diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0ef3739dd..2a1bf5a0f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -114,7 +114,7 @@ test_py39_tf2_intel-1:
     PYTHON: "3.9"
     TEST_PACKAGE: coremltools.converters.mil.frontend.tensorflow
     WHEEL_PATH: build/dist/*cp39*10_15*
-    REQUIREMENTS: reqs/test.pip
+    REQUIREMENTS: reqs/test_tf2.pip
 
 test_py39_tf2_intel-2:
   <<: *test_macos_pkg
@@ -126,7 +126,7 @@ test_py39_tf2_intel-2:
     PYTHON: "3.9"
     TEST_PACKAGE: coremltools.converters.mil.frontend.tensorflow2
     WHEEL_PATH: build/dist/*cp39*10_15*
-    REQUIREMENTS: reqs/test.pip
+    REQUIREMENTS: reqs/test_tf2.pip
 
 test_py39_mil_intel:
   <<: *test_macos_pkg
@@ -174,7 +174,7 @@ test_py39_milproto_intel:
     WHEEL_PATH: build/dist/*cp39*10_15*
     TEST_PACKAGE: coremltools.converters.mil.frontend.milproto
     PYTHON: "3.9"
-    REQUIREMENTS: reqs/test.pip
+    REQUIREMENTS: reqs/test_tf2.pip
 
 
 
@@ -212,7 +212,7 @@ test_py310_tf2-1:
     PYTHON: "3.10"
     TEST_PACKAGE: coremltools.converters.mil.frontend.tensorflow
     WHEEL_PATH: build/dist/*cp310*11*
-    REQUIREMENTS: reqs/test.pip
+    REQUIREMENTS: reqs/test_tf2.pip
 
 test_py310_tf2-2:
   <<: *test_macos_pkg
@@ -224,7 +224,7 @@ test_py310_tf2-2:
     PYTHON: "3.10"
     TEST_PACKAGE: coremltools.converters.mil.frontend.tensorflow2
     WHEEL_PATH: build/dist/*cp310*11*
-    REQUIREMENTS: reqs/test.pip
+    REQUIREMENTS: reqs/test_tf2.pip
 
 test_py310_mil:
   <<: *test_macos_pkg
@@ -272,7 +272,7 @@ test_py310_milproto:
     PYTHON: "3.10"
     TEST_PACKAGE: coremltools.converters.mil.frontend.milproto
     WHEEL_PATH: build/dist/*cp310*11*
-    REQUIREMENTS: reqs/test.pip
+    REQUIREMENTS: reqs/test_tf2.pip
 
 
 
diff --git a/coremlpython/CoreMLPython.h b/coremlpython/CoreMLPython.h
index 6dbfd5cf1..320ef6f6e 100644
--- a/coremlpython/CoreMLPython.h
+++ b/coremlpython/CoreMLPython.h
@@ -3,6 +3,8 @@
 // Use of this source code is governed by a BSD-3-clause license that can be
 // found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+
+// Disable a few warnings and include pybind first, then re-enable warnings
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wexit-time-destructors"
 #pragma clang diagnostic ignored "-Wdocumentation"
@@ -28,11 +30,13 @@ namespace CoreML {
             static py::bytes autoSetSpecificationVersion(const py::bytes& modelBytes);
             static py::str compileModel(const std::string& urlStr);
             static int32_t maximumSupportedSpecificationVersion();
+            static void setComputeUnit(MLModelConfiguration *configuration, const std::string& computeUnits);
 
             Model(const Model&) = delete;
             Model& operator=(const Model&) = delete;
             ~Model();
             explicit Model(const std::string& urlStr, const std::string& computeUnits);
+            explicit Model(MLModel* m_model, NSURL* compiledUrl, bool deleteCompiledModelOnExit);
 
             py::dict predict(const py::dict& input) const;
             py::list batchPredict(const py::list& batch) const;
diff --git a/coremlpython/CoreMLPython.mm b/coremlpython/CoreMLPython.mm
index e01374d34..7f65f3af1 100644
--- a/coremlpython/CoreMLPython.mm
+++ b/coremlpython/CoreMLPython.mm
@@ -33,18 +33,6 @@ bool usingMacOS13OrHigher() {
     return (NSProtocolFromString(@"MLProgram") != nil);
 }
 
-bool isCompiledModelPath(const std::string& path) {
-    const std::string fileExtension = ".mlmodelc";
-
-    size_t start = path.length() - fileExtension.length();
-    if (path.back() == '/') {
-        start--;
-    }
-    const std::string match = path.substr(start, fileExtension.length());
-
-    return (match == fileExtension);
-}
-
 Model::~Model() {
     @autoreleasepool {
         NSFileManager *fileManager = [NSFileManager defaultManager];
@@ -58,7 +46,7 @@ bool isCompiledModelPath(const std::string& path) {
     @autoreleasepool {
         NSError *error = nil;
 
-        if (! isCompiledModelPath(urlStr)) {
+        if (! Utils::isCompiledModelPath(urlStr)) {
             // Compile the model
             NSURL *specUrl = Utils::stringToNSURL(urlStr);
 
@@ -89,24 +77,8 @@ bool isCompiledModelPath(const std::string& path) {
             compiledUrl = Utils::stringToNSURL(urlStr);
         }
 
-        // Set compute unit
         MLModelConfiguration *configuration = [MLModelConfiguration new];
-        if (computeUnits == "CPU_ONLY") {
-            configuration.computeUnits = MLComputeUnitsCPUOnly;
-        } else if (computeUnits == "CPU_AND_GPU") {
-            configuration.computeUnits = MLComputeUnitsCPUAndGPU;
-        } else if (computeUnits == "CPU_AND_NE") {
-            if (usingMacOS13OrHigher()) {
-#if BUILT_WITH_MACOS13_SDK
-                configuration.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
-#endif // BUILT_WITH_MACOS13_SDK
-            } else {
-                throw std::runtime_error("CPU_AND_NE is only available on macOS >= 13.0");
-            }
-        } else {
-            assert(computeUnits == "ALL");
-            configuration.computeUnits = MLComputeUnitsAll;
-        }
+        setComputeUnit(configuration, computeUnits);
 
         // Create MLModel
         m_model = [MLModel modelWithContentsOfURL:compiledUrl configuration:configuration error:&error];
@@ -114,6 +86,14 @@ bool isCompiledModelPath(const std::string& path) {
     }
 }
 
+
+Model::Model(MLModel* mlModel, NSURL* compiledUrl, bool deleteCompiledModelOnExit)
+    : m_model(mlModel),
+      compiledUrl(compiledUrl),
+      m_deleteCompiledModelOnExit(deleteCompiledModelOnExit)
+{
+}
+
 py::dict Model::predict(const py::dict& input) const {
     @autoreleasepool {
         NSError *error = nil;
@@ -127,6 +107,26 @@ bool isCompiledModelPath(const std::string& path) {
 }
 
 
+void Model::setComputeUnit(MLModelConfiguration *configuration, const std::string& computeUnits) {
+    if (computeUnits == "CPU_ONLY") {
+        configuration.computeUnits = MLComputeUnitsCPUOnly;
+    } else if (computeUnits == "CPU_AND_GPU") {
+        configuration.computeUnits = MLComputeUnitsCPUAndGPU;
+    } else if (computeUnits == "CPU_AND_NE") {
+        if (usingMacOS13OrHigher()) {
+#if BUILT_WITH_MACOS13_SDK
+            configuration.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
+#endif // BUILT_WITH_MACOS13_SDK
+        } else {
+            throw std::runtime_error("CPU_AND_NE is only available on macOS >= 13.0");
+        }
+    } else {
+        assert(computeUnits == "ALL");
+        configuration.computeUnits = MLComputeUnitsAll;
+    }
+}
+
+
 py::list Model::batchPredict(const py::list& batch) const {
   @autoreleasepool {
       NSError* error = nil;
@@ -156,6 +156,9 @@ bool isCompiledModelPath(const std::string& path) {
 
 
 py::str Model::getCompiledModelPath() const {
+    if (this->compiledUrl == nil) {
+        return nil;
+    }
     return [this->compiledUrl.path UTF8String];
 }
 
diff --git a/coremlpython/CoreMLPythonUtils.h b/coremlpython/CoreMLPythonUtils.h
index a970746df..dd326e6f7 100644
--- a/coremlpython/CoreMLPythonUtils.h
+++ b/coremlpython/CoreMLPythonUtils.h
@@ -23,6 +23,7 @@ namespace CoreML {
     namespace Python {
         namespace Utils {
 
+            bool isCompiledModelPath(const std::string& path);
             NSURL * stringToNSURL(const std::string& str);
             void handleError(NSError *error);
             
diff --git a/coremlpython/CoreMLPythonUtils.mm b/coremlpython/CoreMLPythonUtils.mm
index edfc2de1e..1186182af 100644
--- a/coremlpython/CoreMLPythonUtils.mm
+++ b/coremlpython/CoreMLPythonUtils.mm
@@ -29,6 +29,18 @@
 
 using namespace CoreML::Python;
 
+bool Utils::isCompiledModelPath(const std::string& path) {
+    const std::string fileExtension = ".mlmodelc";
+
+    size_t start = path.length() - fileExtension.length();
+    if (path.back() == '/') {
+        start--;
+    }
+    const std::string match = path.substr(start, fileExtension.length());
+
+    return (match == fileExtension);
+}
+
 NSURL * Utils::stringToNSURL(const std::string& str) {
     NSString *nsstr = [NSString stringWithUTF8String:str.c_str()];
     return [NSURL fileURLWithPath:nsstr];
diff --git a/coremltools/__init__.py b/coremltools/__init__.py
index 84821cea1..30130e1ba 100644
--- a/coremltools/__init__.py
+++ b/coremltools/__init__.py
@@ -21,6 +21,7 @@
 
 For more information: http://developer.apple.com/documentation/coreml
 """
+
 from enum import Enum as _Enum
 from logging import getLogger as _getLogger
 
@@ -90,15 +91,14 @@ class ComputeUnit(_Enum):
 
 # expose sub packages as directories
 from . import converters, models, optimize, proto
-
 # expose unified converter in coremltools package level
 from .converters import ClassifierConfig
 from .converters import ColorLayout as colorlayout
 from .converters import EnumeratedShapes, ImageType, RangeDim, Shape, TensorType, convert
 from .converters.mil._deployment_compatibility import AvailableTarget as target
 from .converters.mil.mil.passes.defs import quantization as transform
-from .converters.mil.mil.passes.pass_pipeline import PassPipeline
 from .converters.mil.mil.passes.defs.quantization import ComputePrecision as precision
+from .converters.mil.mil.passes.pass_pipeline import PassPipeline
 from .models import utils
 from .models.ml_program import compression_utils
 
diff --git a/coremltools/_deps/__init__.py b/coremltools/_deps/__init__.py
index 7afd4d05f..bc98849fc 100644
--- a/coremltools/_deps/__init__.py
+++ b/coremltools/_deps/__init__.py
@@ -154,7 +154,7 @@ def __get_sklearn_version(version):
 
 # ---------------------------------------------------------------------------------------
 _HAS_TORCH = True
-_TORCH_MAX_VERSION = "2.1.0"
+_TORCH_MAX_VERSION = "2.2.0"
 _HAS_TORCH_EXPORT_API = False
 try:
     import torch
diff --git a/coremltools/converters/mil/backend/mil/helper.py b/coremltools/converters/mil/backend/mil/helper.py
index 9a88b4fc9..d6c7cd66a 100644
--- a/coremltools/converters/mil/backend/mil/helper.py
+++ b/coremltools/converters/mil/backend/mil/helper.py
@@ -3,21 +3,11 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-import os
 
 import numpy as np
 
-import coremltools.proto.FeatureTypes_pb2 as ft
-import coremltools.proto.MIL_pb2 as pm
+from coremltools import proto
 from coremltools.converters.mil.mil import types
-from coremltools.converters.mil.mil.types import (
-    BUILTIN_TO_PROTO_TYPES,
-    builtin_to_string,
-    numpy_type_to_builtin_type,
-    type_to_builtin_type,
-)
-from coremltools.converters.mil.mil.types.type_mapping import np_val_to_py_type
-from coremltools.models.utils import _WEIGHTS_DIR_NAME, _WEIGHTS_FILE_NAME
 
 # For immediate values, those types are stored in bytes (MIL parser reads those types from bytes).
 IMMEDIATE_VALUE_TYPES_IN_BYTES = (types.fp16, types.int8, types.uint8, types.uint32)
@@ -25,9 +15,9 @@
 
 def create_valuetype_scalar(data_type):
     """
-    Return pm.ValueType with DataType set
+    Return proto.MIL_pb2.ValueType with DataType set
     """
-    v_type = pm.ValueType()
+    v_type = proto.MIL_pb2.ValueType()
     update_tensortype(v_type.tensorType, (), data_type)
     return v_type
 
@@ -45,18 +35,18 @@ def update_listtype(l_type, length, elem_shape, dtype):
 
 def create_valuetype_list(length, elem_shape, dtype):
     """
-    Return pm.ValueType with List (ListType) set.
+    Return proto.MIL_pb2.ValueType with List (ListType) set.
     length: length of list (int)
     """
-    v_type = pm.ValueType()
+    v_type = proto.MIL_pb2.ValueType()
     update_listtype(v_type.listType, length, elem_shape, dtype)
     return v_type
 
 def create_valuetype_dict(key_type, value_type):
     """
-    Return pm.ValueType with dict (dictionaryType) set
+    Return proto.MIL_pb2.ValueType with dict (dictionaryType) set
     """
-    v_type = pm.ValueType()
+    v_type = proto.MIL_pb2.ValueType()
     v_type.dictionaryType.keyType.CopyFrom(types_to_proto(key_type))
     v_type.dictionaryType.valueType.CopyFrom(types_to_proto(value_type))
     return v_type
@@ -64,10 +54,10 @@ def create_valuetype_dict(key_type, value_type):
 
 def create_valuetype_tensor(shape, data_type):
     """
-    Return pm.ValueType with tensor (TensorType) set.
+    Return proto.MIL_pb2.ValueType with tensor (TensorType) set.
     shape: list of ints
     """
-    v_type = pm.ValueType()
+    v_type = proto.MIL_pb2.ValueType()
     update_tensortype(v_type.tensorType, shape, data_type)
     return v_type
 
@@ -123,7 +113,10 @@ def _tensor_field_by_type(tensor_val, builtin_type):
             return tensor_val.bytes.values
         else:
             raise TypeError(
-                "Unsupported float dtype for MIL proto serialization: {}".format(builtin_to_string(builtin_type)))
+                "Unsupported float dtype for MIL proto serialization: {}".format(
+                    types.builtin_to_string(builtin_type)
+                )
+            )
     elif builtin_type == types.str:
         return tensor_val.strings.values
     else:
@@ -147,7 +140,11 @@ def _set_empty_tensor_field_by_type(tensor_val, builtin_type):
         elif (builtin_type == types.fp16):
             tensor_val.bytes.SetInParent()
         else:
-            raise TypeError("Unsupported float dtype for MIL proto serialization: {}".format(builtin_to_string(builtin_type)))
+            raise TypeError(
+                "Unsupported float dtype for MIL proto serialization: {}".format(
+                    types.builtin_to_string(builtin_type)
+                )
+            )
     elif builtin_type == types.str:
         tensor_val.strings.SetInParent()
     else:
@@ -157,10 +154,10 @@ def create_tensor_value(np_tensor):
     """
     Return TensorValue.
     """
-    builtin_type = numpy_type_to_builtin_type(np_tensor.dtype)
+    builtin_type = types.numpy_type_to_builtin_type(np_tensor.dtype)
 
     value_type = create_valuetype_tensor(np_tensor.shape, types_to_proto_primitive(builtin_type))
-    val = pm.Value(type=value_type)
+    val = proto.MIL_pb2.Value(type=value_type)
     t_val = val.immediateValue.tensor
 
     # Copy the tensor values from the input tensor
@@ -171,10 +168,10 @@ def create_tensor_value(np_tensor):
             for x in np.nditer(np_tensor):
                 t_field.append(x.encode("utf-8"))
         elif builtin_type in IMMEDIATE_VALUE_TYPES_IN_BYTES:
-            val.immediateValue.tensor.bytes.values = np_val_to_py_type(np_tensor)
+            val.immediateValue.tensor.bytes.values = types.type_mapping.np_val_to_py_type(np_tensor)
         else:
             for x in np_tensor.flatten():
-                t_field.append(np_val_to_py_type(x))
+                t_field.append(types.type_mapping.np_val_to_py_type(x))
     else:  # This is an "empty" tensor (tensor with a dimension being size 0)
         _set_empty_tensor_field_by_type(t_val, builtin_type)
     return val
@@ -185,20 +182,20 @@ def create_scalar_value(py_scalar):
     Return TensorValue (since there's no ScalarValue)
     """
     # Create the "scalar" (rank 0) tensor
-    builtin_type = type_to_builtin_type(type(py_scalar))
+    builtin_type = types.type_to_builtin_type(type(py_scalar))
     value_type = create_valuetype_scalar(types_to_proto_primitive(builtin_type))
-    val = pm.Value(type=value_type)
+    val = proto.MIL_pb2.Value(type=value_type)
     t_val = val.immediateValue.tensor
 
     # Set the tensor value
     t_field = _tensor_field_by_type(t_val, builtin_type)
     if builtin_type in IMMEDIATE_VALUE_TYPES_IN_BYTES:
         # Serialize to bytes because MIL read them from the "bytes" field in TensorValue.
-        val.immediateValue.tensor.bytes.values = np_val_to_py_type(py_scalar)
+        val.immediateValue.tensor.bytes.values = types.type_mapping.np_val_to_py_type(py_scalar)
     else:
         if builtin_type == types.str:
             py_scalar = py_scalar.encode("utf-8")
-        t_field.append(np_val_to_py_type(py_scalar))
+        t_field.append(types.type_mapping.np_val_to_py_type(py_scalar))
 
     return val
 
@@ -207,7 +204,7 @@ def create_tuple_value(py_tuple):
     """
     Return type of Tuple
     """
-    tp_val = pm.TupleValue()
+    tp_val = proto.MIL_pb2.TupleValue()
     for t in py_tuple:
         item_val = tp_val.values.add()
         item_type = item_val.type  # ValueType
@@ -227,11 +224,11 @@ def create_list_scalarvalue(py_list, np_type):
     """
     Return a Value of type List, which holds scalar values
     """
-    builtin_type = numpy_type_to_builtin_type(np_type)
+    builtin_type = types.numpy_type_to_builtin_type(np_type)
     value_type = create_valuetype_list(length=len(py_list),
                                        elem_shape=(),
                                        dtype=types_to_proto_primitive(builtin_type))
-    val = pm.Value(type=value_type)
+    val = proto.MIL_pb2.Value(type=value_type)
 
     list_val = val.immediateValue.list
     for v in py_list:
@@ -244,15 +241,15 @@ def create_file_value_tensor(file_name, offset, dim, data_type):
     """
     Create a Value Type to store File Value
     """
-    val = pm.Value(
-        blobFileValue=pm.Value.BlobFileValue(fileName=file_name, offset=offset),
+    val = proto.MIL_pb2.Value(
+        blobFileValue=proto.MIL_pb2.Value.BlobFileValue(fileName=file_name, offset=offset),
         type=create_valuetype_tensor(dim, data_type),
     )
     return val
 
 
 def types_to_proto_primitive(valuetype):
-    if valuetype not in BUILTIN_TO_PROTO_TYPES:
+    if valuetype not in types.BUILTIN_TO_PROTO_TYPES:
         additional_error_msg = ""
         if valuetype in (types.complex64, types.complex128):
             additional_error_msg = (
@@ -262,7 +259,7 @@ def types_to_proto_primitive(valuetype):
         raise ValueError(
             f"Unknown map from SSA type {valuetype} to Proto type. {additional_error_msg}"
         )
-    return BUILTIN_TO_PROTO_TYPES[valuetype]
+    return types.BUILTIN_TO_PROTO_TYPES[valuetype]
 
 
 def types_to_proto(valuetype):
@@ -270,7 +267,7 @@ def types_to_proto(valuetype):
         primitive = types_to_proto_primitive(valuetype.get_primitive())
         return create_valuetype_tensor(valuetype.get_shape(), primitive)
     elif types.is_tuple(valuetype):
-        v_type = pm.ValueType()
+        v_type = proto.MIL_pb2.ValueType()
         t_type = v_type.tupleType
         for t in valuetype.T:
             new_v_type = t_type.types.add()
@@ -321,17 +318,6 @@ def _get_offset_by_writing_data(output_var, blob_writer):
 
     return offset
 
-
-def create_file_value(output_var, blob_writer):
-    offset = _get_offset_by_writing_data(output_var, blob_writer)
-
-    return create_file_value_tensor(
-        file_name=os.path.join(os.path.join('@model_path', _WEIGHTS_DIR_NAME), _WEIGHTS_FILE_NAME),
-        offset=offset,
-        dim=output_var.val.shape,
-        data_type=types_to_proto_primitive(output_var.sym_type.get_primitive()),
-    )
-
 def create_immediate_value(var):
     if types.is_tensor(var.sym_type):
         return create_tensor_value(var.val)
@@ -347,13 +333,20 @@ def create_immediate_value(var):
 
 def cast_to_framework_io_dtype(var, is_output):
     if var.dtype == types.fp32:
-        return ft.ArrayFeatureType.ArrayDataType.FLOAT32
+        return proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.FLOAT32
     elif var.dtype == types.int32:
-        return ft.ArrayFeatureType.ArrayDataType.INT32
+        return proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.INT32
     elif var.dtype == types.fp16:
-        return ft.ArrayFeatureType.ArrayDataType.FLOAT16
+        return proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.FLOAT16
     else:
         ioname = "Output " if is_output else "Input "
         ioname2 = "outputs" if is_output else "inputs"
-        raise NotImplementedError(ioname + var.name + " has data type " + builtin_to_string(var.dtype) + \
-                                  ". ML Program models only support fp32 and int32 " + ioname2 + ".")
+        raise NotImplementedError(
+            ioname
+            + var.name
+            + " has data type "
+            + types.builtin_to_string(var.dtype)
+            + ". ML Program models only support fp32 and int32 "
+            + ioname2
+            + "."
+        )
diff --git a/coremltools/converters/mil/backend/mil/load.py b/coremltools/converters/mil/backend/mil/load.py
index 4d070c708..b57b590df 100644
--- a/coremltools/converters/mil/backend/mil/load.py
+++ b/coremltools/converters/mil/backend/mil/load.py
@@ -5,39 +5,46 @@
 
 import os
 import warnings
-from typing import Optional
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 
-from coremltools import _OPSET, _SPECIFICATION_VERSION_IOS_15
+from coremltools import _OPSET, _SPECIFICATION_VERSION_IOS_15, _SPECIFICATION_VERSION_IOS_17
 from coremltools import _logger as logger
+from coremltools import proto
+from coremltools.converters.mil import mil
 from coremltools.converters.mil.backend.backend_helper import _get_probability_var_for_classifier
+from coremltools.converters.mil.backend.mil import helper
 from coremltools.converters.mil.backend.mil.helper import (
     cast_to_framework_io_dtype,
-    create_file_value,
+    create_file_value_tensor,
     create_immediate_value,
     create_list_scalarvalue,
     create_scalar_value,
     types_to_proto,
+    types_to_proto_primitive,
 )
 from coremltools.converters.mil.backend.nn.load import _set_optional_inputs
-from coremltools.converters.mil.input_types import EnumeratedShapes, ImageType, RangeDim, TensorType
+from coremltools.converters.mil.input_types import (
+    ClassifierConfig,
+    EnumeratedShapes,
+    ImageType,
+    RangeDim,
+    TensorType,
+)
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, Program, mil_list, types
+from coremltools.converters.mil.mil import Function, Operation, Program, Var, mil_list, types
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic, any_variadic, is_symbolic
+from coremltools.models.neural_network import flexible_shape_utils
 from coremltools.models.neural_network.flexible_shape_utils import (
     NeuralNetworkImageSize,
     NeuralNetworkImageSizeRange,
-    add_enumerated_image_sizes,
-    add_multiarray_ndshape_enumeration,
-    set_multiarray_ndshape_range,
-    update_image_size_range,
 )
-from coremltools.models.utils import _WEIGHTS_FILE_NAME
-from coremltools.proto import FeatureTypes_pb2 as ft
-from coremltools.proto import MIL_pb2 as pm
-from coremltools.proto import Model_pb2 as ml
+from coremltools.models.utils import _WEIGHTS_DIR_NAME, _WEIGHTS_FILE_NAME
 
 from ..backend_helper import _get_colorspace_enum, _validate_image_input_output_shapes
 
@@ -56,174 +63,304 @@ def should_use_weight_file(val):
         and val.dtype in ['float16', 'float32', 'uint8', 'int8']
     )
 
+class MILProtoExporter:
+    """
+    An utility class to export a pymil program to milproto.
+    """
 
-def translate_const(op, blob_writer):
-    output_var = op.outputs[0]
-
-    if should_use_weight_file(output_var.val):
-        value = create_file_value(output_var, blob_writer)
-    else:
-        value = create_immediate_value(output_var)
-
-    return pm.Operation(
-        type="const",
-        attributes={"name": create_scalar_value(op.name), "val": value},
-        outputs=[
-            pm.NamedValueType(
-                name=output_var.name, type=types_to_proto(output_var.sym_type)
-            )
-        ],
-    )
+    def __init__(
+        self,
+        prog: Program,
+        weights_dir: str,
+    ):
+        self.prog = prog
+        self.weights_dir = weights_dir
+        self.blob_writers = {}
+        self.prog.validate(check_essential_scope=True)
+
+    def translate_program_attributes(self) -> Dict[str, Any]:
+        """
+        Get the program attributes which need to be exported to mil proto.
+        """
+        return {}
+
+    def get_weight_path(self, op: Operation) -> str:
+        """
+        Get the weight path for a constant operation.
+        By default, the weight is saved in {weight_dir}/weight.bin
+        """
+        assert (
+            op.op_type == "const"
+        ), f"Expected op (op.name) be a const op. Got op_type of {op.op_type}."
+        return os.path.join(self.weights_dir, _WEIGHTS_FILE_NAME)
+
+    def get_blob_writer(self, weight_path: str) -> BlobWriter:
+        """
+        Get a blob writer given a weight_path.
+        """
+        if weight_path not in self.blob_writers:
+            self.blob_writers[weight_path] = BlobWriter(weight_path)
+        return self.blob_writers[weight_path]
+
+    def create_file_value(self, var: Var) -> proto.MIL_pb2.Value:
+        """
+        Returns the mil proto file value of a var.
+        """
+        weight_path = self.get_weight_path(var.op)
+        blob_writer = self.get_blob_writer(weight_path)
+        offset = helper._get_offset_by_writing_data(var, blob_writer)
+        weight_file_name = os.path.basename(weight_path)
+
+        return create_file_value_tensor(
+            file_name=os.path.join(
+                os.path.join("@model_path", _WEIGHTS_DIR_NAME), weight_file_name
+            ),
+            offset=offset,
+            dim=var.val.shape,
+            data_type=types_to_proto_primitive(var.sym_type.get_primitive()),
+        )
 
+    def get_milproto_value(self, var: Var) -> proto.MIL_pb2.Value:
+        """
+        Translate a pymil Var into milproto value.
+        """
+        if should_use_weight_file(var.val):
+            return self.create_file_value(var)
+        else:
+            return create_immediate_value(var)
+
+    @staticmethod
+    def _get_input_dict(op: Operation) -> Dict[str, Any]:
+        """
+        Given an op, returns a dict that maps the param name into the corresponding Var.
+        """
+        return op.inputs
+
+    @staticmethod
+    def _get_attr_dict(op: Operation) -> Dict[str, Any]:
+        """
+        Return the initial attribute dict for an op.
+        """
+        return {"name": create_scalar_value(op.name)}
+
+    def translate_const(self, op: Operation) -> proto.MIL_pb2.Operation:
+        """
+        Translate constant operation.
+        """
+        if len(op.outputs) != 1:
+            raise AssertionError(f"const {op.name} must have 1 output, but got {len(op.outputs)}")
+
+        output_var = op.outputs[0]
+        value = self.get_milproto_value(output_var)
+
+        return proto.MIL_pb2.Operation(
+            type="const",
+            attributes={"name": create_scalar_value(op.name), "val": value},
+            outputs=[
+                proto.MIL_pb2.NamedValueType(
+                    name=output_var.name, type=types_to_proto(output_var.sym_type)
+                )
+            ],
+        )
 
-def translate_constexpr(op, blob_writer):
+    def translate_constexpr(self, op: Operation) -> proto.MIL_pb2.Operation:
+        """
+        Translate constexpr operation.
+        """
+        inputs = {}
+        attributes = {"name": create_scalar_value(op.name)}
 
-    def get_value(var):
-        if should_use_weight_file(var.val):
-            value = create_file_value(var, blob_writer)
+        if op.opset_version <= _SPECIFICATION_VERSION_IOS_17:
+            attributes.update(
+                {param_name: self.get_milproto_value(var) for param_name, var in op.inputs.items()}
+            )
         else:
-            value = create_immediate_value(var)
+            for param_name, var in op.inputs.items():
+                if var.op.op_type.startswith("constexpr_"):
+                    arguments = [proto.MIL_pb2.Argument.Binding(name=var.name)]
+                else:
+                    arguments = [proto.MIL_pb2.Argument.Binding(value=self.get_milproto_value(var))]
+                args = proto.MIL_pb2.Argument()
+                args.arguments.extend(arguments)
+                inputs[param_name] = args
+
+        return proto.MIL_pb2.Operation(
+            type=op.op_type,
+            inputs=inputs,
+            attributes=attributes,
+            outputs=[
+                proto.MIL_pb2.NamedValueType(
+                    name=output_var.name, type=types_to_proto(output_var.sym_type)
+                )
+                for output_var in op.outputs
+            ],
+        )
 
-        return value
+    def translate_generic_op(
+        self, op: Operation, literal_params: Optional[List[str]] = None
+    ) -> proto.MIL_pb2.Operation:
+        """
+        Translate a generic pymil Operation.
+        """
+        if literal_params is None:
+            literal_params = []
 
-    output_var = op.outputs[0]
+        inputs = {}
 
-    attributes = {"name": create_scalar_value(op.name)}
-    attributes.update({k: get_value(v) for k, v in op.inputs.items()})
+        for param_name, vars in self._get_input_dict(op).items():
+            if param_name.startswith("_"):
+                continue
+            if not isinstance(vars, (list, tuple)):
+                vars = [vars]
+
+            arguments = []
+            for _var in vars:
+                binding = proto.MIL_pb2.Argument.Binding()
+                # use const value literals if requested
+                if param_name in literal_params:
+                    binding.value.CopyFrom(create_immediate_value(_var))
+                else:
+                    binding.name = _var.name
+                arguments.append(binding)
+
+            args = proto.MIL_pb2.Argument()
+            args.arguments.extend(arguments)
+            inputs[param_name] = args
+
+        outputs = [
+            proto.MIL_pb2.NamedValueType(name=v.name, type=types_to_proto(v.sym_type))
+            for v in op.outputs
+        ]
+        blocks = None
+        if len(op.blocks) > 0:
+            blocks = [self.create_block(b) for b in op.blocks]
+
+        op_type = op.op_type
+        attr_dict = self._get_attr_dict(op)
+        if op.op_type in SSAOpRegistry.custom_ops:
+            op_type = "custom_layer"
+            class_name = op.bindings.get("class_name", op.name)
+            input_order = op.bindings.get("input_order", [])
+            parameters = op.bindings.get("parameters", [])
+            weights = op.bindings.get("weights", [])
+            description = op.bindings.get("description", "")
+
+            attr_dict["class_name"] = create_scalar_value(class_name)
+            attr_dict["input_order"] = create_list_scalarvalue(input_order, str)
+            attr_dict["parameters"] = create_list_scalarvalue(parameters, str)
+            attr_dict["weights"] = create_list_scalarvalue(weights, str)
+            attr_dict["description"] = create_scalar_value(description)
+
+        return proto.MIL_pb2.Operation(
+            type=op_type,
+            blocks=blocks,
+            inputs=inputs,
+            attributes=attr_dict,
+            outputs=outputs,
+        )
 
-    return pm.Operation(
-        type=op.op_type,
-        attributes=attributes,
-        outputs=[
-            pm.NamedValueType(
-                name=output_var.name, type=types_to_proto(output_var.sym_type)
+    def create_block(self, block: Block) -> proto.MIL_pb2.Block:
+        """
+        Translate pymil Block.
+        """
+        def feeds_to_only_constexprs(op: Operation) -> bool:
+            return (
+                (op.op_type == "const")
+                and len(op.outputs[0].child_ops) > 0
+                and all(
+                    (child_op.op_type.startswith("constexpr_"))
+                    for child_op in op.outputs[0].child_ops
+                )
             )
-        ],
-    )
 
+        proto_ops = []
+
+        # Find the const op that generates classify's "label" / "class" string vec.
+        classify_const_classes_op = None
+        if len(block.operations) > 0:
+            # Classify is always the last operation in the block.
+            op = block.operations[-1]
+            op_cls_name = type(op).__name__
+            if op_cls_name == "classify":
+                classes_var = op.inputs["classes"]
+                classify_const_classes_op = classes_var.op
+                if len(classes_var.child_ops) != 1:
+                    raise ValueError(
+                        "Classify's labels/classes should be input to only 1 op (classify)."
+                    )
 
-def translate_generic_op(op, parameters, blob_writer, literal_params=[]):
-    inputs = {}
-    for param_name, vars in op.inputs.items():
-        if param_name.startswith("_"):
-            continue
-        if not isinstance(vars, (list, tuple)):
-            vars = [vars]
-
-        arguments = []
-        for _var in vars:
-            binding = pm.Argument.Binding()
-            # use const value literals if requested
-            if param_name in literal_params:
-                binding.value.CopyFrom(create_immediate_value(_var))
+        for op in block.operations:
+            op_cls_name = type(op).__name__
+            if op_cls_name == "const":
+                if feeds_to_only_constexprs(op):
+                    continue
+                # Do not serialize the const op that creates the var bound to the classifier's "classes" param.
+                # The variable's value will be bound directly to classify's "classes" param instead.
+                if op != classify_const_classes_op:
+                    proto_ops.append(self.translate_const(op))
+            elif op_cls_name.startswith("constexpr_"):
+                proto_ops.append(self.translate_constexpr(op))
+            elif op_cls_name == "classify":
+                # Classify's "classes" param should be serialized as a value literal bound
+                # directly to the param, rather than as a const-generated variable.
+                proto_ops.append(self.translate_generic_op(op, ["classes"]))
+            elif op_cls_name == "reshape_like":
+                # The reshape_like should also be able to take value from a const op
+                # This is a workaround solution
+                # rdar://98689808 (Reshape_like should also accept const value from non literal input)
+                literal_params = ["begins", "ends", "end_masks"]
+                proto_ops.append(self.translate_generic_op(op, literal_params))
             else:
-                binding.name = _var.name
-            arguments.append(binding)
-
-        args = pm.Argument()
-        args.arguments.extend(arguments)
-        inputs[param_name] = args
-
-    outputs = [
-        pm.NamedValueType(name=v.name, type=types_to_proto(v.sym_type))
-        for v in op.outputs
-    ]
-    blocks = None
-    if len(op.blocks) > 0:
-        blocks = [create_block(b, parameters, blob_writer) for b in op.blocks]
-
-    op_type = op.op_type
-    attr_dict = {}
-    if op.op_type in SSAOpRegistry.custom_ops:
-        op_type = "custom_layer"
-        class_name = op.bindings.get("class_name", op.name)
-        input_order = op.bindings.get("input_order", [])
-        parameters = op.bindings.get("parameters", [])
-        weights = op.bindings.get("weights", [])
-        description = op.bindings.get("description", "")
-
-        attr_dict["name"] = create_scalar_value(op.name)
-        attr_dict["class_name"] = create_scalar_value(class_name)
-        attr_dict["input_order"] = create_list_scalarvalue(input_order, str)
-        attr_dict["parameters"] = create_list_scalarvalue(parameters, str)
-        attr_dict["weights"] = create_list_scalarvalue(weights, str)
-        attr_dict["description"] = create_scalar_value(description)
-
-    attr_dict["name"] = create_scalar_value(op.name)
-
-    return pm.Operation(
-        type=op_type,
-        blocks=blocks,
-        inputs=inputs,
-        attributes=attr_dict,
-        outputs=outputs,
-    )
+                proto_ops.append(self.translate_generic_op(op))
+
+        inputs = []
+        if not isinstance(block, Function):
+            # Function is subclass of Block, but function's block has no input,
+            # and hence skipping reading the block inputs.
+            for var in block.inputs:
+                proto_type = types_to_proto(var.sym_type)
+                inputs.append(proto.MIL_pb2.NamedValueType(name=var.name, type=proto_type))
+        output_names = [v.name for v in block.outputs]
+        return proto.MIL_pb2.Block(inputs=inputs, outputs=output_names, operations=proto_ops)
+
+    def convert_function(self, function: Function, opset: str) -> proto.MIL_pb2.Function:
+        """
+        Translate pymil Function.
+        """
+        block = self.create_block(function)
+
+        inputs = []
+        for name, var in function.inputs.items():
+            proto_type = types_to_proto(var.sym_type)
+            inputs.append(proto.MIL_pb2.NamedValueType(name=name, type=proto_type))
 
-def create_block(block, parameters, blob_writer):
-
-    def feeds_to_only_constexprs(op):
-        return (op.op_type == 'const') \
-               and len(op.outputs[0].child_ops) > 0 \
-               and all((child_op.op_type.startswith("constexpr_")) for child_op in op.outputs[0].child_ops)
-
-    proto_ops = []
-
-    # Find the const op that generates classify's "label" / "class" string vec.
-    classify_const_classes_op = None
-    if len(block.operations) > 0:
-        # Classify is always the last operation in the block.
-        op = block.operations[-1]
-        op_cls_name = type(op).__name__
-        if (op_cls_name == "classify"):
-            classes_var = op.inputs["classes"]
-            classify_const_classes_op = classes_var.op
-            if (len(classes_var.child_ops) != 1):
-                raise ValueError("Classify's labels/classes should be input to only 1 op (classify).")
-
-    for op in block.operations:
-        op_cls_name = type(op).__name__
-        if op_cls_name == "const":
-            if feeds_to_only_constexprs(op):
-                continue
-            # Do not serialize the const op that creates the var bound to the classifier's "classes" param.
-            # The variable's value will be bound directly to classify's "classes" param instead.
-            if op != classify_const_classes_op:
-                proto_ops.append(translate_const(op, blob_writer))
-        elif op_cls_name.startswith("constexpr_"):
-            proto_ops.append(translate_constexpr(op, blob_writer))
-        elif op_cls_name == "classify":
-            # Classify's "classes" param should be serialized as a value literal bound
-            # directly to the param, rather than as a const-generated variable.
-            proto_ops.append(translate_generic_op(op, parameters, blob_writer, ["classes"]))
-        elif op_cls_name == "reshape_like":
-            # The reshape_like should also be able to take value from a const op
-            # This is a workaround solution
-            # rdar://98689808 (Reshape_like should also accept const value from non literal input)
-            literal_params = ["begins", "ends", "end_masks"]
-            proto_ops.append(translate_generic_op(op, parameters, blob_writer, literal_params))
-        else:
-            proto_ops.append(translate_generic_op(op, parameters, blob_writer))
+        return proto.MIL_pb2.Function(
+            inputs=inputs, opset=opset, block_specializations={opset: block}
+        )
 
-    inputs = []
-    if not isinstance(block, Function):
-        # Function is subclass of Block, but function's block has no input,
-        # and hence skipping reading the block inputs.
-        for var in block.inputs:
-            proto_type = types_to_proto(var.sym_type)
-            inputs.append(pm.NamedValueType(name=var.name, type=proto_type))
-    output_names = [v.name for v in block.outputs]
-    return pm.Block(inputs=inputs, outputs=output_names, operations=proto_ops)
+    def export(
+        self, specification_version: Optional[str] = _SPECIFICATION_VERSION_IOS_15
+    ) -> proto.MIL_pb2.Program:
+        """
+        Export a pymil program into mil proto with the given specification version.
+        """
+        if BlobWriter is None:
+            raise RuntimeError("BlobWriter not loaded")
 
+        function_protos = {}
+        for func_name, func in self.prog.functions.items():
+            function_protos[func_name] = self.convert_function(func, _OPSET[specification_version])
 
-def convert_function(function, parameters, blob_writer, opset):
-    block = create_block(function, parameters, blob_writer)
+        kwargs = {
+            "version": 1,
+            "functions": function_protos,
+        }
 
-    inputs = []
-    for name, var in function.inputs.items():
-        proto_type = types_to_proto(var.sym_type)
-        inputs.append(pm.NamedValueType(name=name, type=proto_type))
+        prog_attributes = self.translate_program_attributes()
+        if len(prog_attributes) > 0:
+            kwargs["attributes"] = prog_attributes
 
-    return pm.Function(inputs=inputs, opset=opset, block_specializations={opset: block})
+        return proto.MIL_pb2.Program(**kwargs)
 
 # Add a classify op to the output.
 # Replaces the original probabilities output (in the containing MIL block)
@@ -237,6 +374,8 @@ def remove_output(block, prob_var):
         for i in range(len(block.outputs)):
             if block.outputs[i] is prob_var:
                 block.outputs.pop(i)
+                if block in prob_var.consuming_blocks:
+                    prob_var.consuming_blocks.remove(block)
                 break
 
     block = prog.functions["main"]
@@ -258,342 +397,461 @@ def remove_output(block, prob_var):
         raise ValueError(message)
 
     probability_var = _get_probability_var_for_classifier(prog, classifier_config)
+    original_probability_var = probability_var
 
     # add the classify op now
-    with block:
-        # cast the int label to np.int64
-        if isinstance(classes[0], int):
-            classes = [np.int64(x) for x in classes]
-        classes_var = mb.const(val=mil_list(classes))
-        if probability_var.dtype != types.fp32:
+    # we consider this step as a scope of coremltools graph pass
+    with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["add_classify_op"])):
+        with block:
+            # cast the int label to np.int64
+            if isinstance(classes[0], int):
+                classes = [np.int64(x) for x in classes]
+            classes_var = mb.const(val=mil_list(classes))
+            if probability_var.dtype != types.fp32:
+                remove_output(block, probability_var)
+                probability_var = mb.cast(
+                    x=probability_var, dtype="fp32", name=probability_var.name + "_cast_to_fp32"
+                )
+            out = mb.classify(probabilities=probability_var, classes=classes_var)
+
+            predicted_feature_name = (
+                "classLabel"
+                if classifier_config.predicted_feature_name is None
+                else classifier_config.predicted_feature_name
+            )
+            out[0].name = predicted_feature_name
+            out[1].name = predicted_feature_name + "_probs"
+
+            # Remove probabilities from block outputs, replace with classify's outputs
             remove_output(block, probability_var)
-            probability_var = mb.cast(x=probability_var, dtype="fp32", name=probability_var.name + "_cast_to_fp32")
-        out = mb.classify(probabilities=probability_var,
-                          classes=classes_var
-                          )
+            block.outputs[:0] = out
+            out[0].consuming_blocks.append(block)
+            out[1].consuming_blocks.append(block)
 
-        predicted_feature_name = "classLabel" if classifier_config.predicted_feature_name is None \
-                                              else classifier_config.predicted_feature_name
-        out[0].name = predicted_feature_name
-        out[1].name = predicted_feature_name + "_probs"
+            # The new classifier op should have scope information
+            Block._copy_scope_info(original_probability_var, out[0])
 
-        # Remove probabilities from block outputs, replace with classify's outputs
-        remove_output(block, probability_var)
-        block.outputs[:0] = out
-        return out[0].name, out[1].name
+            return out[0].name, out[1].name
 
 
-def _pymil_to_milproto(
-    prog: Program,
-    weights_dir: str,
-    specification_version: Optional[int] = _SPECIFICATION_VERSION_IOS_15,
-) -> pm.Program:
+class CoreMLProtoExporter:
     """
-    Convert a pymil program into mil proto.
+    An utility class to export a pymil program to coreml model.
     """
-    if BlobWriter is None:
-        raise RuntimeError("BlobWriter not loaded")
-
-    weight_path = os.path.join(weights_dir, _WEIGHTS_FILE_NAME)
-    blob_writer = BlobWriter(weight_path)
-
-    opset = _OPSET[specification_version]
-
-    function_protos = {}
-    for func_name, func in prog.functions.items():
-        function_protos[func_name] = convert_function(func, prog.parameters, blob_writer, opset)
 
-    proto = pm.Program(
-        version=1,
-        functions=function_protos,
-    )
-    return proto
-
-
-def load(
-    prog: Program,
-    weights_dir: str,
-    resume_on_errors: Optional[bool] = False,
-    specification_version: Optional[int] = _SPECIFICATION_VERSION_IOS_15,
-    **kwargs,
-):
-    if "main" not in prog.functions:
-        raise ValueError("main function not found in program")
-
-    # if user has specified "ClassifierConfig", then add the "classify" op to the prog
-    classifier_config = kwargs.get("classifier_config", None)
-    predicted_feature_name = None
-    predicted_probabilities_name = None
-    if classifier_config is not None:
-        predicted_feature_name, predicted_probabilities_name = _add_classify_op(
-            prog, classifier_config
-        )
-
-    # convert pymil program into mil proto
-    proto = _pymil_to_milproto(prog, weights_dir, specification_version)
-
-    input_types = prog.main_input_types
-    output_types = prog.main_output_types
-
-    desc = kwargs.get("model_description", None)
-    if desc and not isinstance(desc, ml.ModelDescription):
-        raise ValueError("Invalid model descriptor")
-
-    if desc:
-        if classifier_config is not None:
-            raise AssertionError("Both model_description and classifier_config can't be provided")
-        model = ml.Model(description=desc, specificationVersion=specification_version)
-        model.mlProgram.CopyFrom(proto)
-        return model
+    _DEFAULT_FUNCTION_NAME = "main"
+
+    def __init__(
+        self,
+        prog: mil.Program,
+        mil_proto: proto.MIL_pb2.Program,
+        predicted_feature_name: str,
+        predicted_probabilities_name: str,
+        classifier_config: ClassifierConfig,
+        convert_to: str,
+        convert_from: str,
+    ):
+        self.prog = prog
+        self.mil_proto = mil_proto
+        self.predicted_feature_name = predicted_feature_name
+        self.predicted_probabilities_name = predicted_probabilities_name
+        self.classifier_config = classifier_config
+        self.convert_to = convert_to
+        self.convert_from = convert_from
+        self.prog.validate(check_essential_scope=True)
+
+    @staticmethod
+    def get_additional_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Get additional coreml proto related kwargs.
+        """
+        return {}
+
+    def get_func_input(self, func: mil.Function) -> List[proto.Model_pb2.FeatureDescription]:
+        """
+        Utils to get function input feature description.
+        """
+        input_types = func.input_types
+
+        input_features = []
+        image_input_names = {}  # these are the model inputs marked as image by the user
+        input_shape_map = {}
+
+        for input_type in input_types:
+            if isinstance(input_type, ImageType):
+                image_input_names[input_type.name] = input_type
+                # error checking for input(s) marked as images
+                if input_type.name not in list(func.inputs.keys()):
+                    raise ValueError(
+                        f"Provided image input '{input_type.name}' is not one of the inputs of the MIL program"
+                    )
+            if input_type.name is None:
+                raise ValueError(
+                    'Fail to auto-determine the input name. Please specify the "name" '
+                    'parameter when use "inputs" in ct.convert().'
+                )
+            input_shape_map[input_type.name] = input_type
 
-    input_features = []
-    output_features = []
-    symbolic_inputs = []
-    image_input_names = {} # these are the model inputs marked as image by the user
-    input_shape_map = {}
+        for name, var in func.inputs.items():
+            input_feature_type = proto.FeatureTypes_pb2.FeatureType()
+            is_input_shape_symbolic = False
 
-    for input_type in input_types:
-        if isinstance(input_type, ImageType):
-            image_input_names[input_type.name] = input_type
             # error checking for input(s) marked as images
-            if input_type.name not in list(prog.functions["main"].inputs.keys()):
+            # an image input must be of type tensor in program proto
+            # (since an image type does not exist in MIL program)
+            if name in image_input_names and not types.is_tensor(var.sym_type):
                 raise ValueError(
-                    f"Provided image input '{input_type.name}' is not one of the inputs of the MIL program"
+                    "For the image input, '{}', its type in the MIL program must be tensor. "
+                    "Instead it is {}.".format(name, var.sym_type.__type_info__())
                 )
-        if input_type.name is None:
-            raise ValueError(
-                'Fail to auto-determine the input name. Please specify the "name" '
-                'parameter when use "inputs" in ct.convert().'
-            )
-        input_shape_map[input_type.name] = input_type
-
-    for name, var in prog.functions["main"].inputs.items():
-        input_feature_type = ft.FeatureType()
-
-        # error checking for input(s) marked as images
-        # an image input must be of type tensor in program proto
-        # (since an image type does not exist in MIL program)
-        if name in image_input_names and \
-                not types.is_tensor(var.sym_type):
-            raise ValueError("For the image input, '{}', its type in the MIL program must be tensor. "
-                             "Instead it is {}.".format(name, var.sym_type.__type_info__()))
-
-        if types.is_tensor(var.sym_type):
-            shape = var.sym_type.get_shape()
-            if any_variadic(shape):
-                raise ValueError("Variable rank model inputs are not supported!")
-            if any_symbolic(shape):
-                symbolic_inputs.append(name)
-                # We extract the default input shape given by user first
-                if name in input_shape_map:
-                    shape = input_shape_map[name].shape.default
-                else:
-                    logger.warning("Input shape not fully specified by enumerated shapes or range dim! 1 will be used for dimension not specified instead.")
-                # If no input shape is provided (ex. auto conversion of -1 in Tensorflow)
-                shape = [1 if is_symbolic(d) else d for d in shape]
 
-            if name not in image_input_names:
-                # make a feature type of Type "multiArrayType"
-                array_type = ft.ArrayFeatureType(shape=shape, dataType=cast_to_framework_io_dtype(var, False))
-                input_feature_type.multiArrayType.CopyFrom(array_type)
-            else:
-                # make a feature type of Type "imageType"
-                input_type = image_input_names[name]
-                _validate_image_input_output_shapes(input_type.color_layout, shape, name, is_input=True)
-                if not input_type.channel_first:
-                    raise ValueError("Image input, '{}', must be in the channel_first format".
-                                     format(name))
-                clr_space = _get_colorspace_enum(input_type.color_layout)
-                image_type = ft.ImageFeatureType(width=shape[-1],
-                                                 height=shape[-2],
-                                                 colorSpace=clr_space)
-                input_feature_type.imageType.CopyFrom(image_type)
-
-            input_features.append(
-                ml.FeatureDescription(name=name, type=input_feature_type)
-            )
-        elif types.is_scalar(var.sym_type):
-            array_type = ft.ArrayFeatureType(shape=[1], dataType=cast_to_framework_io_dtype(var, False))
-            input_feature_type.multiArrayType.CopyFrom(array_type)
-            input_features.append(ml.FeatureDescription(name=var.name, type=input_feature_type))
-        else:
-            raise NotImplementedError()
-
-    if output_types is not None and classifier_config is None:
-        assert len(output_types) == len(prog.functions["main"].outputs), \
-                "number of mil program outputs do not match the number of outputs provided by the user"
-
-    for i, var in enumerate(prog.functions["main"].outputs):
-        output_feature_type = ft.FeatureType()
-        if types.is_tensor(var.sym_type) or types.is_primitive(var.sym_type):
-            if output_types is not None and isinstance(output_types[i], ImageType):
-                if not types.is_tensor(var.sym_type):
-                    raise ValueError("Image output, '{}', is a scalar, but it should be a tensor of rank 4".format(
-                                      var.name))
+            if types.is_tensor(var.sym_type):
                 shape = var.sym_type.get_shape()
                 if any_variadic(shape):
-                    raise ValueError("Variable rank model outputs, that are ImageTypes, are not supported")
-                if any([is_symbolic(d) for d in shape]):
-                    raise NotImplementedError("Image output '{}' has symbolic dimensions in its shape".
-                                              format(var.name))
-                _validate_image_input_output_shapes(output_types[i].color_layout, shape, var.name, is_input=False)
-                clr_space = _get_colorspace_enum(output_types[i].color_layout)
-                image_type = ft.ImageFeatureType(width=shape[-1],
-                                                 height=shape[-2],
-                                                 colorSpace=clr_space)
-                output_feature_type.imageType.CopyFrom(image_type)
-                output_features.append(
-                    ml.FeatureDescription(name=var.name, type=output_feature_type)
-                )
-            else:
-                dataType = None
-                if classifier_config is None or var.name != predicted_feature_name:
-                    # Not a classifier output, make sure model output type matches with ML Program type.
-                    dataType = cast_to_framework_io_dtype(var, True)
+                    raise ValueError("Variable rank model inputs are not supported!")
+                if any_symbolic(shape):
+                    is_input_shape_symbolic = True
+                    # We extract the default input shape given by user first
+                    if name in input_shape_map:
+                        shape = input_shape_map[name].shape.default
+                    else:
+                        logger.warning(
+                            "Input shape not fully specified by enumerated shapes or range dim! 1 will be used for dimension not specified instead."
+                        )
+                    # If no input shape is provided (ex. auto conversion of -1 in Tensorflow)
+                    shape = [1 if is_symbolic(d) else d for d in shape]
+
+                if name not in image_input_names:
+                    # make a feature type of Type "multiArrayType"
+                    array_type = proto.FeatureTypes_pb2.ArrayFeatureType(
+                        shape=shape, dataType=cast_to_framework_io_dtype(var, False)
+                    )
+                    input_feature_type.multiArrayType.CopyFrom(array_type)
                 else:
-                    # Classifier outputs are set up separately, so default to fp32 for now.
-                    dataType = ft.ArrayFeatureType.ArrayDataType.FLOAT32
+                    # make a feature type of Type "imageType"
+                    input_type = image_input_names[name]
+                    _validate_image_input_output_shapes(
+                        input_type.color_layout, shape, name, is_input=True
+                    )
+                    if not input_type.channel_first:
+                        raise ValueError(
+                            "Image input, '{}', must be in the channel_first format".format(name)
+                        )
+                    clr_space = _get_colorspace_enum(input_type.color_layout)
+                    image_type = proto.FeatureTypes_pb2.ImageFeatureType(
+                        width=shape[-1], height=shape[-2], colorSpace=clr_space
+                    )
+                    input_feature_type.imageType.CopyFrom(image_type)
 
-                output_shape = (
-                    None
-                    if any_symbolic(var.shape) or types.is_primitive(var.sym_type)
-                    else var.shape
+                input_features.append(
+                    proto.Model_pb2.FeatureDescription(name=name, type=input_feature_type)
+                )
+            elif types.is_scalar(var.sym_type):
+                array_type = proto.FeatureTypes_pb2.ArrayFeatureType(
+                    shape=[1], dataType=cast_to_framework_io_dtype(var, False)
+                )
+                input_feature_type.multiArrayType.CopyFrom(array_type)
+                input_features.append(
+                    proto.Model_pb2.FeatureDescription(name=var.name, type=input_feature_type)
                 )
-                array_type = ft.ArrayFeatureType(shape=output_shape, dataType=dataType)
-                output_feature_type.multiArrayType.CopyFrom(array_type)
-                output_features.append(ml.FeatureDescription(name=var.name, type=output_feature_type))
-        elif (types.is_dict(var.sym_type)):
-            output_feature_type.dictionaryType.MergeFromString(b"")
-            keytype, valtype = var.sym_type.T
-            if types.is_str(keytype):
-                output_feature_type.dictionaryType.stringKeyType.MergeFromString(b"")
-            elif (keytype == types.int64):
-                output_feature_type.dictionaryType.int64KeyType.MergeFromString(b"")
             else:
-                raise ValueError("Dictionary key type not supported.")
-            output_features.append(ml.FeatureDescription(name=var.name, type=output_feature_type))
-        else:
-            raise NotImplementedError()
+                raise NotImplementedError(f"Unsupported input type {var.sym_type}.")
 
-    # Model description
-    desc = ml.ModelDescription(input=input_features, output=output_features)
-    if classifier_config is not None:
-        desc.predictedFeatureName = predicted_feature_name
-        desc.predictedProbabilitiesName = predicted_probabilities_name
-
-        # Manually edit output type of predictedFeatureName.
-        # It doesn't use MLMultiArray and really uses a "primitive" type.
-        for output in desc.output:
-            if output.name == predicted_feature_name:
-                if type(classifier_config.class_labels[0]) == int:
-                    output.type.int64Type.MergeFromString(b"")
-                else:
-                    output.type.stringType.MergeFromString(b"")
-                break
-
-    # Create ML Model
-    model = ml.Model(description=desc, specificationVersion=specification_version)
-    model.mlProgram.CopyFrom(proto)
+            if not is_input_shape_symbolic:
+                continue
 
-    # Set symbolic shapes
-    default_lower_bound = 1
-    default_upper_bound = (
-        default_lower_bound + 1 if kwargs.get("convert_to", None) == "mlprogram" else -1
-    )
-    default_bound_used = False
-    for input_name in symbolic_inputs:
-        input_type = input_shape_map.get(input_name, None)
-
-        if isinstance(input_type, ImageType):
-            if isinstance(input_type.shape, EnumeratedShapes):
-                enumerated_shapes = []
-                for s in input_type.shape.shapes:
-                    enumerated_shapes.append(
-                        NeuralNetworkImageSize(
-                            height=s.shape[-2], width=s.shape[-1]
+            # Set symbolic shapes
+            default_lower_bound = 1
+            default_upper_bound = default_lower_bound + 1 if self.convert_to == "mlprogram" else -1
+            default_bound_used = False
+            input_type = input_shape_map.get(name, None)
+
+            if isinstance(input_type, ImageType):
+                if isinstance(input_type.shape, EnumeratedShapes):
+                    enumerated_shapes = []
+                    for s in input_type.shape.shapes:
+                        enumerated_shapes.append(
+                            NeuralNetworkImageSize(height=s.shape[-2], width=s.shape[-1])
                         )
+                    flexible_shape_utils._add_enumerated_image_sizes_for_feature(
+                        input_features[-1], sizes=enumerated_shapes
                     )
-                add_enumerated_image_sizes(
-                    model, input_name, sizes=enumerated_shapes
-                )
-            else:
-                img_range = NeuralNetworkImageSizeRange()
-                H = input_type.shape.shape[-2]
-                W = input_type.shape.shape[-1]
-
-                if isinstance(H, RangeDim):
-                    img_range.add_height_range((H.lower_bound, H.upper_bound))
-                elif is_symbolic(H):
-                    img_range.add_height_range((default_lower_bound, default_upper_bound))
-                    default_bound_used = True
-                else:
-                    img_range.add_height_range((H, H))
-                if isinstance(W, RangeDim):
-                    img_range.add_width_range((W.lower_bound, W.upper_bound))
-                elif is_symbolic(W):
-                    img_range.add_width_range((default_lower_bound, default_upper_bound))
-                    default_bound_used = True
                 else:
-                    img_range.add_width_range((W, W))
+                    img_range = NeuralNetworkImageSizeRange()
+                    H = input_type.shape.shape[-2]
+                    W = input_type.shape.shape[-1]
+
+                    if isinstance(H, RangeDim):
+                        img_range.add_height_range((H.lower_bound, H.upper_bound))
+                    elif is_symbolic(H):
+                        img_range.add_height_range((default_lower_bound, default_upper_bound))
+                        default_bound_used = True
+                    else:
+                        img_range.add_height_range((H, H))
+                    if isinstance(W, RangeDim):
+                        img_range.add_width_range((W.lower_bound, W.upper_bound))
+                    elif is_symbolic(W):
+                        img_range.add_width_range((default_lower_bound, default_upper_bound))
+                        default_bound_used = True
+                    else:
+                        img_range.add_width_range((W, W))
 
-                update_image_size_range(
-                    model, input_name, img_range
-                )
-        elif isinstance(input_type, TensorType):
-            if isinstance(input_type.shape, EnumeratedShapes):
-                add_multiarray_ndshape_enumeration(
-                    model, input_name, [tuple(s.shape) for s in input_type.shape.shapes]
-                )
-            else:
+                    flexible_shape_utils._update_image_size_range_for_feature(
+                        input_features[-1], img_range
+                    )
+            elif isinstance(input_type, TensorType):
+                if isinstance(input_type.shape, EnumeratedShapes):
+                    flexible_shape_utils._add_multiarray_ndshape_enumeration_for_feature(
+                        input_features[-1], [tuple(s.shape) for s in input_type.shape.shapes]
+                    )
+                else:
+                    lb = []
+                    ub = []
+                    for s in input_type.shape.shape:
+                        if isinstance(s, RangeDim):
+                            lb.append(s.lower_bound)
+                            ub.append(s.upper_bound)
+                        elif is_symbolic(s):
+                            lb.append(default_lower_bound)
+                            ub.append(default_upper_bound)
+                            default_bound_used = True
+                        else:
+                            lb.append(s)
+                            ub.append(s)
+                    flexible_shape_utils._set_multiarray_ndshape_range_for_feature(
+                        input_features[-1], lower_bounds=lb, upper_bounds=ub
+                    )
+            elif input_type is None:
+                sym_type = func.inputs[name].sym_type
                 lb = []
                 ub = []
-                for s in input_type.shape.shape:
-                    if isinstance(s, RangeDim):
-                        lb.append(s.lower_bound)
-                        ub.append(s.upper_bound)
-                    elif is_symbolic(s):
+                for s in sym_type.get_shape():
+                    if is_symbolic(s):
                         lb.append(default_lower_bound)
                         ub.append(default_upper_bound)
                         default_bound_used = True
                     else:
                         lb.append(s)
                         ub.append(s)
-                set_multiarray_ndshape_range(
-                    model, input_name, lower_bounds=lb, upper_bounds=ub
+                flexible_shape_utils._set_multiarray_ndshape_range_for_feature(
+                    input_features[-1], lower_bounds=lb, upper_bounds=ub
+                )
+
+            if default_bound_used and self.convert_to == "mlprogram":
+                warnings.warn(
+                    "Some dimensions in the input shape are unknown, hence they are set to flexible ranges "
+                    f"with lower bound and default value = {default_lower_bound}, and upper bound = "
+                    f"{default_upper_bound}. To set different values for the default shape and upper bound, "
+                    "please use the ct.RangeDim() method as described here: "
+                    "https://coremltools.readme.io/docs/flexible-inputs#set-the-range-for-each-dimension.",
+                    UserWarning,
                 )
-        elif input_type is None:
-            sym_type = prog.functions["main"].inputs[input_name].sym_type
-            lb = []
-            ub = []
-            for s in sym_type.get_shape():
-                if is_symbolic(s):
-                    lb.append(default_lower_bound)
-                    ub.append(default_upper_bound)
-                    default_bound_used = True
+                convert_from = self.convert_from
+                if convert_from is not None and convert_from.startswith("tensorflow"):
+                    warnings.warn(
+                        'There is "None" dim in TF input placeholder. Please consider specifying '
+                        'input shapes by using the "inputs" param in ct.convert().'
+                    )
+
+        return input_features
+
+    def get_func_output(self, func: mil.Function) -> List[proto.Model_pb2.FeatureDescription]:
+        """
+        Utils to get function output feature description.
+        """
+
+        output_types = func.output_types
+        output_features = []
+
+        if output_types is not None and self.classifier_config is None:
+            assert len(output_types) == len(
+                func.outputs
+            ), "number of mil program outputs do not match the number of outputs provided by the user"
+
+        for i, var in enumerate(func.outputs):
+            output_feature_type = proto.FeatureTypes_pb2.FeatureType()
+            if types.is_tensor(var.sym_type) or types.is_primitive(var.sym_type):
+                if output_types is not None and isinstance(output_types[i], ImageType):
+                    if not types.is_tensor(var.sym_type):
+                        raise ValueError(
+                            "Image output, '{}', is a scalar, but it should be a tensor of rank 4".format(
+                                var.name
+                            )
+                        )
+
+                    clr_space = _get_colorspace_enum(output_types[i].color_layout)
+
+                    shape = var.sym_type.get_shape()
+                    if any_variadic(shape):
+                        raise ValueError(
+                            "Variable rank model outputs, that are ImageTypes, are not supported"
+                        )
+                    if any_symbolic(shape):
+                        # For flexible shape output, we set the imageSizeRange to [1, -1],
+                        # util this radar is fixed in CoreML: rdar://122895892 ([Bug] CoreML produce empty dictionary with image output with dynamic shape)
+                        image_type = proto.FeatureTypes_pb2.ImageFeatureType(
+                            width=1, height=1, colorSpace=clr_space
+                        )
+                        image_type.imageSizeRange.widthRange.lowerBound = 1
+                        image_type.imageSizeRange.widthRange.upperBound = -1
+                        image_type.imageSizeRange.heightRange.lowerBound = 1
+                        image_type.imageSizeRange.heightRange.upperBound = -1
+                    else:
+                        image_type = proto.FeatureTypes_pb2.ImageFeatureType(
+                            width=shape[-1], height=shape[-2], colorSpace=clr_space
+                        )
+                    _validate_image_input_output_shapes(
+                        output_types[i].color_layout, shape, var.name, is_input=False
+                    )
+
+                    output_feature_type.imageType.CopyFrom(image_type)
+                    output_features.append(
+                        proto.Model_pb2.FeatureDescription(name=var.name, type=output_feature_type)
+                    )
                 else:
-                    lb.append(s)
-                    ub.append(s)
-            set_multiarray_ndshape_range(
-                model, input_name, lower_bounds=lb, upper_bounds=ub
-            )
+                    dataType = None
+                    if self.classifier_config is None or var.name != self.predicted_feature_name:
+                        # Not a classifier output, make sure model output type matches with ML Program type.
+                        dataType = cast_to_framework_io_dtype(var, True)
+                    else:
+                        # Classifier outputs are set up separately, so default to fp32 for now.
+                        dataType = proto.FeatureTypes_pb2.ArrayFeatureType.ArrayDataType.FLOAT32
 
-    if default_bound_used and kwargs.get("convert_to", None) == "mlprogram":
-        warnings.warn(
-            "Some dimensions in the input shape are unknown, hence they are set to flexible ranges "
-            f"with lower bound and default value = {default_lower_bound}, and upper bound = "
-            f"{default_upper_bound}. To set different values for the default shape and upper bound, "
-            "please use the ct.RangeDim() method as described here: "
-            "https://coremltools.readme.io/docs/flexible-inputs#set-the-range-for-each-dimension.",
-            UserWarning,
+                    output_shape = (
+                        None
+                        if any_symbolic(var.shape) or types.is_primitive(var.sym_type)
+                        else var.shape
+                    )
+                    array_type = proto.FeatureTypes_pb2.ArrayFeatureType(
+                        shape=output_shape, dataType=dataType
+                    )
+                    output_feature_type.multiArrayType.CopyFrom(array_type)
+                    output_features.append(
+                        proto.Model_pb2.FeatureDescription(name=var.name, type=output_feature_type)
+                    )
+            elif types.is_dict(var.sym_type):
+                output_feature_type.dictionaryType.MergeFromString(b"")
+                keytype, valtype = var.sym_type.T
+                if types.is_str(keytype):
+                    output_feature_type.dictionaryType.stringKeyType.MergeFromString(b"")
+                elif keytype == types.int64:
+                    output_feature_type.dictionaryType.int64KeyType.MergeFromString(b"")
+                else:
+                    raise ValueError("Dictionary key type not supported.")
+                output_features.append(
+                    proto.Model_pb2.FeatureDescription(name=var.name, type=output_feature_type)
+                )
+            else:
+                raise NotImplementedError(f"Unsupported output type {var.sym_type}.")
+
+        return output_features
+
+    def get_coreml_model(
+        self,
+        input: Dict[str, List[proto.Model_pb2.FeatureDescription]],
+        output: Dict[str, List[proto.Model_pb2.FeatureDescription]],
+        specification_version: int,
+    ) -> proto.Model_pb2.Model:
+        """
+        Utils to get a coreml model description.
+        """
+        # Model description
+        input_features = input[self._DEFAULT_FUNCTION_NAME]
+        output_features = output[self._DEFAULT_FUNCTION_NAME]
+        desc = proto.Model_pb2.ModelDescription(input=input_features, output=output_features)
+
+        if self.classifier_config is not None:
+            desc.predictedFeatureName = self.predicted_feature_name
+            desc.predictedProbabilitiesName = self.predicted_probabilities_name
+
+            # Manually edit output type of predictedFeatureName.
+            # It doesn't use MLMultiArray and really uses a "primitive" type.
+            for output in desc.output:
+                if output.name == self.predicted_feature_name:
+                    if type(self.classifier_config.class_labels[0]) == int:
+                        output.type.int64Type.MergeFromString(b"")
+                    else:
+                        output.type.stringType.MergeFromString(b"")
+                    break
+
+        # Create ML Model
+        model = proto.Model_pb2.Model(description=desc, specificationVersion=specification_version)
+        model.mlProgram.CopyFrom(self.mil_proto)
+
+        return model
+
+    def export(
+        self, specification_version: Optional[int] = _SPECIFICATION_VERSION_IOS_15
+    ) -> proto.Model_pb2.Model:
+
+        # get functions input / output description
+        func_to_input = OrderedDict()
+        func_to_output = OrderedDict()
+
+        for name, func in self.prog.functions.items():
+            func_to_input[name] = self.get_func_input(func)
+            func_to_output[name] = self.get_func_output(func)
+
+        # create a coreml model with I/O description and mil proto
+        model = self.get_coreml_model(
+            func_to_input,
+            func_to_output,
+            specification_version,
         )
-        convert_from = kwargs.get("convert_from", None)
-        if convert_from is not None and convert_from.startswith("tensorflow"):
-            warnings.warn(
-                'There is "None" dim in TF input placeholder. Please consider specifying '
-                'input shapes by using the "inputs" param in ct.convert().'
-            )
 
-    # Set optional inputs
-    _set_optional_inputs(model, input_types)
+        # Set optional inputs for main function
+        _set_optional_inputs(model, self.prog.functions["main"].input_types)
+
+        return model
+
 
-    return model
+def load(
+    prog: Program,
+    weights_dir: str,
+    resume_on_errors: Optional[bool] = False,
+    specification_version: Optional[int] = _SPECIFICATION_VERSION_IOS_15,
+    **kwargs,
+) -> proto.Model_pb2.Model:
+    if "main" not in prog.functions:
+        raise ValueError("main function not found in program")
+
+    # if user has specified "ClassifierConfig", then add the "classify" op to the prog
+    classifier_config = kwargs.get("classifier_config", None)
+    predicted_feature_name, predicted_probabilities_name = None, None
+    if classifier_config is not None:
+        predicted_feature_name, predicted_probabilities_name = _add_classify_op(
+            prog, classifier_config
+        )
+
+    # convert pymil program into mil proto
+    mil_proto_exporter = MILProtoExporter(
+        prog,
+        weights_dir,
+    )
+    mil_proto = mil_proto_exporter.export(specification_version)
+
+    # return the model provided by users
+    desc = kwargs.get("model_description", None)
+    if desc and not isinstance(desc, proto.Model_pb2.ModelDescription):
+        raise ValueError("Invalid model descriptor")
+
+    if desc:
+        if classifier_config is not None:
+            raise AssertionError("Both model_description and classifier_config can't be provided")
+        model = proto.Model_pb2.Model(description=desc, specificationVersion=specification_version)
+        model.mlProgram.CopyFrom(mil_proto)
+        return model
+
+    # create a CoreML model protobuf
+    exporter_kwargs = CoreMLProtoExporter.get_additional_kwargs(kwargs)
+    coreml_proto_exporter = CoreMLProtoExporter(
+        prog,
+        mil_proto,
+        predicted_feature_name,
+        predicted_probabilities_name,
+        classifier_config=kwargs.get("classifier_config", None),
+        convert_to=kwargs.get("convert_to", None),
+        convert_from=kwargs.get("convert_from", None),
+        **exporter_kwargs,
+    )
+    return coreml_proto_exporter.export(specification_version)
diff --git a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
index df8d9349b..acbc729d8 100644
--- a/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
+++ b/coremltools/converters/mil/backend/mil/passes/adjust_io_to_supported_types.py
@@ -7,6 +7,7 @@
 
 from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types as types
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
@@ -14,6 +15,7 @@
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
 
+# TODO: rdar://122845072 ([Infra] Refactor the transform_function_signatures, adjust_io_to_supported_types and update_output_dtypes using a shared graph pass)
 @register_pass(namespace="mil_backend")
 class adjust_io_to_supported_types(AbstractGraphPass):
     """
@@ -182,8 +184,10 @@ def _adjust_main_outputs(func):
 
             output_var_name = output_var.name
             output_var.set_name(f"{output_var_name}__pre__output__{target_dtype}__cast")
+            old_output_var = output_var
             output_var = mb.cast(x=output_var, dtype=target_dtype)
             output_var.set_name(output_var_name)
+            Block._copy_scope_info(old_output_var, output_var)
         new_outputs.append(output_var)
     func.set_outputs(new_outputs)
 
diff --git a/coremltools/converters/mil/backend/mil/passes/fuse_activation_silu.py b/coremltools/converters/mil/backend/mil/passes/fuse_activation_silu.py
index 5f9270df2..b473c45f1 100644
--- a/coremltools/converters/mil/backend/mil/passes/fuse_activation_silu.py
+++ b/coremltools/converters/mil/backend/mil/passes/fuse_activation_silu.py
@@ -43,8 +43,11 @@ def _try_to_transform(sigmoid_op, mul_op, block):
 
 @block_context_manager
 def _fuse_activation_silu_block(block):
-    fusion_status = False
+    fusion_occurred = False
     for op in list(block.operations):
+        if op.enclosing_block is None:
+            continue
+
         for b in op.blocks:
             block_changed = True
             while block_changed:
@@ -54,11 +57,9 @@ def _fuse_activation_silu_block(block):
 
         mul_op = _match_pattern(op)
         if mul_op is not None:
-            fusion_status = _try_to_transform(op, mul_op, block)
-            # has to break as the downstream iterator is affected.
-            if fusion_status:
-                return fusion_status
-    return fusion_status
+            if _try_to_transform(op, mul_op, block):
+                fusion_occurred = True
+    return fusion_occurred
 
 
 @register_pass(namespace="mil_backend")
diff --git a/coremltools/converters/mil/backend/mil/passes/fuse_pow2_sqrt.py b/coremltools/converters/mil/backend/mil/passes/fuse_pow2_sqrt.py
index 45e17b77a..0f87c83fb 100644
--- a/coremltools/converters/mil/backend/mil/passes/fuse_pow2_sqrt.py
+++ b/coremltools/converters/mil/backend/mil/passes/fuse_pow2_sqrt.py
@@ -29,7 +29,7 @@ def _match_pattern(op):
     # if we have sqrt, check for pow(2)
     elif sqrt_op and child_ops[0].op_type == "pow" and child_ops[0].y.val == 2:
         pow_op = child_ops[0]
-    
+
     # if we don't have both ops, fast fail
     if not pow_op or not sqrt_op:
         return None
@@ -59,8 +59,10 @@ def _try_to_transform(op1, op2, block):
 
 @block_context_manager
 def _fuse_pow2_sqrt(block):
-    fusion_status = False
+    fusion_occurred = False
     for op in list(block.operations):
+        if op.enclosing_block is None:
+            continue
         for b in op.blocks:
             block_changed = True
             while block_changed:
@@ -70,11 +72,9 @@ def _fuse_pow2_sqrt(block):
 
         op2 = _match_pattern(op)
         if op2 is not None:
-            fusion_status = _try_to_transform(op, op2, block)
-            # has to break as the downstream iterator is affected.
-            if fusion_status:
-                return fusion_status
-    return fusion_status
+            if _try_to_transform(op, op2, block):
+                fusion_occurred = True
+    return fusion_occurred
 
 
 @register_pass(namespace="mil_backend")
diff --git a/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py b/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py
index b83a2b430..d9124f742 100644
--- a/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py
+++ b/coremltools/converters/mil/backend/mil/passes/insert_image_preprocessing_op.py
@@ -26,7 +26,7 @@ def apply(self, prog):
 
 @block_context_manager
 def _insert_image_preprocessing_ops(block, prog):
-    input_types = list(prog.main_input_types)
+    input_types = list(prog.functions["main"].input_types)
 
     for input_type in input_types:
         if isinstance(input_type, ImageType):
diff --git a/coremltools/converters/mil/backend/mil/passes/sanitize_name_strings.py b/coremltools/converters/mil/backend/mil/passes/sanitize_name_strings.py
index 9ec899091..b5704ceab 100644
--- a/coremltools/converters/mil/backend/mil/passes/sanitize_name_strings.py
+++ b/coremltools/converters/mil/backend/mil/passes/sanitize_name_strings.py
@@ -19,4 +19,6 @@ def apply(self, prog):
         for f in prog.functions.values():
             sanitizer_vars = NameSanitizer(prefix="var_")
             sanitizer_ops = NameSanitizer(prefix="op_")
-            NameSanitizer.sanitize_block(f, sanitizer_vars, sanitizer_ops, prog.main_input_types)
+            NameSanitizer.sanitize_block(
+                f, sanitizer_vars, sanitizer_ops, prog.functions["main"].input_types
+            )
diff --git a/coremltools/converters/mil/backend/mil/passes/test_passes.py b/coremltools/converters/mil/backend/mil/passes/test_passes.py
index e1b8c5de7..84b7cf5ca 100644
--- a/coremltools/converters/mil/backend/mil/passes/test_passes.py
+++ b/coremltools/converters/mil/backend/mil/passes/test_passes.py
@@ -387,8 +387,8 @@ def assert_block_inputs(prev_inputs, inputs):
                 assert prev_inputs[i].name == inputs[i].name
                 assert inputs[i].dtype == types.fp32
 
-        subblocks = prog.functions['main'].operations[0].blocks
-        prev_subblocks = prev_prog.functions['main'].operations[0].blocks
+        subblocks = prog.functions["main"].operations[0].blocks
+        prev_subblocks = prev_prog.functions["main"].operations[0].blocks
         for i in range(0, len(subblocks)):
             assert_block_inputs(prev_subblocks[i].inputs, subblocks[i].inputs)
 
@@ -482,10 +482,9 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 1, 20, 20],
-                                              color_layout="G",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(name="x", shape=[1, 1, 20, 20], color_layout="G", channel_first=True),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -520,11 +519,11 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 1, 20, 20],
-                                              scale=2.0,
-                                              color_layout="G",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(
+                name="x", shape=[1, 1, 20, 20], scale=2.0, color_layout="G", channel_first=True
+            ),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -561,11 +560,11 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 1, 20, 20],
-                                              bias=2.0,
-                                              color_layout="G",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(
+                name="x", shape=[1, 1, 20, 20], bias=2.0, color_layout="G", channel_first=True
+            ),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -603,12 +602,16 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 1, 20, 20],
-                                              scale=2.0,
-                                              bias=2.0,
-                                              color_layout="G",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(
+                name="x",
+                shape=[1, 1, 20, 20],
+                scale=2.0,
+                bias=2.0,
+                color_layout="G",
+                channel_first=True,
+            ),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -646,10 +649,9 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 3, 20, 20],
-                                              color_layout="RGB",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(name="x", shape=[1, 3, 20, 20], color_layout="RGB", channel_first=True),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -685,12 +687,16 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 3, 20, 20],
-                                              scale=2.0,
-                                              bias=[1.0, 2.0, 3.0],
-                                              color_layout="RGB",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(
+                name="x",
+                shape=[1, 3, 20, 20],
+                scale=2.0,
+                bias=[1.0, 2.0, 3.0],
+                color_layout="RGB",
+                channel_first=True,
+            ),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -728,10 +734,9 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 3, 20, 20],
-                                              color_layout="BGR",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(name="x", shape=[1, 3, 20, 20], color_layout="BGR", channel_first=True),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -767,12 +772,16 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 3, 20, 20],
-                                              scale=2.0,
-                                              bias=[1.0, 2.0, 3.0],
-                                              color_layout="BGR",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(
+                name="x",
+                shape=[1, 3, 20, 20],
+                scale=2.0,
+                bias=[1.0, 2.0, 3.0],
+                color_layout="BGR",
+                channel_first=True,
+            ),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -815,12 +824,16 @@ def prog(x):
             z = mb.add(x=y1, y=y2)
             return z
 
-        prog.main_input_types = (ct.ImageType(name='x',
-                                              shape=[1, 3, 20, 20],
-                                              scale=scale_type(2.0),
-                                              bias=np.array([1, 2, 3]).astype(bias_type),
-                                              color_layout="RGB",
-                                              channel_first=True),)
+        prog.functions["main"].input_types = (
+            ct.ImageType(
+                name="x",
+                shape=[1, 3, 20, 20],
+                scale=scale_type(2.0),
+                bias=np.array([1, 2, 3]).astype(bias_type),
+                color_layout="RGB",
+                channel_first=True,
+            ),
+        )
 
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "mil_backend::insert_image_preprocessing_ops"
@@ -1029,7 +1042,7 @@ def program(x):
             backend=("mlprogram", "fp32"),
             expected_output_shapes={block.outputs[0].name: tuple(x_shape)},
         )
-    
+
     @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="mlprogram predict available only on macOS12+")
     def test_no_pow(self):
         x_shape = tuple(np.random.randint(low=1, high=4, size=5))
@@ -1051,7 +1064,7 @@ def program(x):
             backend=("mlprogram", "fp32"),
             expected_output_shapes={block.outputs[0].name: tuple(x_shape)},
         )
-    
+
     @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="mlprogram predict available only on macOS12+")
     def test_no_sqrt(self):
         x_shape = tuple(np.random.randint(low=1, high=4, size=5))
@@ -1073,7 +1086,7 @@ def program(x):
             backend=("mlprogram", "fp32"),
             expected_output_shapes={block.outputs[0].name: tuple(x_shape)},
         )
-    
+
     @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="mlprogram predict available only on macOS12+")
     @pytest.mark.parametrize(
         "reverse_order", itertools.product([True, False]),
diff --git a/coremltools/converters/mil/backend/nn/load.py b/coremltools/converters/mil/backend/nn/load.py
index 6825b6e63..8c825cb09 100644
--- a/coremltools/converters/mil/backend/nn/load.py
+++ b/coremltools/converters/mil/backend/nn/load.py
@@ -204,8 +204,8 @@ def load(prog, **kwargs):
         )
         raise ValueError(msg.format(prog))
 
-    input_types = prog.main_input_types
-    output_types = prog.main_output_types
+    input_types = prog.functions["main"].input_types
+    output_types = prog.functions["main"].output_types
 
     v1_inputs = []
     symbolic_inputs = {}
diff --git a/coremltools/converters/mil/backend/nn/passes/commingle_loop_vars.py b/coremltools/converters/mil/backend/nn/passes/commingle_loop_vars.py
index 7105ea09d..993ad022e 100644
--- a/coremltools/converters/mil/backend/nn/passes/commingle_loop_vars.py
+++ b/coremltools/converters/mil/backend/nn/passes/commingle_loop_vars.py
@@ -9,7 +9,7 @@
 
 
 def _commingle_loop_vars_block(block):
-    for op in list(block.operations):
+    for op in block.operations:
         for b in op.blocks:
             _commingle_loop_vars_block(b)
 
@@ -23,7 +23,6 @@ def _commingle_loop_vars_block(block):
                     anchor_op=None,
                     old_var=vx_in,
                     new_var=v_out,
-                    no_check_var_visibility=True,
                 )
 
             # replace block inputs
diff --git a/coremltools/converters/mil/backend/nn/passes/conv1d_decomposition.py b/coremltools/converters/mil/backend/nn/passes/conv1d_decomposition.py
index 48c207c55..6a477a17d 100644
--- a/coremltools/converters/mil/backend/nn/passes/conv1d_decomposition.py
+++ b/coremltools/converters/mil/backend/nn/passes/conv1d_decomposition.py
@@ -39,7 +39,11 @@ def apply(self, prog):
     @block_context_manager
     def _decompose_conv1d_block(self, block: Block):
         def help_decompose_conv1d_block(block: Block) -> bool:
+            fusion_occurred = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
+
                 for b in op.blocks:
                     block_changed = True
                     while block_changed:
@@ -50,10 +54,9 @@ def help_decompose_conv1d_block(block: Block) -> bool:
                     continue
 
                 if self._try_apply_transform(op, block):
-                    # has to break as the downstream iterator is affected
-                    return True
+                    fusion_occurred = True
 
-            return False
+            return fusion_occurred
 
         block_changed = True
         while block_changed:
diff --git a/coremltools/converters/mil/backend/nn/passes/handle_unused_inputs.py b/coremltools/converters/mil/backend/nn/passes/handle_unused_inputs.py
index 2effac4f3..a5398441f 100644
--- a/coremltools/converters/mil/backend/nn/passes/handle_unused_inputs.py
+++ b/coremltools/converters/mil/backend/nn/passes/handle_unused_inputs.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
@@ -15,6 +16,7 @@ def _handle_unused_inputs_func(f):
         for v in unused_inputs:
             # copy the input
             v_tmp = mb.identity(x=v, name=v.name + "_tmp")
+            Block._copy_scope_info(v, v_tmp)
 
 
 @register_pass(namespace="nn_backend")
diff --git a/coremltools/converters/mil/converter.py b/coremltools/converters/mil/converter.py
index 72f11769c..f9421b1ad 100644
--- a/coremltools/converters/mil/converter.py
+++ b/coremltools/converters/mil/converter.py
@@ -72,7 +72,7 @@ def __call__(self, model, *args, **kwargs):
                 # natively supported by MIL ops (ex. Conv/Pool/etc.)
                 if isinstance(inp, ImageType) and inputs[idx].channel_first is None:
                     inputs[idx].channel_first = True
-            model.set_main_input_types(tuple(inputs))
+            model.functions["main"].set_input_types(tuple(inputs))
         return model
 
 
@@ -236,10 +236,12 @@ def _mil_convert(
             compute_units=compute_units,
         )
 
-    return modelClass(proto,
-                      mil_program=mil_program,
-                      skip_model_load=kwargs.get('skip_model_load', False),
-                      compute_units=compute_units)
+    return modelClass(
+        proto,
+        mil_program=mil_program,
+        skip_model_load=kwargs.get("skip_model_load", False),
+        compute_units=compute_units,
+    )
 
 
 def mil_convert_to_proto(
diff --git a/coremltools/converters/mil/debugging_utils.py b/coremltools/converters/mil/debugging_utils.py
index d6bc5d456..30282659b 100644
--- a/coremltools/converters/mil/debugging_utils.py
+++ b/coremltools/converters/mil/debugging_utils.py
@@ -23,11 +23,11 @@ def extract_submodel(
     ) -> MLModel:
     """
     This utility function lets you extract a submodel from a Core ML model.
-    
+
     For a NeuralNetwork model, the function extracts only in-memory Core ML models.
     You should always call this function to a model directly from ``ct.convert``. It is not
     allowed to load the model from disk and then call this API.
-    
+
     For an ML program model, both cases (in-memory and from disk) are supported.
 
     Parameters
@@ -37,14 +37,14 @@ def extract_submodel(
 
     outputs: list[str]
         A list of names of Vars, which are the outputs of the extracted submodel.
-        
+
     inputs: list[str] (Optional)
         A list of names of Vars, which are the inputs of the extracted submodel.
         If not provided, the inputs from the original model are used.
 
     function_name: str (Optional)
         Name of the function where the subgraph is extracted. Default ``main``.
-        
+
     Examples
     --------
 
@@ -54,7 +54,7 @@ def extract_submodel(
         >>> mlmodel = ct.convert(model, convert_to="neuralnetwork")
         >>> outputs = ["output_0", "output_1"]
         >>> submodel = extract_submodel(mlmodel, outputs)
-        
+
     ML Program:
 
         >>> from coremltools.converters.mil.debugging_utils import extract_submodel
@@ -75,15 +75,15 @@ def validate_inputs(func, input_vars):
         for op in func.operations:
             if op.op_type == "const":
                 reachable_vars.add(op.outputs[0])
-        
+
         for op in func.operations:
             if all([x in reachable_vars for x in op.inputs.values()]):
                 reachable_vars.update(op.outputs)
-                
+
         for out in func.outputs:
             if out not in reachable_vars:
                 raise ValueError(f"output {output} not reachable from inputs")
-    
+
     @block_context_manager
     def replace_inputs(func, input_vars):
         func_inputs = {}
@@ -94,13 +94,12 @@ def replace_inputs(func, input_vars):
                 anchor_op=input.op,
                 old_var=input,
                 new_var=func_inputs[name].outputs[0],
-                no_check_var_visibility=True,
             )
         func._input_dict = OrderedDict()
         for k, v in func_inputs.items():
             v.set_name(k)
             func._input_dict[k] = v.outputs[0]
-        
+
     if not isinstance(outputs, (list, tuple)):
         raise ValueError(f"outputs must be of type list/tuple. Got {type(outputs)}.")
 
@@ -126,7 +125,7 @@ def replace_inputs(func, input_vars):
             )
         else:
             program = model._mil_program
-    
+
     # extract subgraph
     prog = copy.deepcopy(program)
     func = prog.functions[function_name]
@@ -147,7 +146,7 @@ def replace_inputs(func, input_vars):
 
     # Clean up the graph
     PASS_REGISTRY["common::dead_code_elimination"](prog)
-    
+
     # If the inputs are provided, we subtract the subgraph starting from them
     if inputs is not None:
         if not isinstance(inputs, (list, tuple)):
@@ -169,8 +168,8 @@ def replace_inputs(func, input_vars):
         validate_inputs(func, input_vars)
         replace_inputs(func, input_vars)
         PASS_REGISTRY["common::dead_code_elimination"](prog)
-    
+
     prog.skip_all_passes = True
     submodel = ct.convert(prog, convert_to=backend, compute_units=model.compute_unit)
-    
+
     return submodel
diff --git a/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py b/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py
index 462a3b35f..cf8f8e416 100644
--- a/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py
+++ b/coremltools/converters/mil/experimental/passes/generic_pass_infrastructure.py
@@ -7,7 +7,9 @@
 import warnings
 from functools import partial
 
+from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 
 from ...mil.passes import pass_registry
 
@@ -172,24 +174,26 @@ def _detect_pattern(program_op, ops_arrangement_root_var, block):
 
 @block_context_manager
 def _fuse_one_block(block, ops_arrangement, var_constraints, transform_pattern):
-    fusion_status = False
+    fusion_occurred = False
     for op in list(block.operations):
         for b in op.blocks:
             block_changed = True
             while block_changed:
                 block_changed = _fuse_one_block(b, ops_arrangement, var_constraints, transform_pattern)
 
-        ops_arrangement_root_var = list(ops_arrangement.functions.values())[0].function_inputs[0]
-        fusion_status, pattern = _detect_pattern(op, ops_arrangement_root_var, block)
+        ops_arrangement_root_var = list(
+            list(ops_arrangement.functions.values())[0].inputs.values()
+        )[0]
+        fusion_occurred, pattern = _detect_pattern(op, ops_arrangement_root_var, block)
 
-        if fusion_status:
-            fusion_status &= var_constraints(pattern)
+        if fusion_occurred:
+            fusion_occurred &= var_constraints(pattern)
 
-        if fusion_status:
+        if fusion_occurred:
             transform_pattern(pattern)
-            return fusion_status
+            return fusion_occurred
 
-    return fusion_status
+    return fusion_occurred
 
 
 def fuse_all_blocks(ops_arrangement, var_constraints, transform_pattern, prog):
@@ -208,9 +212,10 @@ def __call__(self, prog):
         if len(self.passes) == 0:
             raise ValueError("no pass functions associated with " + self.pass_name)
 
-        for one_pass in self.passes:
-            one_pass(prog)
-            prog.validate()
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=[self.pass_name])):
+            for one_pass in self.passes:
+                one_pass(prog)
+                prog.validate(check_essential_scope=True)
 
     def add(self, pass_function):
         self.passes.append(pass_function)
diff --git a/coremltools/converters/mil/frontend/_utils.py b/coremltools/converters/mil/frontend/_utils.py
index 3d7d8e168..dc7de2acc 100644
--- a/coremltools/converters/mil/frontend/_utils.py
+++ b/coremltools/converters/mil/frontend/_utils.py
@@ -2,14 +2,20 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 import itertools
+import math as math
+from typing import List, Optional, Union
 
-from typing import List, Optional
+import numpy as _np
 
 from coremltools.converters.mil.input_types import InputType
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Var, types
-from coremltools.converters.mil.mil.ops.defs._utils import parse_einsum_equation
+from coremltools.converters.mil.mil import Operation, Var, types
+from coremltools.converters.mil.mil.ops.defs._utils import (
+    parse_einsum_equation,
+    promote_input_dtypes,
+)
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic, is_symbolic
 
 
@@ -230,6 +236,31 @@ def get_output_names(outputs) -> Optional[List[str]]:
     return output_names
 
 
+# This is a workaround in Core ML for topk with dynamic `k`:
+#     * Core ML topk supports only constant `k`
+#     * Luckily, Core ML gather supports dynamic `end`, so we workaround by argsort then gather
+# This leads to a slightly different behaviour, though: top-k elements are always sorted
+def dynamic_topk(
+    x: Var, k: Var, axis: int, ascending: Optional[bool] = False, name: Optional[str] = None
+):
+    assert k.val is None, "Please use mb.topk directly if k is compile time known"
+
+    indices = mb.argsort(x=x, axis=axis, ascending=ascending)
+    if name is None:
+        values = mb.gather_along_axis(x=x, indices=indices, axis=axis)
+    else:
+        values = mb.gather_along_axis(x=x, indices=indices, axis=axis, name=name)
+
+    k_indices = mb.range_1d(end=k, start=0, step=1)
+    values = mb.gather(x=values, indices=k_indices, axis=axis)
+    if name is None:
+        indices = mb.gather(x=indices, indices=k_indices, axis=axis)
+    else:
+        indices = mb.gather(x=indices, indices=k_indices, axis=axis, name=name)
+
+    return values, indices
+
+
 def solve_diagonal_einsum(parsed_vectors, vars):
     def solve_diagonal_einsum_one_step(parsed_vector, x):
         for i in range(len(parsed_vector)):
@@ -436,3 +467,80 @@ def _concat_dims(dims, none_if_empty=False):
         else:
             ab = mb.transpose(x=ab, perm=get_perm_transpose_einsum(ab_reshaped_axes, out_axes), name=name)
         return ab
+
+
+def _lower_scaled_dot_product_attention(q: Var, k: Var, v: Var, mask: Var, name: str) -> Var:
+    # scale the query input
+    embed_size = q.shape[-1]
+    if is_symbolic(embed_size):
+        raise ValueError(
+            "The embedding size, i.e. last dimension of the shape of query tensor"
+            " cannot be symbolic, in scaled_dot_product_attention op"
+        )
+    multiplicative_scale_factor = 1 / math.sqrt(embed_size)
+    q, k, v, multiplicative_scale_factor = promote_input_dtypes(
+        [q, k, v, multiplicative_scale_factor]
+    )
+    q = mb.mul(x=q, y=multiplicative_scale_factor)
+
+    # multiply query and key input tensors
+    # shape of output: (target_seq, source_seq) or (B,...,target_seq, source_seq)
+    attn_weights = mb.matmul(x=q, y=k, transpose_y=True)
+
+    # add mask if applicable
+    if mask is not None:
+        attn_weights = mb.add(x=attn_weights, y=mask)
+
+    # do softmax
+    attn_weights_normalized = mb.softmax(x=attn_weights, axis=-1)
+
+    # multiply attn_weights and value tensor
+    res = mb.matmul(x=attn_weights_normalized, y=v, name=name)
+    return res
+
+
+def _construct_constexpr_affine_op(
+    quantized_weights: _np.ndarray,
+    zero_point: Optional[Union[Var, _np.ndarray, _np.generic]],
+    scale: Union[Var, _np.ndarray, _np.generic],
+    axis: Optional[Union[Var, int]] = None,
+    name: Optional[str] = None,
+    before_op: Optional[Operation] = None,
+) -> Operation:
+    """Constructs the constexpr op to represent the dequantized weight from PyTorch's data."""
+    # The constexpr_affine_dequantize op requires axis.
+    if axis is None:
+        # Infer the axis based on scale's shape.
+        non_single_dim = [dim for dim, dim_size in enumerate(scale.shape) if dim_size > 1]
+        if len(non_single_dim) > 2:
+            raise ValueError(
+                "The constexpr_affine_dequantize op doesn't support scale which "
+                "have more than one non-single dimensions. Got scale with shape "
+                f"{scale.shape}"
+            )
+        # If non_single_dim is empty, it means it's per-tensor quantization, just use a dummy axis.
+        axis = 0 if len(non_single_dim) == 0 else non_single_dim[0]
+    if isinstance(axis, int):
+        axis = _np.int32(axis)
+
+    # The constexpr_affine_dequantize op requires zero_point.
+    if zero_point is None:
+        zero_point = _np.zeros_like(scale).astype(quantized_weights.dtype)
+
+    # The constexpr_affine_dequantize op requires scale and zero_point to have rank 0 or 1.
+    if isinstance(scale, (_np.ndarray, _np.generic)):
+        scale = _np.squeeze(scale)
+    if isinstance(zero_point, (_np.ndarray, _np.generic)):
+        zero_point = _np.squeeze(zero_point)
+
+    kwargs = {
+        "quantized_data": quantized_weights,
+        "zero_point": zero_point,
+        "scale": scale,
+        "axis": axis,
+    }
+    if name is not None:
+        kwargs["name"] = name
+    if before_op is not None:
+        kwargs["before_op"] = before_op
+    return mb.constexpr_affine_dequantize(**kwargs)
diff --git a/coremltools/converters/mil/frontend/milproto/load.py b/coremltools/converters/mil/frontend/milproto/load.py
index e7ecb0110..b6e39e406 100644
--- a/coremltools/converters/mil/frontend/milproto/load.py
+++ b/coremltools/converters/mil/frontend/milproto/load.py
@@ -4,10 +4,13 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import os
+from typing import Tuple
 
 import numpy as np
 
 from coremltools import _logger as logger
+from coremltools import proto
+from coremltools.converters.mil import mil
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
 from coremltools.converters.mil.backend.mil import helper
 from coremltools.converters.mil.mil import Block
@@ -16,7 +19,6 @@
     Function,
     ListVar,
     Placeholder,
-    Program,
     TupleInputType,
     Var,
     mil_list,
@@ -24,8 +26,6 @@
 )
 from coremltools.converters.mil.mil.block import curr_block
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry as _SSAOpRegistry
-from coremltools.proto import MIL_pb2 as pm
-from coremltools.proto import Model_pb2 as ml
 
 from .helper import proto_to_types
 
@@ -63,7 +63,7 @@ def get_var_from_name(self, name):
 
 
 def _load_tensorvalue(tensorvalue_spec):
-    if not isinstance(tensorvalue_spec, pm.TensorValue):
+    if not isinstance(tensorvalue_spec, proto.MIL_pb2.TensorValue):
         raise TypeError("Invalid TensorValue spec object")
 
     if tensorvalue_spec.WhichOneof("value") == "floats":
@@ -85,7 +85,7 @@ def _load_tensorvalue(tensorvalue_spec):
 
 
 def _load_immediate_value(immediatevalue_spec):
-    if not isinstance(immediatevalue_spec, pm.Value.ImmediateValue):
+    if not isinstance(immediatevalue_spec, proto.MIL_pb2.Value.ImmediateValue):
         raise TypeError("Invalid ImmedidateValue spec object")
 
     if immediatevalue_spec.WhichOneof("value") == "tensor":
@@ -101,7 +101,7 @@ def _load_immediate_value(immediatevalue_spec):
 def _load_file_value(context, filevalue_spec, dtype):
     if BlobReader is None:
         raise RuntimeError("BlobReader not loaded")
-    if not isinstance(filevalue_spec, pm.Value.BlobFileValue):
+    if not isinstance(filevalue_spec, proto.MIL_pb2.Value.BlobFileValue):
         raise TypeError("Invalid BlobFileValue spec object")
 
     filename = os.path.join(context.weights_dir, filevalue_spec.fileName.split("/")[-1])
@@ -132,13 +132,18 @@ def _load_file_value(context, filevalue_spec, dtype):
     return np_value
 
 
+def _restore_np_from_bytes_value(value: bytes, dtype: types, shape: Tuple[int]) -> np.ndarray:
+    return np.frombuffer(value, types.nptype_from_builtin(dtype)).reshape(shape)
+
+
 def _load_value(context, value_spec):
-    if not isinstance(value_spec, pm.Value):
+    if not isinstance(value_spec, proto.MIL_pb2.Value):
         raise TypeError("Invalid Value spec object")
 
     if value_spec.docString:
         raise ValueError("Docstring would get lost in the process.")
 
+    value_spec_type = value_spec.type.WhichOneof("type")
     if value_spec.type.WhichOneof("type") == "tensorType":
         valuetype = proto_to_types(value_spec.type)
 
@@ -152,16 +157,21 @@ def _load_value(context, value_spec):
         else:
             value = _load_file_value(context, value_spec.blobFileValue, dtype)
 
+        target_np_dtype = types.nptype_from_builtin(dtype)
         if dtype in helper.IMMEDIATE_VALUE_TYPES_IN_BYTES:
-            value = np.frombuffer(value, types.nptype_from_builtin(dtype)).reshape(
-                shape
-            )
+            value = _restore_np_from_bytes_value(value, dtype, shape).astype(target_np_dtype)
         elif dtype == types.str and shape == ():
             value = str(value[0])
-        elif dtype in (types.fp32, types.str, types.bool, types.int32, types.int64):
-            value = (
-                np.array(value).astype(types.nptype_from_builtin(dtype)).reshape(shape)
-            )
+        elif dtype in (
+            types.fp32,
+            types.str,
+            types.bool,
+            types.int16,
+            types.uint16,
+            types.int32,
+            types.int64,
+        ):
+            value = np.array(value).astype(target_np_dtype).reshape(shape)
         else:
             raise ValueError("Invalid dtype for tensor value")
     else:
@@ -178,7 +188,7 @@ def _create_var_from_spec(spec):
     This helper function is used for creating PyMIL Var/ListVar from the proto spec.
     Mainly used for the construction of the control flow ops.
     """
-    assert isinstance(spec, pm.NamedValueType)
+    assert isinstance(spec, proto.MIL_pb2.NamedValueType)
     sym_type = proto_to_types(spec.type)
     name = spec.name
     if types.is_list(sym_type):
@@ -255,20 +265,44 @@ def _dummy_false_fn(*loop_vars):
 
 def _load_const_op(context, op_spec):
     inputs = {k: _load_value(context, v) for k, v in op_spec.attributes.items()}
-    pymil_var = getattr(mb, op_spec.type)(**inputs)
-    context.register_var_with_name(op_spec.outputs[0].name, pymil_var)
+    if len(op_spec.inputs) > 0:
+        for param_name, argument in op_spec.inputs.items():
+            vars = []
+            for binding in argument.arguments:
+                binding_type = binding.WhichOneof("binding")
+                if binding_type == "name":
+                    vars.append(context.get_var_from_name(binding.name))
+                elif binding_type == "value":
+                    vars.append(_load_value(context, binding.value))
+                else:
+                    raise ValueError(f"Invalid binding_type {binding_type}")
+            if len(vars) == 1:
+                inputs[param_name] = vars[0]
+            else:
+                inputs[param_name] = vars
+
+    output_var = getattr(mb, op_spec.type)(**inputs)
+
+    if not isinstance(output_var, (tuple, list)):
+        output_var = [output_var]
+    if len(output_var) != len(op_spec.outputs):
+        raise AssertionError(
+            "Mismatch between number of outputs in operation specification vs PyMIL outputs"
+        )
+    for spec, var in zip(op_spec.outputs, output_var):
+        context.register_var_with_name(spec.name, var)
 
 
-def _load_operation(context, op_spec):
-    if not isinstance(op_spec, pm.Operation):
+def _load_operation(context: TranscriptionContext, op_spec: proto.MIL_pb2.Operation):
+    if not isinstance(op_spec, proto.MIL_pb2.Operation):
         raise TypeError("Invalid Operation spec object")
 
     op_type = op_spec.type
     if op_type == "const" or "constexpr_" in op_type:
         if op_spec.blocks:
             raise ValueError("const / constexpr operation can't have any block")
-        if op_spec.inputs:
-            raise ValueError("const / constexpr operation can't have any input")
+        if op_type == "const" and op_spec.inputs:
+            raise ValueError("const operation can't have any input")
         _load_const_op(context, op_spec)
 
     else:
@@ -363,7 +397,7 @@ def _load_operation(context, op_spec):
 
 
 def _load_block(context, block_spec):
-    if not isinstance(block_spec, pm.Block):
+    if not isinstance(block_spec, proto.MIL_pb2.Block):
         raise TypeError("Invalid Block spec object")
 
     if block_spec.attributes:
@@ -383,7 +417,7 @@ def _load_block(context, block_spec):
 
 
 def _load_function(context, func_spec, spec_version):
-    if not isinstance(func_spec, pm.Function):
+    if not isinstance(func_spec, proto.MIL_pb2.Function):
         raise TypeError("Invalid Function spec object")
 
     if func_spec.attributes:
@@ -415,7 +449,7 @@ def load_mil_proto(program_spec, specification_version, file_weights_dir=""):
     """
     Load in-memory Proto specification of MILSpec.Program(.Proto) object to PyMIL
     """
-    if not isinstance(program_spec, pm.Program):
+    if not isinstance(program_spec, proto.MIL_pb2.Program):
         raise TypeError("Invalid Program spec object")
 
     if program_spec.docString:
@@ -425,7 +459,7 @@ def load_mil_proto(program_spec, specification_version, file_weights_dir=""):
         raise ValueError("Invalid program version")
 
     context = TranscriptionContext(file_weights_dir)
-    pymil_program = Program()
+    pymil_program = mil.Program()
     for func_name, func_spec in program_spec.functions.items():
         pymil_program.add_function(
             func_name, _load_function(context, func_spec, specification_version)
@@ -433,7 +467,7 @@ def load_mil_proto(program_spec, specification_version, file_weights_dir=""):
 
     for attr_name, attr_spec in program_spec.attributes.items():
         if attr_name not in ("buildInfo",):
-            raise ValueError("Invalid attribute for program")
+            raise ValueError(f"Invalid attribute {attr_name} for program")
 
     return pymil_program
 
@@ -444,7 +478,7 @@ def load(model_spec, specification_version, file_weights_dir="", **kwargs):
 
     Set force_spec_version to force override the spec version.
     """
-    if not isinstance(model_spec, ml.Model):
+    if not isinstance(model_spec, proto.Model_pb2.Model):
         raise TypeError("Invalid Model sepc object")
 
     if specification_version < model_spec.specificationVersion:
diff --git a/coremltools/converters/mil/frontend/milproto/test_load.py b/coremltools/converters/mil/frontend/milproto/test_load.py
index 69a90e8ca..9e3e10c1b 100644
--- a/coremltools/converters/mil/frontend/milproto/test_load.py
+++ b/coremltools/converters/mil/frontend/milproto/test_load.py
@@ -150,6 +150,26 @@ def prog(x):
 
         assert op_names == new_op_names
 
+    def test_mil_uint16(self):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(2, 2, 3))],
+            opset_version=ct.target.iOS17,
+        )
+        def prog(x):
+            indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=np.uint16)
+            res = mb.gather(x=x, indices=indices, axis=2, batch_dims=2)
+            return res
+
+        mlmodel = ct.convert(
+            prog,
+            convert_to="mlprogram",
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+            minimum_deployment_target=ct.target.iOS17,
+        )
+        loaded_pymil_prog = get_pymil_prog_from_mlmodel(mlmodel)
+        assert get_op_types_in_program(loaded_pymil_prog) == get_op_types_in_program(prog)
+
+
 @pytest.mark.skipif(ct.utils._macos_version() < (12, 0), reason="mlprogram predict available only on macOS12+")
 class TestE2ENumericalCorrectness:
     @pytest.mark.skipif(not _HAS_TORCH, reason="requires torch")
diff --git a/coremltools/converters/mil/frontend/tensorflow/converter.py b/coremltools/converters/mil/frontend/tensorflow/converter.py
index 8c89f0035..e9131c201 100644
--- a/coremltools/converters/mil/frontend/tensorflow/converter.py
+++ b/coremltools/converters/mil/frontend/tensorflow/converter.py
@@ -5,12 +5,13 @@
 
 from coremltools import _logger as logger
 from coremltools.converters._profile_utils import _profile
+from coremltools.converters.mil import mil
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
 from coremltools.converters.mil.input_types import ImageType, InputType, RangeDim
 from coremltools.converters.mil.input_types import Shape as InputShape
 from coremltools.converters.mil.input_types import TensorType, _get_shaping_class
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, Program, get_new_symbol, types
+from coremltools.converters.mil.mil import Function, get_new_symbol, types
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
 from coremltools.converters.mil.mil.var import Var
 
@@ -407,7 +408,6 @@ def convert_main_graph(self, prog, graph):
             func_inputs[input_type.name] = mb.placeholder(
                 input_type.shape.symbolic_shape, dtype=dtype
             )
-        prog.set_main_input_types(self.inputs)
 
         with Function(func_inputs, opset_version=self.opset_version) as ssa_func:
             # Get the input Var
@@ -421,6 +421,8 @@ def convert_main_graph(self, prog, graph):
             outputs = convert_graph(self.context, graph, self.output_names)
             ssa_func.set_outputs(outputs)
             prog.add_function("main", ssa_func)
+            prog.functions["main"].set_input_types(self.inputs)
+
         # check duplicate output
         # Note: sometimes two outputs are pointing to the same Var, we should
         # create mb.identity for those cases
@@ -506,11 +508,11 @@ def convert_main_graph(self, prog, graph):
                     main_output_types.append(TensorType(name=val.name, dtype=dtype))
                 self.main_output_types = main_output_types
 
-        prog.set_main_output_types(self.main_output_types)
+        prog.functions["main"].set_output_types(self.main_output_types)
 
     @_profile
     def convert(self):
-        prog = Program()
+        prog = mil.Program()
         if len(self.graph_stack) == 0:
             raise ValueError("At least one TF function must be present")
         if self.graph_stack[0] != "main":
diff --git a/coremltools/converters/mil/frontend/tensorflow/ops.py b/coremltools/converters/mil/frontend/tensorflow/ops.py
index 5b647a78c..632b4b43a 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ops.py
@@ -8,6 +8,7 @@
 
 from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
+from coremltools.converters.mil.frontend._utils import dynamic_topk
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
@@ -2292,34 +2293,40 @@ def Tanh(context, node):
 @register_tf_op(tf_alias=["TopKV2"])
 def TopK(context, node):
     x = context[node.inputs[0]]
-    k = context[node.inputs[1]].val
-    sort = node.attr["sorted"]
+    k = context[node.inputs[1]]
 
-    kwargs = {
-        "x": x,
-        "k": k,
-        "axis": -1,
-        "name": node.name
-    }
+    if k.val is not None:
+        sort = node.attr["sorted"]
 
-    if is_current_opset_version_compatible_with(target.iOS16):
-        kwargs["sort"] = sort
-    elif not sort:
-        raise ValueError("For opset <= iOS16, only sorted=True supported for the topk")
+        kwargs = {"x": x, "k": k, "axis": -1, "name": node.name}
+
+        if is_current_opset_version_compatible_with(target.iOS16):
+            kwargs["sort"] = sort
+        elif not sort:
+            raise ValueError("For opset <= iOS16, only sorted=True supported for the topk")
+
+        context.add(node.name, mb.topk(**kwargs))
+
+    else:
+        context.add(node.name, dynamic_topk(x, k, -1, name=node.name))
 
-    context.add(node.name, mb.topk(**kwargs))
 
 @register_tf_op(tf_alias=["InTopKV2"])
 def InTopK(context, node):
     x = context[node.inputs[0]]
     target = context[node.inputs[1]]
-    k = context[node.inputs[2]].val
+    k = context[node.inputs[2]]
 
     _, class_num = x.shape
-    if not is_symbolic(class_num):
-        k = min(k, class_num)
+    if k.val is not None and not is_symbolic(class_num):
+        k = min(k.val, class_num)
+        _, indices = mb.topk(x=x, k=k, axis=-1)
+    else:
+        x_shape = mb.shape(x=x)
+        class_num = mb.slice_by_index(x=x_shape, begin=(-1,), end=(-1,), squeeze_mask=(True,))
+        k = mb.minimum(x=k, y=class_num)
+        _, indices = dynamic_topk(x, k, -1)
 
-    _, indices = mb.topk(x=x, k=k, axis=-1)
     target = mb.expand_dims(x=target, axes=[-1])
     x = mb.equal(x=target, y=indices)
     x = mb.cast(x=x, dtype="fp32")
diff --git a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/backfill_make_list_elem_type.py b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/backfill_make_list_elem_type.py
index 81d8423e0..14e750f0c 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/backfill_make_list_elem_type.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/backfill_make_list_elem_type.py
@@ -30,7 +30,7 @@ def apply(self, prog):
 @block_context_manager
 def _backfill_make_list_elem_type_block(block):
     # shallow copy hides changes on f.operations during the loop
-    for op in block.operations:
+    for op in list(block.operations):
         for b in op.blocks:
             _backfill_make_list_elem_type_block(b)
 
diff --git a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/expand_tf_lstm.py b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/expand_tf_lstm.py
index 1f28bfad3..4573ba51b 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/expand_tf_lstm.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/expand_tf_lstm.py
@@ -41,7 +41,7 @@ def apply(self, prog):
 
 def _expand_tf_lstm_helper(block):
     # shallow copy hides changes on f.operations during the loop
-    for op in block.operations[:]:
+    for op in list(block.operations):
         for b in op.blocks:
             _expand_tf_lstm_helper(b)
 
diff --git a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/tf_lstm_to_core_lstm.py b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/tf_lstm_to_core_lstm.py
index f22b95bf3..4020db50b 100644
--- a/coremltools/converters/mil/frontend/tensorflow/ssa_passes/tf_lstm_to_core_lstm.py
+++ b/coremltools/converters/mil/frontend/tensorflow/ssa_passes/tf_lstm_to_core_lstm.py
@@ -47,7 +47,7 @@ def apply(self, prog):
 @block_context_manager
 def _tf_lstm_to_core_lstm_block(block: Block):
     # shallow copy hides changes on f.operations during the loop
-    for op in block.operations:
+    for op in list(block.operations):
         for b in op.blocks:
             _tf_lstm_to_core_lstm_block(b)
 
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
index 8140bfc36..e31ef6b55 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
@@ -5607,7 +5607,7 @@ class TestTopK(TensorFlowBaseTest):
             compute_units,
             backends,
             [1, 3, 5],
-            [1, 3],
+            [1, 3, None],  # None denotes dynamic k
             [True, False],
         ),
     )
@@ -5616,19 +5616,38 @@ def test_top_k(self, compute_unit, backend, rank, k, sort):
             pytest.skip("iOS16 version topk needed for sort = False")
         if not sort and _macos_version() < (13, 0):
             pytest.skip("New functionality in macOS13/iOS16")
+        if rank == 5 and k is None and sort and (
+            backend[0] == "neuralnetwork" or (
+                platform.machine() == "x86_64" and _macos_version() < (15, 0)
+            )
+        ):
+            pytest.xfail("rdar://120891130: TopK failing randomly")
 
         # TensorFlow only supports last dimension (axis = -1).
         shape = np.random.randint(low=3, high=4, size=rank)
 
-        @make_tf_graph([shape])
-        def build_model(x):
-            ref = tf.math.top_k(x, k=k, sorted=sort)
-            if not sort:
-                ref =  (tf.sort(ref[0]), tf.sort(ref[1]))
-            return ref
+        if k is None:
+
+            @make_tf_graph([shape, (1, tf.int32)])
+            def build_model(x, k):
+                ref = tf.math.top_k(x, k=k[0], sorted=sort)
+                if not sort:
+                    ref = (tf.sort(ref[0]), tf.sort(ref[1]))
+                return ref
+
+        else:
+
+            @make_tf_graph([shape])
+            def build_model(x):
+                ref = tf.math.top_k(x, k=k, sorted=sort)
+                if not sort:
+                    ref = (tf.sort(ref[0]), tf.sort(ref[1]))
+                return ref
 
         model, inputs, outputs = build_model
         input_values = [random_gen(shape, rand_min=-100, rand_max=100)]
+        if k is None:
+            input_values.append(np.random.randint(low=1, high=shape[-1], size=1, dtype=np.int32))
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
             model,
@@ -5645,21 +5664,76 @@ def build_model(x):
             compute_units,
             backends,
             [(1, 3), (1, 10), (3, 50)],
-            [1, 3, 20],
+            [1, 3, 20, None],  # None denotes dynamic k
         ),
     )
     def test_in_top_k(self, compute_unit, backend, shape, k):
         # TensorFlow only supports last dimension (axis = -1).
         batch_size, class_num = shape
 
-        @make_tf_graph([shape, (batch_size, tf.int32)])
-        def build_model(predictions, targets):
-            return tf.math.in_top_k(predictions=predictions, targets=targets, k=k)
+        if k is None:
+
+            @make_tf_graph([shape, (batch_size, tf.int32), (1, tf.int32)])
+            def build_model(predictions, targets, k):
+                return tf.math.in_top_k(predictions=predictions, targets=targets, k=k[0])
+
+        else:
+
+            @make_tf_graph([shape, (batch_size, tf.int32)])
+            def build_model(predictions, targets):
+                return tf.math.in_top_k(predictions=predictions, targets=targets, k=k)
 
         model, inputs, outputs = build_model
         pred_values = random_gen(shape, rand_min=-2, rand_max=2)
         target_values = np.random.randint(class_num, size=batch_size).astype(np.int32)
         input_values = [pred_values, target_values]
+        if k is None:
+            input_values.append(np.random.randint(low=1, high=shape[-1], size=1, dtype=np.int32))
+
+        input_dict = dict(zip(inputs, input_values))
+        TensorFlowBaseTest.run_compare_tf(
+            model,
+            input_dict,
+            outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, rank, dynamic",
+        itertools.product(
+            compute_units,
+            backends,
+            (1, 3, 5),
+            (True, False),
+        ),
+    )
+    def test_sort(self, compute_unit, backend, rank, dynamic):
+        """
+        tf.sort dispatches to tf.math.top_k, and k = size of the axis to be sorted
+        """
+        if backend[0] == "mlprogram" and dynamic:
+            pytest.xfail(
+                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
+            )
+
+        # Here we test the conversion of tf.sort(x, axis=0)
+        # If dynamic, we prepend None to x shape as the dynamic shape axis
+        if rank == 5 and dynamic:
+            rank -= 1
+        shape = tuple(np.random.randint(low=3, high=8, size=rank))
+
+        tf_input_shape = (None,) + shape if dynamic else shape
+        @make_tf_graph([tf_input_shape])
+        def build_model(x):
+            return tf.sort(x, axis=0)
+
+        model, inputs, outputs = build_model
+
+        if dynamic:
+            input_values = [random_gen((5,) + shape, rand_min=-100, rand_max=100)]
+        else:
+            input_values = [random_gen(shape, rand_min=-100, rand_max=100)]
 
         input_dict = dict(zip(inputs, input_values))
         TensorFlowBaseTest.run_compare_tf(
@@ -5670,6 +5744,7 @@ def build_model(predictions, targets):
             backend=backend,
         )
 
+
 class TestConcat(TensorFlowBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, op_version, rank, num_inputs",
diff --git a/coremltools/converters/mil/frontend/torch/converter.py b/coremltools/converters/mil/frontend/torch/converter.py
index 4f5b9d7fc..a58401131 100644
--- a/coremltools/converters/mil/frontend/torch/converter.py
+++ b/coremltools/converters/mil/frontend/torch/converter.py
@@ -12,10 +12,13 @@
 
 from coremltools import _logger as logger
 from coremltools._deps import _HAS_TORCH_EXPORT_API
+from coremltools.converters.mil import mil
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
 from coremltools.converters.mil.input_types import ImageType, InputType, TensorType
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Placeholder, Program, types
+from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 from coremltools.converters.mil.mil.types import is_float
 from coremltools.converters.mil.mil.var import Var
 
@@ -28,6 +31,7 @@
     flatten_graph_input_values,
     flatten_graph_output_values,
     generate_tensor_assignment_ops,
+    populate_native_const_model_hierarchy,
     remove_getattr_nodes,
     transform_inplace_ops,
 )
@@ -57,12 +61,17 @@ def _convert_to_torch_inputtype(inputs: List[TensorType]) -> List[TensorType]:
             raise ValueError("Unknown type {} for conversion to InputType.".format(type(_input)))
     return input_type
 
+
 class QuantizationContext:
     """
     Utilities to manage information pertaining to quantization of tensors in a PyTorch graph.
+
+    This is necessary only for TorchScript (not ExecuTorch)
     """
 
     def __init__(self, context: "TranscriptionContext") -> None:
+        if context.frontend != TorchFrontend.TORCHSCRIPT:
+            raise ValueError("QuantizationContext is necessary only for TorchScript")
         self._context = context
 
         # Maps var name to tuple of (torch dtype, scale, zero_point)
@@ -204,7 +213,8 @@ def __init__(
         self.frontend = frontend
         self._current_graph = [{}]
         self._torch_graph = None
-        self._quant_context = QuantizationContext(self)
+        if frontend == TorchFrontend.TORCHSCRIPT:
+            self._quant_context = QuantizationContext(self)
 
     @property
     def torch_graph(self):
@@ -348,13 +358,12 @@ def __init__(
         self.outputs = outputs
         self.output_names = get_output_names(self.outputs)
         self.opset_version = _target(opset_version) if opset_version is not None else None
-        self.context = TranscriptionContext()
-        self._prog = Program()
+        self._prog = mil.Program()
 
         if isinstance(loaded_model, torch.jit.ScriptModule):
-            self.context.frontend = TorchFrontend.TORCHSCRIPT
-            self.graph, self.params_dict, self.buffer_dict = InternalTorchIRGraph.from_torchscript(
-                torchscript=loaded_model, input_values=self.inputs, cut_at_symbols=cut_at_symbols
+            self.context = TranscriptionContext(frontend=TorchFrontend.TORCHSCRIPT)
+            self.graph = InternalTorchIRGraph.from_torchscript(
+                torchscript=loaded_model, inputs=self.inputs, cut_at_symbols=cut_at_symbols
             )
 
             # TODO (rdar://106161395): Register Torch IR passes and unify them into the pass pipeline.
@@ -365,14 +374,14 @@ def __init__(
                 flatten_graph_output_values,
                 remove_getattr_nodes,
                 generate_tensor_assignment_ops,
+                populate_native_const_model_hierarchy,
             ]
             for p in passes:
                 p(self.graph)
 
         elif _HAS_TORCH_EXPORT_API and isinstance(loaded_model, ExportedProgram):
-            self.context.frontend = TorchFrontend.EXIR
+            self.context = TranscriptionContext(frontend=TorchFrontend.EXIR)
             self.graph = InternalTorchIRGraph.from_exir(exir=loaded_model)
-            self.params_dict, self.buffer_dict = None, None
         else:
             raise ValueError(
                 "Model should be an instance of either torch.jit.ScriptModule or ExportedProgram"
@@ -452,13 +461,27 @@ def _create_placeholder(
             dtype = types.fp32
         return mb.placeholder(shape, dtype=dtype)
 
-    @staticmethod
-    def _preprocess_input_vars(input_var):
-        if (
-            types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
-        ) and input_var.dtype == types.fp16:
-            input_var = mb.cast(x=input_var, dtype="fp32")
-        return input_var
+    def _add_const(self, name: str, val: Union[torch.Tensor, torch._C.ScriptObject]) -> None:
+        """Create a const op and add it to the graph."""
+        if isinstance(val, torch._C.ScriptObject):
+            logger.info(f"Encountered constant {name} of type _torch._C.ScriptObject")
+            return
+        elif isinstance(val, torch.Tensor) and val.is_quantized:
+            const = _dequantized_weight(val.cpu(), name)
+            self.context.add(const)
+            return
+        elif not isinstance(val, torch.Tensor):
+            raise ValueError(f"unsupported class for {name} in PyTorch graph: {type(val)}")
+        val = val.detach().cpu().numpy()
+        # TODO (rdar://107718371): support uint8 activation quantization in torchscript
+        # Some torchscript models store indices with uint8, which are unrelated to quantization and
+        # need to be cast to int32 since many non-quantized Core ML ops do not support int8.
+        # We need a way to distinguish whether an uint8 is quantization (so should be kept)
+        # or not (so should be cast to int32).
+        if self.context.frontend == TorchFrontend.TORCHSCRIPT and val.dtype == np.uint8:
+            val = val.astype(np.int32)
+        const = mb.const(val=val, name=name)
+        self.context.add(const)
 
     def check_ops(self):
         """
@@ -469,24 +492,24 @@ def check_ops(self):
 
     def convert_const(self) -> None:
         for name, val in self.graph.params.items():
-            if isinstance(val, torch._C.ScriptObject):
-                logger.info(f"Encountered constant {name} of type _torch._C.ScriptObject")
-                continue
-            elif isinstance(val, torch.Tensor) and val.is_quantized:
-                const = _dequantized_weight(val.cpu(), name)
-                self.context.add(const)
-                continue
-            elif not isinstance(val, np.ndarray):
-                raise ValueError(f"unsupported class for {name} in PyTorch graph: {type(val)}")
-            # TODO (rdar://107718371): support uint8 quantization
-            # Some torch models store indices with uint8, which are unrelated to quantization and
-            # need to be cast to int32 since Core ML does not support int8.
-            # We need a way to distinguish whether an uint8 is quantization (so should be kept)
-            # or not (so should be cast to int32).
-            if val.dtype == np.uint8:
-                val = val.astype(np.int32)
-            const = mb.const(val=val, name=name)
-            self.context.add(const)
+            if self.context.frontend == TorchFrontend.TORCHSCRIPT:
+                scope_name, scope_type = self.graph.params_scope[name]
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=scope_type),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=scope_name),
+                ):
+                    self._add_const(name, val)
+            elif self.context.frontend == TorchFrontend.EXIR:
+                # ExecuTorch has constants lifted as inputs, yet we have not sorted out
+                # how to support IO metadata, so for now just put a dummy metadata
+                # since inputs/constants will not contribute to debugging/profiling
+                # TODO (rdar://125572392): Support torch.export IO metadata
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]),
+                ):
+                    self._add_const(name, val)
+            else:
+                raise ValueError(f"Invalid PyTorch frontend {self.context.frontend}")
 
     def convert(self) -> Program:
         logger.info("Converting graph.")
@@ -499,7 +522,6 @@ def convert(self) -> Program:
 
         # This will hold the converted model.
         prog = self._prog
-        prog.set_main_input_types(tuple(self.inputs))
 
         # Construct placeholder for input to SSA function
         ssa_func_inputs = OrderedDict()
@@ -517,7 +539,39 @@ def convert(self) -> Program:
             internal_names = list(self.graph.inputs.keys())
             internal_names.extend(user_names[len(internal_names) :])
             for torch_name, ssa_name in zip(internal_names, user_names):
-                input_var = self._preprocess_input_vars(ssa_func.inputs[ssa_name])
+                input_var = ssa_func.inputs[ssa_name]
+                if self.context.frontend == TorchFrontend.TORCHSCRIPT:
+                    # To create fp16 Core ML model from fp32 torch model, we
+                    # 1. Cast input to fp32 (if specified fp16 input)
+                    # 2. Convert fp32 torch model to fp32 Core ML model
+                    # 3. Graph passes `add_fp16_cast` and `cast_optimization`
+                    #    then cast fp32 Core ML model to fp16
+                    # So here we perform the "cast input to fp32" step
+                    if (
+                        types.is_tensor(input_var.sym_type) or types.is_scalar(input_var.sym_type)
+                    ) and input_var.dtype == types.fp16:
+                        # This cast should have placeholder scope
+                        with mb.scope(
+                            ScopeInfo(
+                                source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="placeholder"
+                            ),
+                            ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=torch_name),
+                        ):
+                            input_var = mb.cast(x=input_var, dtype="fp32")
+                elif self.context.frontend == TorchFrontend.EXIR:
+                    # EXIR has dtypes all determined, so for now we just stick to EXIR dtypes
+                    # TODO (rdar://115845792): Handle fp16 IO dtypes
+                    # When handle user provided IO dtypes, we will also need to handle IO metadata
+                    # TODO (rdar://125572392): Support torch.export IO metadata
+                    if (
+                        input_var.dtype == types.fp16
+                        and not is_current_opset_version_compatible_with(_target.iOS16)
+                    ):
+                        raise ValueError(
+                            "To use fp16 input, please set minimum deployment target to iOS16+"
+                        )
+                else:
+                    raise ValueError(f"Invalid PyTorch frontend {self.context.frontend}")
                 self.context.add(input_var, torch_name=torch_name)
 
             # Convert constants
@@ -554,5 +608,21 @@ def convert(self) -> Program:
                 # is True. Make the default output type to fp16
                 self._adjust_default_output_to_fp16(graph_outputs)
             if self.outputs is not None:
-                prog.set_main_output_types(self.outputs)
+                prog.functions["main"].set_output_types(self.outputs)
+
+            prog.functions["main"].set_input_types(tuple(self.inputs))
+
+            # Make sure the prog is not missing any scope information
+            essential_scope_sources = []
+            if self.context.frontend == TorchFrontend.TORCHSCRIPT:
+                essential_scope_sources = [
+                    ScopeSource.TORCHSCRIPT_MODULE_NAME,
+                    ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+                ]
+            elif self.context.frontend == TorchFrontend.EXIR:
+                essential_scope_sources = [ScopeSource.EXIR_DEBUG_HANDLE]
+            else:
+                raise ValueError(f"Invalid PyTorch frontend {self.context.frontend}")
+            prog._add_essential_scope_source(essential_scope_sources)
+            prog.validate(check_essential_scope=True)
         return prog
diff --git a/coremltools/converters/mil/frontend/torch/internal_graph.py b/coremltools/converters/mil/frontend/torch/internal_graph.py
index bb3cacdee..f22f32c97 100644
--- a/coremltools/converters/mil/frontend/torch/internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/internal_graph.py
@@ -4,20 +4,20 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from collections import OrderedDict
+from typing import Any, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
-import torch.fx
-import torch.fx.immutable_collections
-import torch.export
 
 from coremltools import _logger as logger
+from coremltools.converters.mil.input_types import TensorType
 
 from .utils import TORCH_DTYPE_TO_NUM, sanitize_op_kind
 from .exir_utils import extract_inputs_from_exir_program
 from .torchscript_utils import _expand_and_optimize_ir
 
 
-def _make_ssa_name(name):
+def _make_ssa_name(name: str) -> str:
     """
     Converts a symbol name (string) into an SSA name, by prepending '%'.
     Only used for pretty printing the graph.
@@ -27,7 +27,7 @@ def _make_ssa_name(name):
     return "%" + name
 
 
-def _ssa_name_list(names):
+def _ssa_name_list(names: List[str]) -> List[str]:
     """
     Take a list of symbol names (strings) and return them as SSA names. Only
     used for pretty printing the graph.
@@ -35,7 +35,7 @@ def _ssa_name_list(names):
     return [_make_ssa_name(x) for x in names]
 
 
-def _find_new_name(old_name, node_names):
+def _find_new_name(old_name: str, node_names: List[str]) -> str:
     """
     Disambiguate a node's name from a list of existing node names by adding
     successively larger integers.
@@ -48,7 +48,7 @@ def _find_new_name(old_name, node_names):
     return new_name
 
 
-def _replace_in_list(ls, old_val, new_val):
+def _replace_in_list(ls: List[Any], old_val: Any, new_val: Any) -> None:
     """Helper function to replace a value in a list."""
     try:
         idx = ls.index(old_val)
@@ -63,11 +63,17 @@ class InternalTorchIRBlock:
     coremltools internal representation of a torch IR block.
     """
 
-    def __init__(self, parent=None, nodes=None, inputs=None, outputs=None):
+    def __init__(
+        self,
+        parent: Optional["InternalTorchIRNode"] = None,
+        nodes: Optional[List["InternalTorchIRNode"]] = None,
+        inputs: Optional[List[str]] = None,
+        outputs: Optional[List[str]] = None,
+    ):
         """
         Arguments:
             parent: The InternalTorchIRNode this block belongs to.
-            nodes: list of InternalTorchIRNodes in the block
+            nodes: list of InternalTorchIRNode in the block
             inputs: list of input symbols.
             outputs: list of output symbols.
         """
@@ -152,13 +158,15 @@ class InternalTorchIRNode:
 
     def __init__(
         self,
-        kind,
-        inputs,
-        outputs,
-        name=None,
-        parent=None,
-        attr=None,
-        blocks=None,
+        kind: str,
+        inputs: List[str],
+        outputs: List[str],
+        name: Optional[str] = None,
+        parent: Optional[Union["InternalTorchIRGraph", "InternalTorchIRBlock"]] = None,
+        attr: Optional[Dict[str, Any]] = None,
+        blocks: Optional[List["InternalTorchIRBlock"]] = None,
+        model_hierarchy: Optional[str] = None,
+        meta: Optional[Dict] = None,
     ):
         """
         Arguments:
@@ -169,6 +177,8 @@ def __init__(
             parent: The InternalTorchIRGraph/Block this node belongs to.
             attr:  dict of named attributes.
             blocks: list of InternalTorchIRBlock.
+            model_hierarchy: str represents TorchScript node's model hierarchy.
+            meta: A dictionary of torch fx node metadata inherited from torch.fx.Node.meta
         """
         if not name and not outputs:
             self.name = ""
@@ -181,6 +191,8 @@ def __init__(
         self.parent = parent
         self.attr = attr if attr is not None else {"value": None}
         self.blocks = blocks if blocks is not None else []
+        self.model_hierarchy = model_hierarchy
+        self.meta = meta
 
     @classmethod
     def from_torchscript_node(cls, node, parent):
@@ -211,6 +223,7 @@ def from_torchscript_node(cls, node, parent):
             outputs=outputs,
             attr=attr,
             blocks=None,
+            model_hierarchy=node.getModuleHierarchy(),
         )
         internal_node.blocks = [
             InternalTorchIRBlock.from_torchscript_block(block=b, parent=internal_node)
@@ -267,6 +280,7 @@ def get_arguments(alist):
             parent=None,
             attr=None,
             blocks=None,
+            meta=node.meta,
         )
 
     def __str__(self, indent=2):
@@ -297,13 +311,89 @@ def replace_name(self, old_name, new_name):
         for block in self.blocks:
             block.replace_name(old_name, new_name)
 
+    def get_scope_info(self) -> Tuple[List[str], List[str]]:
+        """
+        Get the scope information (``scope_name``, ``scope_type``) of a TorchScript node.
+        In a TorchScript node, a model hierarchy is represented in a string of format:
+            ``scope_name_1(scope_type_1).scope_name_2(scope_type_1).<...>.scope_name_n(scope_type_n)``
+        For instance, given a torch model:
+
+            class SubModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.linear_1 = torch.nn.Linear(2, 3)
+
+                def forward(self, x):
+                    x_1 = self.linear(x)
+                    x_2 = torch.relu(x_1)
+                    return x_2
+
+            class Model(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.submodule_1 = SubModule()
+
+                def forward(self, x):
+                    return self.submodule_1(x)
+
+        The model hierarchy of ``x_1`` is ``submodule_1(SubModule).linear_1(Linear)``,
+        and ``x_2`` has ``submodule_1(SubModule)``.
+
+        We consider the ``node.name`` as the most inner ``scope_name``, and
+        ``node.kind`` (aten op type) as the most inner ``scope_type``.
+
+        ``x_1`` results in:
+            {
+                "scope_name": ["submodule_1", "linear_1", "x_1"],
+                "scope_type": ["SubModule", "Linear", "linear"],
+            },
+        and ``x_2`` gets:
+            {
+                "scope_name": ["submodule_1", "x_2"],
+                "scope_type": ["SubModule", "relu"],
+            }.
+
+        Note that, for the model weight const ops, the names are in the following format:
+        "submodule_1.linear_1.weight", which would result in a long ``scope_name``:
+        ``["submodule_1", "linear_1", "submodule_1.linear_1.weight"]``.
+        This function does a special handling to trim it to:
+        ``["submodule_1", "linear_1", "weight"]``
+        """
+
+        def _trim_scopename_for_weight(scope_names: List[str]) -> List[str]:
+            weight_name = scope_names[-1]
+            if scope_names[:-1] != weight_name.split(".")[:-1]:
+                return scope_names
+            scope_names[-1] = weight_name.split(".")[-1]
+            return scope_names
+
+        if self.model_hierarchy == "" or self.model_hierarchy is None:
+            scopes = []
+        else:
+            scopes = self.model_hierarchy.split(".")
+        scope_names, scope_types = [], []
+        for val in scopes:
+            if val == "":
+                scope_names.append("UNKNOWN_SCOPE_NAME")
+                scope_types.append("UNKNOWN_SCOPE_TYPE")
+                continue
+            if val.count("(") != 1 or val.count(")") != 1:
+                raise ValueError(f"{val} is not a valid model hierarchy string.")
+            lower_idx, upper_idx = val.index("("), val.index(")")
+            scope_names.append(val[:lower_idx])
+            scope_types.append(val[lower_idx + 1 : upper_idx])
+        scope_names.append(self.name)
+        scope_types.append(self.kind)
+        if self.kind == "getattr":
+            scope_names = _trim_scopename_for_weight(scope_names)
+        return scope_names, scope_types
 
 class InternalTorchIRGraph:
     """
-    CoreML internal representation of a torch IR graph. A torch._C.Graph
+    Core ML internal representation of a torch IR graph. A torch._C.Graph
     object is not an ideal structure to use in converting to CoreML. Conversion
     to an InternalTorchIRGraph is inserted between the original graph and the
-    final CoreML model to address several issues:
+    final Core ML model to address several issues:
         1. A torch._C.graph is hard to work with. For example, its .inputs()
           and .outputs() functions return iterators, so the only way to
           determine the number of inputs/outputs is by counting to the end.
@@ -322,33 +412,35 @@ class InternalTorchIRGraph:
 
     def __init__(
         self,
-        params,
-        inputs,
-        outputs,
-        nodes=None,
+        params: Dict[str, np.ndarray],
+        inputs: Dict[str, TensorType],
+        outputs: List[str],
+        nodes: Optional[List["InternalTorchIRNode"]] = None,
+        buffers: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """
         Arguments:
             params: dict mapping parameter names to their numpy value.
-            inputs: OrderedDict mapping input names to their example values.
+            inputs: OrderedDict mapping input names to their input types.
             outputs: list[str], list of outputs from the graph.
-            nodes: list of InternalTorchIRNodes in the graph.
+            nodes: list of InternalTorchIRNode in the graph.
+            buffers: Dict mapping torch model buffers to their names.
         """
         self.nodes = nodes
         self.params = params
         self.inputs = inputs
         self.outputs = outputs
+        self.buffers = buffers
+        self.params_scope = {}
 
     @classmethod
-    def from_torchscript(cls, torchscript, input_values=None, cut_at_symbols=None):
+    def from_torchscript(cls, torchscript, inputs=None, cut_at_symbols=None):
         """
         Arguments:
             torchscript: TorchScript object representing the model to convert.
-            input_values: A list of inputs to the graph. Must be given is
-                @raw_graph if not None.
+            inputs: A list of input types to the graph.
             cut_at_symbols: The list of desired outputs from the graph. Symbols
-                must be present in the graph. For debugging use only. Can only
-                be given if @raw_graph is not None.
+                must be present in the graph. For debugging use only.
         """
         if not isinstance(torchscript, torch.jit.ScriptModule):
             raise AssertionError(
@@ -367,34 +459,21 @@ def from_torchscript(cls, torchscript, input_values=None, cut_at_symbols=None):
             )
 
         nodes = []
-        params = {}
-        inputs = OrderedDict()
+        inputs_name_to_type = OrderedDict()
         outputs = []
 
-        raw_graph, params_dict, buffer_dict = _expand_and_optimize_ir(torchscript)
-
-        # Add params
-        for name, param in params_dict.items():
-            if isinstance(param, torch.Tensor):
-                if param.is_quantized:
-                    value = param
-                else:
-                    value = param.detach().cpu().numpy()
-            else:
-                value = param
-            params[name] = value
+        raw_graph, params, buffers = _expand_and_optimize_ir(torchscript)
 
         # Add inputs
         # The first element of the raw_graph.inputs() is the 'self' of the module, which is not used.
         graph_inputs = list(raw_graph.inputs())[1:]
-        if len(graph_inputs) != len(input_values):
-                raise ValueError(
-                    f"Number of TorchScript inputs ({len(graph_inputs)}) must match the user provided inputs ({len(input_values)})."
-                )
+        if len(graph_inputs) != len(inputs):
+            raise ValueError(
+                f"Number of TorchScript inputs ({len(graph_inputs)}) must match the user provided inputs ({len(inputs)})."
+            )
         for index, _input in enumerate(graph_inputs):
             name = _input.debugName()
-            value = input_values[index]
-            inputs[name] = value
+            inputs_name_to_type[name] = inputs[index]
 
         # Add outputs, cutting if @cut_at_symbols is set
         output_names = cut_at_symbols
@@ -403,10 +482,16 @@ def from_torchscript(cls, torchscript, input_values=None, cut_at_symbols=None):
         for output in output_names:
             outputs.append(output)
 
-        internal_graph = cls(nodes=nodes, params=params, inputs=inputs, outputs=outputs)
+        internal_graph = cls(
+            nodes=nodes,
+            params=params,
+            inputs=inputs_name_to_type,
+            outputs=outputs,
+            buffers=buffers,
+        )
 
-        node_names = set()
         # Add nodes
+        node_names = set()
         for raw_node in raw_graph.nodes():
             new_node = InternalTorchIRNode.from_torchscript_node(
                 node=raw_node, parent=internal_graph
@@ -416,10 +501,24 @@ def from_torchscript(cls, torchscript, input_values=None, cut_at_symbols=None):
             internal_graph.nodes.append(new_node)
             node_names.add(new_node.name)
 
-        return internal_graph, params_dict, buffer_dict
+        internal_graph._cache_model_hierarchy_for_params()
+
+        return internal_graph
+
+    def _cache_model_hierarchy_for_params(self) -> None:
+        # We cache the model hierarchy information for model weights in self.params_scope,
+        # since self.params doesn't contain the information.
+        def cache_model_hierarchy_block(block):
+            for node in block.nodes:
+                for b in node.blocks:
+                    cache_model_hierarchy_block(b)
+                if node.name in self.params:
+                    self.params_scope[node.name] = node.get_scope_info()
+        cache_model_hierarchy_block(self)
 
     @classmethod
-    def from_exir(cls, exir: torch.export.ExportedProgram):
+    def from_exir(cls, exir):
+        # exir: torch.export.ExportedProgram
         exported_program = exir
 
         nodes = []
@@ -436,19 +535,29 @@ def from_exir(cls, exir: torch.export.ExportedProgram):
         inputs_to_buffers = exported_program.graph_signature.inputs_to_buffers
 
         inputs_to_consts = {**inputs_to_parameters, **inputs_to_buffers}
-
-        parameters_to_inputs = {
+        consts_to_inputs = {
             v: k if not k.startswith("%") else k[1:] for k, v in inputs_to_consts.items()
         }
 
         # Add params
         for name, param in exported_program.state_dict.items():
-            if isinstance(param, torch.Tensor):
-                value = param.detach().cpu().numpy()
+            if not isinstance(param, torch.Tensor):
+                raise NotImplementedError(
+                    f"For ExecuTorch paramter, only support torch.Tensor, but got {type(param)}"
+                )
+            params[name if name not in consts_to_inputs else consts_to_inputs[name]] = param
+        # Non-persistent buffers may be missing from state_dict, but we still need their values
+        # Reference: https://github.com/pytorch/executorch/pull/1802
+        for name, buffer in zip(exported_program.graph_signature.buffers, exported_program.buffers()):
+            if not isinstance(buffer, torch.Tensor):
+                raise NotImplementedError(
+                    f"For ExecuTorch buffer, only support torch.Tensor, but got {type(buffer)}"
+                )
+            params_name = consts_to_inputs[name]
+            if params_name in params:
+                assert torch.equal(params[params_name], buffer)
             else:
-                raise NotImplementedError("Only torch.Tensor handled yet")
-
-            params[name if name not in parameters_to_inputs else parameters_to_inputs[name]] = value
+                params[params_name] = buffer
 
         graph_module = exported_program.graph_module
         graph = graph_module.graph
@@ -466,7 +575,7 @@ def from_exir(cls, exir: torch.export.ExportedProgram):
                 # e.g. higher-level callables such as "call_delegate"
                 if not isinstance(attr, torch.Tensor):
                     raise NotImplementedError("Only torch.Tensor attr handled yet")
-                params[name] = attr.detach().cpu().numpy()
+                params[name] = attr
             elif node.op == "placeholder":
                 continue
             elif node.op == "output":
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index e6694be13..febb96586 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -17,6 +17,8 @@
 
 from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as target
+from coremltools.converters.mil.frontend import _utils
+from coremltools.converters.mil.frontend._utils import dynamic_topk
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Symbol, types
 from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
@@ -25,6 +27,7 @@
     promote_input_dtypes,
     solve_slice_by_index_shape,
 )
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 from coremltools.converters.mil.mil.types import is_bool, nptype_from_builtin
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic, is_symbolic
 from coremltools.converters.mil.mil.types.type_mapping import builtin_to_string
@@ -75,35 +78,64 @@ def convert_nodes(context, graph):
         graph: An InternalTorchIRGraph or InternalTorchIRBlock object.
     """
     for node in _tqdm(graph.nodes, desc="Converting PyTorch Frontend ==> MIL Ops", unit=" ops"):
-        op_lookup = node.kind
-        add_op = _TORCH_OPS_REGISTRY.get_func(op_lookup)
-        if add_op is None:
-            if re.match(r".*_dynamic", op_lookup):
-                raise RuntimeError(
-                    f"PyTorch convert function for op '{op_lookup}' not implemented.\n"
-                    "Dynamic quantized models are not supported by Core ML.\n"
-                    "Please use static quantization or the APIs in coremltools.optimize to quantize/compress models."
-                )
-            else:
-                raise RuntimeError(
-                    f"PyTorch convert function for op '{op_lookup}' not implemented."
-                )
+        try:
+            convert_single_node(context, node)
+        except Exception as e:
+            scope_names = node.get_scope_info()[0]
+            op_location = '/'.join(scope_names[:-1])
+            logger.error(f"\n\nERROR - converting '{node.kind}' op (located at: '{op_location}'):\n")
+            raise e     # re-raise exception
+
+        if _all_outputs_present(context, graph):
+            # We've generated all the outputs the graph needs, terminate conversion.
+            break
+
+
+def convert_single_node(context, node):
+    """
+    Converts a single lowered PyTorch op to MIL.
 
-        logger.info("Converting op {} : {}".format(node.name, op_lookup))
+    Arguments:
+        context: A TranscriptionContext object to pull node inputs and
+            assign node outputs.
+        node: lowered PyTorch op to convert.
+    """
+    op_lookup = node.kind
+    add_op = _TORCH_OPS_REGISTRY.get_func(op_lookup)
+    if add_op is None:
+        if re.match(r".*_dynamic", op_lookup):
+            raise RuntimeError(
+                f"PyTorch convert function for op '{op_lookup}' not implemented.\n"
+                "Dynamic quantized models are not supported by Core ML.\n"
+                "Please use static quantization or the APIs in coremltools.optimize to quantize/compress models."
+            )
+        else:
+            raise RuntimeError(
+                f"PyTorch convert function for op '{op_lookup}' not implemented."
+            )
+
+    logger.info("Converting op {} : {}".format(node.name, op_lookup))
+
+    scopes = []
+    if context.frontend == TorchFrontend.TORCHSCRIPT:
+        scope_name, scope_type = node.get_scope_info()
+        scopes = [
+            ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=scope_type),
+            ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=scope_name),
+        ]
+    elif context.frontend == TorchFrontend.EXIR:
+        scopes = [ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[node.meta["debug_handle"]])]
+    else:
+        raise ValueError(f"Invalid PyTorch frontend {context.frontend}")
 
+    with mb.scope(*scopes):
         if context.frontend == TorchFrontend.TORCHSCRIPT:
             context.quant_context.maybe_handle_quantized_inputs(node)
         context.prepare_for_conversion(node)
-
         add_op(context, node)
-
         if _TORCH_OPS_REGISTRY.is_inplace_op(op_lookup):
             context.process_inplace_op(node)
 
-        # We've generated all the outputs the graph needs, terminate conversion.
-        if _all_outputs_present(context, graph):
-            break
-
 
 def convert_block(context, block, inputs):
     """Convert a block (sub-graph) to MIL. Conversion happens within a new
@@ -177,7 +209,7 @@ def get_bindings(alist) -> List[Any]:
                 raise NotImplementedError(f"Binding of inputs of type {type(i)} not handled yet")
 
         return results
-    
+
     def check_if_number_of_inputs_expected(num_inputs: int, expected: Union[int, List, Tuple]) -> None:
         expected = [expected] if isinstance(expected, int) else expected
         if num_inputs not in expected:
@@ -186,7 +218,7 @@ def check_if_number_of_inputs_expected(num_inputs: int, expected: Union[int, Lis
                     node.name, node.kind, num_inputs, expected
                 )
             )
-        
+
     def check_if_number_of_inputs_more_than_min_expected(num_inputs: int, min_expected: int) -> None:
         if num_inputs < min_expected:
             raise ValueError(
@@ -242,7 +274,16 @@ def _is_const(var, optional=False):
         return True
     if isinstance(var, np.ndarray):
         return True
-    return var is not None and (var.val is not None or var.op.op_type.startswith("constexpr_"))
+    return (
+        var is not None
+        and isinstance(var, Var)
+        and var.op is not None
+        and (
+            var.op.op_type.startswith("constexpr_")
+            or (var.op.op_type == "dequantize" and var.op.can_materialize_val())
+            or var.val is not None
+        )
+    )
 
 def _create_linear_layer(x, w, bias):
     """
@@ -287,6 +328,15 @@ def _construct_constant(val, name):
         return mb.const(val=val, name=name)
 
 
+@register_torch_op
+def native_dropout(context, node):
+    if context.frontend == TorchFrontend.EXIR:
+        inputs = _get_inputs(context, node, min_expected=2)
+        context.add((inputs[0],), node.name)
+    else:
+        raise ValueError(f"native_dropout should only appear in EXIR, but got {context.frontend}")
+
+
 @register_torch_op
 def affine_grid_generator(context, node):
     # rdar://73165386 (Improve error handling of coremltools "affine" op PyTorch conversion.)
@@ -822,9 +872,12 @@ def pixel_unshuffle(context, node):
 @register_torch_op(torch_alias=["bmm", "mm"])
 def matmul(context, node):
     inputs = _get_inputs(context, node, expected=2)
-    if inputs[1].val is not None and \
-            len(inputs[1].shape) == 2 and len(inputs[0].shape) <= 3:
-        res = mb.linear(x=inputs[0], weight=_np.transpose(inputs[1].val), name=node.name)
+    if (len(inputs[1].shape) == 2 and len(inputs[0].shape) <= 3) and (
+        _is_const(inputs[1]) or inputs[1].is_descendant_of_const
+    ):
+        linear_x, weight = inputs
+        transposed_weight = mb.transpose(x=weight, perm=(1, 0))
+        res = mb.linear(x=linear_x, weight=transposed_weight, name=node.name)
     else:
         x, y = promote_input_dtypes([inputs[0], inputs[1]])
         res = mb.matmul(x=x, y=y, name=node.name)
@@ -1403,9 +1456,9 @@ def max_pool3d(context, node):
 @register_torch_op
 def minimum(context, node):
     inputs = _get_inputs(context, node, expected=2)
+    x, y = promote_input_dtypes(inputs)
     assert len(node.outputs) == 1
-    x = context[node.inputs[0]]
-    y = context[node.inputs[1]]
+
     out = mb.minimum(x=x, y=y, name=node.name)
     context.add(out)
 
@@ -1420,9 +1473,9 @@ def clamp_min(context, node):
 @register_torch_op
 def maximum(context, node):
     inputs = _get_inputs(context, node, expected=2)
+    x, y = promote_input_dtypes(inputs)
     assert len(node.outputs) == 1
-    x = context[node.inputs[0]]
-    y = context[node.inputs[1]]
+
     out = mb.maximum(x=x, y=y, name=node.name)
     context.add(out)
 
@@ -1532,7 +1585,7 @@ def sub(context, node):
     ]
 )
 def mean(context, node):
-    inputs = _get_inputs(context, node)
+    inputs = _get_inputs(context, node, min_expected=1)
 
     x = inputs[0]
     if types.is_bool(x.dtype):
@@ -1542,7 +1595,7 @@ def mean(context, node):
     kwargs = {"x": x, "name": node.name}
 
     # @axes is optional, so omit if None.
-    axes = inputs[1]
+    axes = None if len(inputs) < 2 else inputs[1]
     if axes is not None:
         # @axes needs to be a list, but if only one axis was specified in the
         # model, it will be constructed as an int. Construct a new constant as a
@@ -1718,7 +1771,7 @@ def _adaptive_pool1d(context, node, reduce_op):
             keep_dims=True
         )
         pool_results.append(cur_result)
-        
+
     context.add(
         mb.reshape(
             x=mb.concat(values=pool_results, axis=-1),
@@ -3333,6 +3386,63 @@ def _loop_body(*loop_vars):
         context.add(output_var, torch_name=output_name)
 
 
+@register_torch_op
+def _unique2(context, node):
+    (x, sorted, return_inverse, return_counts)  = _get_inputs(context, node, expected=4)
+
+    # Unsupported case
+    if sorted.val is not True:
+        raise NotImplementedError("sorted=False not supported for unique op")
+
+    x_flatten = mb.reshape(x=x, shape=[-1])
+
+    # Sort flattened input
+    indices = mb.argsort(x=x_flatten, ascending=True)
+    x_sorted = mb.gather_along_axis(x=x_flatten, indices=indices)
+
+    # Subtract n_th+1 element from n_th element
+    neg_inf = np.float32(-np.inf)
+    x_sorted = mb.cast(x=x_sorted, dtype="fp32")
+    x_sorted_shifted  = mb.pad(x=x_sorted, pad=[1, 0], constant_val=neg_inf)
+    x_sorted_padded = mb.pad(x=x_sorted, pad=[0, 1], mode="replicate")
+    diff = mb.sub(x=x_sorted_padded, y=x_sorted_shifted)
+
+    # Get non-zero element after subtraction to determine unique values
+    non_zero_indices = mb.non_zero(x=diff)
+    unique_values_unsqueeze = mb.gather(x=x_sorted, indices=non_zero_indices)
+    unique_values = mb.squeeze(x = unique_values_unsqueeze)
+
+    # Add unique values to output and see if we're done.
+    context.add(unique_values, torch_name=node.outputs[0])
+    if return_counts.val is False and return_inverse.val is False:
+        # only the unique values are needed
+        return
+
+    # Calculate a UxN boolean tensor, where:
+    #     U - number of unique values
+    #     N - number of input elements
+    num_unique_values = mb.shape(x=unique_values)
+    x_tile = mb.tile(x=x_flatten, reps=num_unique_values)
+    tile_shape = mb.concat(values=(num_unique_values, mb.shape(x=x_flatten)), axis=0)
+    x_tile = mb.reshape(x=x_tile, shape=tile_shape)
+    unique_values_unsqueeze = mb.cast(x=unique_values_unsqueeze, dtype="int32")
+    x_tile, unique_values_unsqueeze = promote_input_dtypes([x_tile, unique_values_unsqueeze])
+    diff = mb.sub(x=x_tile, y=unique_values_unsqueeze)
+    bool_tensor = mb.logical_not(x=mb.cast(x=diff, dtype="bool"))
+
+    if return_inverse.val is True:
+        # Get indices
+        range = mb.range_1d(start=0, end=mb.squeeze(x=num_unique_values), step=1)
+        indices = mb.matmul(x=range, y=mb.cast(x=bool_tensor, dtype="int32"))
+        indices = mb.reshape(x=indices, shape=mb.shape(x=x))
+        context.add(indices, torch_name=node.outputs[1])
+
+    if return_counts.val is True:
+        # Get counts
+        counts = mb.reduce_sum(x=mb.cast(x=bool_tensor, dtype='int32'), axes=(-1,))
+        context.add(counts, torch_name=node.outputs[2])
+
+
 @register_torch_op(torch_alias=["if"])
 def _if(context, node):
     """ In TorchIR, a conditional looks like:
@@ -3959,9 +4069,10 @@ def full(context, node):
 
 @register_torch_op
 def full_like(context, node):
-    inputs = _get_inputs(context, node, expected=7)
+    inputs = _get_inputs(context, node, min_expected=2)
     x = inputs[0]
     val = inputs[1].val
+
     if is_current_opset_version_compatible_with(target.iOS16):
         result = mb.fill_like(ref_tensor=x, value=val, name=node.name)
     else:
@@ -4066,13 +4177,13 @@ def _avg_pool(context, node, inputs):
 
     kernel_sizes = inputs[1]
 
-    strides = None
+    strides = kernel_sizes  # default strides = kernel sizes
     if len(inputs) > 2:
-        strides = (
-            mb.const(val=kernel_sizes.val, name=inputs[2].name)
-            if inputs[2].op.op_type == "const" and (not list(inputs[2].val))
-            else inputs[2]
-        )
+        strides = inputs[2]
+        # TorchScript may give us empty stride, in such case
+        # we still default strides to kernel sizes, but name conform to TorchScript
+        if strides.op.op_type == "const" and (not list(strides.val)):
+            strides = mb.const(val=kernel_sizes.val, name=strides.name)
 
     pad_type = "custom"
     # Need to explicitly state L-R, T-B pad
@@ -4082,7 +4193,7 @@ def _avg_pool(context, node, inputs):
 
     include_pad = True if len(inputs) < 6 else inputs[5].val
 
-    spatial_rank = len(pad) // 2
+    spatial_rank = 0 if pad is None else len(pad) // 2
     if spatial_rank > 2 and ceil_mode is True and list(strides.val) != [1] * len(strides.val):
         # since MIL does not support ceil_mode for 3D pool,
         # need to adjust padding values if ceil_mode is True
@@ -4390,8 +4501,14 @@ def to(context, node):
         casted_input = torch.tensor(_input.val).type(torch_dtype).cpu().numpy()
         res = mb.const(val=casted_input, name=node.name)
     else:
-        if dtype in NUM_TO_DTYPE_STRING:
-            res = mb.cast(x=_input, dtype=NUM_TO_DTYPE_STRING[dtype], name=node.name)
+        dtype_str = NUM_TO_DTYPE_STRING[dtype]
+        valid_dtypes = (
+            {"int8", "uint8", "int16", "uint16", "int32", "fp16", "fp32", "bool"}
+            if is_current_opset_version_compatible_with(target.iOS17)
+            else {"int32", "fp16", "fp32", "bool"}
+        )
+        if dtype_str in valid_dtypes:
+            res = mb.cast(x=_input, dtype=dtype_str, name=node.name)
         else:
             # For dtype that is not supported by mb.cast, we do it in best-effort to cast it to int
             # or float based on the dtype.
@@ -4648,7 +4765,7 @@ def meshgrid(context, node):
     ]
 )
 def noop(context, node):
-    logger.info("Setting pytorch op: {} to no-op.".format(node))
+    logger.info(f"Setting pytorch op: {node.kind} to no-op.")
     inputs = _get_inputs(context, node)
     _input = inputs[0]
     context.add(_input, torch_name=node.name)
@@ -4892,8 +5009,7 @@ def _abs(context, node):
 
 @register_torch_op
 def repeat(context, node):
-    x = context[node.inputs[0]]
-    reps = context[node.inputs[1]]
+    x, reps = _get_inputs(context, node, expected=2)
     if isinstance(reps, list):
         reps = mb.concat(values=reps, axis=0)
 
@@ -5334,21 +5450,15 @@ def where(context, node):
         return
 
     assert len(inputs) == 3
-    cond = inputs[0]
+    cond, a, b = inputs
+    a, b = promote_input_dtypes([a, b])
     if not types.is_bool(cond.dtype):
         # cond must be bool type
         cond = mb.cast(x=cond, dtype="bool")
-    if not any([any_symbolic(x.shape) for x in inputs[:3]]):
+    if not any([any_symbolic(x.shape) for x in (cond, a, b)]):
         # broadcast all tensors to the same shape
-        broadcast_inputs = _broadcast_tensors([cond, inputs[1], inputs[2]])
-        result = mb.select(
-            cond=broadcast_inputs[0],
-            a=broadcast_inputs[1],
-            b=broadcast_inputs[2],
-            name=node.name,
-        )
-    else:
-        result = mb.select(cond=cond, a=inputs[1], b=inputs[2], name=node.name)
+        cond, a, b = _broadcast_tensors([cond, a, b])
+    result = mb.select(cond=cond, a=a, b=b, name=node.name)
     context.add(result)
 
 
@@ -5366,17 +5476,6 @@ def neg(context, node):
 
 @register_torch_op
 def topk(context, node):
-    def dynamic_topk(x, k, axis, ascending):
-        assert k.val is None, "Please use mb.topk directly if k is compile time known"
-        indices = mb.argsort(x=x, axis=axis, ascending=ascending)
-        values = mb.gather_along_axis(x=x, indices=indices, axis=axis)
-
-        k_indices = mb.range_1d(end=k, start=0, step=1)
-        values = mb.gather(x=values, indices=k_indices, axis=axis)
-        indices = mb.gather(x=indices, indices=k_indices, axis=axis)
-
-        return values, indices
-
     inputs = _get_inputs(context, node)
     kwargs = {"name": node.name, "x": inputs[0], "k": inputs[1]}
 
@@ -6447,33 +6546,6 @@ def _cast_bool_attn_mask(attn_mask: Var, query_var: Var) -> Var:
     compliment_of_mask = mb.mul(x=negative_inf, y=compliment_of_mask)
     return mb.add(x=mask, y=compliment_of_mask)
 
-
-def _lower_scaled_dot_product_attention(q: Var, k: Var, v: Var, mask: Var, name: str) -> Var:
-    # scale the query input
-    embed_size = q.shape[-1]
-    if is_symbolic(embed_size):
-        raise ValueError(
-            "The embedding size, i.e. last dimension of the shape of query tensor"
-            " cannot be symbolic, in scaled_dot_product_attention op"
-        )
-    multiplicative_scale_factor = 1 / _math.sqrt(embed_size)
-    q = mb.mul(x=q, y=multiplicative_scale_factor)
-
-    # multiply query and key input tensors
-    # shape of output: (target_seq, source_seq) or (B,...,target_seq, source_seq)
-    attn_weights = mb.matmul(x=q, y=k, transpose_y=True)
-
-    # add mask if applicable
-    if mask is not None:
-        attn_weights = mb.add(x=attn_weights, y=mask)
-
-    # do softmax
-    attn_weights_normalized = mb.softmax(x=attn_weights, axis=-1)
-
-    # multiply attn_weights and value tensor
-    res = mb.matmul(x=attn_weights_normalized, y=v, name=name)
-    return res
-
 @register_torch_op
 def scaled_dot_product_attention(context, node):
     """
@@ -6500,13 +6572,13 @@ def scaled_dot_product_attention(context, node):
     attn_mask = None if len(inputs) < 4 else inputs[3]
     dropout = 0.0 if len(inputs) < 5 else inputs[4]
     is_causal = False if len(inputs) < 6 else inputs[5].val
-    
+
     # When len(inputs) == 7, the inputs are (q, k, v, attn_mask, dropout, is_causal, scale)
     if len(inputs) == 7 and inputs[6] is not None:
         raise NotImplementedError(
             "scaled_dot_product_attention op: scale parameter is not handled."
         )
-    
+
     if attn_mask is not None and is_causal:
         raise ValueError(
             "scaled_dot_product_attention op: attn_mask cannot be provided when is_causal is set to True."
@@ -6534,7 +6606,7 @@ def scaled_dot_product_attention(context, node):
         else:
             mask = attn_mask
 
-    res = _lower_scaled_dot_product_attention(q, k, v, mask, node.name)
+    res = _utils._lower_scaled_dot_product_attention(q, k, v, mask, node.name)
     context.add(res)
 
 
diff --git a/coremltools/converters/mil/frontend/torch/quantization_ops.py b/coremltools/converters/mil/frontend/torch/quantization_ops.py
index d33b66877..236f428e4 100644
--- a/coremltools/converters/mil/frontend/torch/quantization_ops.py
+++ b/coremltools/converters/mil/frontend/torch/quantization_ops.py
@@ -3,21 +3,24 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+
 import numpy as _np
 import torch as _torch
 
 from coremltools import _logger as logger
+from coremltools.converters.mil.frontend import _utils
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Var, types
 
+from .ops import _create_linear_layer, _get_inputs, promote_input_dtypes
+from .torch_op_registry import register_torch_op
 from .utils import (
     NUM_TO_TORCH_DTYPE,
     TORCH_QTYPE_TO_NP_TYPE,
     TORCH_QTYPE_TO_STR,
+    TYPE_TO_DTYPE_STRING,
     TorchFrontend,
 )
-from .ops import _create_linear_layer, _get_inputs, promote_input_dtypes
-from .torch_op_registry import register_torch_op
 
 
 def _quantize_general(
@@ -72,10 +75,16 @@ def _quantize_general(
         axis=axis,
     )
     context.add(result, node.name)
-    context.quant_context.add_quantization_info(node.name, torch_dtype, scale, zero_point, axis)
+    if context.frontend == TorchFrontend.TORCHSCRIPT:
+        context.quant_context.add_quantization_info(node.name, torch_dtype, scale, zero_point, axis)
 
 
-@register_torch_op(torch_alias=["quantized_decomposed::quantize_per_tensor"])
+@register_torch_op(
+    torch_alias=[
+        "quantized_decomposed::quantize_per_tensor",
+        "quantized_decomposed.quantize_per_tensor",
+    ]
+)
 def quantize_per_tensor(context, node):
     inputs = _get_inputs(
         context,
@@ -101,25 +110,81 @@ def quantize_per_channel(context, node):
     _quantize_general(context, node, input, scale, zero_point, torch_dtype, axis.val)
 
 
-@register_torch_op(torch_alias=["quantized_decomposed::dequantize_per_tensor"])
-def dequantize(context, node):
-    context.quant_context.get_dequantized_var(node.inputs[0], node.name)
+def _dequantize_general(
+    context,
+    node,
+    input: Var,
+    scale: Var,
+    zero_point: Var,
+    axis: Var = None,
+) -> None:
+    # torch may use different dtype for input and zero_point,
+    # but Core ML requires input and zero_point to have a same dtype,
+    # so cast zero_point dtype to input dtype
+    if input.dtype != zero_point.dtype:
+        zero_point = mb.cast(x=zero_point, dtype=TYPE_TO_DTYPE_STRING[input.dtype])
+    # Not sure why torch may quantize a scalar... does not make sense,
+    # since the floating point scale is as big as the original floating point input data scalar
+    if input.rank == 0:
+        # For const input, translate to the const floating point scalar output
+        if input.val is not None:
+            output_value = scale.val * (input.val - zero_point.val)
+            output = mb.const(val=output_value)
+        # For variable input, we have no choice but to expand and squeeze,
+        # since CoreML dequantize op requires tensor input
+        else:
+            expanded_input = mb.expand_dims(x=input, axes=(0,))
+            dequantize_output = mb.dequantize(
+                input=expanded_input,
+                zero_point=zero_point,
+                scale=scale,
+                axis=axis,
+            )
+            output = mb.squeeze(x=dequantize_output)
+    else:
+        output = mb.dequantize(
+            input=input,
+            zero_point=zero_point,
+            scale=scale,
+            axis=axis,
+        )
+    context.add(output, node.name)
 
 
-def _construct_constexpr_affine_op(quantized_weights, zero_point, scale, axis=None, name=None):
-    """Constructs the constexpr op to represent the dequantized weight from PyTorch's data."""
-    if axis is None:
-        # It's per-tensor quantization, just use a dummy value for axis.
-        axis = _np.int32(0)
-    kwargs = {
-        "quantized_data": quantized_weights,
-        "zero_point": zero_point,
-        "scale": scale,
-        "axis": axis,
-    }
-    if name is not None:
-        kwargs["name"] = name
-    return mb.constexpr_affine_dequantize(**kwargs)
+@register_torch_op(
+    torch_alias=[
+        "quantized_decomposed::dequantize_per_tensor",
+        "quantized_decomposed.dequantize_per_tensor",
+        "quantized_decomposed::dequantize_per_channel",
+        "quantized_decomposed.dequantize_per_channel",
+    ]
+)
+def dequantize(context, node):
+    if context.frontend == TorchFrontend.TORCHSCRIPT:
+        context.quant_context.get_dequantized_var(node.inputs[0], node.name)
+    elif context.frontend == TorchFrontend.EXIR:
+        # ExecuTorch intends to use `min` and `max` to indicate quantization dtype, e.g.
+        #     min = -64, max = 63, torch_dtype = torch.int8
+        # means int4 quantization (torch_dtype = torch.int8 due to there is no torch.int4 yet)
+        # For now (2024-02-27), 2 issues preventing us from translating `min` and `max`
+        #     1. ExecuTorch has not fully added 4-bit quantization support yet, so no way to test
+        #     2. CoreML supports only 8-bit quantization yet, so no way to translate
+        # TODO(rdar://123421506): Translate `min` and `max` once the above 2 issues get resolved
+        inputs = _get_inputs(context, node, min_expected={TorchFrontend.EXIR: 6})
+        num_inputs = len(inputs)
+        if num_inputs == 6:
+            input, scale, zero_point, min, max, torch_dtype_number = inputs
+            axis = None
+        elif num_inputs == 7:
+            input, scale, zero_point, axis, min, max, torch_dtype_number = inputs
+        else:
+            raise ValueError(f"dequantize should have 6 or 7 inputs, but got {num_inputs}")
+        _dequantize_general(context, node, input, scale, zero_point, axis)
+    else:
+        raise ValueError(
+            "dequantize is supported only in TorchScript and EXIR frontends, "
+            f"but got {context.frontend}"
+        )
 
 
 def _dequantized_weight(qweight, name: str = None):
@@ -132,7 +197,7 @@ def _dequantized_weight(qweight, name: str = None):
         scale = _np.float32(qweight.q_scale())
         zero_point = quant_dtype_np(qweight.q_zero_point())
         quantized_weights = _torch.int_repr(qweight).numpy()
-        dequant_weights = _construct_constexpr_affine_op(
+        dequant_weights = _utils._construct_constexpr_affine_op(
             quantized_weights, zero_point, scale, axis=None, name=name
         )
     # per_channel_affine_float_qparams is same as per_channel_affine except that it
@@ -158,7 +223,7 @@ def _dequantized_weight(qweight, name: str = None):
             zero_point = quant_dtype_np(val)
         quantized_weights = _torch.int_repr(qweight).numpy()
         axis = _np.int32(qweight.q_per_channel_axis())
-        dequant_weights = _construct_constexpr_affine_op(
+        dequant_weights = _utils._construct_constexpr_affine_op(
             quantized_weights, zero_point, scale, axis=axis, name=name
         )
     else:
diff --git a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py
index 49b85ef1f..9033fb5b0 100644
--- a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py
+++ b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_tensor_assign_to_core.py
@@ -22,7 +22,7 @@ def apply(self, prog):
 
 @block_context_manager
 def _torch_tensor_assign_to_core_block(block):
-    for op in block.operations[:]:
+    for op in list(block.operations):
         for b in op.blocks:
             _torch_tensor_assign_to_core_block(b)
 
diff --git a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py
index d8864f80d..6d76375aa 100644
--- a/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py
+++ b/coremltools/converters/mil/frontend/torch/ssa_passes/torch_upsample_to_core_upsample.py
@@ -36,7 +36,7 @@ def apply(self, prog):
 
 @block_context_manager
 def _torch_upsample_to_core_upsample_block(block):
-    for op in block.operations[:]:
+    for op in list(block.operations):
         for b in op.blocks:
             _torch_upsample_to_core_upsample_block(b)
 
diff --git a/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py b/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py
index 69ad8d47b..f128b8fdd 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_executorch_e2e.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import itertools
 import pytest
 
 from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH_VISION
@@ -18,28 +19,72 @@
 import timm
 import transformers
 
+from coremltools.converters.mil import testing_reqs
+from coremltools.converters.mil.mil.scope import ScopeSource
+
 from .testing_utils import TorchBaseTest, TorchFrontend
 
+backends = testing_reqs.backends
+compute_units = testing_reqs.compute_units
 
-class TestExecutorch(TorchBaseTest):
-    def test_mul(self):
-        class MulModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
 
+class TestExecutorchExampleModels(TorchBaseTest):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_mul(self, compute_unit, backend):
+        class MulModule(torch.nn.Module):
             def forward(self, input, other):
                 return input * other
 
-        model = MulModule()
-        model.eval()
-
-        self.run_compare_torch(
+        _, coreml_model, _, _, _, _ = self.run_compare_torch(
             [(3, 2), (3, 2)],
-            model,
+            MulModule(),
+            compute_unit=compute_unit,
+            backend=backend,
             frontend=TorchFrontend.EXIR,
         )
 
-    def test_linear(self):
+        mil_program = coreml_model._mil_program
+        mul = mil_program.functions["main"].find_ops(op_type="mul")[0]
+
+        debug_handle = mul.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0]
+        assert isinstance(debug_handle, int)
+
+        debug_handle_to_ops_mapping = mil_program.construct_debug_handle_to_ops_mapping()
+        assert debug_handle_to_ops_mapping.keys() == {debug_handle}
+
+        ops = debug_handle_to_ops_mapping[debug_handle]
+        index_mul = 0
+        indices_const = ()
+        indices_cast = ()
+        if backend[1] == "fp32":
+            assert len(ops) == 1
+            index_mul = 0
+        else:
+            # fp16 introduces additional io casts
+            # each cast introduces 1 const to store destination dtype
+            assert len(ops) == 7
+            index_mul = 4
+            indices_const = (0, 1, 5)
+            indices_cast = (2, 3, 6)
+        assert ops[index_mul] == [
+            {"Type": "Program"},
+            {"Type": "Function", "Name": "main"},
+            {"Type": "Block"},
+            {"Type": "Operation", "Operation_Type": "mul", "Output": mul.outputs[0].name},
+        ]
+        for index_const_cast in indices_const + indices_cast:
+            assert ops[index_const_cast][:-1] == [
+                {"Type": "Program"},
+                {"Type": "Function", "Name": "main"},
+                {"Type": "Block"},
+            ]
+        for index_const in indices_const:
+            assert ops[index_const][-1]["Operation_Type"] == "const"
+        for index_cast in indices_cast:
+            assert ops[index_cast][-1]["Operation_Type"] == "cast"
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_linear(self, compute_unit, backend):
         class LinearModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -48,18 +93,58 @@ def __init__(self):
             def forward(self, arg):
                 return self.linear(arg)
 
-        model = LinearModule()
-        model.eval()
-
-        self.run_compare_torch(
-            [(3, 3)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+            [(3, 3)],
+            LinearModule(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_add(self):
+        mil_program = coreml_model._mil_program
+        linear = mil_program.functions["main"].find_ops(op_type="linear")[0]
+
+        debug_handle = linear.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0]
+        assert isinstance(debug_handle, int)
+
+        debug_handle_to_ops_mapping = mil_program.construct_debug_handle_to_ops_mapping()
+        assert debug_handle_to_ops_mapping.keys() == {debug_handle}
+
+        ops = debug_handle_to_ops_mapping[debug_handle]
+        index_linear = 0
+        indices_const = ()
+        indices_cast = ()
+        if backend[1] == "fp32":
+            assert len(ops) == 3
+            index_linear = 2
+            indices_const = (0, 1)
+        else:
+            # fp16 introduces additional io casts
+            # each cast introduces 1 const to store destination dtype
+            assert len(ops) == 7
+            index_linear = 4
+            indices_const = (0, 1, 2, 5)
+            indices_cast = (3, 6)
+        assert ops[index_linear] == [
+            {"Type": "Program"},
+            {"Type": "Function", "Name": "main"},
+            {"Type": "Block"},
+            {"Type": "Operation", "Operation_Type": "linear", "Output": linear.outputs[0].name},
+        ]
+        for index_const_cast in indices_const + indices_cast:
+            assert ops[index_const_cast][:-1] == [
+                {"Type": "Program"},
+                {"Type": "Function", "Name": "main"},
+                {"Type": "Block"},
+            ]
+        for index_const in indices_const:
+            assert ops[index_const][-1]["Operation_Type"] == "const"
+        for index_cast in indices_cast:
+            assert ops[index_cast][-1]["Operation_Type"] == "cast"
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_add(self, compute_unit, backend):
         class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, x, y):
                 z = x + y
                 z = z + x
@@ -67,35 +152,135 @@ def forward(self, x, y):
                 z = z + z
                 return z
 
-        model = AddModule()
-        model.eval()
-
-        self.run_compare_torch(
-            [(1,), (1,)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+            [(1,), (1,)],
+            AddModule(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_add_mul(self):
+        mil_program = coreml_model._mil_program
+        adds = mil_program.functions["main"].find_ops(op_type="add")
+
+        debug_handles = [add.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0] for add in adds]
+        for debug_handle in debug_handles:
+            assert isinstance(debug_handle, int)
+
+        debug_handle_to_ops_mapping = mil_program.construct_debug_handle_to_ops_mapping()
+        assert debug_handle_to_ops_mapping.keys() == set(debug_handles)
+
+        for add_index, debug_handle in enumerate(debug_handles):
+            add = adds[add_index]
+            ops = debug_handle_to_ops_mapping[debug_handle]
+            index_add = 0
+            indices_const = ()
+            indices_cast = ()
+            if backend[1] == "fp32":
+                assert len(ops) == 1
+                index_add = 0
+            else:
+                # fp16 introduces additional io casts
+                # each cast introduces 1 const to store destination dtype
+                ADD_INDEX_TO_NUM_OPS = {0: 5, 1: 1, 2: 1, 3: 3}
+                ADD_INDEX_TO_OP_INDEX = {0: -1, 1: 0, 2: 0, 3: 0}
+                assert len(ops) == ADD_INDEX_TO_NUM_OPS[add_index]
+                index_add = ADD_INDEX_TO_OP_INDEX[add_index]
+                if add_index == 0:
+                    indices_const = (0, 1)
+                    indices_cast = (2, 3)
+                elif add_index == 3:
+                    indices_const = (1,)
+                    indices_cast = (2,)
+            assert ops[index_add] == [
+                {"Type": "Program"},
+                {"Type": "Function", "Name": "main"},
+                {"Type": "Block"},
+                {"Type": "Operation", "Operation_Type": "add", "Output": add.outputs[0].name},
+            ]
+            for index_const_cast in indices_const + indices_cast:
+                assert ops[index_const_cast][:-1] == [
+                    {"Type": "Program"},
+                    {"Type": "Function", "Name": "main"},
+                    {"Type": "Block"},
+                ]
+            for index_const in indices_const:
+                assert ops[index_const][-1]["Operation_Type"] == "const"
+            for index_cast in indices_cast:
+                assert ops[index_cast][-1]["Operation_Type"] == "cast"
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_add_mul(self, compute_unit, backend):
         class AddMulModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
             def forward(self, a, x, b):
                 y = torch.mm(a, x)
                 z = torch.add(y, b)
                 return z
 
-        model = AddMulModule()
-        model.eval()
-
-        self.run_compare_torch(
+        _, coreml_model, _, _, _, _ = self.run_compare_torch(
             [(2, 2), (2, 2), (2, 2)],
-            model,
+            AddMulModule(),
+            compute_unit=compute_unit,
+            backend=backend,
             frontend=TorchFrontend.EXIR,
-            backend=("mlprogram", "fp16"),
         )
 
-    def test_softmax(self):
-        class LinearModule(torch.nn.Module):
+        mil_program = coreml_model._mil_program
+        matmul_or_add = {}
+        for op_type in ("matmul", "add"):
+            matmul_or_add[op_type] = mil_program.functions["main"].find_ops(op_type=op_type)[0]
+
+        debug_handle = {
+            k: v.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0] for k, v in matmul_or_add.items()
+        }
+        for v in debug_handle.values():
+            assert isinstance(v, int)
+
+        debug_handle_to_ops_mapping = mil_program.construct_debug_handle_to_ops_mapping()
+        assert debug_handle_to_ops_mapping.keys() == set(debug_handle.values())
+
+        ops = {}
+        for op_type in ("matmul", "add"):
+            ops[op_type] = debug_handle_to_ops_mapping[debug_handle[op_type]]
+        index = {"matmul": 0, "add": 0}
+        indices_const = {"matmul": (), "add": ()}
+        indices_cast = {"matmul": (), "add": ()}
+        if backend[1] == "fp32":
+            assert len(ops["matmul"]) == 3 and len(ops["add"]) == 1
+            index = {"matmul": 2, "add": 0}
+            indices_const["matmul"] = (0, 1)
+        else:
+            # fp16 introduces additional io casts
+            # each cast introduces 1 const to store destination dtype
+            assert len(ops["matmul"]) == 7 and len(ops["add"]) == 5
+            index = {"matmul": 6, "add": 2}
+            indices_const = {"matmul": (0, 1, 2, 3), "add": (0, 3)}
+            indices_cast = {"matmul": (4, 5), "add": (1, 4)}
+        for op_type in ("matmul", "add"):
+            assert ops[op_type][index[op_type]] == [
+                {"Type": "Program"},
+                {"Type": "Function", "Name": "main"},
+                {"Type": "Block"},
+                {
+                    "Type": "Operation",
+                    "Operation_Type": op_type,
+                    "Output": matmul_or_add[op_type].outputs[0].name,
+                },
+            ]
+            for index_const_cast in indices_const[op_type] + indices_cast[op_type]:
+                assert ops[op_type][index_const_cast][:-1] == [
+                    {"Type": "Program"},
+                    {"Type": "Function", "Name": "main"},
+                    {"Type": "Block"},
+                ]
+            for index_const in indices_const[op_type]:
+                assert ops[op_type][index_const][-1]["Operation_Type"] == "const"
+            for index_cast in indices_cast[op_type]:
+                assert ops[op_type][index_cast][-1]["Operation_Type"] == "cast"
+
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_softmax(self, compute_unit, backend):
+        class SoftmaxModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.softmax = torch.nn.Softmax()
@@ -103,33 +288,80 @@ def __init__(self):
             def forward(self, x):
                 return self.softmax(x)
 
-        model = LinearModule()
-        model.eval()
-
-        self.run_compare_torch(
-            [(2, 2)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+            [(2, 2)],
+            SoftmaxModule(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    @pytest.mark.xfail(reason="numerical error")
-    def test_deeplab_v3(self):
-        model = torchvision.models.segmentation.deeplabv3_resnet50(
-            weights=torchvision.models.segmentation.deeplabv3.DeepLabV3_ResNet50_Weights.DEFAULT
-        )
-        model.eval()
+        mil_program = coreml_model._mil_program
+        softmax = mil_program.functions["main"].find_ops(op_type="softmax")[0]
+
+        debug_handle = softmax.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0]
+        assert isinstance(debug_handle, int)
+
+        debug_handle_to_ops_mapping = mil_program.construct_debug_handle_to_ops_mapping()
+        assert debug_handle_to_ops_mapping.keys() == {debug_handle}
+
+        ops = debug_handle_to_ops_mapping[debug_handle]
+        index_softmax = 0
+        indices_const = ()
+        indices_cast = ()
+        if backend[1] == "fp32":
+            assert len(ops) == 2
+            index_softmax = 1
+            indices_const = (0,)
+        else:
+            # fp16 introduces additional io casts
+            # each cast introduces 1 const to store destination dtype
+            assert len(ops) == 6
+            index_softmax = 3
+            indices_const = (0, 1, 4)
+            indices_cast = (2, 5)
+        assert ops[index_softmax] == [
+            {"Type": "Program"},
+            {"Type": "Function", "Name": "main"},
+            {"Type": "Block"},
+            {"Type": "Operation", "Operation_Type": "softmax", "Output": softmax.outputs[0].name},
+        ]
+        for index_const_cast in indices_const + indices_cast:
+            assert ops[index_const_cast][:-1] == [
+                {"Type": "Program"},
+                {"Type": "Function", "Name": "main"},
+                {"Type": "Block"},
+            ]
+        for index_const in indices_const:
+            assert ops[index_const][-1]["Operation_Type"] == "const"
+        for index_cast in indices_cast:
+            assert ops[index_cast][-1]["Operation_Type"] == "cast"
 
+    @pytest.mark.xfail(reason="numerical error")
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_deeplab_v3(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.segmentation.deeplabv3_resnet50(
+                weights=torchvision.models.segmentation.deeplabv3.DeepLabV3_ResNet50_Weights.DEFAULT
+            ),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_edsr(self):
-        model = torchsr.models.edsr_r16f64(2, True)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_edsr(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchsr.models.edsr_r16f64(2, True),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_emformer_transcribe(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_emformer_transcribe(self, compute_unit, backend):
         class EmformerRnntTranscriberExample(torch.nn.Module):
             """
             This is a wrapper for validating transcriber for the Emformer RNN-T architecture.
@@ -145,14 +377,19 @@ def __init__(self) -> None:
             def forward(self, sources, source_lengths):
                 return self.rnnt.transcribe(sources, source_lengths)
 
-        model = EmformerRnntTranscriberExample()
-        model.eval()
+        if backend[0] == "neuralnetwork":
+            pytest.xfail("rdar://125514139 emformer transcribe fails on neuralnetwork")
 
         self.run_compare_torch(
-            [(1, 128, 80), (128,)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 128, 80), (128,)],
+            EmformerRnntTranscriberExample(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_emformer_predict(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_emformer_predict(self, compute_unit, backend):
         class EmformerRnntPredictorExample(torch.nn.Module):
             """
             This is a wrapper for validating predictor for the Emformer RNN-T architecture.
@@ -168,19 +405,18 @@ def __init__(self) -> None:
             def forward(self, targets, target_lengths):
                 return self.rnnt.predict(targets, target_lengths, None)
 
-        model = EmformerRnntPredictorExample()
-        model.eval()
-
         self.run_compare_torch(
             [torch.zeros([1, 128], dtype=int), torch.tensor([128], dtype=int)],
-            model,
+            EmformerRnntPredictorExample(),
             input_as_shape=False,
+            compute_unit=compute_unit,
+            backend=backend,
             frontend=TorchFrontend.EXIR,
-            backend=("mlprogram", "fp16"),
         )
 
     @pytest.mark.xfail(reason="numerical error")
-    def test_emformer_join(self):
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_emformer_join(self, compute_unit, backend):
         class EmformerRnntJoinerExample(torch.nn.Module):
             """
             This is a wrapper for validating joiner for the Emformer RNN-T architecture.
@@ -196,96 +432,112 @@ def __init__(self) -> None:
             def forward(self, source_encodings, source_lengths, target_encodings, target_lengths):
                 return self.rnnt.join(source_encodings, source_lengths, target_encodings, target_lengths)
 
-        model = EmformerRnntJoinerExample()
-        model.eval()
-
         self.run_compare_torch(
             [(1, 128, 1024), (128,), (1, 128, 1024), (128,)],
-            model,
+            EmformerRnntJoinerExample(),
+            compute_unit=compute_unit,
+            backend=backend,
             frontend=TorchFrontend.EXIR,
-            backend=("mlprogram", "fp16"),
         )
 
-    # TODO: add llama2
-
-    def test_mobilebert(self):
-        model = transformers.MobileBertModel.from_pretrained(
-            "google/mobilebert-uncased", return_dict=False
-        )
-        model.eval()
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_mobilebert(self, compute_unit, backend):
+        if backend[1] == "fp16":
+            pytest.skip("Mobile Bert overflows fp16")
 
         tokenizer = transformers.AutoTokenizer.from_pretrained("google/mobilebert-uncased")
         token = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
 
         self.run_compare_torch(
             token,
-            model,
+            transformers.MobileBertModel.from_pretrained(
+                "google/mobilebert-uncased", return_dict=False
+            ),
             input_as_shape=False,
+            compute_unit=compute_unit,
+            backend=backend,
             frontend=TorchFrontend.EXIR,
-            backend=("mlprogram", "fp32"),
             rtol=0.005,
         )
 
-    def test_mobilenet_v2(self):
-        model = torchvision.models.mobilenet_v2(weights=torchvision.models.mobilenetv2.MobileNet_V2_Weights.DEFAULT)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_mobilenet_v2(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.mobilenet_v2(
+                weights=torchvision.models.mobilenetv2.MobileNet_V2_Weights.DEFAULT
+            ),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_mobilenet_v3(self):
-        model = torchvision.models.mobilenet_v3_small(pretrained=True)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_mobilenet_v3(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.mobilenet_v3_small(pretrained=True),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_vit(self):
-        model = torchvision.models.vit_b_16(weights="IMAGENET1K_V1")
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_vit(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.vit_b_16(weights="IMAGENET1K_V1"),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_wav2letter(self):
-        model = torchaudio.models.Wav2Letter(num_classes=4096)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_wav2letter(self, compute_unit, backend):
         self.run_compare_torch(
-            [(10, 1, 700)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(10, 1, 700)],
+            torchaudio.models.Wav2Letter(num_classes=4096),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_inception_v3(self):
-        model = torchvision.models.inception_v3(weights="IMAGENET1K_V1")
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_inception_v3(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.inception_v3(weights="IMAGENET1K_V1"),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_inception_v4(self):
-        model = timm.models.inception_v4(pretrained=True)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_inception_v4(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 299, 299)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 299, 299)],
+            timm.models.inception_v4(pretrained=True),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_resnet18(self):
-        model = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_resnet18(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
 
-    def test_resnet50(self):
-        model = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.IMAGENET1K_V1)
-        model.eval()
-
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_resnet50(self, compute_unit, backend):
         self.run_compare_torch(
-            [(1, 3, 224, 224)], model, frontend=TorchFrontend.EXIR, backend=("mlprogram", "fp16")
+            [(1, 3, 224, 224)],
+            torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.IMAGENET1K_V1),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=TorchFrontend.EXIR,
         )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
index d50bfe1d5..77a45f0d0 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
@@ -20,10 +20,11 @@
 )
 from coremltools.converters.mil.frontend.torch.test.testing_utils import _copy_input_data
 from coremltools.converters.mil.frontend.torch.torch_op_registry import (
-    TorchOpsRegistry,
     _TORCH_OPS_REGISTRY,
+    TorchOpsRegistry,
     register_torch_op,
 )
+from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 from coremltools.converters.mil.testing_reqs import backends
 from coremltools.converters.mil.testing_utils import (
     assert_cast_ops_count,
@@ -131,19 +132,48 @@ def test_source_dialect_metadata(torch_model, backend):
         assert mlmodel.user_defined_metadata[_METADATA_SOURCE_DIALECT] == "TorchScript"
 
 
-
 @pytest.mark.skipif(not _HAS_EXECUTORCH, reason=MSG_EXECUTORCH_NOT_FOUND)
 class TestEXIRValidation:
     @staticmethod
-    @pytest.mark.parametrize(
-        "backend",
-        backends,
-    )
+    @pytest.mark.parametrize("backend", backends)
+    def test_fp16_io(torch_model, backend):  # TODO (rdar://115845792): Handle fp16 IO dtypes
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.linear = torch.nn.Linear(10, 20, dtype=torch.float16)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = TestModule()
+        model.eval()
+
+        shape = (1, 10)
+        example_inputs = (torch.rand(*shape, dtype=torch.float16),)
+        exir_program_aten = torch.export.export(model, example_inputs)
+        exir_program_edge = executorch.exir.to_edge(exir_program_aten).exported_program()
+
+        # Default deployment target is iOS14 for neuralnetwork and iOS15 for mlprogram,
+        # both are too old to support fp16 io
+        with pytest.raises(
+            ValueError, match=r"To use fp16 input, please set minimum deployment target to iOS16\+"
+        ):
+            ct.convert(exir_program_edge, convert_to=backend[0])
+
+        # fp16 io should work fine for iOS16+
+        if backend[0] == "mlprogram":
+            ct.convert(
+                exir_program_edge,
+                convert_to="mlprogram",
+                minimum_deployment_target=ct.target.iOS16,
+            )
+
+    @staticmethod
+    @pytest.mark.parametrize("backend", backends)
     def test_inputs(
         torch_model, backend
     ):  # TODO: rdar://115845792 ([Executorch] Handle user provided inputs/outputs in the convert API)
-
-        shape = (1, 10)
+        shape = (2, 10)
         exir_program_aten = torch.export.export(torch_model, (torch.rand(*shape),))
         exir_program_edge = executorch.exir.to_edge(exir_program_aten).exported_program()
 
@@ -157,15 +187,11 @@ def test_inputs(
             )
 
     @staticmethod
-    @pytest.mark.parametrize(
-        "backend",
-        backends,
-    )
+    @pytest.mark.parametrize("backend", backends)
     def test_outputs(
         torch_model, backend
     ):  # TODO: rdar://115845792 ([Executorch] Handle user provided inputs/outputs in the convert API)
-
-        shape = (1, 10)
+        shape = (3, 10)
         exir_program_aten = torch.export.export(torch_model, (torch.rand(*shape),))
         exir_program_edge = executorch.exir.to_edge(exir_program_aten).exported_program()
 
@@ -179,12 +205,9 @@ def test_outputs(
             )
 
     @staticmethod
-    @pytest.mark.parametrize(
-        "backend",
-        backends,
-    )
+    @pytest.mark.parametrize("backend", backends)
     def test_source_dialect_metadata(torch_model, backend):
-        shape = (1, 10)
+        shape = (4, 10)
         exir_program_aten = torch.export.export(torch_model, (torch.rand(*shape),))
         exir_program_edge = executorch.exir.to_edge(exir_program_aten).exported_program()
 
@@ -198,6 +221,7 @@ def test_source_dialect_metadata(torch_model, backend):
 
         assert mlmodel.user_defined_metadata[_METADATA_SOURCE_DIALECT] == "TorchExport::EDGE"
 
+
 @pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
 class TestTorchOpsRegistry:
     @staticmethod
@@ -1239,7 +1263,7 @@ def forward(self, x):
 def rank4_input_model():
     class Model(torch.nn.Module):
         def forward(self, x):
-            return x + 5.5
+            return x + 5.0
     example_input = torch.randint(0, 100, (1, 3, 10, 20), dtype=torch.float32)
     return torch.jit.trace(Model().eval(), example_input)
 
@@ -1674,6 +1698,35 @@ def test_color_output(self, rank4_input_model, float32_input_model_add_op):
         assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.BGR)
         verify_prediction(mlmodel)
 
+        # check mlprogram can have dynamic shape image output
+        shape = ct.Shape((1, 3, ct.RangeDim(5, 10), ct.RangeDim(5, 10)))
+        mlmodel = ct.convert(
+            rank4_input_model,
+            inputs=[ct.TensorType(shape=shape, dtype=np.float32)],
+            outputs=[ct.ImageType(name="output_image", color_layout=ct.colorlayout.RGB)],
+            minimum_deployment_target=ct.target.macOS13,
+        )
+        assert_ops_in_mil_program(mlmodel, expected_op_list=["cast", "add", "cast"])
+        assert_spec_output_image_type(mlmodel._spec, expected_feature_type=ft.ImageFeatureType.RGB)
+        assert_prog_input_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert_prog_output_type(mlmodel._mil_program, expected_dtype_str="fp32")
+        assert any_symbolic(mlmodel._mil_program.functions["main"].outputs[0].shape)
+        verify_prediction(mlmodel)
+
+        # Test output image numerical
+        sample_input = np.random.randint(low=0, high=200, size=(1, 3, 10, 10)).astype(np.float32)
+        model_output_pil_image = mlmodel.predict({"x": sample_input})["output_image"]
+        assert isinstance(model_output_pil_image, Image.Image)
+        assert model_output_pil_image.mode == "RGBA"
+        model_output_as_numpy = np.array(model_output_pil_image)[:, :, :3]  # last A channel is 255
+        model_output_as_numpy = np.transpose(model_output_as_numpy, axes=[2, 0, 1])
+        reference_output = rank4_input_model(torch.from_numpy(sample_input)).detach().numpy()
+        reference_output = np.squeeze(reference_output)
+        np.testing.assert_allclose(reference_output, model_output_as_numpy, rtol=1e-2, atol=1e-2)
+
+        a_channel = np.array(model_output_pil_image)[:, :, 3].flatten()
+        assert np.all(a_channel == 255)
+
     def test_grayscale_output(self, rank4_grayscale_input_model):
         with pytest.raises(TypeError, match="float16 dtype for outputs is only supported for deployment target >= iOS16/macOS13"):
             ct.convert(rank4_grayscale_input_model,
@@ -1811,15 +1864,24 @@ def test_grayscale_fp16_input_image(self, rank4_grayscale_input_model):
         reference_output = rank4_grayscale_input_model(torch.from_numpy(sample_input.astype(np.float32))).detach().numpy()
         np.testing.assert_allclose(reference_output, model_output, rtol=1e-2, atol=1e-2)
 
-    def test_grayscale_output_image(self, rank4_grayscale_input_model):
-        mlmodel = ct.convert(rank4_grayscale_input_model,
-                             inputs=[ct.TensorType(name="input",
-                                                  shape=(1, 1, 10, 20))],
-                             outputs=[ct.ImageType(name="output_image",
-                                                   color_layout=ct.colorlayout.GRAYSCALE)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             compute_precision=ct.precision.FLOAT32,
-                             )
+    @pytest.mark.parametrize(
+        "dynamic_shape",
+        [True, False],
+    )
+    def test_grayscale_output_image(self, rank4_grayscale_input_model, dynamic_shape):
+
+        if dynamic_shape:
+            shape = ct.Shape((1, 1, ct.RangeDim(5, 10), ct.RangeDim(5, 20)))
+        else:
+            shape = (1, 1, 10, 20)
+
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.TensorType(name="input", shape=shape)],
+            outputs=[ct.ImageType(name="output_image", color_layout=ct.colorlayout.GRAYSCALE)],
+            minimum_deployment_target=ct.target.macOS13,
+            compute_precision=ct.precision.FLOAT32,
+        )
         sample_input = np.random.randint(low=0, high=200, size=(1, 1, 10, 20)).astype(np.float32)
         model_output_pil_image = mlmodel.predict({"input": sample_input})['output_image']
         assert isinstance(model_output_pil_image, Image.Image)
@@ -1829,15 +1891,27 @@ def test_grayscale_output_image(self, rank4_grayscale_input_model):
         reference_output = np.squeeze(reference_output)
         np.testing.assert_allclose(reference_output, model_output_as_numpy, rtol=1e-2, atol=1e-2)
 
-    def test_grayscale_fp16_output_image(self, rank4_grayscale_input_model):
-        mlmodel = ct.convert(rank4_grayscale_input_model,
-                             inputs=[ct.TensorType(name="input",
-                                                  shape=(1, 1, 10, 20))],
-                             outputs=[ct.ImageType(name="output_image",
-                                                   color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)],
-                             minimum_deployment_target=ct.target.macOS13,
-                             compute_precision=ct.precision.FLOAT32,
-                             )
+    @pytest.mark.parametrize(
+        "dynamic_shape",
+        [True, False],
+    )
+    def test_grayscale_fp16_output_image(self, rank4_grayscale_input_model, dynamic_shape):
+
+        if dynamic_shape:
+            shape = ct.Shape((1, 1, ct.RangeDim(5, 10), ct.RangeDim(5, 20)))
+        else:
+            shape = (1, 1, 10, 20)
+
+        mlmodel = ct.convert(
+            rank4_grayscale_input_model,
+            inputs=[ct.TensorType(name="input", shape=shape)],
+            outputs=[
+                ct.ImageType(name="output_image", color_layout=ct.colorlayout.GRAYSCALE_FLOAT16)
+            ],
+            minimum_deployment_target=ct.target.macOS13,
+            compute_precision=ct.precision.FLOAT32,
+        )
+
         sample_input = np.random.randint(low=0, high=200, size=(1, 1, 10, 20)).astype(np.float32)
         model_output_pil_image = mlmodel.predict({"input": sample_input})['output_image']
         assert isinstance(model_output_pil_image, Image.Image)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 6bacb5ad7..e49783d00 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -1570,6 +1570,20 @@ def test_convolution2d(
         bias,
         groups=1,
     ):
+        if (
+            backend == ('neuralnetwork', 'fp32') and
+            padding == 1 and
+            stride == 2 and
+            height == 7 and
+            width == 5 and
+            in_channels == 3 and
+            out_channels == 3 and
+            kernel_size == 2 and
+            dilation == 3 and
+            not bias
+        ):
+            pytest.xfail("rdar://121954894: Conv2d starts to fail")
+
         if padding == "same" and stride != 1:
             return
         model = nn.Conv2d(
@@ -3034,9 +3048,48 @@ def forward(self, x, y):
                 else:
                     raise ValueError("Unsupported mode: {mode}".format(mode=mode))
 
-        model = TestModel()
         self.run_compare_torch(
-            input_shapes, model, backend=backend, compute_unit=compute_unit
+            input_shapes, TestModel(), backend=backend, compute_unit=compute_unit
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, input_shapes, mode, xdtype, ydtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                [(2, 5, 7, 3), (2, 5, 7, 3)],
+                [(3, 2, 9), (3, 2, 9)],
+                [(1, 2, 3), (1,)],
+                [(1,), (2, 5, 6, 7)],
+                [(1, 2, 1), (3, 4, 2, 5)],
+            ],
+            ["minimum", "maximum"],
+            (torch.float16, torch.float32),
+            (torch.float16, torch.float32),
+        ),
+    )
+    def test_minimum_maximum_mixed_precision(
+        self, compute_unit, backend, input_shapes, mode, xdtype, ydtype
+    ):
+        class TestModel(torch.nn.Module):
+            def forward(self, x, y):
+                a = x.to(xdtype)
+                b = y.to(ydtype)
+                if mode == "minimum":
+                    return torch.minimum(a, b)
+                elif mode == "maximum":
+                    return torch.maximum(a, b)
+                else:
+                    raise ValueError("Unsupported mode: {mode}".format(mode=mode))
+
+        self.run_compare_torch(
+            input_shapes,
+            TestModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            rtol=1e-6 if xdtype == ydtype and xdtype == torch.float32 else 1e-3,
+            atol=1e-6 if xdtype == ydtype and xdtype == torch.float32 else 1e-3,
         )
 
 class TestAMaxAMin(TorchBaseTest):
@@ -4824,6 +4877,9 @@ class TestEinsum(TorchBaseTest):
         ),
     )
     def test_binary_einsum(self, compute_unit, backend, equation, reverse_input_order, dynamic):
+        if dynamic and backend[0] == "mlprogram" and ct.utils._macos_version() > (14, 2):
+            pytest.xfail("rdar://120386990 (Einsum Model Failed)")
+
         class TestBinaryEinsum(nn.Module):
             def forward(self, x, y):
                 return torch.einsum(equation, x, y)
@@ -4976,11 +5032,41 @@ def test_squeeze(self, compute_unit, backend, rank_and_axis):
         else:
             input_shape[0] = 1
         input_shape = tuple(input_shape)
-        model = ModuleWrapper(
-            function=torch.squeeze, kwargs={"dim": axis} if axis else {}
-        )
+        model = ModuleWrapper(function=torch.squeeze, kwargs={"dim": axis} if axis else {})
+        self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dynamic, dim",
+        itertools.product(compute_units, backends, [True, False], [None, 0, 2, (1,), (1, 2)]),
+    )
+    def test_squeeze_non_single_element_dim(self, compute_unit, backend, dynamic, dim):
+        if backend[0] == "neuralnetwork":
+            pytest.skip("neuralnetwork backend doesn't support squeeze a not-1 dimension")
+        if dynamic and compute_unit == ct.ComputeUnit.CPU_ONLY:
+            pytest.skip("CPU behaves differently from PyTorch for dropping dynamic dim.")
+        if compute_unit == ct.ComputeUnit.CPU_ONLY and dim in {0, (1,), (1, 2)}:
+            pytest.xfail("CPU failed non-single-dim squeeze (rdar://124555262)")
+
+        input_shape = (2, 3, 1)
+        model = ModuleWrapper(function=torch.squeeze, kwargs=None if dim is None else {"dim": dim})
+        if dynamic:
+            converter_input_type = [
+                ct.TensorType(
+                    shape=(
+                        ct.RangeDim(upper_bound=10, default=2),
+                        ct.RangeDim(upper_bound=10, default=3),
+                        ct.RangeDim(upper_bound=10, default=1),
+                    )
+                ),
+            ]
+        else:
+            converter_input_type = None
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            converter_input_type=converter_input_type,
         )
 
 
@@ -5144,8 +5230,43 @@ def test_gather_along_axis(self, compute_unit, backend, rank_and_axis):
             function=torch.gather,
             kwargs={"dim": axis, "index": torch.from_numpy(indices)},
         )
+        self.run_compare_torch([params_shape], model, backend=backend, compute_unit=compute_unit)
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, input_enumerated_shape",
+        itertools.product(compute_units, backends, (True, False)),
+    )
+    def test_gather_enumerated_shape(self, compute_unit, backend, input_enumerated_shape):
+        axis = 0
+        params_shape = (2, 3, 4)
+        indices_shape = (3, 3, 4)
+
+        class Model(nn.Module):
+            def forward(self, x, index):
+                return torch.gather(x, axis, index)
+
+        input_data = [torch.rand(params_shape), torch.randint(0, params_shape[axis], indices_shape)]
+        # Each model is only allowed for one input feature with enumerated shape.
+        if input_enumerated_shape:
+            converter_input_type = [
+                ct.TensorType(shape=ct.EnumeratedShapes(shapes=[(2, 3, 4), (3, 4, 5)])),
+                ct.TensorType(shape=(3, 3, 4), dtype=np.int32),
+            ]
+        else:
+            converter_input_type = [
+                ct.TensorType(shape=(2, 3, 4)),
+                ct.TensorType(
+                    shape=ct.EnumeratedShapes(shapes=[(3, 3, 4), (4, 3, 4)]), dtype=np.int32
+                ),
+            ]
         self.run_compare_torch(
-            [params_shape], model, backend=backend, compute_unit=compute_unit
+            input_data,
+            Model(),
+            input_as_shape=False,
+            converter_input_type=converter_input_type,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=ct.target.iOS17,
         )
 
     def test_gather_along_axis_invalid_indices(self):
@@ -5157,6 +5278,42 @@ def test_gather_along_axis_invalid_indices(self):
         with pytest.raises(RuntimeError, match="index 2 is out of bounds"):
             torch.gather(data, 1, torch.tensor([[0, 0], [2, 0]]))
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, dynamic",
+        itertools.product(compute_units, backends, [True, False]),
+    )
+    def test_gather_nd_int16_indices(self, compute_unit, backend, dynamic):
+        """Test the indices access in torch model which gets lowered to gather_nd."""
+        B, C, H, W, T = 1, 24, 64, 64, 32
+        data = torch.rand(B, C, H, W)
+        time = (torch.rand(1, T) * (C - 1)).to(torch.int)
+
+        class DynamicModel(torch.nn.Module):
+            def forward(self, data, time):
+                return data[torch.arange(B).unsqueeze(1), time, :, :]
+
+        class StaticModel(torch.nn.Module):
+            def forward(self, data):
+                return data[torch.arange(B).unsqueeze(1), time, :, :]
+
+        torch_model = DynamicModel() if dynamic else StaticModel()
+        input_data = (data, time) if dynamic else data
+        converter_input_type = [ct.TensorType(shape=data.shape)]
+        if dynamic:
+            converter_input_type.append(ct.TensorType(shape=time.shape, dtype=np.int32))
+
+        mlmodel = self.run_compare_torch(
+            input_data,
+            torch_model,
+            input_as_shape=False,
+            converter_input_type=converter_input_type,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=ct.target.iOS17,
+        )[1]
+        gather_op = mlmodel._mil_program.find_ops(op_type="gather_nd")[0]
+        assert gather_op.indices.dtype == types.int16 if dynamic else types.uint16
+
 
 class TestActivation(TorchBaseTest):
     @staticmethod
@@ -6886,6 +7043,39 @@ def forward(self, x):
         )
 
 
+class TestUnique(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, x, return_inverse, return_counts",
+        itertools.product(
+            compute_units,
+            backends,
+            (
+                [1, 2, 3, 2, 2, 3, 99, -1, 1],
+                [[1, 2, 3, 100], [3, 2, 99, 1]],
+            ),
+            (True, False),
+            (True, False),
+        )
+    )
+    def test(self, compute_unit, backend, x, return_inverse, return_counts):
+        class Model(nn.Module):
+            def forward(self, x):
+                return torch.unique(
+                    x, return_inverse=return_inverse, return_counts=return_counts
+                )
+
+        if backend[0] == 'neuralnetwork':
+            pytest.xfail("This op is only supported on mlprogram backend.")
+
+        self.run_compare_torch(
+            torch.Tensor(x),
+            Model(),
+            input_as_shape=False,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
+
 class TestFlip(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank_dim",
@@ -7105,6 +7295,40 @@ def forward(self, cond, x, y):
             input_as_shape=False,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, shapes, xdtype, ydtype",
+        itertools.product(
+            compute_units,
+            backends,
+            [
+                [(1, 2), (1, 2), (1, 1)],
+                [(1, 2, 3), (1, 2, 1), (1, 1, 3)],
+            ],
+            (torch.float16, torch.float32),
+            (torch.float16, torch.float32),
+        ),
+    )
+    def test_where_mixed_precision(self, compute_unit, backend, shapes, xdtype, ydtype):
+        class WhereModel(nn.Module):
+            def forward(self, cond, x, y):
+                a = x.to(xdtype)
+                b = y.to(ydtype)
+                return torch.where(cond, a, b)
+
+        cond_shape, x_shape, y_shape = shapes
+        cond = torch.rand(*cond_shape) > 0.5
+        inputs = [cond, torch.rand(*x_shape), torch.rand(*y_shape)]
+
+        self.run_compare_torch(
+            inputs,
+            WhereModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            input_as_shape=False,
+            rtol=1e-6 if xdtype == ydtype and xdtype == torch.float32 else 1e-3,
+            atol=1e-6 if xdtype == ydtype and xdtype == torch.float32 else 1e-3,
+        )
+
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
         itertools.product(compute_units, backends, COMMON_SHAPES + [(10,)]),
@@ -10213,14 +10437,13 @@ def test_different_input_ranks_no_mask(
             },
         )
 
-        res = self.run_compare_torch(
+        return self.run_compare_torch(
             [input_shape] * 3,
             model,
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=minimum_deployment_target,
-        )
-        return res[1]
+        )[1]
 
     @pytest.mark.parametrize(
         "compute_unit, backend, seq_lengths, include_heads",
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
index bd51b2af1..d41de1e77 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
@@ -90,6 +90,8 @@ def run_compare_torch(
         rtol=1e-05,
         input_as_shape=True,
         minimum_deployment_target=ct.target.iOS17,
+        compute_unit=ct.ComputeUnit.CPU_ONLY,
+        converter=ct.convert,
     ):
         # TODO(rdar://108472419): properly design a random input
         if input_as_shape:
@@ -103,8 +105,9 @@ def run_compare_torch(
             input_as_shape=False,
             backend=("mlprogram", "fp32"),
             use_scripting=False,
-            compute_unit=ct.ComputeUnit.CPU_ONLY,
+            compute_unit=compute_unit,
             minimum_deployment_target=minimum_deployment_target,
+            converter=converter,
         )
 
 
@@ -399,7 +402,7 @@ def forward(self, x):
         input_shape = [(3, 5)]
         res = self.run_compare_torch(input_shape, model)
         prog = res[1]._mil_program
-        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize", "matmul"]
+        assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize", "linear"]
 
 
 @pytest.mark.skipif(not _HAS_TORCH_VISION, reason=MSG_TORCH_VISION_NOT_FOUND)
diff --git a/coremltools/converters/mil/frontend/torch/test/testing_utils.py b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
index 918be4003..65f03cc36 100644
--- a/coremltools/converters/mil/frontend/torch/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
@@ -80,9 +80,15 @@ def convert_to_coreml_inputs(input_description, inputs):
     return coreml_inputs
 
 
-def convert_to_mlmodel(model_spec, tensor_inputs, backend=("neuralnetwork", "fp32"),
-                       converter_input_type=None, compute_unit=ct.ComputeUnit.CPU_ONLY,
-                       minimum_deployment_target=None):
+def convert_to_mlmodel(
+    model_spec,
+    tensor_inputs,
+    backend=("neuralnetwork", "fp32"),
+    converter_input_type=None,
+    compute_unit=ct.ComputeUnit.CPU_ONLY,
+    minimum_deployment_target=None,
+    converter=ct.convert,
+):
     def _convert_to_inputtype(inputs):
         if isinstance(inputs, list):
             return [_convert_to_inputtype(x) for x in inputs]
@@ -106,9 +112,15 @@ def _convert_to_inputtype(inputs):
         inputs = None
         outputs = None
 
-    return ct_convert(model_spec, inputs=inputs, convert_to=backend,
-                      source="pytorch", compute_units=compute_unit,
-                      minimum_deployment_target=minimum_deployment_target)
+    return ct_convert(
+        model_spec,
+        inputs=inputs,
+        convert_to=backend,
+        source="pytorch",
+        compute_units=compute_unit,
+        minimum_deployment_target=minimum_deployment_target,
+        converter=converter,
+    )
 
 
 def generate_input_data(input_size, rand_range=(0, 1), torch_device=torch.device("cpu")):
@@ -162,6 +174,7 @@ def convert_and_compare(
     converter_input_type=None,
     compute_unit=ct.ComputeUnit.CPU_ONLY,
     minimum_deployment_target=None,
+    converter=ct.convert,
 ):
     """
     If expected results is not set, it will by default
@@ -175,6 +188,9 @@ def convert_and_compare(
         torch_model = torch.jit.load(model_spec)
     else:
         torch_model = model_spec
+    if _HAS_TORCH_EXPORT_API:
+        if isinstance(torch_model, ExportedProgram):
+            torch_model = torch_model.module()
 
     if not isinstance(input_data, (list, tuple)):
         input_data = [input_data]
@@ -183,10 +199,15 @@ def convert_and_compare(
         torch_input = _copy_input_data(input_data)
         expected_results = torch_model(*torch_input)
     expected_results = flatten_and_detach_torch_results(expected_results)
-    mlmodel = convert_to_mlmodel(model_spec, input_data, backend=backend,
-                                 converter_input_type=converter_input_type,
-                                 compute_unit=compute_unit,
-                                 minimum_deployment_target=minimum_deployment_target,)
+    mlmodel = convert_to_mlmodel(
+        model_spec,
+        input_data,
+        backend=backend,
+        converter_input_type=converter_input_type,
+        compute_unit=compute_unit,
+        minimum_deployment_target=minimum_deployment_target,
+        converter=converter,
+    )
 
     coreml_inputs = convert_to_coreml_inputs(mlmodel.input_description, input_data)
 
@@ -236,6 +257,7 @@ def run_compare_torch(
         minimum_deployment_target=None,
         torch_device=torch.device("cpu"),
         frontend=TorchFrontend.TORCHSCRIPT,
+        converter=ct.convert,
     ):
         """
         Traces a model and runs a numerical test.
@@ -286,6 +308,7 @@ def run_compare_torch(
             converter_input_type=converter_input_type,
             compute_unit=compute_unit,
             minimum_deployment_target=minimum_deployment_target,
+            converter=converter,
         )
 
         return model_spec, mlmodel, coreml_inputs, coreml_results, \
diff --git a/coremltools/converters/mil/frontend/torch/torch_op_registry.py b/coremltools/converters/mil/frontend/torch/torch_op_registry.py
index ec8b07547..b542b5bfd 100644
--- a/coremltools/converters/mil/frontend/torch/torch_op_registry.py
+++ b/coremltools/converters/mil/frontend/torch/torch_op_registry.py
@@ -3,7 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from typing import Callable, List
+from typing import Callable
 
 import torch
 
@@ -129,7 +129,7 @@ def func_wrapper(func):
     return func_wrapper(_func)
 
 
-def is_torch_fx_node_supported(torch_fx_node: torch.fx.Node) -> bool:
+def is_torch_fx_node_supported(torch_fx_node: "torch.fx.Node") -> bool:
     # There are many types of torch fx node:
     #     1. call_function
     #     2. call_module
diff --git a/coremltools/converters/mil/frontend/torch/torchir_passes.py b/coremltools/converters/mil/frontend/torch/torchir_passes.py
index 77d73c471..cf784cdf0 100644
--- a/coremltools/converters/mil/frontend/torch/torchir_passes.py
+++ b/coremltools/converters/mil/frontend/torch/torchir_passes.py
@@ -2,14 +2,16 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
 from collections import OrderedDict, defaultdict
+from typing import Dict, Optional
 
 from coremltools import _logger as logger
 
 from .internal_graph import InternalTorchIRGraph, InternalTorchIRNode
 
 
-def generate_tensor_assignment_ops(graph):
+def generate_tensor_assignment_ops(graph: InternalTorchIRGraph) -> None:
     """
     This graph pass handles inplace tensor assignments, specifically it handles:
     `torch.Tensor.copy_` and `torch.Tensor.fill_`. There are many other inplace tensor
@@ -174,6 +176,7 @@ def _construct_nodes_to_fuse_inputs(nodes_to_fuse):
                 outputs=outputs,
                 kind=kind,
                 blocks=[],
+                model_hierarchy=node.model_hierarchy,
             )
             graph.nodes[i] = tensor_assign_node
 
@@ -183,7 +186,50 @@ def _construct_nodes_to_fuse_inputs(nodes_to_fuse):
         graph.outputs[idx] = _get_updated_name(output, updated_tensor_count, out_alias)
 
 
-def remove_getattr_nodes(graph):
+def populate_native_const_model_hierarchy(graph: InternalTorchIRGraph) -> None:
+    """
+    Torchscript doesn't capture the model hierarchy of those python native consts.
+    For instance:
+
+    class Submodule(torch.nn.Module):
+        def forward(self, x):
+            x = x + 0.9
+            x = x * 0.9
+            return torch.relu(x)
+
+    class Model(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.submodule_1 = Submodule()
+
+        def forward(self, x):
+            return self.submodule_1(x)
+
+    The two ``0.9`` constants don't have the scope of Submodule.
+    In this graph pass, we make the model hierarchy of such constants inherited from
+    their child ops.
+    """
+
+    cached_model_hierarchy = {}
+    child_ops = defaultdict(list)
+
+    for node in graph.nodes:
+        for b in node.blocks:
+            populate_native_const_model_hierarchy(b)
+
+    for node in graph.nodes:
+        cached_model_hierarchy[node.name] = node.model_hierarchy
+        for val in node.inputs:
+            child_ops[val].append(node.name)
+
+    for node in graph.nodes:
+        if node.kind != "constant":
+            continue
+        if node.model_hierarchy == "" and len(child_ops[node.name]) == 1:
+            node.model_hierarchy = cached_model_hierarchy[child_ops[node.name][0]]
+
+
+def remove_getattr_nodes(graph: InternalTorchIRGraph) -> None:
     """
     Remove the getattr nodes in the graph
     """
@@ -210,7 +256,9 @@ def remove_getattr_nodes(graph):
     graph.nodes = new_nodes
 
 
-def transform_inplace_ops(graph, name_remap_dict=None):
+def transform_inplace_ops(
+    graph: InternalTorchIRGraph, name_remap_dict: Optional[Dict[str, str]] = None
+) -> None:
 
     # As we modify ops, we'll need to remap symbols.
     if name_remap_dict is None:
@@ -272,7 +320,7 @@ def transform_inplace_ops(graph, name_remap_dict=None):
             graph.outputs[idx] = v
 
 
-def flatten_graph_input_values(graph):
+def flatten_graph_input_values(graph: InternalTorchIRGraph) -> None:
     """CoreML can't handle nested iterables of tensors, so we flatten the
     inputs of any graph that expects them.
     """
@@ -317,7 +365,7 @@ def flatten_graph_input_values(graph):
     graph.nodes = all_new_nodes + graph.nodes
 
 
-def flatten_graph_output_values(graph):
+def flatten_graph_output_values(graph: InternalTorchIRGraph) -> None:
     """
     CoreML can't handle nested iterables of tensors, so we flatten the
     outputs of any graph that produces them.
diff --git a/coremltools/converters/mil/frontend/torch/utils.py b/coremltools/converters/mil/frontend/torch/utils.py
index 11f11bd10..62837fe5e 100644
--- a/coremltools/converters/mil/frontend/torch/utils.py
+++ b/coremltools/converters/mil/frontend/torch/utils.py
@@ -10,9 +10,17 @@
 
 from coremltools.converters.mil.mil import types
 
-# Some ops will receive a dtype input as an integer
-# which maps to a torch dtype. The below mapping was found by
-# converting test models with different dtypes passed to ones.
+# NOTE [represent torch dtype by integer]
+# In TorchScript, some ops will receive a dtype input as an integer which maps to a torch dtype.
+# The below mapping was found by converting test models with different dtypes passed to ones.
+# There is one modification to original torch mapping, though, due to CoreML lacks 64-bit dtype
+# When mapping from torch dtype to integer number, we map
+#     * int64 to int32's number
+#     * float64 to float32's number
+# When mapping from integer number back to torch dtype, we map
+#     * int64's number to int32
+#     * float64's number to float32
+# TODO(https://github.com/apple/coremltools/issues/2153): This is confusing... we should refactor
 NUM_TO_TORCH_DTYPE = {
     0: torch.uint8,
     1: torch.int8,
@@ -31,24 +39,30 @@
 TORCH_DTYPE_TO_NUM = {
     dtype: val for val, dtype in NUM_TO_TORCH_DTYPE.items()
 }
+TORCH_DTYPE_TO_NUM[torch.int64] = TORCH_DTYPE_TO_NUM[torch.int32]
+TORCH_DTYPE_TO_NUM[torch.float64] = TORCH_DTYPE_TO_NUM[torch.float32]
 
-NUMPY_DTYPE_TO_TORCH_NUM = {
-    np.uint8: 0,
-    np.int8: 1,
-    np.int16: 2,
-    np.int32: 3,
-    np.int64: 4,
-    np.float16: 5,
-    np.float32: 6,
-    np.float64: 7,
-    bool: 11,
+NUM_TO_NUMPY_DTYPE = {
+    0: np.uint8,
+    1: np.int8,
+    2: np.int16,
+    3: np.int32,
+    4: np.int32,
+    5: np.float16,
+    6: np.float32,
+    7: np.float32,
+    11: bool, 
 }
 
-NUM_TO_NUMPY_DTYPE = {
-    val: dtype for dtype, val in NUMPY_DTYPE_TO_TORCH_NUM.items()
+NUMPY_DTYPE_TO_TORCH_NUM = {
+    dtype: val for val, dtype in NUM_TO_NUMPY_DTYPE.items()
 }
+NUMPY_DTYPE_TO_TORCH_NUM[np.int64] = NUMPY_DTYPE_TO_TORCH_NUM[np.int32]
+NUMPY_DTYPE_TO_TORCH_NUM[np.float64] = NUMPY_DTYPE_TO_TORCH_NUM[np.float32]
 
 NUM_TO_DTYPE_STRING = {
+    0: "uint8",
+    1: "int8",
     2: "int16",
     3: "int32",
     4: "int32",
@@ -59,10 +73,12 @@
 }
 
 TYPE_TO_DTYPE_STRING = {
-    types.bool: "bool",
+    types.uint8: "uint8",
+    types.int8: "int8",
+    types.int32: "int32",
     types.fp16: "fp16",
     types.fp32: "fp32",
-    types.int32: "int32",
+    types.bool: "bool",
 }
 
 TORCH_QTYPE_TO_NP_TYPE = {
diff --git a/coremltools/converters/mil/mil/__init__.py b/coremltools/converters/mil/mil/__init__.py
index 2ec248b9e..96ac3a8f7 100644
--- a/coremltools/converters/mil/mil/__init__.py
+++ b/coremltools/converters/mil/mil/__init__.py
@@ -20,7 +20,6 @@
 )
 from .operation import Operation, mil_list, precondition
 from .program import (
-    InputType,
     Placeholder,
     Program,
     Symbol,
diff --git a/coremltools/converters/mil/mil/block.py b/coremltools/converters/mil/mil/block.py
index 3191911b5..9c5e88209 100644
--- a/coremltools/converters/mil/mil/block.py
+++ b/coremltools/converters/mil/mil/block.py
@@ -5,15 +5,18 @@
 
 import copy
 from collections import Counter, OrderedDict
-from typing import Tuple
+from typing import List, Optional, Set, Tuple, Union
 
 from coremltools import _OPSET
 from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
+from coremltools.converters.mil.input_types import InputType
 
 from . import SPACES, types
 from .operation import Operation
+from .scope import SCOPE_STACK, VALID_OPS_TO_COPY_SCOPE_INFO, ScopeSource, add_graph_pass_scope
 from .types.symbolic import is_symbolic, k_used_symbols
+from .utils import CacheDoublyLinkedList
 from .var import ComplexVar, InternalVar, Var
 from .visitors.dot_visitor import DotVisitor
 
@@ -21,6 +24,7 @@
 BLOCK_STACK = []
 DEBUG = False
 
+
 def curr_block():
     if len(BLOCK_STACK) == 0:
         raise ValueError("Must call Builder inside an Function" + " or Block")
@@ -50,6 +54,8 @@ class Block:
         "operations",
         "_internal_vars",
         "outer_op",
+        "cache_operations",
+        "_essential_scope_sources",
     ]
 
     counter = 0
@@ -108,7 +114,7 @@ def __init__(self, block_inputs=None, outer_op=None, name=None):
             self.name = Block._get_new_name()
 
         # list[Operation]. Topologically sorted.
-        self.operations = []
+        self.operations = CacheDoublyLinkedList()
 
         # Must be set before self.validate()
         self.outer_op = outer_op
@@ -124,22 +130,99 @@ def __init__(self, block_inputs=None, outer_op=None, name=None):
         # (infinite recursion). They must be considered as always visible.
         self._internal_vars = set()
 
+        # List[ScopeSource]. During graph pass, those scope source cannot be missed
+        self._essential_scope_sources = []
+
         if self.outer_op is None and not isinstance(self, Function):
             msg = "Block {} is not Function and thus outer_op cannot be None"
             raise ValueError(msg.format(self.name))
 
         self.validate()
 
-    def validate(self):
+    def _add_essential_scope_source(
+        self, scope_source: Union[ScopeSource, List[ScopeSource]]
+    ) -> None:
+        """
+        Add essential scope sources to self._essential_scope_sources.
+        When self.validate() is called, we make sure that all source info are not missing.
+        """
+        if not isinstance(scope_source, list):
+            scope_source = [scope_source]
+
+        for source in scope_source:
+            if source in self._essential_scope_sources:
+                raise ValueError(f"{source} already exist in _essential_scope_sources.")
+            self._essential_scope_sources.append(source)
+
+    def _check_has_scope_info(self) -> None:
+        """
+        Check no ops in the function are missing scope information.
+        """
+
+        def _check_has_scope_info_block(block: Block):
+            for op in block.operations:
+                for b in op.blocks:
+                    _check_has_scope_info_block(b)
+                for scope in self._essential_scope_sources:
+                    if scope not in op.scopes or len(op.scopes[scope]) == 0:
+                        raise ValueError(
+                            f"op {op.name} with scopes {op.scopes} is missing essential scopes {scope}."
+                        )
+
+        _check_has_scope_info_block(self)
+
+    def _check_vars_visibility_in_block(
+        self, visible_vars_from_outer_block: Optional[Set[Var]] = None
+    ):
+        """
+        This utils does a one pass program-wise checking of vars visibility.
+        That is, each input of an op, should appear before the op in the sequantial order.
+
+        For the debug purpose, if you want to pinpoint the operation which caused the
+        invalid program state, please set DEBUG=True, and it will be captured by the ``is_var_visible_in_block`` utils.
+        """
+        if visible_vars_from_outer_block is None:
+            visible_vars_from_outer_block = set()
+        block_inputs = list(self.inputs.values()) if isinstance(self, Function) else self.inputs
+        visible_vars_in_block = set(block_inputs)
+
+        for op in self.operations:
+            for b in op.blocks:
+                b._check_vars_visibility_in_block(
+                    visible_vars_from_outer_block=visible_vars_from_outer_block.union(
+                        visible_vars_in_block
+                    )
+                )
+            for val in op.get_flattened_inputs():
+                if (
+                    val not in self._internal_vars
+                    and val not in visible_vars_in_block
+                    and val not in visible_vars_from_outer_block
+                ):
+                    raise ValueError(f"Var {val} not visible in the block {self.name}.")
+            for out_var in op.outputs:
+                visible_vars_in_block.add(out_var)
+
+    def validate(
+        self,
+        force_validate: Optional[bool] = False,
+        check_essential_scope: Optional[bool] = False,
+    ) -> None:
         """
         Basic validation to protect against some invalid state.
+        If force_validate is False, the validation is done only if the global variable DEBUG=True.
         """
-        if not DEBUG:
+        if not DEBUG and not force_validate:
             return
 
+        # Check vars visibility
+        if isinstance(self, Function):
+            self._check_vars_visibility_in_block()
+
+        # Other validations
         for op in self.operations:
             for b in op.blocks:
-                b.validate()
+                b.validate(force_validate=force_validate)
             if op.outputs is None:
                 raise InvalidBlockStateError()
 
@@ -191,6 +274,20 @@ def validate(self):
                 msg = "Var {} should be output of block {}: {}"
                 raise ValueError(msg.format(ov.name, b.name, b))
 
+        # checking internal vars are consistent with self._internal_vars
+        internal_var_in_block = set()
+        for op in self.operations:
+            for v in op.internal_inputs.values():
+                internal_var_in_block.add(v)
+        if not internal_var_in_block == self._internal_vars:
+            raise ValueError(
+                "internal vars in the block are not consistent with self._internal_vars."
+            )
+
+        # check essential scope info are not missing
+        if check_essential_scope:
+            self._check_has_scope_info()
+
     def remove_inputs(self, curr_input_vars):
         """
         curr_input_vars: list[Var], whose elements must be in
@@ -236,7 +333,7 @@ def inputs(self):
     def outputs(self):
         return self._outputs
 
-    def is_var_visible_in_block(self, var, upto_op_with_id=None):
+    def is_var_visible_in_block(self, var: Var, upto_op: Optional[Operation] = None):
         """
         Checks if a var is visible to ops starting from id=`upto_op_with_id` inside the block.
 
@@ -248,33 +345,60 @@ def is_var_visible_in_block(self, var, upto_op_with_id=None):
 
         If upto_op_with_id is None, outputs of all operations inside the block are visible to
         that block.
+
+        For debugging:
+        - By default (DEBUG=False), this utils is guarded by the flag in calling code and not running.
+        - By setting DEBUG=True, this utils is triggered in multiple places in the code base,
+          so the users can pinpoint the exact place where an invalid operation is made by the converter.
+          Beware that, the converter could be slow in the debug mode, since the overal conversion
+          time will explode to O(N^2) in the average cases by this util.
         """
+        if not DEBUG:
+            # Only in debug mode, there is a chance that self.operations is type of list when executing this function.
+            assert isinstance(
+                self.operations, CacheDoublyLinkedList
+            ), "operations must be type of CacheDoublyLinkedList."
 
         if var in self._internal_vars:
             return True
 
-        inputs = self.function_inputs if isinstance(self, Function) else self.inputs
+        inputs = list(self.inputs.values()) if isinstance(self, Function) else self.inputs
         if var in inputs:
             return True
 
-        idx = len(self.operations) if upto_op_with_id is None else upto_op_with_id
-
-        for i in range(idx-1, -1, -1):
-            op_outputs = self.operations[i].outputs
-            if op_outputs is not None and var in op_outputs:
+        if upto_op is None:
+            if var.op in self.operations:
                 return True
+        else:
+            if isinstance(self.operations, list):
+                # This could only happen in debug mode
+                assert DEBUG is True, "block.operations can only be type of list in debug mode."
+                idx = self.find_op_id_in_block(upto_op)
+                for i in range(idx - 1, -1, -1):
+                    if var.op is self.operations[i]:
+                        return True
+            else:
+                cursor = self.operations._get_node_from_op(upto_op).prev
+                while cursor is not None:
+                    if cursor.op is var.op:
+                        return True
+                    cursor = cursor.prev
 
         if self.outer_op is not None:
             enclosing_block = self.outer_op.enclosing_block
-            outer_op_id = enclosing_block.find_op_id_in_block(self.outer_op)
-            if enclosing_block.is_var_visible_in_block(var, upto_op_with_id=outer_op_id):
+            if enclosing_block.is_var_visible_in_block(var, upto_op=self.outer_op):
                 return True
 
         return False
 
-    def find_op_id_in_block(self, target_op):
+    def find_op_id_in_block(self, target_op: Operation) -> int:
+        if len(self.operations) > 0 and target_op == self.operations[-1]:
+            return len(self.operations) - 1
+
+        op_list = self.operations if isinstance(self.operations, list) else list(self.operations)
+
         try:
-            idx = self.operations.index(target_op)
+            idx = op_list.index(target_op)
         except ValueError:
             raise ValueError("Op {} not found in {}: {}".format(target_op.name, self.name, self))
         return idx
@@ -287,13 +411,16 @@ def set_outputs(self, outputs):
             raise ValueError("Outputs must be list of Vars")
 
         self.validate()
-        for ov in outputs:
-            if not self.is_var_visible_in_block(ov):
-                msg = (
-                    "Var {} is not visible in block {} and thus cannot "
-                    + "be a block output.\n{}"
-                )
-                raise ValueError(msg.format(ov.name, self.name, self))
+
+        # check var visibility in debug mode
+        if DEBUG:
+            for ov in outputs:
+                if not self.is_var_visible_in_block(ov):
+                    msg = (
+                        "Var {} is not visible in block {} and thus cannot "
+                        + "be a block output.\n{}"
+                    )
+                    raise ValueError(msg.format(ov.name, self.name, self))
 
         # For duplicate vars in self._outputs, only remove block once.
         for ov in set(self._outputs):
@@ -317,7 +444,7 @@ def __exit__(self, type, value, traceback):
         global BLOCK_STACK
         BLOCK_STACK = BLOCK_STACK[:-1]
 
-    def _insert_op_before(self, new_op, before_op=None):
+    def _insert_op_before(self, new_op: Operation, before_op: Optional[Operation] = None):
         """
         A private API used by builder. Please use `builder.YOUR_OP(...,before_op)`.
 
@@ -351,42 +478,91 @@ def _insert_op_before(self, new_op, before_op=None):
         """
         self.validate()
 
-        idx = len(self.operations) if before_op is None else self.find_op_id_in_block(before_op)
-
-        # check inputs are visible
-        for k, v in new_op.inputs.items():
-            if not isinstance(v, (Var, tuple)):
-                continue
-            vs = [v] if isinstance(v, Var) else v
-            for v in vs:
-                if not self.is_var_visible_in_block(v, upto_op_with_id=idx):
-                    before_op_name = before_op.name if before_op is not None else "None"
-                    msg = "Op '{}' input {}={} is not in scope of {} before {}"
-                    raise ValueError(msg.format(new_op.name, k, v.name, self.name, before_op_name))
+        if isinstance(self.operations, CacheDoublyLinkedList):
+            self.operations.insert_op_before(new_op, before_op)
+            return
 
-        # add new_op
         if before_op is None:
             self.operations.append(new_op)
-        else:
-            self.operations.insert(idx, new_op)
+            return
+
+        # check inputs visibility in debug mode
+        if DEBUG:
+            for k, v in new_op.inputs.items():
+                if not isinstance(v, (Var, tuple)):
+                    continue
+                vs = [v] if isinstance(v, Var) else v
+                for v in vs:
+                    if not self.is_var_visible_in_block(v, upto_op=before_op):
+                        before_op_name = before_op.name if before_op is not None else "None"
+                        msg = "Op '{}' input {}={} is not in scope of {} before {}"
+                        raise ValueError(
+                            msg.format(new_op.name, k, v.name, self.name, before_op_name)
+                        )
+
+        idx = self.find_op_id_in_block(before_op)
+        self.operations.insert(idx, new_op)
 
     def _replace_var(
         self,
-        old_var,
-        new_var,
-        start=0,
-        end_id=-1,
-        no_check_var_types=False,
+        old_var: Var,
+        new_var: Var,
+        anchor_op: Optional[Operation] = None,
+        end_op: Optional[Operation] = None,
+        no_check_var_types: Optional[bool] = False,
     ):
         """
         Helper function for replace_uses_of_var_after_op
         """
+        self._copy_metadata(old_var, new_var)
+        self._copy_scope_info(old_var, new_var)
+
         num_ops_affected = 0
 
-        if end_id == -1:
-            op_list = self.operations[start:]
+        # If we start checking right after the old_var, we can reduce the time
+        # complexity hugely, by only checking the child_ops, without iterating
+        # through whole program.
+        # This fix reduce the overall time from O(N) -> O(1).
+        replace_vars_right_after_old_var = (
+            end_op is None
+            and len(self.operations) > 0
+            and anchor_op is not None
+            and anchor_op is old_var.op
+        )
+
+        # We should only compute start_idx and end_idx once if needed.
+        start_idx = end_idx = None
+
+        if replace_vars_right_after_old_var:
+            op_list = list(old_var.child_ops)
         else:
-            op_list = self.operations[start : end_id + 1]
+            if isinstance(self.operations, list):
+                start_idx = self.find_op_id_in_block(anchor_op) + 1 if anchor_op is not None else 0
+                end_idx = (
+                    self.find_op_id_in_block(end_op)
+                    if end_op is not None
+                    else len(self.operations) - 1
+                )
+                op_list = self.operations[start_idx : end_idx + 1]
+            else:
+                assert isinstance(
+                    self.operations, CacheDoublyLinkedList
+                ), f"Expect operations be type of CacheDoublyLinkedList. Got {type(self.operations)}."
+                if len(self.operations) == 0 and anchor_op is not None:
+                    raise ValueError(f"anchor op {anchor_op} not in the block.")
+
+                start_node = (
+                    self.operations.start
+                    if anchor_op is None
+                    else self.operations._get_node_from_op(anchor_op).next
+                )
+                cursor = start_node
+                op_list = []
+                while cursor is not None:
+                    op_list.append(cursor.op)
+                    if cursor.op is end_op:
+                        break
+                    cursor = cursor.next
 
         for op in op_list:
             new_inputs = {}
@@ -409,7 +585,43 @@ def _replace_var(
             for b in op.blocks:
                 num_ops_affected += b._replace_var(old_var, new_var)
 
-        if end_id != -1 and old_var.op not in op_list:
+        # Replace consuming_blocks's outputs.
+        # It is important to use list copy here,
+        # since replace_block_output_var is going to change the consuming_blocks
+        # Note that, there are some expensive index query in the following implementation,
+        # but overally it won't affect the time complexity too much,
+        # since we can assume the number of the block outputs in a program as a constant.
+        # As the result, the amortized time complexity will not blow up.
+        for b in list(old_var.consuming_blocks):
+            outer_op = b.outer_op
+
+            if outer_op is not None:
+                # Query the start and end index if needed
+                if start_idx is None:
+                    start_idx = (
+                        self.find_op_id_in_block(anchor_op) + 1 if anchor_op is not None else 0
+                    )
+                if end_idx is None:
+                    end_idx = (
+                        self.find_op_id_in_block(end_op)
+                        if end_op is not None
+                        else len(self.operations) - 1
+                    )
+
+            op_to_idx = {}
+            while outer_op is not None:
+                block = outer_op.enclosing_block
+                if block is self:
+                    if len(op_to_idx) == 0:
+                        for idx, op in enumerate(self.operations):
+                            op_to_idx[op] = idx
+                    op_idx = op_to_idx[outer_op]
+                    if op_idx >= start_idx and op_idx <= end_idx:
+                        b.replace_block_output_var(old_var, new_var)
+                    break
+                outer_op = block.outer_op
+
+        if end_op is not None and old_var.op not in op_list:
             return num_ops_affected
 
         if old_var in self._block_inputs:
@@ -420,7 +632,6 @@ def _replace_var(
 
         # If old_var is block's output, replace as well.
         self.replace_block_output_var(old_var, new_var)
-
         return num_ops_affected
 
     def replace_block_output_var(
@@ -451,12 +662,11 @@ def replace_block_output_var(
 
     def try_replace_uses_of_var_after_op(
         self,
-        anchor_op,
-        old_var,
-        new_var,
-        end_op=None,
-        no_check_var_types=False,
-        no_check_var_visibility=False,
+        anchor_op: Operation,
+        old_var: Var,
+        new_var: Var,
+        end_op: Optional[Operation] = None,
+        no_check_var_types: Optional[bool] = False,
     ):
         """
         :param anchor_op: Operation
@@ -464,8 +674,7 @@ def try_replace_uses_of_var_after_op(
         :param new_var: Var
         :param end_op: Operation
         :param no_check_var_types: bool
-        :param no_check_var_visibility: bool
-        :return: True if the old_var can be replaced by new_var. False otherwise.
+        :return: True if the old_var can be replaced by new_var. False otherwsie.
 
         This helper function guards the replace_uses_of_var_after_op function,
         by first checking if the old_var could be replaced by the new_var.
@@ -482,19 +691,65 @@ def try_replace_uses_of_var_after_op(
             old_var=old_var,
             new_var=new_var,
             no_check_var_types=no_check_var_types,
-            no_check_var_visibility=no_check_var_visibility,
         )
         return True
 
+    @staticmethod
+    def _copy_scope_info(src: Var, dst: Var) -> None:
+        """
+        Populate meta data from old var (src) to new var (dst)
+        """
+        curr_scopes = SCOPE_STACK.get_curr_scopes()
+
+        if ScopeSource.COREMLTOOLS_GRAPH_PASS in curr_scopes:
+
+            if src.op in VALID_OPS_TO_COPY_SCOPE_INFO[-1]:
+                return
+
+            elif dst.op in VALID_OPS_TO_COPY_SCOPE_INFO[-1]:
+                op = dst.op
+                assert op is not None, "new_var cannot be a placeholder output"
+                VALID_OPS_TO_COPY_SCOPE_INFO[-1].remove(op)
+
+                # If old_var is a placeholder output, we assign defaults values to essential scope source
+                old_scopes = src.scopes
+                if len(old_scopes) == 0:
+                    essential_scope_sources = op.enclosing_block._essential_scope_sources
+                    for val in essential_scope_sources:
+                        res = None
+                        if val == ScopeSource.TORCHSCRIPT_MODULE_TYPE:
+                            res = ["__COREML__::TORCHSCRIPT_PLACEHOLDER"]
+                        elif val == ScopeSource.TORCHSCRIPT_MODULE_NAME:
+                            res = [f"__COREML__::TORCHSCRIPT_PLACEHOLDER_{src.name}"]
+                        elif val == ScopeSource.EXIR_DEBUG_HANDLE:
+                            res = [None]
+                        else:
+                            raise ValueError(f"No default placeholder info for {val}.")
+                        old_scopes[val] = res
+
+                dst.scopes = add_graph_pass_scope(old_scopes, dst.scopes)
+
+                for input in op.inputs.values():
+                    if not isinstance(input, (list, tuple)):
+                        input = [input]
+                    for i in input:
+                        Block._copy_scope_info(src, i)
+
+    @staticmethod
+    def _copy_metadata(old_var: Var, new_var: Var) -> None:
+        """
+        Populate meta data from old var to new var
+        """
+        return
+
     def replace_uses_of_var_after_op(
         self,
-        anchor_op,
-        old_var,
-        new_var,
-        no_check_var_visibility=False,
-        end_op=None,
-        no_check_var_types=False,
-        force_replace=False,
+        anchor_op: Operation,
+        old_var: Var,
+        new_var: Var,
+        end_op: Optional[Operation] = None,
+        no_check_var_types: Optional[bool] = False,
+        force_replace: Optional[bool] = False,
     ):
         """
         Replace all uses of `old_var` with `new_var` after `anchor_op`,
@@ -508,9 +763,6 @@ def replace_uses_of_var_after_op(
         `end_op` is None, all occurrences of `old_var` are replaced in the block starting from
         the op just after `anchor_op`
 
-        no_check_var_visibility: True to disable the check ensuring new_var is visible
-        (visibility requirement depends on anchor_op).
-
         no_check_var_types: An error will be raised if the type of new_var is not same as the
         old_var, unless `no_check_var_types` is set to True. Normally type inference is
         re-invoked for all the child ops of `old_var` after updating it to `new_var`. However,
@@ -589,13 +841,10 @@ def replace_uses_of_var_after_op(
                 ).format(old_var, new_var, err_var)
                 raise ValueError(msg)
 
-        start = self.find_op_id_in_block(anchor_op) + 1 if anchor_op is not None else 0
-        end_id = self.find_op_id_in_block(end_op) if end_op is not None else -1
-
-        if not no_check_var_visibility:
+        # It is expensive to check the var visibility, and it should only be done while debugging.
+        if DEBUG:
             self.validate()
 
-            idx = start if anchor_op is not None else len(self.operations)
             visibility_error_msg = (
                     "new_var '{}' is not visible in block '{}' at or before "
                     + "anchor_op '{}'"
@@ -603,51 +852,47 @@ def replace_uses_of_var_after_op(
             anchor_op_name = "None" if anchor_op is None else anchor_op.name
 
             if isinstance(new_var, ComplexVar):
-                # For CompleVar, as it's just a temp wrapper to transit the real and imag data, we
+                # For ComplexVar, as it's just a temp wrapper to transit the real and imag data, we
                 # check the visibility of its real and imaginary Var instead.
-                if not self.is_var_visible_in_block(new_var.real, upto_op_with_id=idx):
+                if not self.is_var_visible_in_block(new_var.real, upto_op=anchor_op):
                     raise ValueError(
-                        visibility_error_msg.format(
-                            new_var.real.name, self.name, anchor_op_name
-                        )
+                        visibility_error_msg.format(new_var.real.name, self.name, anchor_op_name)
                     )
-                if not self.is_var_visible_in_block(new_var.imag, upto_op_with_id=idx):
+                if not self.is_var_visible_in_block(new_var.imag, upto_op=anchor_op):
                     raise ValueError(
-                        visibility_error_msg.format(
-                            new_var.imag.name, self.name, anchor_op_name
-                        )
+                        visibility_error_msg.format(new_var.imag.name, self.name, anchor_op_name)
                     )
             else:
-                if not self.is_var_visible_in_block(new_var, upto_op_with_id=idx):
+                if not self.is_var_visible_in_block(new_var, upto_op=anchor_op):
                     raise ValueError(
-                        visibility_error_msg.format(
-                            new_var.name, self.name, anchor_op_name
-                        )
+                        visibility_error_msg.format(new_var.name, self.name, anchor_op_name)
                     )
+            start = self.find_op_id_in_block(anchor_op) + 1 if anchor_op is not None else 0
+            end_id = self.find_op_id_in_block(end_op) if end_op is not None else -1
 
-        if end_id != -1 and end_id < start:
-            msg = "end_op '{}' comes before the anchor_op '{}'"
-            raise ValueError(msg.format(end_op.name, anchor_op.name))
+            if end_id != -1 and end_id < start:
+                msg = "end_op '{}' comes before the anchor_op '{}'"
+                raise ValueError(msg.format(end_op.name, anchor_op.name))
 
         num_ops_affected = self._replace_var(
             old_var,
             new_var,
-            start=start,
-            end_id=end_id,
+            anchor_op=anchor_op,
+            end_op=end_op,
             no_check_var_types=no_check_var_types,
         )
 
         logger.debug("Num ops affected in replacing var: {}".format(num_ops_affected))
 
-    def remove_ops(self, existing_ops):
+    def remove_ops(self, ops_to_remove: List[Operation]):
         """
-        Remove ops in `existing_ops`.
+        Remove ops in `ops_to_remove`.
 
-        Args: existing_ops: List[Operation]. All ops in this list must be pre-existing in the
+        Args: ops_to_remove: List[Operation]. All ops in this list must be pre-existing in the
         block. It allows duplicated ops, but duplicated ops will only be removed once.
 
         Raises:
-            ValueError if any `op` in `existing_ops` meets any of following conditions:
+            ValueError if any `op` in `ops_to_remove` meets any of following conditions:
               - `op` is not found in the block
               - any other op in the block uses output Vars of `op`
               - the output var is block's output
@@ -655,99 +900,44 @@ def remove_ops(self, existing_ops):
         self.validate()
 
         # Dedup ops because each op can only be deleted once.
-        existing_ops_set = set(existing_ops)
-        existing_ops = list(existing_ops_set)
-        # Find the idx of each to-be-removed op, and raise errors if any op couldn't be found.
-        idxs = [-1] * len(existing_ops)
-        for i, op in enumerate(self.operations):
-            if op in existing_ops_set:
-                idxs[existing_ops.index(op)] = i
-        if -1 in idxs:
-            not_found = []
-            for i, op in zip(idxs, existing_ops):
-                if i == -1:
-                    not_found.append(op.name)
-            raise ValueError(
-                "Ops {} not found in block {}".format(not_found, self.name)
-            )
-
-        # Remove ops in reverse topological order
-        pairs = list(zip(idxs, existing_ops))
-        pairs.sort(key=lambda x: x[0], reverse=True)
+        ops_to_remove_set = set(ops_to_remove)
+        ops_to_remove = list(ops_to_remove_set)
 
-        for idx, op in pairs:
+        for op in ops_to_remove:
             for i, v in enumerate(op.outputs):
-                # Check that no ops depend on op's outputs
-                if len(v.child_ops) > 0:
-                    child_op_names = [s.name for s in v.child_ops]
-                    msg = (
-                        "Cannot delete op '{}' with active output at id {}: '{}' "
-                        + "used by ops {}"
-                    )
-                    raise ValueError(msg.format(op.name, i, v.name, child_op_names))
                 # Check that the output Var isn't block's output
                 if v in self._outputs:
-                    msg = (
-                        "cannot delete op {} with output {}: {} "
-                        + "that's block {}'s output"
+                    raise ValueError(
+                        f"cannot delete op {op.name} with output {i}: {v.name} that's block {self.name}'s output."
                     )
-                    raise ValueError(msg.format(op.name, i, v.name, self.name))
 
             for b in op.blocks:
                 b.set_outputs([])
                 b.remove_ops(b.operations)
 
-            # Remove the op (in reverse topological order)
-            self.operations.pop(idx)
-            op.enclosing_block = None
-
-            for v in op.inputs.values():
-                if isinstance(v, (tuple, list)):
-                    for vv in v:
-                        vv.remove_child_op(op)
-                else:
-                    v.remove_child_op(op)
-
-    def operations_for_vars(self, end_vs):
-        """
-        Inputs:
-
-        end_vs: list[Operation].
-
-        Return:
+            self.operations.remove(op)
 
-        list[Operation] which are subset of self.operations that are ancestors
-        of `end_vs`. Also do recursion into nested blocks.
-        """
-        used_vars = set(end_vs)
-        used_ops = []
-        for op in reversed(self.operations):
-            # if none of op's output is used, delete op
-            if not set(op.outputs).intersection(used_vars):
-                continue
-
-            used_ops.append(op)  # append in reverse topological order
+            op.enclosing_block = None
 
-            # recursively search for nested blocks
-            ops_to_check = []
-            for b in op.blocks:
-                ops_to_check += b.operations_for_vars(b.outputs)
-            ops_to_check.append(op)
+            for v in op.get_flattened_inputs():
+                v.remove_child_op(op)
 
-            # mark used vars
-            for op_to_check in ops_to_check:
-                # mark all op's inputs to used
-                for _, input_var in op_to_check.inputs.items():
-                    if isinstance(input_var, (tuple, list)):
-                        used_vars.update(list(input_var))
-                    else:
-                        used_vars.add(input_var)
+            # Remove InternalVar from self._internal_vars
+            for v in op.internal_inputs.values():
+                self._internal_vars.remove(v)
 
-        return used_ops[::-1]
+        # In the end, we check no ops depend on removed op's outputs
+        for op in ops_to_remove:
+            for i, v in enumerate(op.outputs):
+                if len(v.child_ops) > 0:
+                    child_op_names = [s.name for s in v.child_ops]
+                    raise ValueError(
+                        f"Cannot delete op '{op.name}' with active output at id {i}: '{v.name}' used by ops {child_op_names}."
+                    )
 
     def _propagate_nonreplaceable_vars(self):
         def propagate_nonreplaceable_vars_block(block):
-            for op in list(block.operations):
+            for op in block.operations:
                 for b in op.blocks:
                     propagate_nonreplaceable_vars_block(b)
                 if op.outputs is None:
@@ -757,7 +947,7 @@ def propagate_nonreplaceable_vars_block(block):
                     o._set_nonreplaceable_vars_upstream()
         propagate_nonreplaceable_vars_block(self)
 
-    def indented_str(self, indent=None):
+    def indented_str(self, indent: Optional[str] = None, print_attr: Optional[bool] = False) -> str:
         if indent is None:
             indent = ""
         s = (
@@ -768,7 +958,7 @@ def indented_str(self, indent=None):
         )
         s += ") {\n"
         for op in self.operations:
-            s += op.indented_str(indent + SPACES * 1)
+            s += op.indented_str(indent + SPACES * 1, print_attr=print_attr)
         s += indent + "} -> ("
         if self._outputs is not None:
             s += ", ".join(["%" + v.name for v in self._outputs])
@@ -842,17 +1032,18 @@ def __init__(self, inputs, opset_version=None):
         """
         self.placeholder_inputs = inputs
         self.opset_version = opset_version
+        self.output_types = None
+        self.input_types = []
 
         # str -> Var
         self._input_dict = OrderedDict()
         for k, v in self.placeholder_inputs.items():
             v.set_name(k)  # set to user input name
             self._input_dict[k] = v.outputs[0]
-        self.function_inputs = tuple(self._input_dict.values())
 
         global k_used_symbols
         global k_num_internal_syms
-        for inp in self.function_inputs:
+        for inp in self._input_dict.values():
             if types.is_tensor(inp.dtype):
                 shapes = inp.dtype.get_shape()
                 for s in shapes:
@@ -884,7 +1075,9 @@ def __repr__(self):
     def __str__(self):
         return self.to_str("function")
 
-    def to_str(self, func_name="function"):
+    def to_str(
+        self, func_name: Optional[str] = "function", print_attr: Optional[bool] = False
+    ) -> str:
         func_name = func_name + "[{}]".format(_OPSET[self.opset_version])
         if len(self._input_dict) == 0:
             s = func_name + "()"
@@ -893,9 +1086,10 @@ def to_str(self, func_name="function"):
             s = func_name + "(" + str(inputs[0][1])
             for in_name, ph in inputs[1:]:
                 s += ",\n" + " " * (len(func_name) + 1) + str(ph)
-            s += ") {\n"
-            s += self.indented_str(SPACES)
-            s += "}\n"
+            s += ")"
+        s += " {\n"
+        s += self.indented_str(SPACES, print_attr=print_attr)
+        s += "}\n"
         return s
 
     def get_max_opset_version_and_op(self) -> Tuple[_target, Operation]:
@@ -909,7 +1103,7 @@ def get_max_opset_version_and_op(self) -> Tuple[_target, Operation]:
         def update_max_opset_version_block(block):
             nonlocal max_opset_version
             nonlocal op_with_max_opset_version
-            for op in list(block.operations):
+            for op in block.operations:
                 for b in op.blocks:
                     update_max_opset_version_block(b)
                 if not hasattr(op, "_op_variants") or not isinstance(op._op_variants, dict):
@@ -920,3 +1114,25 @@ def update_max_opset_version_block(block):
 
         update_max_opset_version_block(self)
         return max_opset_version, op_with_max_opset_version
+
+    def set_output_types(self, outputs: Optional[List[InputType]] = None) -> None:
+        """
+        Set the user defined output type for a function.
+        Note: the common::update_output_dtypes graph pass takes this information,
+        and changes the function output signature accordingly.
+        """
+        if outputs is not None:
+            if not (
+                isinstance(outputs, list) and all([isinstance(out, InputType) for out in outputs])
+            ):
+                raise TypeError(
+                    "main outputs should be a list of type ct.TensorType or ct.ImageType"
+                )
+        self.output_types = outputs
+
+    def set_input_types(self, input_types: List[InputType]):
+        if not isinstance(input_types, tuple):
+            raise ValueError("main inputs should be tuple of TensorType or ImageType")
+        elif not all([isinstance(inp, InputType) for inp in input_types]):
+            raise ValueError("main inputs should be tuple of InputSpec")
+        self.input_types = input_types
diff --git a/coremltools/converters/mil/mil/builder.py b/coremltools/converters/mil/mil/builder.py
index f53bbd1fb..95c74890c 100644
--- a/coremltools/converters/mil/mil/builder.py
+++ b/coremltools/converters/mil/mil/builder.py
@@ -5,17 +5,25 @@
 
 import numbers
 from collections import defaultdict
-from typing import Callable, List, Optional
+from typing import Any, Callable, List, Optional, Tuple, Type
 
 import numpy as np
 
 from coremltools import _logger as logger
+from coremltools.converters.mil import mil
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 
 from .block import Function, curr_block
 from .input_type import InternalInputType, ListOrTensorInputType, TensorInputType, TupleInputType
-from .program import Placeholder, Program
+from .program import Placeholder
+from .scope import (
+    SCOPE_STACK,
+    VALID_OPS_TO_COPY_SCOPE_INFO,
+    ScopeContextManger,
+    ScopeInfo,
+    ScopeSource,
+)
 from .var import InternalVar, Var
 
 
@@ -131,6 +139,8 @@ def _create_vars(cls, input_spec, op_name, before_op,
             new_var_name = op_name + "_" + k
             if isinstance(in_type, TupleInputType):
                 var = []
+                if not isinstance(val, (list, tuple)):
+                    raise ValueError(f"Invalid type {type(val)} for TupleInputType param.")
                 for i, v in enumerate(val):
                     if isinstance(v, Var):
                         var.append(v)
@@ -165,8 +175,16 @@ def _add_op(cls, op_cls, **kwargs):
             op_name=kwargs["name"], before_op=before_op,
             candidate_kv=kwargs))
         kwargs["enclosing_block"] = curr_block()
+
+        # Add scope information
+        current_scopes = SCOPE_STACK.get_curr_scopes()
+        kwargs["scopes"] = current_scopes
         new_op = op_cls(**kwargs)
 
+        # We record if the op is created under graph pass
+        if len(current_scopes) == 1 and ScopeSource.COREMLTOOLS_GRAPH_PASS in current_scopes:
+            VALID_OPS_TO_COPY_SCOPE_INFO[-1].add(new_op)
+
         # Initialize optional input Vars if it wasn't in kwargs
         default_inputs = new_op.default_inputs()
         # Shallow copy list inputs to ensure op inputs are immutable
@@ -187,8 +205,13 @@ def _add_op(cls, op_cls, **kwargs):
         return new_op.outputs
 
     @staticmethod
-    def placeholder(shape, dtype=None, allow_rank0_input=False):
-        return Placeholder(shape, dtype, allow_rank0_input=allow_rank0_input)
+    def placeholder(
+        shape: Tuple[Any],
+        dtype: Optional[Type] = None,
+        allow_rank0_input: Optional[bool] = False,
+        name: Optional[str] = None,
+    ) -> Placeholder:
+        return Placeholder(shape, dtype, allow_rank0_input=allow_rank0_input, name=name)
 
     @staticmethod
     def TensorSpec(shape, dtype=None):
@@ -294,7 +317,65 @@ def program(
         """
         def wrapper(main_block):
             function = Builder._create_function(main_block, input_specs, opset_version)
-            program = Program()
+            program = mil.Program()
             program.add_function(function_name, function)
             return program
         return wrapper
+
+    @staticmethod
+    def scope(
+        *scopes: List[ScopeInfo],
+    ) -> ScopeContextManger:
+        """
+        The ``mb.scope`` creates a context manager, which makes the operations created within it have the corresponding scope information.
+
+        Parameters
+        ----------
+        scopes: Optional[List[ScopeInfo]] (Optional)
+            * A list of ScopeInfo under the context manager.
+            * The source in each ScopeInfo cannot be duplicated.
+            * If not provided, this context manager does no affects.
+
+        Examples
+        --------
+        Here is an example of creating a scope for torchscript module heirarchy with type and name information.
+
+        .. sourcecode:: python
+
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+                ):
+                    return mb.add(x=x, y=4.3, name="add_1")
+
+
+        In the above example, the "add_1" op will have two scope attributes, for torchscipt module type and name:
+            * TORCHSCRIPT_MODULE_TYPE: ["Module1"]
+            * TORCHSCRIPT_MODULE_NAME: ["module_1"]
+
+        Here is an example of creating nested scopes:
+
+        .. sourcecode:: python
+
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+                ):
+                    x = mb.add(x=x, y=4.3, name="add_1")
+                    with mb.scope(
+                        ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module2"]),
+                        ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_2"]),
+                    ):
+                        return mb.add(x=x, y=3.2, name="add_2")
+
+        In the above example, the "add_1" op would have a scope attribute:
+            * TORCHSCRIPT_MODULE_TYPE: ["Module1"]
+
+        while the "add_2" op would have scope attributes:
+            * TORCHSCRIPT_MODULE_TYPE: ["Module1", "Module2"]
+            * TORCHSCRIPT_MODULE_NAME: ["module_2"]
+        """
+        return ScopeContextManger(*scopes)
diff --git a/coremltools/converters/mil/mil/operation.py b/coremltools/converters/mil/mil/operation.py
index 050b1a494..8f3536f06 100644
--- a/coremltools/converters/mil/mil/operation.py
+++ b/coremltools/converters/mil/mil/operation.py
@@ -3,7 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
 
@@ -167,6 +167,7 @@ def __init__(self, **kwargs):
         self._input_vars = {}
         self.blocks = []
         self.enclosing_block = kwargs["enclosing_block"]
+        self.scopes = kwargs["scopes"]
 
         # Initialize inputs as object attributes (all None)
         for k in self._input_types.keys():
@@ -205,10 +206,10 @@ def _check_expected_inputs(self, kwargs):
             "value",
             "version",
             "before_op",
-            "no_check_var_visibility",  # no_check_var_visibility==True to deviate from SSA
             "no_check_var_types",
             # no_check_var_types==True to force set inputs, even if type does not match with earlier ones
             "enclosing_block",
+            "scopes",
         ]
         for k in kwargs.keys():
             if k not in non_attributes and k not in self._input_types:
@@ -541,6 +542,13 @@ def inputs(self):
             if not isinstance(v, InternalVar) and v is not None
         }
 
+    @property
+    def internal_inputs(self) -> Dict[str, InternalVar]:
+        """
+        Get internal var inputs of an op.
+        """
+        return {k: v for k, v in self._input_vars.items() if isinstance(v, InternalVar)}
+
     @property
     def outputs(self):
         return self._output_vars
@@ -584,23 +592,26 @@ def var_to_str(v):
 
         return "%" + v.name
 
-    def indented_str(self, indent=""):
+    def indented_str(self, indent: Optional[str] = "", print_attr: Optional[bool] = False) -> str:
         if self.op_type == "const":
             return ""
         s = indent
         if self.outputs is not None:
             s += ", ".join([str(o) for o in self.outputs])
-        s += " = " + self.op_type + "("
-        s += ", ".join(
-            [
-                k + "=" + Operation.var_to_str(self.inputs[k])
-                for k in self._input_types.keys()
-                if k in self.inputs and not is_internal_input(k)
-            ]
-        )
+
+        if print_attr:
+            attr = "["
+            for k, v in self.scopes.items():
+                attr += f"{k}: {v}, "
+            attr = attr[:-2] + "]"
+        else:
+            attr = ""
+
+        s += " = " + self.op_type + attr + "("
+        s += ", ".join([k + "=" + Operation.var_to_str(v) for k, v in self.inputs.items()])
         s += ', name="{}")\n'.format(self.name)
         for b in self.blocks:
-            s += b.indented_str(indent=indent + SPACES)
+            s += b.indented_str(indent=indent + SPACES, print_attr=print_attr)
         return s
 
     def __repr__(self):
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py b/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
index f858ae46c..40e5d3126 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/control_flow.py
@@ -143,13 +143,13 @@ class Const(Operation):
 
     Parameters
     ----------
+    val: const<\*,T> (Required)
+
     mode: immediate_value, file_value (Optional)
         * Determines how the constant value is stored in the internal MIL format.
         * For large constants such as convolution weights, use ``file_value``.
         * For smaller-size constants such as values of a stride, use ``immediate_value``.
 
-    val: const<\*,T> (Required)
-
     Returns
     -------
     const<\*,T>
@@ -355,7 +355,7 @@ def _check_equal_value(val1, val2):
 
     @staticmethod
     def _clean_up_child_ops(block):
-        for op in list(block.operations):
+        for op in block.operations:
 
             for b in op.blocks:
                 while_loop._clean_up_child_ops(b)
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py b/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py
index 5e45864c7..3a5804df5 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/linear.py
@@ -224,10 +224,10 @@ def type_inference(self):
     def value_inference(self):
         x = self.x.val
         if self.transpose_x.val:
-            x = np.transpose(x)
+            x = np.swapaxes(x, -1, -2)
         y = self.y.val
         if self.transpose_y.val:
-            y = np.transpose(y)
+            y = np.swapaxes(y, -1, -2)
         return np.matmul(x, y)
 
 
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/scatter_gather.py b/coremltools/converters/mil/mil/ops/defs/iOS15/scatter_gather.py
index 6650c4438..216bb3e19 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/scatter_gather.py
@@ -6,15 +6,11 @@
 import numpy as np
 
 from coremltools.converters.mil.mil import Operation, types
-from coremltools.converters.mil.mil.input_type import (DefaultInputs,
-                                                       InputSpec,
-                                                       TensorInputType)
-from coremltools.converters.mil.mil.operation import (SYMBOL, VALUE,
-                                                      precondition)
+from coremltools.converters.mil.mil.input_type import DefaultInputs, InputSpec, TensorInputType
+from coremltools.converters.mil.mil.operation import SYMBOL, VALUE, precondition
 from coremltools.converters.mil.mil.ops.defs._op_reqs import register_op
 from coremltools.converters.mil.mil.ops.defs._utils import compute_gather
-from coremltools.converters.mil.mil.types.symbolic import (
-    is_compatible_symbolic_vector)
+from coremltools.converters.mil.mil.types.symbolic import is_compatible_symbolic_vector, is_symbolic
 
 
 @register_op
@@ -78,7 +74,7 @@ class gather(Operation):
         indices=TensorInputType(type_domain=types.int32),
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -96,11 +92,8 @@ def value_inference(self):
             # only allow x to be symbolic. indices cannot.
             return None
         return compute_gather(
-                params=self.x.sym_val, 
-                indices=self.indices.val, 
-                axis=self.axis.val,
-                batch_dims=0
-            )
+            params=self.x.sym_val, indices=self.indices.val, axis=self.axis.val, batch_dims=0
+        )
 
     def type_inference(self):
         out_type = self.x.dtype
@@ -204,7 +197,7 @@ class scatter(Operation):
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
         mode=TensorInputType(const=True, optional=True, type_domain=types.str),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -270,7 +263,7 @@ class gather_along_axis(Operation):
         indices=TensorInputType(type_domain=types.int32),
         axis=TensorInputType(const=True, optional=True, type_domain=types.int32),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -308,8 +301,14 @@ def type_inference(self):
         axis = axis if axis >= 0 else axis + self.x.rank
 
         for i in range(self.x.rank):
-            if i != axis:
-                assert self.x.shape[i] == self.indices.shape[i]
+            x_size = self.x.shape[i]
+            indices_size = self.indices.shape[i]
+            if i != axis and not is_symbolic(x_size) and not is_symbolic(indices_size):
+                if x_size != indices_size:
+                    raise AssertionError(
+                        "The input data and indices should have the same size at "
+                        f"axis {i}, but got {x_size} vs {indices_size}"
+                    )
 
         return types.tensor(self.x.dtype, self.indices.shape)
 
@@ -469,7 +468,7 @@ class gather_nd(Operation):
         x=TensorInputType(type_domain="T"),
         indices=TensorInputType(type_domain=types.int32),
         )
-        
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
@@ -528,7 +527,7 @@ class scatter_nd(Operation):
         updates=TensorInputType(type_domain="T"),
         mode=TensorInputType(const=True, optional=True, type_domain=types.str),
     )
-    
+
     type_domains = {
         "T": (types.fp16, types.fp32, types.int32),
     }
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
index 9a5340764..a32eeefb3 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_operation.py
@@ -365,7 +365,7 @@ class one_hot(Operation):
     ----------
     indices: tensor<[D], i32> (Required)
         * Tensor, values indicate the locations for each one-hot vector to take the ``on_value``.
-    one_got_vector_size: i32 (Required)
+    one_hot_vector_size: i32 (Required)
         * Indicates the number of returning vectors.
     axis: const i32 (Optional)
         * Indicates which dimension to append the new axis.
@@ -444,7 +444,7 @@ def type_inference(self):
 @register_op
 class pad(Operation):
     """
-    Pad a tensor.
+    Pads a tensor.
 
     Parameters
     ----------
@@ -570,10 +570,10 @@ class range_1d(Operation):
 
     Parameters
     ----------
-    end: <T> (Required)
-        * The upper limit of the sequence, exclusive.
     start: <T> (Required)
         * The start point of the sequence.
+    end: <T> (Required)
+        * The upper limit of the sequence, exclusive.
     step: <T> (Required)
         * Number that increments ``start``.
 
@@ -1117,6 +1117,10 @@ class split(Operation):
         * The tensors may be variadic, but the number of tensors must be determined
           at compile time (i.e. a tuple).
 
+    axis: const<i32> (Required)
+        * The dimension along which to concatenate. Must be in the
+          range ``[-rank(x), rank(x))``.
+
     num_splits: <i32> (Optional)
         If specified, divide ``x`` into ``num_splits`` tensors along ``axis``.
         Its behavior depends on ``split_sizes``:
@@ -1134,10 +1138,6 @@ class split(Operation):
         * Sizes to split to. The sum of ``split_sizes`` must equal to
           ``value.shape[axis]``.
 
-    axis: const<i32> (Required)
-        * The dimension along which to concatenate. Must be in the
-          range ``[-rank(x), rank(x))``.
-
     Returns
     -------
     Tuple[tensor<\*?, T>]
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
index 180c79f33..9ced519cd 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py
@@ -585,9 +585,9 @@ class slice_by_size(Operation):
     ----------
     x: tensor<*?, T> (Required)
         * Input tensor.
-    begin: tensor<[rank(x)], i32> Required
+    begin: tensor<[rank(x)], i32> (Required)
         * The begin index for slice.
-    size: tensor<[rank(x)], i32> Required
+    size: tensor<[rank(x)], i32> (Required)
         * The size that is to be sliced. If ``size`` is ``-1``,
           all the remaining elements starting with "begin" are sliced.
 
@@ -884,6 +884,9 @@ class squeeze(Operation):
         * Must be at least 1-D.
     axes: const<K,i32> (Optional)
         * Axes to squeeze out.
+        * The behaviour of squeezing non-single dimensions follow PyTorch instead of NumPy, where
+          it ignores non-single dimensions instead of erroring out. More specifically, if x has
+          shape (2, 3, 4) and axes is [0, 1], the output will be a tensor with shape (2, 3, 4).
         * Default to remove all single-dimensions.
 
     Returns
@@ -923,9 +926,11 @@ def type_inference(self):
             for i in sorted(axes)[::-1]:  # descending order
                 if len(squeezed_shape) <= i:
                     raise ValueError(
-                        "Cannot squeeze dim {} for shape {}".format(i, squeezed_shape)
+                        f"Invalid axis {i} in squeeze. The axis should be smaller than {len(squeezed_shape)}"
                     )
-                squeezed_shape.pop(i)
+                if squeezed_shape[i] == 1:
+                    # Only remove the dim_size=1 dimension.
+                    squeezed_shape.pop(i)
 
         return types.tensor(x_type, tuple(squeezed_shape)) if len(squeezed_shape) != 0 else x_type
 
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py b/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
index 899c8e4a3..925ec149d 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS16/constexpr_ops.py
@@ -116,20 +116,37 @@ def materialized_val_inference(self):
             self.quantized_data.val, self.zero_point.val, self.scale.val, self.axis.val
         )
 
+    def is_all_zeros(self) -> bool:
+        zero_point = self.promote_rank_to_same_as_quantized_data(
+            self.zero_point.val, self.quantized_data.val, self.axis.val
+        )
+        return np.all(self.quantized_data.val == zero_point)
+
     @staticmethod
-    def decompress(quantized_data, zero_point, scale, axis):
+    def promote_rank_to_same_as_quantized_data(
+        param: np.ndarray, quantized_data: np.ndarray, axis: int
+    ) -> np.ndarray:
+        """
+        Promote param (i.e. zero point or scale) rank to same as quantized data,
+        so subtraction or multiplication can happen properly on the specified axis
+        """
+        if len(param.shape) == 0:
+            return np.reshape(param, np.ones(len(quantized_data.shape), np.int32))
+        else:
+            axes = [i for i in range(len(quantized_data.shape)) if i != axis]
+            return np.expand_dims(param, axis=tuple(axes))
 
+    @staticmethod
+    def decompress(
+        quantized_data: np.ndarray, zero_point: np.ndarray, scale: np.ndarray, axis: int
+    ) -> np.ndarray:
         axis = axis if axis >= 0 else axis + len(quantized_data.shape)
-
-        def rank_promoted_to_same_as_quantized_data(param):
-            if len(param.shape) == 0:
-                return np.reshape(param, np.ones(len(quantized_data.shape), np.int32))
-            else:
-                axes = [i for i in range(len(quantized_data.shape)) if i != axis]
-                return np.expand_dims(param, axis=tuple(axes))
-
-        sc = rank_promoted_to_same_as_quantized_data(scale)
-        zp = rank_promoted_to_same_as_quantized_data(zero_point)
+        sc = constexpr_affine_dequantize.promote_rank_to_same_as_quantized_data(
+            scale, quantized_data, axis
+        )
+        zp = constexpr_affine_dequantize.promote_rank_to_same_as_quantized_data(
+            zero_point, quantized_data, axis
+        )
         val = sc * (quantized_data.astype(np.float32) - zp.astype(np.float32))
         return val.astype(scale.dtype)
 
@@ -296,8 +313,8 @@ class constexpr_sparse_to_dense(Operation):
 
     shape: const tensor<uint32, [K]> (Required)
 
-	Notes
-	-----
+        Notes
+        -----
     * Any data is packed and read in a row-major order.
     * ``mask`` contains ``M`` bytes, where ``M = ceil( product(shape) / 8)``. That is, each bit
       field corresponds to one element in the output tensor.
@@ -311,7 +328,7 @@ class constexpr_sparse_to_dense(Operation):
     .. sourcecode:: python
 
         shape = (5,) => M = 1 bytes
-        
+
                    MSB                  LSB
                     |                    |
         mask    =  |x  x  x  0  1  1  0  0 |      <== packed elements
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py b/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py
index 0e635cea9..6b0c87ffd 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS17/quantization_ops.py
@@ -93,12 +93,12 @@ class quantize(Operation):
                   ``input.shape[axis]``; that is, equal to ``3``.
                 - This is broadcasted to ``(1, 3, 1, 1)``.
 
-    axis: const tensor<int32, []> (Optional)
-
     output_dtype: const tensor<string, []> (Required)
         * This parameter can take ``"uint8"``, ``"int8"`` as values.
         * The ``output_dtype`` value must match the ``zero_point`` dtype.
 
+    axis: const tensor<int32, []> (Optional)
+
     Returns
     -------
     tensor<DstT, [1..]>
@@ -224,8 +224,21 @@ def type_inference(self):
         _check_scale_zp_shapes(self.input, self.scale, self.zero_point, self.axis)
         return types.tensor(self.scale.dtype, self.input.shape)
 
-    @precondition(allow=VALUE)
-    def value_inference(self):
+    def can_materialize_val(self) -> bool:
+        if self.input.val is None:
+            return False
+        if self.scale.val is None:
+            return False
+        if self.zero_point is not None and self.zero_point.val is None:
+            return False
+        if self.axis is not None and self.axis.val is None:
+            return False
+        return True
+
+    def materialized_val_inference(self) -> np.ndarray:
+        if not self.can_materialize_val():
+            return None
+
         quantized_data = self.input.val
         if self.zero_point is not None:
             zero_point = self.zero_point.val
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py
index a7ff49a81..c80f90ddb 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_activation.py
@@ -912,7 +912,8 @@ def test_value_inference(self, input_size):
             def prog():
                 return mb.softmax(x=x, axis=axis)
 
-            op = list(prog.functions.values())[0].operations[2]
+            ops = list(prog.functions.values())[0].operations
+            op = list(ops)[2]
             assert op.op_type == "softmax"
             np.testing.assert_allclose(
                 op.value_inference(),
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py
index a8f0232a2..bcb8278a0 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_control_flow.py
@@ -331,7 +331,7 @@ def cond(res, bx):
     def test_builder_to_backend_nested(self, compute_unit, backend):
         if backend.backend == "neuralnetwork":
             pytest.xfail(
-                "rdar://96862073 (test_control_folw::TestWhileLoop::test_builder_to_backend_nested failing on nnv1)"
+                "rdar://96862073 (test_control_flow::TestWhileLoop::test_builder_to_backend_nested failing on nnv1)"
             )
 
         input_placeholders = {
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
index 8817aa59b..57010f313 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_conv.py
@@ -436,6 +436,21 @@ def test_builder_to_backend_stress(
         config,
         x_weight_dtype,
     ):
+        if (
+            backend.backend == 'neuralnetwork' and
+            conv_dim == "conv2d" and
+            config == {
+                "padding": (1, 1, 1),
+                "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                "stride": (2, 2, 2),
+                "dilation": (2, 1, 1),
+                "has_bias": True,
+                "groups": 1,
+                "symbolic": True,
+            }
+        ):
+            pytest.xfail("rdar://121954894: Conv2d starts to fail")
+
         padding = config["padding"]
         DHWKdKhKw = config["DHWKdKhKw"]
         stride = config["stride"]
@@ -623,6 +638,20 @@ def test_builder_to_backend_stress_weights_input(
         conv_dim,
         config,
     ):
+        if (
+            conv_dim == "conv2d" and
+            config == {
+                'padding': (1, 1, 1),
+                'DHWKdKhKw': (5, 5, 5, 2, 2, 2),
+                'stride': (2, 2, 2),
+                'dilation': (2, 1, 1),
+                'has_bias': True,
+                'groups': 1,
+                'symbolic': True,
+            }
+        ):
+            pytest.xfail("rdar://121954894: Conv2d starts to fail")
+
         padding = config["padding"]
         DHWKdKhKw = config["DHWKdKhKw"]
         stride = config["stride"]
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py
index 8bdae53ac..040257d8f 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_elementwise_unary.py
@@ -10,10 +10,9 @@
 import scipy
 
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, get_new_symbol, types
+from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
-from coremltools.converters.mil.mil.types.symbolic import is_compatible_symbolic_vector
 from coremltools.converters.mil.testing_reqs import compute_units
 from coremltools.converters.mil.testing_utils import ssa_fn
 
@@ -514,19 +513,35 @@ def test_builder_threshold_eval(self):
 
         np.testing.assert_allclose(expected_outputs, v.val, atol=1e-04, rtol=1e-05)
 
-    def test_cast_with_symbolic_value(self):
-        input_shape = [get_new_symbol(), 1]
-        input_placeholders = {
-            "x": mb.placeholder(shape=input_shape),
-        }
+    @pytest.mark.parametrize(
+        "backend, dtype",
+        itertools.product(
+            backends,
+            ["bool", "int32", "fp16", "fp32"],
+        ),
+    )
+    def test_cast_with_symbolic_value(self, backend, dtype):
+        s1 = get_new_symbol()
 
-        def build(x):
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(s1, 1))],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
             shape = mb.shape(x=x)
-            return mb.cast(x=shape, dtype="int32")
-
-        with Function(input_placeholders) as ssa_func:
-            output_vars = build(**ssa_func.inputs)
-            assert is_compatible_symbolic_vector(output_vars.sym_val, [get_new_symbol(), 1])
+            out = mb.cast(x=shape, dtype=dtype)
+            assert out.val is None
+            sym_val = out.sym_val
+            if dtype == "bool":
+                assert sym_val.tolist() == [s1, True]
+            elif dtype == "int32":
+                assert sym_val.tolist() == [s1, 1]
+            elif dtype == "fp16":
+                assert sym_val.tolist() == [s1, np.float16(1.0)]
+            else:
+                assert dtype == "fp32"
+                assert sym_val.tolist() == [s1, np.float32(1.0)]
+            return out
 
     @staticmethod
     def _test_builder_to_backend_stress_with_epsilon(
@@ -634,6 +649,7 @@ def prog():
             return mb.erf(x=x)
 
         ops = list(prog.functions.values())[0].operations
+        ops = list(ops)
         assert len(ops) == 2
         assert ops[0].op_type == "const"
         erf_op = ops[1]
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
index 0da0d8320..41b3d1d39 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_linear.py
@@ -261,6 +261,28 @@ def build(x):
             backend=backend,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend",
+        itertools.product(compute_units, backends),
+    )
+    def test_builder_transpose_y(self, compute_unit, backend):
+        x_val = np.random.rand(3, 2, 7, 16)
+        y_val = np.random.rand(3, 2, 5, 16)
+
+        def build(x):
+            return mb.matmul(x=x, y=y_val, transpose_x=False, transpose_y=True)
+
+        expected_output = np.matmul(x_val, np.transpose(y_val, (0, 1, 3, 2)))
+        run_compare_builder(
+            build,
+            input_placeholders={"x": mb.placeholder(shape=x_val.shape)},
+            input_values={"x": x_val},
+            expected_output_types=expected_output.shape + (types.fp32,),
+            expected_outputs=expected_output,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
 
 class TestEinsum:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py
index 60ecf6b7c..b5e144c99 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_reduction.py
@@ -335,7 +335,8 @@ def test_reduce_log_sum_exp_value_inference(self, input_size):
             def prog():
                 return mb.reduce_log_sum_exp(x=x, axes=(axis,))
 
-            op = list(prog.functions.values())[0].operations[3]
+            ops = list(prog.functions.values())[0].operations
+            op = list(ops)[3]
             assert op.op_type == "reduce_log_sum_exp"
             np.testing.assert_allclose(
                 op.value_inference(), scipy.special.logsumexp(x, axis=axis), atol=1e-04, rtol=1e-05
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
index 8c7ef8374..7c3dd2913 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_scatter_gather.py
@@ -11,7 +11,7 @@
 import coremltools as ct
 from coremltools._deps import _HAS_TF_2, MSG_TF2_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.mil.ops.tests.iOS14 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import (
     mark_api_breaking,
@@ -488,6 +488,31 @@ def prog(x):
                 opset_version=backend.opset_version,
             )(prog)
 
+    @staticmethod
+    def test_gather_value_inference_on_symbolic_input():
+
+        s1, s2 = get_new_symbol(), get_new_symbol()
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(2, 3, s1, s2, 5))],
+        )
+        def prog(x):
+            shape = mb.shape(x=x)
+            gather_1 = mb.gather(x=shape, indices=0, axis=0)
+            gather_2 = mb.gather(x=shape, indices=[0, 1], axis=0)
+            gather_3 = mb.gather(x=shape, indices=[1, 2, 3], axis=0)
+
+            # Test value inference
+            assert gather_1.val == 2
+            assert gather_1.sym_val == 2
+
+            assert gather_2.val.tolist() == [2, 3]
+            assert gather_2.sym_val.tolist() == [2, 3]
+
+            assert gather_3.val is None
+            assert gather_3.sym_val.tolist() == [3, s1, s2]
+
+            return x
 
 class TestGatherAlongAxis:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
index 1a1d31fc3..6ae83e6aa 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS14/test_tensor_transformation.py
@@ -1268,6 +1268,63 @@ def prog():
             assert const.val[0, 0, 0] == 112
             return x
 
+    @staticmethod
+    def test_squeeze_invalid_axis():
+        with pytest.raises(
+            ValueError, match="Invalid axis 3 in squeeze. The axis should be smaller than 3"
+        ):
+
+            @mb.program()
+            def prog():
+                const = mb.const(val=[[[2, 3], [4, 5]]])
+                x = mb.squeeze(x=const, axes=(3,))
+                return x
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, is_symbolic",
+        itertools.product(
+            compute_units,
+            backends,
+            (True, False),
+        ),
+    )
+    def test_non_single_element_dim(self, compute_unit, backend, is_symbolic):
+        if backend.backend == "neuralnetwork":
+            pytest.skip("neuralnetwork backend doesn't support squeeze a not-1 dimension")
+        if compute_unit == ct.ComputeUnit.CPU_ONLY:
+            pytest.xfail("CPU failed non-single-dim squeeze (rdar://124555262)")
+
+        x = np.arange(2 * 3 * 4, dtype=np.int32).reshape(2, 3, 4)
+        input_shape = (
+            [get_new_symbol(), get_new_symbol(), get_new_symbol()] if is_symbolic else x.shape
+        )
+        input_placeholders = {"x": mb.placeholder(shape=input_shape)}
+        input_values = {"x": x}
+
+        def build(x):
+            return [
+                mb.squeeze(x=x, axes=(-1,)),
+                mb.squeeze(x=x, axes=(-2, 0)),
+                mb.squeeze(x=x, axes=(0, 1, 2)),
+                mb.squeeze(x=x),
+            ]
+
+        # The symbolic dim won't be squeezed, so it doesn't affect the output.
+        expected_output_types = [tuple(input_shape) + (types.int32,)] * 4
+        expected_outputs = [x] * 4
+        run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            inputs=construct_inputs_from_placeholders(input_placeholders, 10)
+            if backend.backend == "mlprogram"
+            else None,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+
 
 class TestTranspose:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
index efd157257..24358362d 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pytest
 
+import coremltools as ct
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import types
@@ -59,6 +60,40 @@ def build(x):
         prog = mlmodel._mil_program
         assert "constexpr_affine_dequantize" in get_op_types_in_program(prog)
 
+    def test_is_all_zeros(self):
+        @mb.program(opset_version=ct.target.iOS16)
+        def prog_0_scalar():
+            return mb.constexpr_affine_dequantize(
+                quantized_data=np.array([[0, 0, 0], [0, 0, 0]]).astype(np.int8),
+                zero_point=np.int8(0),
+                scale=np.float32(1.2),
+                axis=0,
+            )
+
+        assert prog_0_scalar.find_ops(op_type="constexpr_affine_dequantize")[0].is_all_zeros()
+
+        @mb.program(opset_version=ct.target.iOS16)
+        def prog_0_vector():
+            return mb.constexpr_affine_dequantize(
+                quantized_data=np.array([[1, 2, 3], [1, 2, 3]]).astype(np.uint8),
+                zero_point=np.uint8([1, 2, 3]),
+                scale=np.float32(2),
+                axis=1,
+            )
+
+        assert prog_0_vector.find_ops(op_type="constexpr_affine_dequantize")[0].is_all_zeros()
+
+        @mb.program(opset_version=ct.target.iOS16)
+        def prog_none0():
+            return mb.constexpr_affine_dequantize(
+                quantized_data=np.array([[1, 2, 3], [1, 2, 3]]).astype(np.uint8),
+                zero_point=np.uint8([1, 2]),
+                scale=np.float32(2),
+                axis=0,
+            )
+
+        assert not prog_none0.find_ops(op_type="constexpr_affine_dequantize")[0].is_all_zeros()
+
     @ssa_fn
     def test_builder_eval(self):
         # scalar zero-point & scalar scale
@@ -350,6 +385,45 @@ def build(x):
         prog = mlmodel._mil_program
         assert "constexpr_lut_to_dense" in get_op_types_in_program(prog)
 
+    @pytest.mark.parametrize("backend", backends)
+    def test_shape_of_constexpr_is_replaceable(self, backend):
+        @mb.program(input_specs=[], opset_version=backend.opset_version)
+        def prog():
+            lut_data = np.array(
+                [
+                    -19.0,
+                    4.0,
+                    0.0,
+                    -1.0,
+                    1.0,
+                    3.0,
+                    5.0,
+                    -8.0,
+                    19,
+                    13,
+                    42,
+                    4.5,
+                    5.4,
+                    2.0,
+                    -6,
+                    -7,
+                ]
+            ).astype(np.float32)
+            indices = np.array([212, 21]).astype(np.uint8)
+            shape = np.array([4, 1]).astype(np.uint32)
+            y = mb.constexpr_lut_to_dense(lut=lut_data, indices=indices, shape=shape)
+            shape = mb.shape(x=y)
+            assert len(shape.nonreplaceable_vars_upstream) == 0
+            gather = mb.gather(
+                x=shape,
+                indices=[
+                    0,
+                ],
+                axis=0,
+            )
+            assert len(gather.nonreplaceable_vars_upstream) == 0
+            return gather
+
     @ssa_fn
     def test_builder_eval(self):
         v = mb.constexpr_lut_to_dense(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
index 7d6fa1fbb..640e35df4 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_scatter_gather.py
@@ -24,29 +24,40 @@
 
 class TestGather:
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_dtype, indices_dtype",
+        "compute_unit, backend, x_dtype, indices_dtype, indices_dynamic",
         itertools.product(
             compute_units,
             backends,
             [np.float32, np.float16, np.int32],
             [np.int32, np.int16, np.uint16],
+            [True, False],
         ),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
+    def test_builder_to_backend_smoke(
+        self, compute_unit, backend, x_dtype, indices_dtype, indices_dynamic
+    ):
         x = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=x_dtype)
         indices = np.array([[[1, 0], [0, 1]], [[1, 0], [0, 0]]], dtype=indices_dtype)
 
         builtin_x_dtype = types.numpy_type_to_builtin_type(x_dtype)
-        input_placeholders = {
-            "x": mb.placeholder(shape=x.shape, dtype=builtin_x_dtype),
-            "indices": mb.placeholder(
+        input_placeholders = {"x": mb.placeholder(shape=x.shape, dtype=builtin_x_dtype)}
+        input_values = {"x": x}
+        if indices_dynamic:
+            input_placeholders["indices"] = mb.placeholder(
                 shape=indices.shape, dtype=types.numpy_type_to_builtin_type(indices_dtype)
-            ),
-        }
+            )
+            input_values["indices"] = indices
 
-        input_values = {"x": x, "indices": indices}
+        def build_dynamic(x, indices):
+            return [
+                mb.gather(x=x, indices=indices, axis=1, batch_dims=0),
+                mb.gather(x=x, indices=indices, axis=1, batch_dims=1),
+                mb.gather(x=x, indices=indices, axis=2, batch_dims=0),
+                mb.gather(x=x, indices=indices, axis=2, batch_dims=1),
+                mb.gather(x=x, indices=indices, axis=2, batch_dims=2),
+            ]
 
-        def build(x, indices):
+        def build_static(x):
             return [
                 mb.gather(x=x, indices=indices, axis=1, batch_dims=0),
                 mb.gather(x=x, indices=indices, axis=1, batch_dims=1),
@@ -55,6 +66,8 @@ def build(x, indices):
                 mb.gather(x=x, indices=indices, axis=2, batch_dims=2),
             ]
 
+        build = build_dynamic if indices_dynamic else build_static
+
         expected_output_types = [
             (2, 2, 2, 2, 3, builtin_x_dtype),
             (2, 2, 2, 3, builtin_x_dtype),
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py
index 65f68b116..b6f67b427 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_conv.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pytest
 
+import coremltools as ct
 from coremltools._deps import _HAS_TORCH, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil.ops.tests.iOS14.test_conv import TestConv as _TestConvIos14
 from coremltools.converters.mil.mil.ops.tests.iOS14.test_conv import (
@@ -87,6 +88,25 @@ def test_builder_to_backend_stress(
         config,
         x_weight_dtype,
     ):
+        if (
+            backend.backend == "mlprogram"
+            and backend.precision == "fp16"
+            and backend.opset_version == ct.target.iOS17
+            and conv_dim == "conv2d"
+            and config
+            == {
+                "padding": (1, 1, 1),
+                "DHWKdKhKw": (5, 5, 5, 2, 2, 2),
+                "stride": (2, 2, 2),
+                "dilation": (2, 1, 1),
+                "has_bias": True,
+                "groups": 1,
+                "symbolic": True,
+            }
+            and x_weight_dtype == (np.float32, np.float16)
+        ):
+            pytest.xfail("rdar://124260627 ([CI] Two tests are random failing on CI)")
+
         super().test_builder_to_backend_stress(
             compute_unit, backend, conv_dim, config, x_weight_dtype
         )
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py
index a41ac776f..57e6ec90b 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_elementwise_unary.py
@@ -11,7 +11,7 @@
 
 import coremltools as ct
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import types
+from coremltools.converters.mil.mil import get_new_symbol, types
 from coremltools.converters.mil.mil.ops.tests.iOS14.test_elementwise_unary import (
     TestElementwiseUnary as _TestElementwiseUnary_iOS14,
 )
@@ -63,6 +63,28 @@ def prog():
         cast_op = main_func.find_ops(op_type="cast")[0]
         np.testing.assert_allclose(expected_res, cast_op.outputs[0].val, atol=1e-04, rtol=1e-05)
 
+    @pytest.mark.parametrize(
+        "backend, dtype",
+        itertools.product(
+            backends,
+            ["int8", "uint8", "int16", "uint16"],
+        ),
+    )
+    def test_cast_with_symbolic_value_iOS17(self, backend, dtype):
+        s1 = get_new_symbol()
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(s1, 1))],
+            opset_version=backend.opset_version,
+        )
+        def prog(x):
+            shape = mb.shape(x=x)
+            out = mb.cast(x=shape, dtype=dtype)
+            assert out.val is None
+            sym_val = out.sym_val
+            assert sym_val.tolist() == [s1, 1]
+            return out
+
     @pytest.mark.parametrize(
         "compute_unit, backend, src_dtype, dst_dtype",
         itertools.product(
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
index ce014a3a1..d2e687521 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_linear.py
@@ -13,8 +13,11 @@
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.ops.tests.iOS17 import backends
 from coremltools.converters.mil.mil.ops.tests.testing_utils import run_compare_builder
-from coremltools.converters.mil.mil.types import builtin_to_string, nptype_from_builtin
-from coremltools.converters.mil.mil.types.type_mapping import numpy_type_to_builtin_type
+from coremltools.converters.mil.mil.types import (
+    builtin_to_string,
+    nptype_from_builtin,
+    numpy_type_to_builtin_type,
+)
 from coremltools.converters.mil.testing_reqs import compute_units
 
 
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py
index 77b890b7e..cae1135c5 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_quantization.py
@@ -313,7 +313,11 @@ def test_builder_eval_scalar_params(self):
             zero_point=np.uint8(1),
             scale=np.float32(2),
         )
-        np.testing.assert_allclose(np.float32([[0, 2, 4], [0, 2, 4]]), v.val)
+        assert v.val is None
+        np.testing.assert_allclose(
+            np.float32([[0, 2, 4], [0, 2, 4]]),
+            v.op.materialized_val_inference(),
+        )
 
     @ssa_fn
     def test_builder_eval_vector_params(self):
@@ -323,8 +327,10 @@ def test_builder_eval_vector_params(self):
             scale=np.array([1, 2]).astype(np.float32),
             axis=3,
         )
+        assert v.val is None
         np.testing.assert_allclose(
-            np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32), v.val
+            np.array([1, 2, 3, 4]).reshape(1, 1, 2, 2).astype(np.float32),
+            v.op.materialized_val_inference(),
         )
 
     @ssa_fn
@@ -333,7 +339,11 @@ def test_builder_eval_no_zero_point(self):
             input=np.array([[0, 1, 2], [0, 1, 2]]).astype(np.int8),
             scale=np.float32(2),
         )
-        np.testing.assert_allclose(np.float32([[0, 2, 4], [0, 2, 4]]), v.val)
+        assert v.val is None
+        np.testing.assert_allclose(
+            np.float32([[0, 2, 4], [0, 2, 4]]),
+            v.op.materialized_val_inference(),
+        )
 
     @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_smoke_builder_to_backend_dequantize_per_tensor(self, compute_unit, backend):
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py b/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
index 6d5d0cdeb..b4b4edca5 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS17/test_scatter_gather.py
@@ -249,16 +249,21 @@ def build_dynamic(data, indices, updates):
 
 class TestGather(_TestGatherIOS16):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_dtype, indices_dtype",
+        "compute_unit, backend, x_dtype, indices_dtype, indices_dynamic",
         itertools.product(
             compute_units,
             backends,
             [np.float32, np.float16, np.int32, np.int16, np.uint16, np.int8, np.uint8],
             [np.int32, np.int16, np.uint16, np.int8, np.uint8],
+            [True, False],
         ),
     )
-    def test_builder_to_backend_smoke(self, compute_unit, backend, x_dtype, indices_dtype):
-        super().test_builder_to_backend_smoke(compute_unit, backend, x_dtype, indices_dtype)
+    def test_builder_to_backend_smoke(
+        self, compute_unit, backend, x_dtype, indices_dtype, indices_dynamic
+    ):
+        super().test_builder_to_backend_smoke(
+            compute_unit, backend, x_dtype, indices_dtype, indices_dynamic
+        )
 
     @pytest.mark.parametrize(
         "backend, indices_val, validate_indices",
@@ -291,6 +296,25 @@ def prog(x):
                 opset_version=backend.opset_version,
             )(prog)
 
+    @pytest.mark.parametrize(
+        "backend, indices_val",
+        itertools.product(backends, [0, 1]),
+    )
+    def test_builder_scalar_indices(self, backend, indices_val):
+        @mb.program(input_specs=[], opset_version=backend.opset_version)
+        def prog():
+            params = np.array([1, 2, 3, 4], dtype=np.int32)
+            indices = np.array(indices_val, dtype=np.int32)
+            res = mb.gather(
+                x=params, indices=indices_val, axis=0, batch_dims=0, validate_indices=False
+            )
+            return res
+
+        main_func = prog.functions["main"]
+        gather_op = main_func.find_ops(op_type="gather")[0]
+        assert gather_op.outputs[0].val == 1 if indices_val == 0 else 2
+        assert gather_op.outputs[0].dtype == types.int32
+
 
 class TestGatherAlongAxis:
     @pytest.mark.parametrize(
diff --git a/coremltools/converters/mil/mil/ops/tests/testing_utils.py b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
index 4fa5b93ad..2bce2e551 100644
--- a/coremltools/converters/mil/mil/ops/tests/testing_utils.py
+++ b/coremltools/converters/mil/mil/ops/tests/testing_utils.py
@@ -10,8 +10,9 @@
 
 import coremltools as ct
 from coremltools import _logger as logger
+from coremltools.converters.mil import mil
 from coremltools.converters.mil.input_types import TensorType
-from coremltools.converters.mil.mil import Function, Placeholder, Program
+from coremltools.converters.mil.mil import Function, Placeholder
 from coremltools.converters.mil.mil.passes.pass_pipeline import PassPipeline
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
 from coremltools.converters.mil.testing_reqs import BackendConfig
@@ -119,7 +120,7 @@ def run_compare_builder(
     if expected_outputs is not None and not isinstance(expected_outputs, list):
         expected_outputs = [expected_outputs]
 
-    prog = Program()
+    prog = mil.Program()
     with Function(input_placeholders, opset_version=minimum_deployment_target) as ssa_func:
         output_vars = build(**ssa_func.inputs)
         if isinstance(output_vars, tuple):
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py b/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py
index 32aeb2f5f..5442e9710 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/__init__.py
@@ -7,6 +7,7 @@
 from .const_elimination import const_elimination
 from .dead_code_elimination import dead_code_elimination
 from .dedup_op_and_var_names import dedup_op_and_var_names
+from .expand_dynamic_linear import expand_dynamic_linear
 from .fuse_reduce_mean import fuse_reduce_mean
 from .loop_invariant_elimination import loop_invariant_elimination
 from .noop_elimination import noop_elimination
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py b/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
index 5875fed55..020ab294c 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
@@ -65,14 +65,13 @@ def remove_duplicate_ops(
             for duplicate in unique2duplicates[unique]:
                 if duplicate in block.outputs:
                     continue
-                op = duplicate.op
                 block.replace_uses_of_var_after_op(
-                    anchor_op=op,
+                    anchor_op=duplicate.op,
                     old_var=duplicate,
                     new_var=unique,
                     force_replace=force_replace,
                 )
-                block.remove_ops([op])
+                block.remove_ops([duplicate.op])
 
     @block_context_manager
     def _constant_deduplication_block(self, block: Block) -> None:
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/dead_code_elimination.py b/coremltools/converters/mil/mil/passes/defs/cleanup/dead_code_elimination.py
index bbe6578eb..b7aa1f6ff 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/dead_code_elimination.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/dead_code_elimination.py
@@ -7,6 +7,7 @@
 from coremltools import _logger as logger
 from coremltools.converters.mil.mil import Program
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
 
@@ -48,6 +49,7 @@ def apply(self, prog: Program):
             self._dead_code_elimination_block(f)
 
     @staticmethod
+    @block_context_manager
     def _dead_code_elimination_block(block):
         used_vars = set()
         ops_to_remove = list()
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/dedup_op_and_var_names.py b/coremltools/converters/mil/mil/passes/defs/cleanup/dedup_op_and_var_names.py
index f20675521..639a756af 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/dedup_op_and_var_names.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/dedup_op_and_var_names.py
@@ -38,7 +38,7 @@ def prog(x):
     def apply(self, prog):
         for func in prog.functions.values():
             # Handle function input/outputs as they cannot be changed (to maintain user interface)
-            inputs = list(func.function_inputs)
+            inputs = list(func.inputs.values())
             io_vars = set(inputs + func.outputs)
             self._ensure_unique_var_names(io_vars)
             seen_var_names = set([v.name for v in io_vars])
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/expand_dynamic_linear.py b/coremltools/converters/mil/mil/passes/defs/cleanup/expand_dynamic_linear.py
new file mode 100644
index 000000000..55ca9df71
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/expand_dynamic_linear.py
@@ -0,0 +1,116 @@
+#  Copyright (c) 2024, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+
+from coremltools.converters.mil.mil import Block
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Operation, Program, Var
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.helper import block_context_manager
+from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+
+
+@register_pass(namespace="common")
+class expand_dynamic_linear(AbstractGraphPass):
+    """
+    ``Linear`` requires const or constexpr ``weight`` and ``bias``. In op translation,
+    we ambitiously prefer ``linear`` whenever possible, i.e. translate to ``linear``
+    when operand is descendant of const, since such operand may be folded / fused into
+    const or constexpr later on by graph passes.
+
+    If such const folding / constexpr fusion did not happen, this pass would clean up
+    those too ambitious ``linear``s by replacing them with ``matmul``s
+    """
+
+    def apply(self, prog: Program) -> None:
+        for f in prog.functions.values():
+            self._expand_dynamic_linear_block(f)
+
+    @block_context_manager
+    def _expand_dynamic_linear_block(self, block: Block) -> None:
+        # use shallow copy to hide changes on block.operations during the loop,
+        # since we do not need to deal with the newly expanded matmul + add ops
+        for op in list(block.operations):
+            for b in op.blocks:
+                self._expand_dynamic_linear_block(b)
+
+            if op.op_type == "linear":
+                self._try_expand_dynamic_linear(op, block)
+
+    @staticmethod
+    def _is_operand_static(var: Var) -> bool:
+        if var is None:
+            return True
+
+        op = var.op
+        if op is None:
+            return False
+
+        op_type = op.op_type
+        return op_type == "const" or op_type.startswith("constexpr_")
+
+    def _try_expand_dynamic_linear(self, op: Operation, block: Block) -> None:
+        assert op.op_type == "linear", "Should only apply to linear op"
+
+        is_weight_static = self._is_operand_static(op.weight)
+        is_bias_static = self._is_operand_static(op.bias)
+
+        if is_weight_static:
+            if is_bias_static:
+                # static weight and bias, linear is good
+                return
+            else:
+                # static weight with dynamic bias, so linear for weight matmul + add for bias add
+                matmul = mb.linear(x=op.x, weight=op.weight, before_op=op)
+                add = mb.add(x=matmul, y=op.bias, before_op=op, name=op.name)
+                block.replace_uses_of_var_after_op(
+                    anchor_op=op,
+                    old_var=op.outputs[0],
+                    new_var=add,
+                )
+                op.remove_from_block()
+        else:
+            # dynamic weight, have to expand to at least matmul
+            result = mb.matmul(x=op.x, y=op.weight, transpose_y=True, before_op=op)
+            # static bias, try skipping add if all zero
+            if is_bias_static:
+                force_replace = False
+                # if no bias provided, default to 0, can skip
+                # if bias provided, need to inspect its value
+                if op.bias is not None:
+                    bias_op = op.bias.op
+                    bias_op_type = bias_op.op_type
+                    if bias_op_type == "const":
+                        is_nonzero_bias = np.any(op.bias.val != 0)
+                    else:
+                        if bias_op_type == "constexpr_affine_dequantize":
+                            is_nonzero_bias = not bias_op.is_all_zeros()
+                        # cowardly treat other types of compressed bias as if nonzero
+                        else:
+                            is_nonzero_bias = True
+                        # For such a compressed all-zero bias, if we skip add, then
+                        # the result (matmul output) would only descend from weight but not bias,
+                        # i.e. need to force replacing descendant of bias
+                        if not is_nonzero_bias:
+                            force_replace = True
+                    if is_nonzero_bias:
+                        result = mb.add(x=result, y=op.bias, before_op=op, name=op.name)
+                block.replace_uses_of_var_after_op(
+                    anchor_op=op,
+                    old_var=op.outputs[0],
+                    new_var=result,
+                    force_replace=force_replace,
+                )
+                op.remove_from_block()
+            # dynamic bias, have to further expand to matmul + add
+            else:
+                result = mb.add(x=result, y=op.bias, before_op=op, name=op.name)
+                block.replace_uses_of_var_after_op(
+                    anchor_op=op,
+                    old_var=op.outputs[0],
+                    new_var=result,
+                )
+                op.remove_from_block()
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/fuse_reduce_mean.py b/coremltools/converters/mil/mil/passes/defs/cleanup/fuse_reduce_mean.py
index 58f913538..33a1b89a7 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/fuse_reduce_mean.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/fuse_reduce_mean.py
@@ -108,8 +108,11 @@ def _try_to_transform(reduce_sum_op, block):
 
     @block_context_manager
     def _fuse_reduce_mean_block(self, block):
-        fusion_status = False
-        for i, op in enumerate(list(block.operations)):
+        fusion_occurred = False
+        for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -119,8 +122,6 @@ def _fuse_reduce_mean_block(self, block):
 
             # start pattern match if mul op is encountered
             if op.op_type == "reduce_sum":
-                fusion_status = self._try_to_transform(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, block):
+                    fusion_occurred = True
+        return fusion_occurred
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/loop_invariant_elimination.py b/coremltools/converters/mil/mil/passes/defs/cleanup/loop_invariant_elimination.py
index 774c6b208..f25945fdb 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/loop_invariant_elimination.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/loop_invariant_elimination.py
@@ -68,11 +68,8 @@ def _detect_loop_invariants(while_op):
             # this block output is a var from outside of the block
 
             enclosing_block = while_op.enclosing_block
-            while_op_id = enclosing_block.find_op_id_in_block(while_op)
-            output_from_outside_of_block = (
-                True
-                if enclosing_block.is_var_visible_in_block(vx_out, upto_op_with_id=while_op_id)
-                else False
+            output_from_outside_of_block = enclosing_block.is_var_visible_in_block(
+                vx_out, upto_op=while_op
             )
             if return_input_as_output or output_from_outside_of_block:
                 loop_invariant_ids.append(i)
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py b/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py
index 8aee02bbf..21f9fb00f 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/noop_elimination.py
@@ -85,10 +85,8 @@ def has_all_elements_equal_to(var, value):
 
             if has_all_elements_equal_to(op.x, x):
                 input_var = op.y
-                input_op = input_var.op
             elif has_all_elements_equal_to(op.y, y):
                 input_var = op.x
-                input_op = input_var.op
             else:
                 return False
 
@@ -100,7 +98,7 @@ def has_all_elements_equal_to(var, value):
                 return False
 
             if op.enclosing_block.try_replace_uses_of_var_after_op(
-                anchor_op=input_op,
+                anchor_op=op,
                 old_var=op.outputs[0],
                 new_var=input_var,
             ):
@@ -132,13 +130,10 @@ def remove_slice_by_index(op):
                 if any([x < 0 for x in stride]):
                     return False
 
-            input_var = op.x
-            input_op = input_var.op
-
             if op.enclosing_block.try_replace_uses_of_var_after_op(
-                anchor_op=input_op,
+                anchor_op=op,
                 old_var=op.outputs[0],
-                new_var=input_var,
+                new_var=op.x,
             ):
                 op.enclosing_block.remove_ops([op])
                 return True
@@ -151,13 +146,10 @@ def remove_same_shape(op):
             if input_shape != output_shape:
                 return False
 
-            input_var = op.x
-            input_op = input_var.op
-
             if op.enclosing_block.try_replace_uses_of_var_after_op(
-                anchor_op=input_op,
+                anchor_op=op,
                 old_var=op.outputs[0],
-                new_var=input_var,
+                new_var=op.x,
             ):
                 op.enclosing_block.remove_ops([op])
                 return True
@@ -167,13 +159,10 @@ def remove_linear(op):
             if op.alpha.val != 1 or op.beta.val != 0:
                 return False
 
-            input_var = op.x
-            input_op = input_var.op
-
             if op.enclosing_block.try_replace_uses_of_var_after_op(
-                anchor_op=input_op,
+                anchor_op=op,
                 old_var=op.outputs[0],
-                new_var=input_var,
+                new_var=op.x,
             ):
                 op.enclosing_block.remove_ops([op])
                 return True
@@ -185,13 +174,10 @@ def remove_transpose(op):
             if (perm != sorted_perm).any():
                 return False
 
-            input_var = op.x
-            input_op = input_var.op
-
             if op.enclosing_block.try_replace_uses_of_var_after_op(
-                anchor_op=input_op,
+                anchor_op=op,
                 old_var=op.outputs[0],
-                new_var=input_var,
+                new_var=op.x,
             ):
                 op.enclosing_block.remove_ops([op])
                 return True
@@ -218,7 +204,6 @@ def remove_transpose(op):
             "crop": remove_same_shape,
             "linear_activation": remove_linear,
         }
-
         # abort if op output is a block output
         if op.outputs[0] in op.enclosing_block.outputs:
             return None
@@ -234,7 +219,11 @@ def remove_transpose(op):
     @block_context_manager
     def _noop_elimination_block_wrapper(self, block):
         def _noop_elimination_block(block):
+            status = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
+
                 for b in op.blocks:
                     block_changed = True
                     while block_changed:
@@ -243,12 +232,9 @@ def _noop_elimination_block(block):
                     continue
 
                 remove_fn = noop_elimination._match_pattern(op)
-                if remove_fn is not None:
-                    status = remove_fn(op)
-                    # has to break as the downstream iterator is affected.
-                    if status:
-                        return status
-            return False
+                if remove_fn is not None and remove_fn(op):
+                    status = True
+            return status
 
         block_changed = True
         while block_changed:
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py b/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
index a266dd1c7..c35e5404f 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/remove_redundant_ops.py
@@ -4,10 +4,11 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import collections
+from typing import Dict, List
 
 import numpy as np
 
-from coremltools.converters.mil.mil import Var
+from coremltools.converters.mil.mil import Block, Operation, Var
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
@@ -54,7 +55,14 @@ class remove_redundant_ops(AbstractGraphPass):
 
     _NON_REDUNDANT_OPS = tuple()
 
+    def __init__(self):
+        self._num_of_visited_ops: int = (
+            0  # Testing purpose, making sure the algorithm performs in O(N)
+        )
+        self._ops_order: Dict[Block, Dict[Operation, int]] = {}
+
     def apply(self, prog):
+        self._num_of_visited_ops = 0
         for f in prog.functions.values():
             self._remove_redundant_ops_in_block_wrapper(f)
 
@@ -69,21 +77,20 @@ def _is_op_eligible_to_be_removed(op):
         else:
             return True
 
-    @staticmethod
-    def _get_candidate_ops_list(prospective_ops_list):
+    def _get_candidate_ops_list(self, prospective_ops_list: List[Operation]) -> List[Operation]:
         od = collections.OrderedDict()
-        enclosing_block = [op.enclosing_block for op in prospective_ops_list]
-        if len(set(enclosing_block)) > 1:  # all candidate ops must belong to the same block
+        enclosing_blocks = [op.enclosing_block for op in prospective_ops_list]
+        if len(set(enclosing_blocks)) > 1:  # all candidate ops must belong to the same block
             return []
         for op in prospective_ops_list:
             if remove_redundant_ops._is_op_eligible_to_be_removed(op):
-                od[op] = enclosing_block[0].operations.index(op)
+                od[op] = self._ops_order[enclosing_blocks[0]][op]
+
         # Sort the ops according to their index of appearing in block.operations, which is
         # topologically sorted
         return [x[0] for x in sorted(od.items(), key=lambda t: t[1])]
 
-    @staticmethod
-    def _get_candidate_ops_lists_from_var(var):
+    def _get_candidate_ops_lists_from_var(self, var: Var) -> List[List[Operation]]:
         """
         Return a list of lists.
         Each element is a list of a subset of the child ops of var, which satisfies the following conditions:
@@ -103,7 +110,7 @@ def _get_candidate_ops_lists_from_var(var):
 
         for v in op_types_to_ops.values():
             if len(v) > 1:
-                candidate_ops_list = remove_redundant_ops._get_candidate_ops_list(v)
+                candidate_ops_list = self._get_candidate_ops_list(v)
                 if len(candidate_ops_list) > 1:
                     candidate_ops_lists.append(candidate_ops_list)
 
@@ -184,6 +191,9 @@ def _try_to_remove_ops(candidate_ops_list):
         first_op = candidate_ops_list[0]
         block = first_op.enclosing_block
 
+        if block is None:
+            return False
+
         # currently, we only consider the cases when the op has 1 output.
         # The replace var logic below only handles the single output case.
         if len(first_op.outputs) > 1:
@@ -191,6 +201,8 @@ def _try_to_remove_ops(candidate_ops_list):
 
         ops_to_remove = []
         for op in candidate_ops_list[1:]:
+            if op.enclosing_block is None:
+                continue
             if op.outputs[0] not in block.outputs:  # to make sure we don't remove an output op
                 if remove_redundant_ops._are_ops_identical(first_op, op):
                     ops_to_remove.append(op)
@@ -212,25 +224,44 @@ def _try_to_remove_ops(candidate_ops_list):
         block.remove_ops(ops_removed)
         return True
 
-    @staticmethod
-    def _try_to_transform(parent_var):
+    def _try_to_transform(self, parent_var: Var) -> bool:
         """
         scan the children ops to parent_var, to find and remove identical ops, if any.
         Returns True, if successful in finding such redundant ops.
         """
-        candidate_ops_lists = remove_redundant_ops._get_candidate_ops_lists_from_var(parent_var)
+        candidate_ops_lists = self._get_candidate_ops_lists_from_var(parent_var)
         block_changed = False
         for ops_list in candidate_ops_lists:
             # Iterate through the child ops list, to make sure that we check all possible combinations.
             for idx in range(len(ops_list)):
                 if remove_redundant_ops._try_to_remove_ops(ops_list[idx:]):
+                    # We shoud not break right alway, so that we can keep
+                    # the time complexity low.
                     block_changed = True
-                    break
+
         return block_changed
 
     @block_context_manager
     def _remove_redundant_ops_in_block_wrapper(self, block):
+        def _cache_topological_order_of_ops_in_block(block: Block):
+            if block in self._ops_order:
+                return
+
+            self._ops_order[block] = {}
+            for i, op in enumerate(block.operations):
+                for b in op.blocks:
+                    _cache_topological_order_of_ops_in_block(b)
+                self._ops_order[block][op] = i
+
         def _remove_redundant_ops_in_block(block):
+            # cache the topological order of the ops,
+            # so that we would not to query the index every single time.
+            # Note that, the transformation in this particular graph pass
+            # is going to preserve the topological order. And that is the
+            # reason why we can do the cache in the very beginning.
+            _cache_topological_order_of_ops_in_block(block)
+
+            # iterate over the block inputs
             if isinstance(block.inputs, dict):
                 block_input_var_list = list(block.inputs.values())
             elif isinstance(block.inputs, (list, tuple)):
@@ -238,17 +269,19 @@ def _remove_redundant_ops_in_block(block):
             else:
                 raise ValueError("Unrecognized type of block.inputs, its neither a list nor dict.")
 
-            # iterate over the block inputs
             for input_var in block_input_var_list:
                 if len(input_var.child_ops) > 1:
                     self._try_to_transform(input_var)
 
             # iterate over the ops in the block
             graph_updated = False
-            for op in block.operations:
+            for op in list(block.operations):
+
                 if op.op_type == "const":
                     continue
 
+                self._num_of_visited_ops += 1
+
                 for b in op.blocks:
                     block_changed = True
                     while block_changed:
@@ -257,12 +290,13 @@ def _remove_redundant_ops_in_block(block):
                 if len(op.outputs) > 0 and len(op.outputs[0].child_ops) > 1:
                     # currently, we only check the first output of the op
                     # this can be extended, if required, to check for other outputs.
-                    graph_updated = self._try_to_transform(op.outputs[0])
-                    # has to break as the downstream iterator is affected.
-                    if graph_updated:
-                        return graph_updated
+                    if self._try_to_transform(op.outputs[0]):
+                        # we don't need to break right away, in order to
+                        # keep the time complexity fast.
+                        graph_updated = True
             return graph_updated
 
         block_changed = True
         while block_changed:
+            self._ops_order = {}
             block_changed = _remove_redundant_ops_in_block(block)
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/topological_reorder.py b/coremltools/converters/mil/mil/passes/defs/cleanup/topological_reorder.py
index afbc88ee7..6e12a2c27 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/topological_reorder.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/topological_reorder.py
@@ -8,6 +8,7 @@
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
+from coremltools.converters.mil.mil.utils import CacheDoublyLinkedList
 
 
 @register_pass(namespace="common")
@@ -67,7 +68,7 @@ class topological_reorder(AbstractGraphPass):
     """
 
     def apply(self, prog):
-        for f_name, f in prog.functions.items():
+        for f in prog.functions.values():
             self._move_operations_to_the_end_block(f, ["cast", "transpose"])
 
     @staticmethod
@@ -84,9 +85,10 @@ def _move_operations_to_the_end_block(block, op_type_to_move):
         #  - set[Var]: Set of vars consumed in block (or returned as block output)
 
         # first_use maps var to (index, op) representing the first op in block.operation that consumes this var.
+        block.operations = list(block.operations)
         first_use = {}  # var -> op
         ops_to_remove = []  # list of ops to be deleted at the end of pass
-        for index, op in enumerate(reversed(block.operations[:])):
+        for op in reversed(block.operations):
             current_op = op
 
             if op.op_type in op_type_to_move:
@@ -118,7 +120,7 @@ def _move_operations_to_the_end_block(block, op_type_to_move):
 
                 for old_output_var, new_output_var in zip(op.outputs, new_var):
                     block.replace_uses_of_var_after_op(
-                        anchor_op=None, old_var=old_output_var, new_var=new_output_var
+                        anchor_op=op, old_var=old_output_var, new_var=new_output_var
                     )
 
             # Collect input vars from sub-block if present
@@ -161,6 +163,7 @@ def _move_operations_to_the_end_block(block, op_type_to_move):
                     first_use[v] = current_op
 
         # Remove ops that are reordered
+        block.operations = CacheDoublyLinkedList(block.operations)
         block.remove_ops(ops_to_remove)
 
         # Returns set of vars consumed in current block
diff --git a/coremltools/converters/mil/mil/passes/defs/lower_complex_dialect_ops.py b/coremltools/converters/mil/mil/passes/defs/lower_complex_dialect_ops.py
index ed36d87f3..c78665870 100644
--- a/coremltools/converters/mil/mil/passes/defs/lower_complex_dialect_ops.py
+++ b/coremltools/converters/mil/mil/passes/defs/lower_complex_dialect_ops.py
@@ -162,8 +162,10 @@ def _calculate_dft_matrix(
     weight matrix consisting of only the first (n_fft // 2 + 1) values.
     """
     n_fft = mb.cast(x=n_fft, dtype="fp32", before_op=before_op)
-    half = mb.floor_div(x=n_fft, y=2., before_op=before_op)
-    half = mb.add(x=half, y=1., before_op=before_op)
+
+    if onesided:
+        half = mb.floor_div(x=n_fft, y=2.0, before_op=before_op)
+        half = mb.add(x=half, y=1.0, before_op=before_op)
 
     tmp_x = mb.range_1d(start=0.0, end=(half if onesided else n_fft), step=1.0, before_op=before_op)
     tmp_y = mb.range_1d(start=0.0, end=n_fft, step=1.0, before_op=before_op)
@@ -171,15 +173,15 @@ def _calculate_dft_matrix(
      # Use MIL ops to calculate base = torch.outer(tmp, tmp) * (2 * torch.pi / N).
     tmp_x = mb.reshape(x=tmp_x, shape=[-1, 1], before_op=before_op)
     tmp_y = mb.reshape(x=tmp_y, shape=[1, -1], before_op=before_op)
-    
+
     base = mb.matmul(x=tmp_x, y=tmp_y, before_op=before_op)
     base = mb.mul(x=base, y=2 * np.pi, before_op=before_op)
     base = mb.real_div(x=base, y=n_fft, before_op=before_op)
-    
+
     # Get real part and imaginary part separately.
     cos_base = mb.cos(x=base, before_op=before_op)
     sin_base = mb.sin(x=base, before_op=before_op)
-    
+
     return cos_base, sin_base
 
 def _fft_1d(
@@ -233,7 +235,7 @@ def _fft_1d(
     N = transposed_input_real.shape[0]
     reshaped_input_real = mb.reshape(x=transposed_input_real, shape=[N, -1], before_op=before_op)
     reshaped_input_imag = mb.reshape(x=transposed_input_imag, shape=[N, -1], before_op=before_op)
-    
+
     N = mb.cast(x=N, dtype="fp32", before_op=before_op)
     cos_base, sin_base = _calculate_dft_matrix(N, onesided=False, before_op=before_op)
 
@@ -342,7 +344,7 @@ def _stft(
         n_fft,
         onesided=is_onesided,
         before_op=before_op)
-    
+
     # create a window of centered 1s of the requested size
     if win_length:
         n_left = (n_fft.val - win_length.val) // 2
@@ -352,7 +354,7 @@ def _stft(
         if not window:
             window = mb.fill(shape=(win_length.val,), value=1., before_op=before_op)
         right = mb.fill(shape=(n_right,), value=0., before_op=before_op)
-        
+
         # concatenate
         window = mb.concat(values=(left, window, right), axis=0, before_op=before_op)
 
@@ -602,10 +604,10 @@ def _lower_complex_stft(op: Operation):
         raise ValueError("Onesided is only valid for real inputs")
 
     real, imag = _stft(
-        op.input.real if is_complex else op.input, 
-        op.input.imag if is_complex else None, 
+        op.input.real if is_complex else op.input,
+        op.input.imag if is_complex else None,
         op.n_fft, op.hop_length, op.win_length, op.window, op.normalized, op.onesided, before_op=op)
-   
+
     return _wrap_complex_output(op.outputs[0], real, imag)
 
 
@@ -637,16 +639,8 @@ def _match_and_replace_dialect_op(block, op):
 
 @block_context_manager
 def _lower_complex_dialect_ops_in_block(block):
-    def help_lower_complex_dialect_ops(block):
-        for op in list(block.operations):
-            if _match_and_replace_dialect_op(block, op):
-                return True
-        return False
-
-    block_changed = True
-    while block_changed:
-        block_changed = help_lower_complex_dialect_ops(block)
-
+    for op in list(block.operations):
+        _match_and_replace_dialect_op(block, op)
 
 @register_pass(namespace="common")
 class lower_complex_dialect_ops(AbstractGraphPass):
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_activation.py b/coremltools/converters/mil/mil/passes/defs/optimize_activation.py
index 84f8c44ec..fb0e5991f 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_activation.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_activation.py
@@ -154,19 +154,21 @@ def _try_to_transform(op, block):
     def _fuse_gelu_exact_block(self, block):
         fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
                     block_changed = self._fuse_gelu_exact_block(b)
+
             if len(op.blocks) > 0:
                 # This op can't be real_div or mul
                 continue
 
             if op.op_type in ["mul", "real_div"]:
-                fusion_occurred = self._try_to_transform(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_occurred:
-                    return fusion_occurred
+                if self._try_to_transform(op, block):
+                    fusion_occurred = True
         return fusion_occurred
 
 
@@ -179,21 +181,21 @@ class fuse_gelu_tanh_approximation(AbstractGraphPass):
     The implementation of this pass uses the generic graph pattern matching and transform algorithm
     implemented in ``coremltools.converters.mil.experimental.passes.generic_pass_infrastructure`` and
     documented in ``coremltools/converters/mil/experimental/passes/readme.md``.
-    
+
     `Graph for` ``get_gelu_pattern1()``
-    
+
     ``y = x * (0.5 * (tanh(((.0447)x^3 + x ) * sqrt(2/pi)) + 1))``
 
     .. code-block::
 
-	    [...] -----> pow (3) ----> mul (.044715) ---> add -----> mul (sqrt(2/pi)) ---> tanh ----> add (1) ----> mul (0.5) -----> mul ---> [...]
-	      |                                            ^                                                                          ^
-	      |                                            |                                                                          |
-	      |------------------------------------------------------------------------------------------------------------------------
+            [...] -----> pow (3) ----> mul (.044715) ---> add -----> mul (sqrt(2/pi)) ---> tanh ----> add (1) ----> mul (0.5) -----> mul ---> [...]
+              |                                            ^                                                                          ^
+              |                                            |                                                                          |
+              |------------------------------------------------------------------------------------------------------------------------
 
 
     `Graph for` ``get_gelu_pattern2()``
-    
+
     ``y = (0.5 * x) * (tanh(((.0447)x^3 + x ) * sqrt(2/pi)) + 1)``
 
     .. code-block::
@@ -311,7 +313,7 @@ def get_gelu_pattern2():
               |                        ^                               ^
               |                        |                               |
               |---------------------------------------------------------
-		
+
         """
 
         @mb.program(
@@ -339,7 +341,7 @@ class fuse_leaky_relu(AbstractGraphPass):
     Detect the ``mul`` ---> ``max`` pattern than can be mapped to ``leaky_relu``.
 
     `In code form - Input`
-    
+
     .. code-block::
 
        %2 = const(value = alpha) # where 0 <= alpha <= 1
@@ -348,14 +350,14 @@ class fuse_leaky_relu(AbstractGraphPass):
 
 
     `In code form - Output`
-    
+
     .. code-block::
 
        %4 = leaky_relu(x=%1, alpha=%2)
 
 
     `In graphical form - Input graph`
-    
+
     .. code-block::
 
                  const (val = alpha)
@@ -366,7 +368,7 @@ class fuse_leaky_relu(AbstractGraphPass):
 
 
     `In graphical form - Output graph`
-    
+
     .. code-block::
 
         input --------> leaky_relu ---------> output
@@ -423,22 +425,24 @@ def _try_to_transform(mul_op, block):
 
     @block_context_manager
     def _fuse_leaky_relu_block(self, block):
-        fusion_status = False
-        for i, op in enumerate(list(block.operations)):
+        fusion_occurred = False
+        for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
                     block_changed = self._fuse_leaky_relu_block(b)
+
             if len(op.blocks) > 0:
                 continue
 
             # start pattern match if mul op is encountered
             if op.op_type == "mul":
-                fusion_status = self._try_to_transform(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, block):
+                    fusion_occurred = True
+        return fusion_occurred
 
 
 class FusePreluPattern1:
@@ -557,7 +561,7 @@ def transform_pattern(pattern):
     def get_prelu_pattern():
         """
         ``x1 = transpose(perm=(0,2,3,1))(x)``
-        
+
         ``y = a * relu(-1 * x1) + relu(x1)``
 
         When ``x`` is rank 4, and ``a`` is of shape (``C,)``, ``(1, C)``, ``(1,1,C)``, or ``(1,1,1,C)``,
@@ -585,7 +589,7 @@ class fuse_prelu(AbstractGraphPass):
     """
     Detect the following patterns that can be mapped to a ``prelu`` op.
     Essentially, the ``prelu`` op can be broken down into the following ops:
-    
+
     ``y = a * relu(-1 * x) + relu(x)``
 
     `Pattern 1`
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_conv.py b/coremltools/converters/mil/mil/passes/defs/optimize_conv.py
index 0dbbd0236..eecedf485 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_conv.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_conv.py
@@ -125,7 +125,11 @@ def apply(self, prog):
     @block_context_manager
     def _compose_conv1d_block(self, block: Block):
         def help_compose_conv1d_block(block: Block) -> bool:
+            fusion_occurred = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
+
                 for b in op.blocks:
                     self._compose_conv1d_block(b)
 
@@ -141,10 +145,9 @@ def help_compose_conv1d_block(block: Block) -> bool:
 
                 # try pattern `expand_dim` -> `transpose` -> `conv2d` -> `transpose` -> `squeeze`
                 if self._try_match_and_transform_pattern_channel_last(op, block):
-                    # has to break as the downstream iterator is affected
-                    return True
+                    fusion_occurred = True
 
-            return False
+            return fusion_occurred
 
         block_changed = True
         while block_changed:
@@ -168,9 +171,11 @@ def _try_match_and_transform_pattern(self, expand_op: Operation, block: Block) -
             return False
         squeeze_op = conv_op.outputs[0].child_ops[0]
 
-        # abort composition if not squeezing the dummy height
+        # Abort composition if not squeezing the dummy height (the extended dim_size=1 dimension)
         if squeeze_op.axes.rank != 1 or squeeze_op.axes.val[0] not in (-2, 2):
             return False
+        elif squeeze_op.x.shape[squeeze_op.axes.val[0]] != 1:
+            return False
 
         # everything looks good
         return self._try_apply_transform(expand_op, conv_op, squeeze_op, block)
@@ -498,6 +503,9 @@ def _match_pattern(op):
 
         fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -508,10 +516,8 @@ def _match_pattern(op):
 
             bn_op = _match_pattern(op)
             if bn_op is not None:
-                fusion_occurred = self._try_to_transform(op, bn_op)
-                # has to break as the downstream iterator is affected.
-                if fusion_occurred:
-                    return fusion_occurred
+                if self._try_to_transform(op, bn_op):
+                    fusion_occurred = True
         return fusion_occurred
 
 
@@ -796,8 +802,11 @@ def _try_to_transform(conv_op, add_op):
 
     @block_context_manager
     def _fuse_conv_bias_block(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -809,17 +818,14 @@ def _fuse_conv_bias_block(self, block):
             # pattern 1 : conv + add/sub
             add_op = self._match_pattern(op)
             if add_op is not None:
-                fusion_status = self._try_to_transform(op, add_op)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
+                if self._try_to_transform(op, add_op):
+                    fusion_occurred = True
 
             # pattern 2 : conv + transpose + add/sub
-            fusion_status = self._try_to_transform_transpose_pattern(op, block)
-            if fusion_status:
-                return fusion_status
+            elif self._try_to_transform_transpose_pattern(op, block):
+                fusion_occurred = True
 
-        return fusion_status
+        return fusion_occurred
 
 
 @register_pass(namespace="common")
@@ -1005,6 +1011,9 @@ def _match_pattern(op):
 
         fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -1016,10 +1025,9 @@ def _match_pattern(op):
             scale_op = _match_pattern(op)
 
             if scale_op is not None:
-                fusion_occurred = self._try_to_transform(op, scale_op)
-                # has to break as the downstream iterator is affected.
-                if fusion_occurred:
-                    return fusion_occurred
+                if self._try_to_transform(op, scale_op):
+                    fusion_occurred = True
+
         return fusion_occurred
 
 
@@ -1125,8 +1133,11 @@ def _compute_new_pad_values(transpose_op):
 
     @block_context_manager
     def _pad_conv_connect_block(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -1137,8 +1148,6 @@ def _pad_conv_connect_block(self, block):
 
             transpose_ops = self._match_pattern(op)
             if transpose_ops is not None:
-                fusion_status = self._try_to_transform(op, transpose_ops, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, transpose_ops, block):
+                    fusion_occurred = True
+        return fusion_occurred
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py b/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
index f0be448e4..72633392c 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
@@ -170,6 +170,7 @@ def try_to_transform_const_scalar_cond(select_op: Operation, cond_val: np.ndarra
         result_shape = broadcast_shapes(a.shape, b.shape)
         # cannot simply replace with a or b if broadcasting
         if x.shape != result_shape:
+            x.op.enclosing_block.remove_ops([x.op])
             return None
 
         return x
@@ -332,8 +333,11 @@ def _check_shape(arr):
 
     @block_context_manager
     def _fuse_elementwise_to_batchnorm_block(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -344,11 +348,10 @@ def _fuse_elementwise_to_batchnorm_block(self, block):
 
             add_op = self._match_pattern(op)
             if add_op is not None:
-                fusion_status = self._try_to_transform(op, add_op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, add_op, block):
+                    fusion_occurred = True
+
+        return fusion_occurred
 
 
 @register_pass(namespace="common")
@@ -475,6 +478,9 @@ def _try_to_transform(op, block):
     def _rank0_expand_dims_swap(self, block):
         fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -484,8 +490,6 @@ def _rank0_expand_dims_swap(self, block):
                 continue
 
             if op.op_type in ["add", "sub", "mul", "real_div", "floor_div"]:
-                fusion_occurred = self._try_to_transform(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_occurred:
-                    return fusion_occurred
+                if self._try_to_transform(op, block):
+                    fusion_occurred = True
         return fusion_occurred
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_linear.py b/coremltools/converters/mil/mil/passes/defs/optimize_linear.py
index e59103a5d..d37084b03 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_linear.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_linear.py
@@ -5,8 +5,9 @@
 
 import numpy as np
 
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Program
+from coremltools.converters.mil.mil import Operation, Program, Var
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
@@ -130,6 +131,9 @@ def _find_candicate_op(op):
 
         fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -140,10 +144,8 @@ def _find_candicate_op(op):
 
             add_or_sub_op = _find_candicate_op(op)
             if add_or_sub_op is not None:
-                fusion_occurred = self._try_to_transform(op, add_or_sub_op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_occurred:
-                    return fusion_occurred
+                if self._try_to_transform(op, add_or_sub_op, block):
+                    fusion_occurred = True
         return fusion_occurred
 
 
@@ -290,8 +292,11 @@ def _try_to_transform(self, matmul_op, add_op, block):
 
     @block_context_manager
     def _fuse_matmul_weight_bias_block(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -303,8 +308,114 @@ def _fuse_matmul_weight_bias_block(self, block):
             add_op = self._find_candidate_op(op)
 
             if add_op is not None:
-                fusion_status = self._try_to_transform(op, add_op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, add_op, block):
+                    fusion_occurred = True
+        return fusion_occurred
+
+
+@register_pass(namespace="common")
+class fuse_transpose_matmul(AbstractGraphPass):
+    """
+    Fuse ``transpose + matmul`` to ``matmul`` if possible,
+    since ``matmul`` has args ``transpose_x`` and ``transpose_y`` to transpose last 2 dims
+
+    .. code-block::
+
+        Positive example:
+            Input graph:
+                transpose(x=x, perm=(1, 0)) -|
+                                             |-> matmul(x=transposed_x, y=transposed_y)
+                transpose(x=y, perm=(1, 0)) -|
+
+            Output graph:
+                matmul(x=x, y=y, transpose_x=True, transpose_y=True)
+
+        Negative example:
+            Input graph:
+                transpose(x=x, perm=(1, 0, 2)) -|
+                                                |-> matmul(x=transposed_x, y=transposed_y)
+                transpose(x=y, perm=(1, 0, 2)) -|
+
+            Output graph:
+                Same to input graph, nothing changes
+    """
+
+    def apply(self, prog: Program) -> None:
+        for f in prog.functions.values():
+            self._fuse_transpose_matmul_block(f)
+
+    @block_context_manager
+    def _fuse_transpose_matmul_block(self, block: Block) -> None:
+        # use shallow copy to hide changes on block.operations during the loop,
+        # since we try fusion when loop to matmul, which will not affect downstream
+        for op in list(block.operations):
+            for b in op.blocks:
+                self._fuse_transpose_matmul_block(b)
+
+            if op.op_type == "matmul":
+                self._try_fuse_transpose_matmul(op, block)
+
+    @staticmethod
+    def is_transposed_and_fusable_to_matmul(x: Var) -> bool:
+        """
+        1. check if x is transposed
+        2. check if x is transposed in the last 2 dimensions,
+           since the transpose arg in matmul only transposes the last 2 dimensions
+        """
+
+        # x is not transposed, False
+        if x.op is None or x.op.op_type != "transpose":
+            return False
+
+        rank = x.rank
+        # if transposing a rank < 2 tensor, it is a noop and will be elimianted by noop_elimination
+        if rank < 2:
+            return False
+
+        # canonicalize the input permutation to compare with last-2-dim permutation below
+        perm = x.op.perm.val
+        perm[np.where(perm < 0)] += rank
+        perm[-2:] -= rank
+
+        # permuting only last 2 dims should look like (0, 1, ..., -1, -2)
+        perm_only_last_2_dims = np.arange(rank)
+        perm_only_last_2_dims[-2] = -1
+        perm_only_last_2_dims[-1] = -2
+
+        return np.all(perm == perm_only_last_2_dims)
+
+    def _try_fuse_transpose_matmul(self, op: Operation, block: Block) -> None:
+        assert op.op_type == "matmul"
+
+        x = op.x
+        y = op.y
+        transpose_x = False if op.transpose_x is None else op.transpose_x.val
+        transpose_y = False if op.transpose_y is None else op.transpose_y.val
+
+        is_x_transposed_and_fusable_to_matmul = self.is_transposed_and_fusable_to_matmul(x)
+        is_y_transposed_and_fusable_to_matmul = self.is_transposed_and_fusable_to_matmul(y)
+        # if neither x nor y is transposed and fuseable with matmul, nothing we need to do
+        if not is_x_transposed_and_fusable_to_matmul and not is_y_transposed_and_fusable_to_matmul:
+            return
+
+        if is_x_transposed_and_fusable_to_matmul:
+            x = x.op.x
+            transpose_x = not transpose_x
+        if is_y_transposed_and_fusable_to_matmul:
+            y = y.op.x
+            transpose_y = not transpose_y
+
+        fused_transpose_matmul = mb.matmul(
+            x=x,
+            y=y,
+            transpose_x=transpose_x,
+            transpose_y=transpose_y,
+            before_op=op,
+            name=op.name,
+        )
+        block.replace_uses_of_var_after_op(
+            anchor_op=op,
+            old_var=op.outputs[0],
+            new_var=fused_transpose_matmul,
+        )
+        op.remove_from_block()
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_normalization.py b/coremltools/converters/mil/mil/passes/defs/optimize_normalization.py
index 30e430bb6..15bfb3c19 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_normalization.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_normalization.py
@@ -27,7 +27,7 @@ class fuse_layernorm_or_instancenorm(AbstractGraphPass):
     are ``instance_norm``. Pattern 5 is ``layer_norm``. You can find these patterns in the methods for
     this class in the source code. To quickly view the source code, click the **[source]** button at
     the end of the class definition.
-    
+
     """
 
     _DEBUG = False  # set to true to plot the block before and after the transformation
@@ -93,13 +93,13 @@ def _check_child_op_types(
         """
         Returns ``True`` for child op types matching ``child_op_types``, otherwise returns ``False``.
 
-		Parameters
-		----------
+                Parameters
+                ----------
 
         param op : Current op.
-        
+
         param child_op_type : Expected child op type.
-        
+
         param check_order : Ensure child in given order, defaults to ``True``.
         """
         if op is None or len(op.outputs) != 1:
@@ -120,13 +120,13 @@ def _try_get_child_op_type(
         """
         Returns child op if type matches, otherwise returns ``None``.
 
-		Parameters
-		----------
+                Parameters
+                ----------
 
         param op : Current op.
-        
+
         param child_op_type : Expected child op type.
-        
+
         param index : Child op index.
         """
         if op is None:
@@ -185,8 +185,18 @@ def _try_apply_transform(
 
         if rank == 4 and negative_axes == [-3]:
             is_layernorm = (gamma_var is None and beta_var is None) or (gamma_rank == 1 and beta_rank == 1)
-            gamma_var = gamma_var.val if gamma_var else None
-            beta_var = beta_var.val if beta_var else None
+
+            if gamma_var:
+                ops_to_remove.append(gamma_var.op)
+                gamma_var = gamma_var.val
+            else:
+                gamma_var = None
+
+            if beta_var:
+                ops_to_remove.append(beta_var.op)
+                beta_var = beta_var.val
+            else:
+                beta_var = None
 
         if rank == 4 and (negative_axes == [-2, -1] or negative_axes == [-3, -2]):
             if (
@@ -219,6 +229,7 @@ def _try_apply_transform(
                 name=out_name + "_instancenorm" if is_require_rank4_transpose else out_name,
                 before_op=end_op,
             )
+            ops_to_remove.extend([gamma_var.op, beta_var.op])
         else:  # is_layernorm
             x = mb.layer_norm(
                 x=x if is_require_rank4_transpose else reduce_op.x,
@@ -251,7 +262,7 @@ def _try_match_and_transform_pattern_1(self, reduce_op, block) -> bool:
         ``y = gamma * (x - mean) / sqrt(variance + epsilon) + beta``
 
         ``y = x * [gamma * rsqrt(variance + eps)] + (beta - mean * [gamma * rsqrt(variance + eps)])``
-        
+
         .. code-block::
 
             x --> reduce_mean --> sub --> square --> reduce_mean --> add(epsilon) --> rsqrt
@@ -282,8 +293,8 @@ def _try_match_and_transform_pattern_1(self, reduce_op, block) -> bool:
         It is ``layer_norm`` if all of the following are true:
             - ``axes`` is either ``[-1]``, ``[-1, -2]``, or ``[-1, -2, -3]``, and so on.
             - ``rank`` of ``gamma`` and ``beta`` is equal to the length of the ``axes``.
-        
-         """
+
+        """
         ops_to_remove = []
         root_var = reduce_op.x
 
@@ -398,18 +409,18 @@ def _try_match_and_transform_pattern_1(self, reduce_op, block) -> bool:
     def _try_match_and_transform_pattern_2(self, reduce_op, block) -> bool:
         """
         Identify the pattern:
-        
+
         ``y = (x - mean) / pow(variance + epsilon) * gamma + beta``
 
         This pattern corresponds to, and should be fused as, ``instance_norm``.
-        
+
         All of the following conditions must be satisfied:
-        
+
         1. ``input`` is rank 4 tensor.
         2. ``reduce`` operates on spatial dimensions ``axes=[-2, -1]``, or ``axes=[-3, -2]`` (a
            channel first to channel last transpose would be inserted in such cases).
         3. ``gamma`` and ``beta`` are both shape ``(C,)`` after ``squeeze``, where ``C`` is number of channels.
-        
+
         .. code-block::
 
             |----> sub -----|                            const (0.5)
@@ -523,14 +534,14 @@ def _try_match_and_transform_pattern_3(self, reduce_op, block) -> bool:
         Detect ``InstanceNorm`` pattern in TensorFlow-Addons.
 
         This pattern corresponds to, and should be fused as, ``instance_norm``.
-        
+
         All of the following conditions must be satisfied:
-        
+
         1. ``input`` is rank 4 tensor.
         2. ``reduce`` operates on spatial dimensions ``axes=[-2, -1]``, or ``axes=[-3, -2]`` (a
            channel first to channel last transpose would be inserted in such cases).
         3. ``gamma`` and ``beta`` are absent. Default values for ``gamma`` and ``beta`` would be used.
-        
+
         .. code-block::
 
                    |-------------------------------------------------|
@@ -661,18 +672,18 @@ def _try_match_and_transform_pattern_3(self, reduce_op, block) -> bool:
     def _try_match_and_transform_pattern_4(self, reduce_op: Operation, block: Block) -> bool:
         """
         Identify the pattern:
-        
+
         ``y = x * [gamma * rsqrt(variance + eps)] + (beta - mean * [gamma * rsqrt(variance + eps)])``
 
         This pattern corresponds to, and should be fused as, ``instance_norm``.
-        
+
         All of the following conditions must be satisfied:
-        
+
         1. ``input`` is rank 4 tensor.
         2. ``reduce`` operates on spatial dimensions ``axes=[-2, -1]`` or ``axes=[-3, -2]`` (a
            channel first to channel last transpose would be inserted in such cases).
         3. ``gamma`` and ``beta`` are both shape ``(C,)`` after ``squeeze``, where ``C`` is number of channels.
-        
+
         .. code-block::
 
             |-----------|
@@ -704,8 +715,17 @@ def _try_match_and_transform_pattern_4(self, reduce_op: Operation, block: Block)
         # check that root_var feeds into exactly 4 ops
         if len(root_var.child_ops) != 4:
             return False
-        if root_var.op is not None and not self._check_child_op_types(
-            root_var.op, child_op_types=["mul", "mul", "reduce_sum", "mul"]
+
+        if (
+            root_var.op is not None
+            and not self._check_child_op_types(
+                root_var.op, child_op_types=["mul", "mul", "reduce_sum", "mul"]
+            )
+            and not self._check_child_op_types(
+                # The _check_child_op_types checks for the exact order of the child_ops.
+                root_var.op,
+                child_op_types=["mul", "mul", "mul", "reduce_sum"],
+            )
         ):
             return False
 
@@ -988,6 +1008,15 @@ def _try_match_and_transform_pattern_5(self, reduce_op, block) -> bool:
             # For simplicity don't handle this edge case.
             return False
 
+        if add_beta_op is None and mul_gamma_op is None:
+            # Gamma and beta are optional in layer_norm.
+            pass
+        elif add_beta_op is None or mul_gamma_op is None:
+            # If only one of gamma or beta is present, they could
+            # be folded into the layer_norm op. For simplicity
+            # don't handle this edge case.
+            return False
+
         if has_beta_and_gamma:
             beta_var = add_beta_op.y if add_beta_op.x == mul_op.outputs[0] else add_beta_op.x
 
@@ -1005,27 +1034,20 @@ def _try_match_and_transform_pattern_5(self, reduce_op, block) -> bool:
                 name="_fuse_layernorm_beta"
             )
 
-            ops_to_remove.append(add_beta_op)
-            ops_to_remove.append(mul_gamma_op)
+            ops_to_remove.extend([add_beta_op, mul_gamma_op])
             end_op = mul_gamma_op
 
-        if add_beta_op is None and mul_gamma_op is None:
-            # Gamma and beta are optional in layer_norm.
-            pass
-        elif add_beta_op is None or mul_gamma_op is None:
-            # If only one of gamma or beta is present, they could
-            # be folded into the layer_norm op. For simplicity
-            # don't handle this edge case.
-            return False
-
         return self._try_apply_transform(
             reduce_op, block, gamma_var, beta_var, epsilon_var, end_op, ops_to_remove
         )
 
     @block_context_manager
     def _fuse_layernorm_or_instancenorm_block(self, block: Block):
-        fusion_status = False
-        for i, op in enumerate(list(block.operations)):
+        fusion_occurred = False
+        for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -1035,21 +1057,15 @@ def _fuse_layernorm_or_instancenorm_block(self, block: Block):
 
             # start pattern match if reduce_mean op is encountered
             if op.op_type == "reduce_mean":
-                if fusion_status is False:
-                    fusion_status = self._try_match_and_transform_pattern_1(op, block)
-                if fusion_status is False:
-                    fusion_status = self._try_match_and_transform_pattern_2(op, block)
-                if fusion_status is False:
-                    fusion_status = self._try_match_and_transform_pattern_3(op, block)
-                if fusion_status is False:
-                    fusion_status = self._try_match_and_transform_pattern_5(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
+                if self._try_match_and_transform_pattern_1(op, block):
+                    fusion_occurred = True
+                elif self._try_match_and_transform_pattern_2(op, block):
+                    fusion_occurred = True
+                elif self._try_match_and_transform_pattern_3(op, block):
+                    fusion_occurred = True
+                elif self._try_match_and_transform_pattern_5(op, block):
+                    fusion_occurred = True
             elif op.op_type == "reduce_sum":
-                if fusion_status is False:
-                    fusion_status = self._try_match_and_transform_pattern_4(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_match_and_transform_pattern_4(op, block):
+                    fusion_occurred = True
+        return fusion_occurred
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
index f1826b259..cd66e421e 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
@@ -3,15 +3,15 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from typing import Tuple
+from typing import List, Set, Tuple
 
 import numpy as np
 
-import coremltools.converters.mil.mil.types as types
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
+from coremltools.converters.mil.frontend import _utils
 from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Operation, Var
+from coremltools.converters.mil.mil import Operation, Var, types
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.helper import (
     _check_child_op_type,
@@ -22,11 +22,11 @@
 
 
 @register_pass(namespace="common")
-class merge_tensorwise_affine_dequantize_with_consecutive_ops(AbstractGraphPass):
+class merge_affine_dequantize_with_consecutive_ops(AbstractGraphPass):
     """
     This graph pass does const folding to a chain of supported ops starts with a
-    tensor-wise ``constexpr_affine_dequantize`` op. i.e., both ``scale`` and
-    ``zero_point`` are scalar (rank 0).
+    ``constexpr_affine_dequantize`` op. More types of op are supported when quantization
+    is tensor-wise, and only a subset is supported for channel-wise
 
     For example:
     Input graph:
@@ -45,43 +45,48 @@ class merge_tensorwise_affine_dequantize_with_consecutive_ops(AbstractGraphPass)
               --> constexpr_affine_dequantize -> reshape -> out_2
     """
 
-    SUPPORTED_OPS = [
+    SUPPORTED_OP_TYPES_PER_TENSOR = {
         "transpose",
         "reshape",
         "expand_dims",
         "squeeze",
-    ]
+    }
+    SUPPORTED_OP_TYPES_PER_CHANNEL = {"transpose"}
+    assert SUPPORTED_OP_TYPES_PER_CHANNEL.issubset(
+        SUPPORTED_OP_TYPES_PER_TENSOR
+    ), "If an op can merge with channel-wise quantization, then it must also be able to merge with tensor-wise quantization"
 
     def apply(self, prog):
         for f in prog.functions.values():
             block_changed = True
             while block_changed:
-                block_changed = self.merge_tensorwise_affine_dequantize_with_consecutive_ops_block(
-                    f
-                )
+                block_changed = self.merge_affine_dequantize_with_consecutive_ops_block(f)
 
     @block_context_manager
-    def merge_tensorwise_affine_dequantize_with_consecutive_ops_block(self, block):
-        fusion_status = False
+    def merge_affine_dequantize_with_consecutive_ops_block(self, block: Block):
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
-                    block_changed = (
-                        self.merge_tensorwise_affine_dequantize_with_consecutive_ops_block(b)
-                    )
+                    block_changed = self.merge_affine_dequantize_with_consecutive_ops_block(b)
 
             if op.op_type != "constexpr_affine_dequantize":
                 continue
 
-            fusion_status = self._try_to_transform(op, block)
-            if fusion_status:
-                return fusion_status
-        return fusion_status
+            if self._try_to_transform(op, block):
+                fusion_occurred = True
+        return fusion_occurred
 
     @staticmethod
-    def _apply_equivalent_transform(val, op):
-        if op.op_type not in merge_tensorwise_affine_dequantize_with_consecutive_ops.SUPPORTED_OPS:
+    def _apply_equivalent_transform(val: np.ndarray, op: Operation) -> np.ndarray:
+        if (
+            op.op_type
+            not in merge_affine_dequantize_with_consecutive_ops.SUPPORTED_OP_TYPES_PER_TENSOR
+        ):
             raise ValueError(f"unsupported op_type {op.op_type}")
 
         if op.op_type == "transpose":
@@ -97,15 +102,9 @@ def _apply_equivalent_transform(val, op):
             return np.squeeze(val, axis=tuple(op.axes.val.tolist()))
 
     @staticmethod
-    def _try_to_transform(op, block):
-        # first check if it is tensorwise quantization
-        if op.scale.rank != 0 or op.zero_point.rank != 0:
-            return False
-
-        # first check if quantized_data only feeds into a single op
-        if len(op.quantized_data.child_ops) != 1:
-            return False
-
+    def search_for_ops_to_fold(
+        op: Operation, block: Block, supported_op_types: Set[str]
+    ) -> List[Operation]:
         # traverse the graph to get a chain of applicable ops to fold
         ops_to_fold = []
         cursor = op
@@ -113,32 +112,82 @@ def _try_to_transform(op, block):
             prev_cursor = cursor
             if cursor.outputs[0] in block.outputs:
                 break
-            for val in merge_tensorwise_affine_dequantize_with_consecutive_ops.SUPPORTED_OPS:
-                if _check_child_op_type(cursor, val):
+            for supported_op_type in supported_op_types:
+                if _check_child_op_type(cursor, supported_op_type):
                     ops_to_fold.append(cursor.outputs[0].child_ops[0])
                     cursor = ops_to_fold[-1]
                     break
             if prev_cursor == cursor:
                 break
+        return ops_to_fold
+
+    @staticmethod
+    def _try_to_transform_per_tensor(op: Operation, block: Block) -> bool:
+        assert (
+            op.scale.rank == 0 and op.zero_point.rank == 0
+        ), "The _try_to_transform_per_tensor method should only be used for per-tensor dequantization case"
+
+        ops_to_fold = merge_affine_dequantize_with_consecutive_ops.search_for_ops_to_fold(
+            op, block, merge_affine_dequantize_with_consecutive_ops.SUPPORTED_OP_TYPES_PER_TENSOR
+        )
+        if len(ops_to_fold) == 0:
+            return False
+
+        # do the same transformation on the source quantized data
+        cursor = op.quantized_data.val
+        for op_to_fold in ops_to_fold:
+            cursor = merge_affine_dequantize_with_consecutive_ops._apply_equivalent_transform(
+                cursor, op_to_fold
+            )
+
+        # after transformation, we create a new constexpr_affine_dequantize op and do the replacement
+        new_var = _utils._construct_constexpr_affine_op(
+            cursor,
+            op.zero_point,
+            op.scale,
+            op.axis,
+            name=ops_to_fold[-1].outputs[0].name,
+            before_op=ops_to_fold[-1],
+        )
+        block.replace_uses_of_var_after_op(
+            anchor_op=ops_to_fold[-1],
+            old_var=ops_to_fold[-1].outputs[0],
+            new_var=new_var,
+            force_replace=True,
+        )
+        block.remove_ops([op] + ops_to_fold)
+        return True
+
+    @staticmethod
+    def _try_to_transform_per_channel(op: Operation, block: Block) -> bool:
+        scale = op.scale
+        zero_point = op.zero_point
+        # positively canonicalize axis for easier manipulation later on
+        axis = op.axis.val if op.axis.val >= 0 else op.axis.val + op.quantized_data.rank
 
+        ops_to_fold = merge_affine_dequantize_with_consecutive_ops.search_for_ops_to_fold(
+            op,
+            block,
+            merge_affine_dequantize_with_consecutive_ops.SUPPORTED_OP_TYPES_PER_CHANNEL,
+        )
         if len(ops_to_fold) == 0:
             return False
 
         # do the same transformation on the source quantized data
         cursor = op.quantized_data.val
-        for val in ops_to_fold:
-            cursor = (
-                merge_tensorwise_affine_dequantize_with_consecutive_ops._apply_equivalent_transform(
-                    cursor, val
-                )
+        for op_to_fold in ops_to_fold:
+            cursor = merge_affine_dequantize_with_consecutive_ops._apply_equivalent_transform(
+                cursor, op_to_fold
             )
+            if op_to_fold.op_type == "transpose":
+                axis = np.where(op_to_fold.perm.val == axis)[0][0]
 
         # after transformation, we create a new constexpr_affine_dequantize op and do the replacement
         new_var = mb.constexpr_affine_dequantize(
             quantized_data=cursor,
-            zero_point=op.zero_point,
-            scale=op.scale,
-            axis=op.axis,
+            zero_point=zero_point,
+            scale=scale,
+            axis=axis,
             name=ops_to_fold[-1].outputs[0].name,
             before_op=ops_to_fold[-1],
         )
@@ -151,6 +200,17 @@ def _try_to_transform(op, block):
         block.remove_ops([op] + ops_to_fold)
         return True
 
+    def _try_to_transform(self, op: Operation, block: Block) -> bool:
+        # make sure quantized_data only feeds into a single op
+        if len(op.quantized_data.child_ops) != 1:
+            return False
+
+        if op.scale.rank == 0 and op.zero_point.rank == 0:
+            return self._try_to_transform_per_tensor(op, block)
+        else:
+            return self._try_to_transform_per_channel(op, block)
+
+
 @register_pass(namespace="common")
 class int_op_canonicalization(AbstractGraphPass):
     """
@@ -315,7 +375,11 @@ def apply(self, prog):
     @block_context_manager
     def _nullify_redundant_quantization_zero_point_block(self, block: Block):
         def apply_block(block: Block) -> bool:
+            fusion_occurred = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
+
                 for b in op.blocks:
                     self._nullify_redundant_quantization_zero_point_block(b)
 
@@ -325,9 +389,9 @@ def apply_block(block: Block) -> bool:
 
                 # has to break as the downstream iterator is affected
                 if self.try_transform_zp128_quantize_dequantize(op):
-                    return True
+                    fusion_occurred = True
 
-            return False
+            return fusion_occurred
 
         need_transformation = True
         while need_transformation:
@@ -507,15 +571,18 @@ def apply(self, prog):
     @block_context_manager
     def _dequantize_quantize_pair_elimination_block(self, block):
         def apply_block(block: Block) -> bool:
+            fusion_occurred = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
+
                 for b in op.blocks:
                     self._dequantize_quantize_pair_elimination_block(b)
 
                 # has to break as the downstream iterator is affected
                 if self.try_dequantize_quantize_pair_elimination(op):
-                    return True
-
-            return False
+                    fusion_occurred = True
+            return fusion_occurred
 
         need_transformation = True
         while need_transformation:
@@ -869,7 +936,7 @@ def apply_block(block):
             apply_block(f)
 
     def is_valid_op(self, op):
-        return op.op_type == "dequantize" and op.outputs[0].val is not None
+        return op.op_type == "dequantize" and op.can_materialize_val()
 
     def transform_op(self, op):
         quantized_data = op.input.val
@@ -882,22 +949,15 @@ def transform_op(self, op):
         else:
             zero_point = np.int8(0) if op.input.dtype == types.int8 else np.uint8(0)
 
-        # In dequantize semantics, axis may be None:
-        #     when scale is a scalar, axis is None
-        #
-        # In constexpr_affine_dequantize semantics, None axis is not allowed;
-        # since axis is not referred to when scale is a scalar, we pass a dummy
-        axis = 0
-        if op.axis is not None:
-            axis = op.axis.val
+        axis = None if op.axis is None else op.axis.val
 
-        new_var = mb.constexpr_affine_dequantize(
-            quantized_data=quantized_data,
-            zero_point=zero_point,
-            scale=scale,
-            axis=axis,
-            before_op=op,
+        new_var = _utils._construct_constexpr_affine_op(
+            quantized_data,
+            zero_point,
+            scale,
+            axis,
             name=op.name + "_affine_dequantized",
+            before_op=op,
         )
 
         block = op.enclosing_block
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py b/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py
index aab66dd7b..76be08a6f 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_repeat_ops.py
@@ -103,11 +103,14 @@ def _replace_ops(block, padding_op, child_padding_op, final_pad):
 
     @block_context_manager
     def _merge_padding_block(self, block):
+        fusion_happens = False
         for op in list(block.operations):
-            result = self._match_pattern(block, op)
-            if result:
-                return True
-        return False
+            if op.enclosing_block is None:
+                continue
+
+            if self._match_pattern(block, op):
+                fusion_happens = True
+        return fusion_happens
 
 @register_pass(namespace="common")
 class merge_consecutive_transposes(AbstractGraphPass):
@@ -142,7 +145,9 @@ def _replace_ops(block, transpose_op, child_transpose_op):
         new_perm = [perm[i] for i in child_transpose_op.perm.val]
         x = mb.transpose(x=transpose_op.x, perm=new_perm, before_op=transpose_op)
         if transpose_op.enclosing_block.try_replace_uses_of_var_after_op(
-            anchor_op=transpose_op, old_var=child_transpose_op.outputs[0], new_var=x,
+            anchor_op=child_transpose_op,
+            old_var=child_transpose_op.outputs[0],
+            new_var=x,
         ):
             block.remove_ops([transpose_op, child_transpose_op])
             return True
@@ -151,10 +156,13 @@ def _replace_ops(block, transpose_op, child_transpose_op):
     @block_context_manager
     def _merge_transposes_in_block(self, block):
         def help_merge_transpose_ops(block):
+            fusion_happens = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
                 if self._match_and_replace_pattern(block, op):
-                    return True
-            return False
+                    fusion_happens = True
+            return fusion_happens
 
         block_changed = True
         while block_changed:
@@ -189,7 +197,7 @@ def _match_and_replace_pattern(self, block, relu_op):
     @staticmethod
     def _replace_ops(block, relu_op, child_relu_op):
         if relu_op.enclosing_block.try_replace_uses_of_var_after_op(
-            anchor_op=relu_op, old_var=child_relu_op.outputs[0], new_var=relu_op.outputs[0]
+            anchor_op=child_relu_op, old_var=child_relu_op.outputs[0], new_var=relu_op.outputs[0]
         ):
             block.remove_ops([child_relu_op])
             return True
@@ -198,10 +206,13 @@ def _replace_ops(block, relu_op, child_relu_op):
     @block_context_manager
     def _merge_relus_in_block(self, block):
         def help_merge_relu_ops(block):
+            fusion_happens = False
             for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
                 if self._match_and_replace_pattern(block, op):
-                    return True
-            return False
+                    fusion_happens = True
+            return fusion_happens
 
         block_changed = True
         while block_changed:
@@ -259,7 +270,11 @@ def _match_pattern(reshape_op):
     @block_context_manager
     def _merge_consecutive_reshapes_block(self, block):
         def help_merge_consecutive_reshapes_block(block):
-            for op in block.operations:
+            fusion_happens = False
+            for op in list(block.operations):
+                if op.enclosing_block is None:
+                    continue
+
                 for b in op.blocks:
                     block_changed = True
                     while block_changed:
@@ -285,9 +300,9 @@ def help_merge_consecutive_reshapes_block(block):
                         new_var=reshape_out,
                     )
                     reshape_ops[-1].enclosing_block.remove_ops(reshape_ops)
-                    return True
+                    fusion_happens = True
 
-            return False
+            return fusion_happens
 
         block_changed = True
         while block_changed:
@@ -333,11 +348,11 @@ class cast_optimization(AbstractGraphPass):
     This is a non-algebraic translation which assumes that the upcasting doesn't change the user's intent.
 
     (1) Example for redundant ``cast`` op removal:
-         .. sourcecode:: python
+        .. code-block::
 
             Input graph:
             input(fp16) -> cast(dtype="fp16") -> relu -> out
-            
+
             Output graph:
             input -> relu -> out
 
@@ -345,11 +360,11 @@ class cast_optimization(AbstractGraphPass):
          Hence, it can be removed.
 
     (2) Example for two ``cast`` ops fusion:
-         .. sourcecode:: python
+        .. code-block::
 
             Input graph:
             input(int8) -> cast(dtype="fp16") -> cast(dtype="fp32") -> out
-            
+
             Output graph:
             input(int8) -> cast(dtype="fp32") -> out
 
@@ -357,11 +372,11 @@ class cast_optimization(AbstractGraphPass):
          so the fusion is allowed.
 
     (3) Negative example for two ``cast`` ops fusion:
-         .. sourcecode:: python
+        .. code-block::
 
             Input graph:
             input(fp32) -> cast(dtype="bool") -> cast(dtype="fp16") -> out
-            
+
             Output graph:
             Same as input graph.
 
@@ -370,11 +385,11 @@ class cast_optimization(AbstractGraphPass):
          If we fuse them, the output would be in the range and resolution of ``fp16`` instead.
 
     (4) Another Negative example for two ``cast`` ops fusion:
-         .. sourcecode:: python
+        .. code-block::
 
             Input graph:
             input(int32) -> cast(dtype="int8") -> cast(dtype="uint8") -> out
-            
+
             Output graph:
             Same as input graph.
 
@@ -389,7 +404,10 @@ class cast_optimization(AbstractGraphPass):
     For more examples, please see the unittests that start with prefix ``TestCastOptimization`` in ``test_passes.py``.
     """
 
+    _num_of_visited_ops = 0  # Testing purpose, making sure the algorithm performs in O(N)
+
     def apply(self, prog):
+        self._num_of_visited_ops = 0
         for f in prog.functions.values():
             self._fuse_or_cancel_consecutive_casts_block_wrapper(f)
 
@@ -507,6 +525,8 @@ def _fuse_cast_ops(self, cast_ops: List[Operation], reuse_input_var: bool = Fals
 
     def _try_to_transform(self, root_op, cast_ops_across_blocks):
         block = root_op.enclosing_block
+        if block is None:
+            return False
 
         # Scenario: Redundant cast when source and destination dtype are same.
         if root_op.op_type == "cast" and root_op.x.is_tensor_or_scalar_of(dtype=root_op.dtype.val):
@@ -554,18 +574,27 @@ def _fuse_casts_ops_across_blocks(self, block: Block, ops_to_fused: Tuple[Operat
     def _fuse_or_cancel_consecutive_casts_block_wrapper(self, block):
         def _fuse_or_cancel_consecutive_casts_block(block, cast_ops_across_blocks):
             # We first make sure all the inner blocks are optimized
-            # It is important to do it separately in the very beginning, to ensure the last step of optimization cast ops across the block boundary is correct.
-            for i, op in enumerate(list(block.operations)):
+            # It is important to do it seperately in the very beginning, to ensure the last step of optimization cast ops across the block boundary is correct.
+            for op in block.operations:
                 for b in op.blocks:
                     self._fuse_or_cancel_consecutive_casts_block_wrapper(b)
 
-            for i, op in enumerate(list(block.operations)):
+            fusion_happens = False
+            for op in list(block.operations):
+                self._num_of_visited_ops += 1
                 # start pattern match if cast op is encountered
                 if op.op_type == "cast":
                     if self._try_to_transform(op, cast_ops_across_blocks):
-                        # has to break as the downstream iterator is affected.
-                        return True
-            return False
+                        # It is important not to exist the loop right away when a fusion happens,
+                        # in order to make the time conplexity low.
+                        # For instance, given a program of the pattern:
+                        # relu -> relu -> cast -> cast -> cast,
+                        # the three cast ops can be fused into a single cast op in one shot.
+                        # On the other hand, if we break the loop right away, the
+                        # two relu ops will be visited 3 times, and makes the overal
+                        # time complexity O(N^2).
+                        fusion_happens = True
+            return fusion_happens
 
         block_changed = True
         cast_ops_across_blocks = defaultdict(set)
@@ -1784,7 +1813,7 @@ def _reduce_transposes_block(block):
         which is simpler to do when all the ops in the block are free of sub blocks.
         The case of transpose fusion with sub-block containing ops needs to be handled with more care and test cases.
         """
-        for op in list(block.operations):
+        for op in block.operations:
             if len(op.blocks) > 0:
                 return
 
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py b/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
index 4466a09b5..bd3d9d62a 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_tensor_operation.py
@@ -48,8 +48,11 @@ def apply(self, prog):
 
     @block_context_manager
     def fuse_squeeze_expand_dims_block(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -60,11 +63,9 @@ def fuse_squeeze_expand_dims_block(self, block):
 
             squeeze_op = self._match_pattern(op)
             if squeeze_op is not None:
-                fusion_status = self._try_to_transform(squeeze_op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(squeeze_op, block):
+                    fusion_occurred = True
+        return fusion_occurred
 
     @staticmethod
     def _match_pattern(op):
@@ -268,8 +269,11 @@ def _get_prod(start, end, arr, skip_indices):
 
     @block_context_manager
     def expand_high_rank_reshape_and_transpose_block(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -279,11 +283,9 @@ def expand_high_rank_reshape_and_transpose_block(self, block):
 
             ops = self._match_pattern(op)
             if ops is not None:
-                fusion_status = self._try_to_transform(ops, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(ops, block):
+                    fusion_occurred = True
+        return fusion_occurred
 
 @register_pass(namespace="common")
 class concat_to_pixel_shuffle(AbstractGraphPass):
@@ -545,8 +547,11 @@ def _try_to_transform(concat_op, add_op, block):
 
     @block_context_manager
     def _fuse_concat_interleave(self, block):
-        fusion_status = False
+        fusion_occurred = False
         for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -556,11 +561,9 @@ def _fuse_concat_interleave(self, block):
 
             concat_op = self._match_pattern(op)
             if concat_op is not None:
-                fusion_status = self._try_to_transform(op, concat_op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, concat_op, block):
+                    fusion_occurred = True
+        return fusion_occurred
 
 
 @register_pass(namespace="common")
@@ -653,8 +656,11 @@ def _try_to_transform(onehot_op, block):
 
     @block_context_manager
     def _fuse_onehot_matmul_to_gather_block(self, block):
-        fusion_status = False
-        for i, op in enumerate(list(block.operations)):
+        fusion_occurred = False
+        for op in list(block.operations):
+            if op.enclosing_block is None:
+                continue
+
             for b in op.blocks:
                 block_changed = True
                 while block_changed:
@@ -665,11 +671,9 @@ def _fuse_onehot_matmul_to_gather_block(self, block):
 
             # start pattern match if one_hot op is encountered
             if op.op_type == "one_hot":
-                fusion_status = self._try_to_transform(op, block)
-                # has to break as the downstream iterator is affected.
-                if fusion_status:
-                    return fusion_status
-        return fusion_status
+                if self._try_to_transform(op, block):
+                    fusion_occurred = True
+        return fusion_occurred
 
 
 @register_pass(namespace="common")
diff --git a/coremltools/converters/mil/mil/passes/defs/preprocess.py b/coremltools/converters/mil/mil/passes/defs/preprocess.py
index 3f9ea7b1a..e8dd6f899 100644
--- a/coremltools/converters/mil/mil/passes/defs/preprocess.py
+++ b/coremltools/converters/mil/mil/passes/defs/preprocess.py
@@ -9,9 +9,11 @@
 
 from coremltools import _logger as logger
 from coremltools.converters.mil.input_types import EnumeratedShapes, ImageType, Shape
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, types
+from coremltools.converters.mil.mil import Function, Program, types
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
+from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 
 
@@ -48,7 +50,7 @@ def _transform_to_channel_first(shape):
             else:
                 return shape[:-3] + [shape[-1]] + shape[-3:-1]
 
-        main_input_types = list(prog.main_input_types)
+        main_input_types = list(prog.functions["main"].input_types)
         for idx, input_type in enumerate(main_input_types):
             if isinstance(input_type, ImageType) and not input_type.channel_first:
                 name = input_type.name
@@ -88,9 +90,6 @@ def _transform_to_channel_first(shape):
 
                 # Update Function input var
                 prog.functions["main"]._input_dict[name] = placeholder_op.outputs[0]
-                prog.functions["main"].function_inputs = tuple(
-                    prog.functions["main"]._input_dict.values()
-                )
 
                 # Add transpose into graph (Transpose from NCHW back to NHWC)
                 curr_block = prog.functions["main"]
@@ -108,7 +107,7 @@ def _transform_to_channel_first(shape):
                 curr_block.replace_uses_of_var_after_op(
                     anchor_op=None, old_var=old_var, new_var=new_input
                 )
-        prog.main_input_types = tuple(main_input_types)
+        prog.functions["main"].input_types = tuple(main_input_types)
 
 
 class NameSanitizer:
@@ -311,26 +310,30 @@ def apply(self, prog):
             prog.functions["main"],
             sanitizer_vars,
             sanitizer_ops,
-            prog.main_input_types,
+            prog.functions["main"].input_types,
             sanitize_model_inputs_outputs_only=True,
         )
 
 
+# TODO: rdar://122845072 ([Infra] Refactor the transform_function_signatures, adjust_io_to_supported_types and update_output_dtypes using a shared graph pass)
 @register_pass(namespace="common")
 class update_output_dtypes(AbstractGraphPass):
     """
-    Update the dtypes of output vars of the main block to match the dtypes
-    provided in ``prog.main_output_types``, which in turn is populated by the
-    ``outputs`` argument provided by the user in the ``coremltools.convert()`` API.
-    This graph pass assumes that the list of outputs in ``prog.main_output_types`` (if not ``None``),
+    Update the dtypes of output vars of each function block to match the dtypes
+    provided in ``function.output_types``. The output types for the main function
+    is populated by the ``outputs`` argument provided by the user in the ``coremltools.convert()`` API.
+    This graph pass assumes that the list of outputs in ``function.output_types`` (if not ``None``),
     are in the same order as the output vars.
     """
 
-    def apply(self, prog):
-        user_provided_output_types = prog.main_output_types
-        main_func = prog.functions["main"]
-        output_vars = main_func.outputs
-        input_vars = list(main_func.inputs.values())
+    @block_context_manager
+    def adjust_function_output_types(self, func: Function) -> None:
+        """
+        Adjust output dtypes for a pymil function.
+        """
+        user_provided_output_types = func.output_types
+        output_vars = func.outputs
+        input_vars = list(func.inputs.values())
         if user_provided_output_types is None or len(user_provided_output_types) == 0:
             return
         if len(output_vars) != len(user_provided_output_types):
@@ -367,11 +370,15 @@ def apply(self, prog):
                 output_var.set_name(
                     output_var_name + "_type_" + types.builtin_to_string(output_var.dtype)
                 )
-                with main_func:
-                    output_var = mb.cast(
-                        x=output_var, dtype=types.builtin_to_string(required_output_dtype)
-                    )
-                    output_var.set_name(output_var_name)
-                new_outputs.append(output_var)
+                new_output_var = mb.cast(
+                    x=output_var, dtype=types.builtin_to_string(required_output_dtype)
+                )
+                new_output_var.set_name(output_var_name)
+                Block._copy_scope_info(output_var, new_output_var)
+                new_outputs.append(new_output_var)
+
+        func.set_outputs(new_outputs)
 
-        main_func.set_outputs(new_outputs)
+    def apply(self, prog: Program):
+        for func in prog.functions.values():
+            self.adjust_function_output_types(func)
diff --git a/coremltools/converters/mil/mil/passes/defs/quantization.py b/coremltools/converters/mil/mil/passes/defs/quantization.py
index fab3e1656..6c016a586 100644
--- a/coremltools/converters/mil/mil/passes/defs/quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/quantization.py
@@ -5,13 +5,15 @@
 
 from abc import abstractmethod
 from enum import Enum as _Enum
-from typing import Set, Text
+from typing import Dict, Set, Text, Tuple
 
 import numpy as np
 
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
+from coremltools.converters.mil.input_types import TensorType
+from coremltools.converters.mil.mil import Block
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Operation, types
+from coremltools.converters.mil.mil import Function, Operation, Var, types
 from coremltools.converters.mil.mil.block import is_current_opset_version_compatible_with
 from coremltools.converters.mil.mil.ops.registry import SSAOpRegistry
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
@@ -19,6 +21,7 @@
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.program import Program
 from coremltools.converters.mil.mil.types.symbolic import is_symbolic
+from coremltools.converters.mil.mil.types.type_mapping import string_to_builtin
 
 
 class ComputePrecision(_Enum):
@@ -47,6 +50,14 @@ def __init__(self, op_selector=None):
             )
         self.op_selector = op_selector
 
+        # Var that feeds into multiple ops will be cast once and cached into this dict
+        # For reference: Checkout test_single_input_to_multiple_operations in `TestFP16CastTransform`.
+        # Note that, we make it a stack of dict to keep tracking the blocks
+        self._cache_vars = []
+
+    def current_cache_vars(self) -> Set[Var]:
+        return self._cache_vars[-1]
+
     def apply(self, prog):
         """
         Walks over each operation in the graph and performs following two steps,
@@ -68,6 +79,7 @@ def apply(self, prog):
 
         @block_context_manager
         def apply_block(block):
+            self._cache_vars.append({})
             for op in list(block.operations):
                 for b in op.blocks:
                     apply_block(b)
@@ -80,6 +92,7 @@ def apply_block(block):
                         need_transform = op.op_type not in getattr(self, "skip_ops_by_type", set())
                     if need_transform:
                         self.transform_op(op)
+            self._cache_vars.pop()
 
         for f in prog.functions.values():
             apply_block(f)
@@ -135,10 +148,6 @@ class CastTypeQuantization(AbstractQuantizationPass):
     def __init__(self, op_selector=None):
         super().__init__(op_selector=op_selector)
 
-        # Var that feeds into multiple ops will be cast once and cached into this dict
-        # For reference: Checkout test_single_input_to_multiple_operations in `TestFP16CastTransform`.
-        self.cache_vars = {}
-
     @property
     @abstractmethod
     def origin_dtype(self) -> str:
@@ -151,6 +160,91 @@ def target_dtype(self) -> str:
         """Target dtype, such as fp16."""
         raise NotImplementedError("target_dtype must be specified in subclass.")
 
+    # TODO: rdar://122845072 ([Infra] Refactor the transform_function_signatures, adjust_io_to_supported_types and update_output_dtypes using a shared graph pass)
+    @block_context_manager
+    def transform_function_signatures(self, func: Function) -> None:
+        """
+        This utility transform a function input / output signatures from the original_dtype to
+        the target_dtype.
+
+        For instance, in the add_fp16_cast class, this member function transforms the following
+        function:
+
+            function(%input(fp32)) {
+              block0() {
+                % var_1 = op_1(x=%input)
+                ...
+                % output(fp32) = ...
+              } -> (%output)
+            }
+
+        into:
+
+            function(%input(fp16)) {
+              block0() {
+                # input_cast = cast(x=input, dtype="fp32")
+                % var_1 = op_1(x=%input_cast)
+                ...
+                % output(fp32) = ...
+              } -> (%output)
+            }
+
+        and function.output_types is set to [TensorType(dtype=types.fp16)],
+        in which will be used in common::update_output_dtypes to upgrade the function output dtype accordingly.
+
+        """
+        # reset input signatures
+        old_func_inputs = func.inputs
+        new_func_inputs = {}
+        cache_vars = {}
+
+        # cast the new input into the original dtype
+        for k, v in old_func_inputs.items():
+            if v.is_tensor_or_scalar_of(self.origin_dtype):
+                new_input = mb.placeholder(
+                    shape=v.shape,
+                    dtype=string_to_builtin(self.target_dtype),
+                    name=v.name,
+                ).outputs[0]
+
+                if v in func.outputs:
+                    new_outputs = []
+                    for val in func.outputs:
+                        new_outputs.append(new_input if val == v else val)
+                    func.set_outputs(new_outputs)
+
+                new_func_inputs[k] = new_input
+                cast_input = mb.cast(
+                    x=new_input,
+                    dtype=self.origin_dtype,
+                    before_op=func.operations[0] if len(func.operations) > 0 else None,
+                )
+                cache_vars[k] = cast_input
+            else:
+                new_func_inputs[k] = v
+                cache_vars[k] = v
+
+        # replace the use of the old input vars with the new cast var
+        for k, v in old_func_inputs.items():
+            func.replace_uses_of_var_after_op(
+                anchor_op=None,
+                old_var=v,
+                new_var=cache_vars[k],
+            )
+        func._input_dict = new_func_inputs
+
+        # reset output signatures
+        if func.output_types is None:
+            output_types = [TensorType(dtype=v.dtype) for v in func.outputs]
+        else:
+            output_types = func.output_types
+
+        for idx, v in enumerate(output_types):
+            if v.dtype == string_to_builtin(self.origin_dtype):
+                output_types[idx] = TensorType(dtype=string_to_builtin(self.target_dtype))
+
+        func.output_types = output_types
+
     def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
         """
         Determines if a param of an op should be cast to target_dtype.
@@ -167,6 +261,13 @@ def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
 
         return True
 
+    def _get_casted_outputs(self, op: Operation, casted_inputs: Dict[str, Var]) -> Tuple[Var]:
+        """
+        Given an op and casted_inputs, this utility returns the new resulting outputs.
+        """
+        return getattr(mb, op.op_type)(**casted_inputs)
+
+
     def transform_op(self, op) -> None:
         """Transform the input(s)/output(s) dtypes of the op."""
         block = op.enclosing_block
@@ -190,18 +291,23 @@ def transform_op(self, op) -> None:
                 casted_var_name = f"{var.name}_to_{self.target_dtype}"
                 if (
                     len(var._child_ops) > 1
-                    and casted_var_name in self.cache_vars
-                    and (block.is_var_visible_in_block(self.cache_vars[casted_var_name]))
+                    and casted_var_name in self.current_cache_vars()
                 ):
-                    casted_inputs[param][i] = self.cache_vars[casted_var_name]
+                    casted_inputs[param][i] = self.current_cache_vars()[casted_var_name]
                 else:
-                    x = mb.cast(x=var, dtype=self.target_dtype, name=casted_var_name, before_op=op)
+                    x = mb.cast(
+                        x=var,
+                        dtype=self.target_dtype,
+                        name=casted_var_name,
+                        before_op=op,
+                    )
                     if self.target_dtype == "fp16":
                         self._check_underflow_to_zero(x, var)
+                    Block._copy_metadata(var, x)
 
                     casted_inputs[param][i] = x
                     if len(var._child_ops) > 1:
-                        self.cache_vars[casted_var_name] = casted_inputs[param][i]
+                        self.current_cache_vars()[casted_var_name] = casted_inputs[param][i]
 
             if not is_list_input:
                 casted_inputs[param] = casted_inputs[param][0]
@@ -210,7 +316,7 @@ def transform_op(self, op) -> None:
             casted_inputs.update({k: v for k, v in op.inputs.items() if k not in casted_inputs})
             casted_inputs["name"] = f"{op.name}_cast_{self.target_dtype}"
             casted_inputs["before_op"] = op
-            quant_output = getattr(mb, op.op_type)(**casted_inputs)
+            quant_output = self._get_casted_outputs(op, casted_inputs)
 
             if not isinstance(quant_output, (list, tuple)):
                 quant_output = [quant_output]
@@ -232,6 +338,7 @@ def transform_op(self, op) -> None:
                         force_replace=True,
                     )
                 else:
+
                     op.enclosing_block.replace_uses_of_var_after_op(
                         anchor_op=op,
                         old_var=old_output_var,
@@ -401,17 +508,20 @@ def skip_ops_by_type(self, criteria: Text):
 @register_pass(namespace="common")
 class add_int16_cast(CastTypeQuantization):
     """
-    This transform does the following, for each op that supports int16:
-    - For each input of dtype int32 which actually supports int16, inject a "cast" op to change it
-      to int16 dtype.
-    - For each output of dtype int16, inject a "cast" op to change it back to int32.
-    It's mainly for int16 op ANE residency.
+    This transform does the following, for each op that supports int16/uint16:
+    - For each input of dtype int32 which supports int16/uint16, inject a "cast" op to change it
+      to int16/uint16 dtype.
+    - For each output of dtype int16/uint16, inject a "cast" op to change it back to int32.
+    Notice that the cast will not be inserted if the const value is out of int16/uint16 range.
     """
     # Ops that prefer int16 params.
     _PREFER_INT16_OPS: Set[str] = {"gather", "gather_along_axis", "gather_nd"}
 
     def __init__(self, op_selector=None):
         super().__init__(op_selector=op_selector)
+        # Use variable instead of hard-coded "int16" because the target dtype could be uint16
+        # depending on if the param is non-negative const and within uint16 range.
+        self._target_dtype: str = "int16"
 
     @property
     def origin_dtype(self) -> str:
@@ -419,38 +529,56 @@ def origin_dtype(self) -> str:
 
     @property
     def target_dtype(self) -> str:
-        return "int16"
+        return self._target_dtype
 
-    @staticmethod
-    def int16_overflow(op: Operation) -> bool:
+    @target_dtype.setter
+    def target_dtype(self, target_dtype: str):
+        if target_dtype not in {"int16", "uint16"}:
+            raise ValueError("The target_dtype in add_int16_cast must be int16 or uint16")
+        self._target_dtype = target_dtype
+
+    def should_cast_parameter(self, op: Operation, param_name: str) -> bool:
         """
-        Determines if any of the op's input will overflow when represented by int16. Constants with
-        values more than np.iinfo(np.int16).max or less than np.iinfo(np.int16).min overflows in int16.
+        Determine if a parameter should be cast or not.
+        If should be cast, determine whether to use int16 or uint16.
         """
         _INT16_MAX = np.iinfo(np.int16).max
         _INT16_MIN = np.iinfo(np.int16).min
-        for _, inputs in op.inputs.items():
-            is_list_input = isinstance(inputs, (list, tuple))
-            if not is_list_input:
-                inputs = [inputs]
-            for var in inputs:
-                if var.val is not None and var.is_tensor_or_scalar_of(dtype="int32"):
-                    if np.any(var.val > _INT16_MAX) or np.any(var.val < _INT16_MIN):
-                        return True
+        _UINT16_MAX = np.iinfo(np.uint16).max
+        _UINT16_MIN = np.iinfo(np.uint16).min
 
-        # In `gather` and `gather_along_axis`, if the dim size of x is larger than int16 upperbound,
-        # the dynamic indices could overflow.
-        if (
-            op.op_type in {"gather", "gather_along_axis"}
-            and op.indices.val is None
-            and op.x.shape is not None
-        ):
-            dim_size = op.x.shape[op.axis.val]
-            if not is_symbolic(dim_size) and dim_size > _INT16_MAX:
-                return True
+        input_var = op.inputs[param_name]
+        if not input_var.is_tensor_or_scalar_of(dtype="int32"):
+            return False
 
-        return False
+        input_op = input_var.op
+        if input_op is not None and input_op.op_type == "const":
+            if (
+                input_op.outputs[0].val.min() >= _UINT16_MIN
+                and input_op.outputs[0].val.max() <= _UINT16_MAX
+            ):
+                self._target_dtype = "uint16"
+            elif (
+                input_op.outputs[0].val.min() >= _INT16_MIN
+                and input_op.outputs[0].val.max() <= _INT16_MAX
+            ):
+                self._target_dtype = "int16"
+            else:
+                return False
+
+        # In `gather` and `gather_along_axis`, if the dim size of x is larger than int16
+        # upperbound, the dynamic indices could overflow, so it shouldn't be cast.
+        if op.op_type in {"gather", "gather_along_axis"} and param_name == "indices":
+            if op.indices.val is None and op.x.shape is not None:
+                dim_size = op.x.shape[op.axis.val]
+                if not is_symbolic(dim_size) and dim_size > _INT16_MAX:
+                    return False
+
+        if not super().should_cast_parameter(op, param_name):
+            return False
+
+        return True
 
     def is_valid_op(self, op: Operation) -> bool:
-        """Determines if op is valid for int16 casting."""
-        return op.op_type in self._PREFER_INT16_OPS and not self.int16_overflow(op)
+        """Determines if op is valid for int16/uint16 casting."""
+        return op.op_type in self._PREFER_INT16_OPS
diff --git a/coremltools/converters/mil/mil/passes/graph_pass.py b/coremltools/converters/mil/mil/passes/graph_pass.py
index 6a39628a9..ffa61801b 100644
--- a/coremltools/converters/mil/mil/passes/graph_pass.py
+++ b/coremltools/converters/mil/mil/passes/graph_pass.py
@@ -7,6 +7,8 @@
 from typing import Callable, List, Optional, Text, Union
 
 from coremltools.converters.mil import Operation, Program
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 
 
 class PassOption:
@@ -48,7 +50,10 @@ class AbstractGraphPass(ABC):
 
     def __call__(self, prog: Program):
         if not prog.skip_all_passes:
-            self.apply(prog)
+            # we use the scope context manager to populate the graph pass information to the ops
+            # constructed by the pass.
+            with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=[str(self)])):
+                self.apply(prog)
 
     def __str__(self):
         return type(self).__name__
diff --git a/coremltools/converters/mil/mil/passes/helper.py b/coremltools/converters/mil/mil/passes/helper.py
index 1bf1e70c3..72dbde9b3 100644
--- a/coremltools/converters/mil/mil/passes/helper.py
+++ b/coremltools/converters/mil/mil/passes/helper.py
@@ -3,13 +3,14 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from typing import List
+from typing import Callable, List, Optional
 
 import numpy as np
 
 from coremltools.converters.mil.mil import Block, Operation
 from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 
+
 class classproperty(property):
     """
     A decorator class that allow us to have a class-level property
@@ -17,7 +18,8 @@ class classproperty(property):
     def __get__(self, owner, cls):
         return self.fget(cls)
 
-def block_context_manager(func):
+
+def block_context_manager(_func: Optional[Callable] = None):
     """
     This decorator executes a function under the context manager `with block`.
     For instance, given a function `func` with an input block and other arguments:
@@ -44,6 +46,7 @@ def func(block, *args):
     since when the code exit `block`, an expensive _propagate_nonreplaceable_vars() is invoked.
     The decorator reduces the amount of calling `with block` overally.
     """
+
     def wrapper(*args):
         # Make it compatible with class method.
         if isinstance(args[0], AbstractGraphPass):
@@ -56,8 +59,10 @@ def wrapper(*args):
                 "The function decorated with block_context_manager must have a Block "
                 "type argument as the first input."
             )
+
         with block:
-            return func(*args)
+            return _func(*args)
+
     return wrapper
 
 
diff --git a/coremltools/converters/mil/mil/passes/pass_pipeline.py b/coremltools/converters/mil/mil/passes/pass_pipeline.py
index eaf8c8bd9..caa1f7d4e 100644
--- a/coremltools/converters/mil/mil/passes/pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/pass_pipeline.py
@@ -83,6 +83,7 @@
     "common::merge_consecutive_relus",
     "common::merge_consecutive_reshapes",
     "common::merge_consecutive_transposes",
+    "common::fuse_transpose_matmul",
     # "expand_high_rank_reshape_and_transpose" must come after "common::merge_consecutive_transposes"
     "common::expand_high_rank_reshape_and_transpose",
     "common::reduce_transposes",
@@ -93,6 +94,8 @@
     "common::remove_redundant_ops",
     "common::add_fp16_cast",  # Will be removed if compute precision is not FP16.
     "common::add_int16_cast",  # Will be removed if compute precision is not FP16.
+    "common::update_output_dtypes",  # Must run again after `add_fp16_cast` and `add_int16_cast`.
+    "common::const_elimination",
     "common::dead_code_elimination",  # always end with dce
 ]
 
@@ -103,8 +106,12 @@
     "common::dead_code_elimination",  # must follow cast_optimization
     "common::const_elimination",
     "common::const_deduplication",  # after all consts have been settled
-    "common::dead_code_elimination",  # come before merge_tensorwise_affine_dequantize_with_consecutive_ops
-    "common::merge_tensorwise_affine_dequantize_with_consecutive_ops",  # after const_deduplication and dead_code_elimination
+    "common::dead_code_elimination",  # come before merge_affine_dequantize_with_consecutive_ops
+    "common::merge_affine_dequantize_with_consecutive_ops",  # after const_deduplication and dead_code_elimination
+    "common::expand_dynamic_linear",  # if weight or bias were not merged into constexpr, then expand linear to matmul + add
+    "common::fuse_transpose_matmul",  # there might be left over transpose that got created in hoping to use linear, but now can be fused back with matmul
+    "common::dead_code_elimination",  # fused transposes become orphans thus can be elimianted
+    "common::const_deduplication",  # additional consts may be introduced during merging dequantize and expanding linear
     "common::loop_invariant_elimination",
     "common::noop_elimination",
     "common::dedup_op_and_var_names",
@@ -250,6 +257,7 @@ class PassPipeline:
         )
     """
 
+    # TODO: rdar://121242189 ([Infra] Have a better way to handle predefined pass pipeline)
     _PIPELINE_NAME_TO_PASSES = {
         "default": _COMMON_PASSES + _CLEANUP_PASSES,
         "cleanup": _CLEANUP_PASSES,
@@ -453,8 +461,23 @@ def apply_pipeline(prog: Program, pass_pipeline: PassPipeline):
                     f"The graph pass options for {pass_name} is set to {pass_options}. "
                     f"It will change the pass behavior. Make sure the option is intended."
                 )
+            if pass_name.startswith("experimental::"):
+                logger.warning(
+                    f"The graph pass {pass_name} is under experimental development, "
+                    f"and the API could be changed in the future."
+                )
             graph_pass = PASS_REGISTRY[pass_name]
             graph_pass.set_options(pass_options)
-            graph_pass(prog)
-            prog.validate()
+
+            try:
+                graph_pass(prog)
+            except Exception as e:
+                logger.error(
+                    f"\n\nERROR - '{pass_name}' graph pass produces the following error:\n"
+                )
+                raise e  # re-raise exception
+
+            # After dead code elimination, we should check if the program misses any essential scope info
+            check_essential_scope = pass_name == "common::dead_code_elimination"
+            prog.validate(check_essential_scope=check_essential_scope)
         logger.debug(f"Program after {pass_pipeline} pipeline:\n{prog}")
diff --git a/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py b/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py
new file mode 100644
index 000000000..6269a95fb
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py
@@ -0,0 +1,2469 @@
+#  Copyright (c) 2024, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy
+import itertools
+import unittest
+
+import numpy as np
+import pytest
+from mock import patch
+
+import coremltools as ct
+from coremltools.converters.mil import mil
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil import Function, Symbol, get_new_symbol, types
+from coremltools.converters.mil.mil.passes.defs.cleanup import topological_reorder
+from coremltools.converters.mil.mil.passes.defs.cleanup.remove_redundant_ops import (
+    remove_redundant_ops,
+)
+from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
+from coremltools.converters.mil.testing_utils import (
+    apply_pass_and_basic_check,
+    assert_model_is_valid,
+    assert_op_count_match,
+    assert_same_output_names,
+    get_op_names_in_program,
+    get_op_types_in_program,
+)
+
+from .test_passes import _VALIDATE_MODEL, CONSTEXPR_FUNCS, CONSTEXPR_OPS
+
+
+class TestConstDeduplication:
+    def test_const_deduplication(self):
+        BATCH_DIM = 5
+        SEQUENCE_LENGTH = 4
+        ENCODING_DIM = 256
+        EMBEDDING_DIM = 128
+        weight = np.random.rand(EMBEDDING_DIM, ENCODING_DIM)
+        bias = np.random.rand(EMBEDDING_DIM)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+            ]
+        )
+        def prog(q, k):
+            q_e = mb.linear(x=q, weight=weight, bias=bias)
+            k_e = mb.linear(x=k, weight=weight, bias=bias)
+            attention = mb.matmul(x=q_e, y=k_e, transpose_y=True)
+            return attention
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=6, op="const")
+        assert_op_count_match(prog, expect=4, op="const")
+
+    @pytest.mark.parametrize(
+        "constexpr_op",
+        CONSTEXPR_OPS,
+    )
+    def test_constexpr_deduplication(self, constexpr_op):
+        BATCH_DIM = 5
+        SEQUENCE_LENGTH = 4
+        ENCODING_DIM = 256
+        EMBEDDING_DIM = 128
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+            ]
+        )
+        def prog(q, k):
+            weight_q = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM, ENCODING_DIM), seed=19)
+            weight_k = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM, ENCODING_DIM), seed=19)
+            bias_q = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM,), seed=29)
+            bias_k = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM,), seed=29)
+            q_e = mb.linear(x=q, weight=weight_q, bias=bias_q)
+            k_e = mb.linear(x=k, weight=weight_k, bias=bias_k)
+            attention = mb.matmul(x=q_e, y=k_e, transpose_y=True)
+            return attention
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=4, op=constexpr_op)
+        assert_op_count_match(prog, expect=2, op=constexpr_op)
+
+    def test_const_deduplication_as_outputs(self):
+        """
+        If the duplicated constants are block outputs, we should not remove them.
+        """
+        # case 1:
+        # const_2 can be eliminated since it is not block output
+        const = np.random.rand(40, 20, 30)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(
+                    shape=(
+                        40,
+                        20,
+                        30,
+                    )
+                )
+            ]
+        )
+        def prog(x):
+            const_1 = mb.const(val=const, name="const_1")
+            const_2 = mb.const(val=const, name="const_2")
+            x = mb.relu(x=x)
+            x = mb.add(x=x, y=const_2)
+            return x, const_1
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=2, op="const")
+        assert_op_count_match(prog, expect=1, op="const")
+        assert prog.functions["main"].outputs[1].name == "const_1"
+
+        # case 2:
+        # const_2 can not be eliminated since it is a block output
+        const = np.random.rand(40, 20, 30)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(
+                    shape=(
+                        40,
+                        20,
+                        30,
+                    )
+                )
+            ]
+        )
+        def prog(x):
+            const_1 = mb.const(val=const, name="const_1")
+            const_2 = mb.const(val=const, name="const_2")
+            x = mb.relu(x=x)
+            x = mb.add(x=x, y=const_2)
+            return x, const_1, const_2
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=2, op="const")
+        assert_op_count_match(prog, expect=2, op="const")
+        assert prog.functions["main"].outputs[1].name == "const_1"
+        assert prog.functions["main"].outputs[2].name == "const_2"
+
+    @pytest.mark.skip("rdar://109374995 consts are not shared across blocks")
+    def test_const_deduplication_multiple_blocks(self):
+        weight = np.random.rand(5, 3, 2, 2)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 3, 8, 8))])
+        def prog(x):
+            def _true_fn():
+                return mb.conv(x=x, weight=weight, pad_type="valid")
+
+            def _false_fn():
+                y = mb.mul(x=x, y=2.0)
+                return mb.conv(x=y, weight=weight, pad_type="valid")
+
+            x_gt_0_tensor = mb.greater(x=x, y=0.0)
+            x_gt_0 = mb.slice_by_index(x=x_gt_0_tensor, begin=(0, 0, 0, 0), end=(1, 1, 1, 1))
+            return mb.cond(pred=x_gt_0, _true_fn=_true_fn, _false_fn=_false_fn)
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        assert_op_count_match(prev_prog, expect=8, op="const")
+        assert_op_count_match(prog, expect=6, op="const")
+
+
+class TestConstElimination:
+    def test_const_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            a = np.random.rand(2, 4).astype(np.float32)
+            double_a = mb.add(x=a, y=a)
+            return mb.add(x=x, y=double_a)
+
+        assert_op_count_match(prog, expect=2, op="const")
+        prev_prog = copy.deepcopy(prog)
+        PASS_REGISTRY["common::const_elimination"](prog)
+        assert_same_output_names(prev_prog, prog)
+        assert_op_count_match(prog, expect=3, op="const")
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(prog, {"x": (2, 4)})
+
+    def test_const_elimination_nonreplaceable(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            a = np.random.rand(2, 4).astype(np.float16)
+            constexpr_a = mb.constexpr_cast(source_val=a, output_dtype="fp32")
+            double_a = mb.add(x=constexpr_a, y=a.astype(np.float32))
+            return mb.add(x=x, y=double_a)
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_elimination")
+        assert get_op_types_in_program(prev_prog) == ["constexpr_cast", "add", "add"]
+        # Not fold into const because the upstream constexpr_cast op is non-replaceable.
+        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "add"]
+
+    def test_force_const_eliminate_nonreplaceable_ops(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3,), dtype=types.int32)])
+        def prog(x):
+            a = np.random.rand(2, 3, 5).astype(np.float16)
+            constexpr_a = mb.constexpr_cast(source_val=a, output_dtype="fp32")
+            double_a = mb.add(x=constexpr_a, y=a.astype(np.float32))
+            a_shape = mb.shape(x=double_a)
+            return mb.add(x=x, y=a_shape)
+
+        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "shape", "add"]
+
+        apply_pass_and_basic_check(prog, "common::const_elimination")
+        # still fold shape into const regardless the non-replaceable upstream
+        # constexpr_cast op, since it only provides a shape
+        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "add"]
+
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        # constexpr_cast(a) and add(a, a) no longer contributes to output,
+        # so they should get dead code eliminated
+        assert get_op_types_in_program(prog) == ["add"]
+
+    def test_force_const_eliminate_nonreplaceable_ops_case_2(self):
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1,), dtype=types.int32),
+                mb.TensorSpec(shape=(2,), dtype=types.int32),
+            ],
+            opset_version=ct.target.iOS17,
+        )
+        def prog(x, y):
+            a = np.random.rand(2, 3, 5).astype(np.float16)
+            constexpr_a = mb.constexpr_cast(source_val=a, output_dtype="fp32")
+
+            reshape_shape = mb.concat(values=[y, [5]], axis=0)
+            reshape = mb.reshape(x=constexpr_a, shape=reshape_shape)
+            a_shape = mb.shape(x=reshape)
+            a_shape_int16 = mb.cast(x=a_shape, dtype="int16")
+
+            # Even though the gather ops has constexpr_cast op as upstream,
+            # it can still be removed by const elimination.
+            gather = mb.gather(
+                x=a_shape,
+                indices=[
+                    2,
+                ],
+                axis=0,
+            )
+            gather_int32 = mb.cast(x=gather, dtype="int32")
+            return mb.add(x=x, y=gather)
+
+        assert get_op_types_in_program(prog) == [
+            "constexpr_cast",
+            "concat",
+            "reshape",
+            "shape",
+            "cast",
+            "gather",
+            "cast",
+            "add",
+        ]
+
+        apply_pass_and_basic_check(prog, "common::const_elimination")
+        # still const-folding gather into const regardless the non-replaceable upstream
+        # constexpr_cast op, since it only provides the meta data (shape)
+        assert get_op_types_in_program(prog) == [
+            "constexpr_cast",
+            "concat",
+            "reshape",
+            "shape",
+            "cast",
+            "add",
+        ]
+
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prog) == ["add"]
+
+    @patch(
+        "coremltools.converters.mil.mil.passes.defs.cleanup.const_elimination._skip_const_by_size",
+        1000,
+    )
+    def test_const_elimination_larger_than_threshold(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            # Construct a 10 x 10 matrix (100 elements) which is smaller than the threshold (1000).
+            tmp = mb.range_1d(start=0, end=10, step=1)
+            tmp_x = mb.reshape(x=tmp, shape=[-1, 1])
+            tmp_y = mb.reshape(x=tmp, shape=[1, -1])
+            return mb.matmul(x=tmp_x, y=tmp_y)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog_large_const_size(x):
+            # Construct a 100 x 100 matrix (10000 elements) which is larger than the threshold (1000).
+            tmp = mb.range_1d(start=0, end=100, step=1)
+            tmp_x = mb.reshape(x=tmp, shape=[-1, 1])
+            tmp_y = mb.reshape(x=tmp, shape=[1, -1])
+            return mb.matmul(x=tmp_x, y=tmp_y)
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_elimination")
+        assert get_op_types_in_program(prev_prog) == [
+            "range_1d",
+            "reshape",
+            "reshape",
+            "matmul",
+        ]
+        # All ops (range_1d, reshape, matmul) constructing that 10x10 matrix is folded into a const.
+        assert get_op_types_in_program(prog) == []
+
+        prev_prog_large_const_size, _, _ = apply_pass_and_basic_check(
+            prog_large_const_size, "common::const_elimination"
+        )
+        assert get_op_types_in_program(prev_prog_large_const_size) == [
+            "range_1d",
+            "reshape",
+            "reshape",
+            "matmul",
+        ]
+        # The matmul op constructing the large matrix is kept due to size larger than threshold.
+        assert get_op_types_in_program(prog_large_const_size) == ["matmul"]
+
+
+class TestDeadCodeElimination:
+    def test_dead_code_elimination(self):
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(2, 4)),
+                mb.TensorSpec(shape=(2, 4)),
+            ]
+        )
+        def program0(x, y):
+            # following three unused op should be eliminated
+            a = mb.const(val=np.zeros(shape=(1,)))
+            b = mb.const(val=np.zeros(shape=(1,)))
+            _ = mb.add(x=a, y=b)
+            return mb.add(x=x, y=y)
+
+        assert_op_count_match(program0, expect=4)
+        prev_prog = copy.deepcopy(program0)
+        PASS_REGISTRY["common::dead_code_elimination"](program0)
+        assert_same_output_names(prev_prog, program0)
+        assert_op_count_match(program0, expect=1)
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(program0, {"x": (2, 4), "y": (2, 4)})
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def program1(x):
+            weights_val = np.random.rand(4, 2).T.astype(np.float32)
+            weights = mb.const(val=weights_val)
+            bias_val = np.random.rand(2).astype(np.float32)
+            bias = mb.const(val=bias_val)
+
+            # unused op and its inputs should be eliminated
+            weights_for_matmul = mb.transpose(x=weights, perm=[1, 0])
+            mb.matmul(x=x, y=weights_for_matmul)
+
+            return mb.linear(x=x, weight=weights, bias=bias)
+
+        assert_op_count_match(program1, expect=8)
+        prev_prog = copy.deepcopy(program1)
+        PASS_REGISTRY["common::dead_code_elimination"](program1)
+        assert_same_output_names(prev_prog, program1)
+        assert_op_count_match(program1, expect=3)
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(program1, {"x": (2, 4)})
+
+
+class TestDedupOpAndVarNames(unittest.TestCase):
+    def test_unchanged(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            x = mb.reshape(x=x, shape=(1, 8), name="reshape")
+            return x
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
+
+        self.assertEqual(get_op_types_in_program(prev_prog), ["reshape"])
+        self.assertEqual(get_op_names_in_program(prev_prog), ["reshape"])
+
+        self.assertEqual(get_op_types_in_program(prog), ["reshape"])
+        self.assertEqual(get_op_names_in_program(prog), ["reshape"])
+
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (1, 8)},
+        )
+
+    def test_op_name_duplicated_once(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16", name="castop")
+            x = mb.cast(x=x, dtype="fp32", name="castop")
+            x = mb.square(x=x, name="square_last")
+            return x
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
+
+        self.assertEqual(get_op_types_in_program(prev_prog), ["cast", "cast", "square"])
+        self.assertEqual(get_op_names_in_program(prev_prog), ["castop", "castop", "square_last"])
+
+        self.assertEqual(get_op_types_in_program(prog), ["cast", "cast", "square"])
+        self.assertEqual(get_op_names_in_program(prog), ["castop", "castop_1", "square_last"])
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (10, 20)},
+        )
+
+    def test_op_name_duplicated_many(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16", name="castop")
+            x = mb.cast(x=x, dtype="fp16", name="castop")
+            x = mb.cast(x=x, dtype="int32", name="castop_2")
+            x = mb.cast(x=x, dtype="fp16", name="castop")
+            x = mb.cast(x=x, dtype="fp32", name="castop_2")
+            x = mb.square(x=x, name="square")
+            return x
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
+
+        self.assertEqual(
+            get_op_types_in_program(prev_prog), ["cast", "cast", "cast", "cast", "cast", "square"]
+        )
+        self.assertEqual(
+            get_op_names_in_program(prev_prog),
+            ["castop", "castop", "castop_2", "castop", "castop_2", "square"],
+        )
+
+        self.assertEqual(
+            get_op_types_in_program(prog), ["cast", "cast", "cast", "cast", "cast", "square"]
+        )
+        self.assertEqual(
+            get_op_names_in_program(prog),
+            ["castop", "castop_1", "castop_2", "castop_3", "castop_2_1", "square"],
+        )
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (10, 20)},
+        )
+
+    def test_input_name_shadow(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            # op name "x" results in output var name "x", which shadows prog
+            # input var name "x"
+            x = mb.transpose(x=x, perm=[1, 0], name="x")
+            x = mb.relu(x=x, name="relu")
+            return x
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
+        self.assertEqual(get_op_types_in_program(prev_prog), ["transpose", "relu"])
+        self.assertEqual(get_op_names_in_program(prev_prog), ["x", "relu"])
+
+        self.assertEqual(get_op_types_in_program(prog), ["transpose", "relu"])
+        self.assertEqual(get_op_names_in_program(prog), ["x", "relu"])
+
+        op = prog["main"].find_ops(op_type="transpose")[0]
+        self.assertEqual("x_1", op.outputs[0].name)
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (20, 10)},
+        )
+
+    def test_nested_block(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
+        def prog(x):
+            def true_fn():
+                # returns var with name x shadows input 'x'
+                return mb.add(x=x, y=1.0, name="x")
+
+            def false_fn():
+                # two ops with name "x"
+                return mb.add(x=x, y=-1.0, name="x")
+
+            pred = mb.equal(x=mb.squeeze(x=x), y=1.0)
+            return mb.cond(pred=pred, _true_fn=true_fn, _false_fn=false_fn)
+
+        cond_op = prog.functions["main"].operations[-1]
+        assert cond_op.blocks[0].outputs[0].name == "x"
+        assert cond_op.blocks[1].outputs[0].name == "x"
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
+        cond_op = prog.functions["main"].operations[-1]
+        assert cond_op.blocks[0].outputs[0].name == "x_1"
+        assert cond_op.blocks[1].outputs[0].name == "x_2"
+
+        assert_model_is_valid(
+            prog,
+            {"x": (1,)},
+            expected_output_shapes={block.outputs[0].name: (1,)},
+        )
+
+
+class TestExpandDynamicLinear:
+    def test_keep_static_weight_static_bias(self):
+        X_SHAPE = (2, 5)
+        WEIGHT_SHAPE = (3, X_SHAPE[-1])
+
+        bias_shape = (WEIGHT_SHAPE[0],)
+        output_shape = (X_SHAPE[0], WEIGHT_SHAPE[0])
+
+        quantized_weight = np.random.randint(-127, 128, WEIGHT_SHAPE, np.int8)
+        quantized_bias = np.random.randint(-127, 128, bias_shape, np.int8)
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=X_SHAPE)],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x):
+            weight = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            bias = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_bias,
+                scale=4.5,
+                zero_point=np.int8(6),
+                axis=0,
+            )
+            return mb.linear(x=x, weight=weight, bias=bias)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_dynamic_linear")
+        assert get_op_types_in_program(prev_prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "linear",
+        ]
+        assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+    def test_expand_static_weight_dynamic_bias(self):
+        X_SHAPE = (2, 5)
+        WEIGHT_SHAPE = (3, X_SHAPE[-1])
+
+        bias_shape = (WEIGHT_SHAPE[0],)
+        output_shape = (X_SHAPE[0], WEIGHT_SHAPE[0])
+
+        weight = np.random.rand(*WEIGHT_SHAPE)
+        quantized_bias = np.random.randint(-127, 128, bias_shape, np.int8)
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=X_SHAPE)],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x):
+            bias = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_bias,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            screwed_bias = mb.exp(x=bias)
+            return mb.linear(x=x, weight=weight, bias=screwed_bias)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_dynamic_linear")
+        assert get_op_types_in_program(prev_prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "linear",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "linear",
+            "add",
+        ]
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+    def test_expand_dynamic_weight_static_zero_bias(self):
+        X_SHAPE = (2, 5)
+        WEIGHT_SHAPE = (3, X_SHAPE[-1])
+
+        output_shape = (X_SHAPE[0], WEIGHT_SHAPE[0])
+
+        quantized_weight = np.random.randint(-127, 128, WEIGHT_SHAPE, np.int8)
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=X_SHAPE)],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x):
+            weight = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            screwed_weight = mb.exp(x=weight)
+            return mb.linear(x=x, weight=screwed_weight)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_dynamic_linear")
+        assert get_op_types_in_program(prev_prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "linear",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "matmul",
+        ]
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+    def test_expand_dynamic_weight_static_compressed_zero_bias(self):
+        X_SHAPE = (2, 5)
+        WEIGHT_SHAPE = (3, X_SHAPE[-1])
+
+        bias_shape = (WEIGHT_SHAPE[0],)
+        output_shape = (X_SHAPE[0], WEIGHT_SHAPE[0])
+
+        quantized_weight = np.random.randint(-127, 128, WEIGHT_SHAPE, np.int8)
+        quantized_bias = np.random.randint(-127, 128, bias_shape, np.int8)
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=X_SHAPE)],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x):
+            weight = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            bias = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_bias,
+                scale=np.random.rand(*bias_shape),
+                zero_point=quantized_bias,
+                axis=0,
+            )
+            screwed_weight = mb.exp(x=weight)
+            return mb.linear(x=x, weight=screwed_weight, bias=bias)
+
+        original_prog, _, _ = apply_pass_and_basic_check(prog, "common::expand_dynamic_linear")
+        expanded_prog, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(original_prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "exp",
+            "linear",
+        ]
+        assert get_op_types_in_program(expanded_prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "exp",
+            "matmul",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "matmul",
+        ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+    def test_expand_dynamic_weight_static_nonzero_bias(self):
+        X_SHAPE = (2, 5)
+        WEIGHT_SHAPE = (3, X_SHAPE[-1])
+
+        bias_shape = (WEIGHT_SHAPE[0],)
+        output_shape = (X_SHAPE[0], WEIGHT_SHAPE[0])
+
+        quantized_weight = np.random.randint(-127, 128, WEIGHT_SHAPE, np.int8)
+        bias = np.random.rand(*bias_shape)
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=X_SHAPE)],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x):
+            weight = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            screwed_weight = mb.exp(x=weight)
+            return mb.linear(x=x, weight=screwed_weight, bias=bias)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_dynamic_linear")
+        assert get_op_types_in_program(prev_prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "linear",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "exp",
+            "matmul",
+            "add",
+        ]
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+    def test_expand_dynamic_weight_dynamic_bias(self):
+        X_SHAPE = (2, 5)
+        WEIGHT_SHAPE = (3, X_SHAPE[-1])
+
+        bias_shape = (WEIGHT_SHAPE[0],)
+        output_shape = (X_SHAPE[0], WEIGHT_SHAPE[0])
+
+        quantized_weight = np.random.randint(-127, 128, WEIGHT_SHAPE, np.int8)
+        quantized_bias = np.random.randint(-127, 128, bias_shape, np.int8)
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=X_SHAPE)],
+            opset_version=ct.target.iOS16,
+        )
+        def prog(x):
+            weight = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_weight,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            bias = mb.constexpr_affine_dequantize(
+                quantized_data=quantized_bias,
+                scale=1.2,
+                zero_point=np.int8(3),
+                axis=0,
+            )
+            screwed_weight = mb.exp(x=weight)
+            screwed_bias = mb.exp(x=bias)
+            return mb.linear(x=x, weight=screwed_weight, bias=screwed_bias)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::expand_dynamic_linear")
+        assert get_op_types_in_program(prev_prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "exp",
+            "exp",
+            "linear",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "constexpr_affine_dequantize",
+            "constexpr_affine_dequantize",
+            "exp",
+            "exp",
+            "matmul",
+            "add",
+        ]
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=("mlprogram", "fp16"),
+            minimum_deployment_target=ct.target.iOS16,
+        )
+
+
+class TestReduceMeanFusion:
+    def test_valid_pattern1(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
+            x1 = mb.mul(x=1.0 / 30, y=x1)
+            return x1
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
+        assert get_op_types_in_program(prev_prog) == ["reduce_sum", "mul"]
+        assert get_op_types_in_program(prog) == ["reduce_mean"]
+        assert_model_is_valid(
+            prog,
+            {"x": (3, 5, 6)},
+            expected_output_shapes={block.outputs[0].name: (3, 1, 1)},
+        )
+
+    def test_valid_pattern2(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 5))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[0], keep_dims=False)
+            x1 = mb.real_div(x=x1, y=4.0)
+            return x1
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
+        assert get_op_types_in_program(prev_prog) == ["reduce_sum", "real_div"]
+        assert get_op_types_in_program(prog) == ["reduce_mean"]
+        assert_model_is_valid(
+            prog,
+            {"x": (4, 5)},
+            expected_output_shapes={block.outputs[0].name: (5,)},
+        )
+
+    def test_invalid_pattern1(self):
+        """
+        The mul does not correspond to "1/count"
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
+            x1 = mb.mul(x=5.0, y=x1)
+            return x1
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
+        assert get_op_types_in_program(prog) == ["reduce_sum", "mul"]
+
+    def test_invalid_pattern2(self):
+        """
+        The div does not correspond to "count"
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
+            x1 = mb.real_div(x=x1, y=31.0)
+            return x1
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
+        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div"]
+
+    def test_invalid_pattern3(self):
+        """
+        One of the reduction dim is symbolic
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, get_new_symbol(), 6))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
+            x1 = mb.real_div(x=x1, y=30.0)
+            return x1
+
+        pass_name = "common::fuse_reduce_mean"
+        PASS_REGISTRY[pass_name](prog)
+        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div"]
+
+    def test_invalid_pattern4(self):
+        """
+        output of reduce_sum is model output
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
+            y1 = mb.real_div(x=x1, y=30.0)
+            return y1, x1
+
+        pass_name = "common::fuse_reduce_mean"
+        PASS_REGISTRY[pass_name](prog)
+        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div"]
+
+    def test_invalid_pattern5(self):
+        """
+        output of reduce_sum is feeding into another op
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
+        def prog(x):
+            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
+            y1 = mb.real_div(x=x1, y=30.0)
+            y2 = mb.mul(x=x1, y=10.0)
+            y3 = mb.add(x=y1, y=y2)
+            return y3
+
+        pass_name = "common::fuse_reduce_mean"
+        PASS_REGISTRY[pass_name](prog)
+        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div", "mul", "add"]
+
+
+class TestLoopInvariantElimination:
+    def test_loop_invariant_elimination1(self):
+        """
+        Invariant pattern: Block input vars are returned as block output vars.
+        """
+
+        def body(a, b):
+            return mb.add(x=a, y=b), b
+
+        def cond(a, b):
+            a_mean = mb.reduce_mean(x=a, axes=[0, 1])
+            b_mean = mb.reduce_mean(x=b, axes=[0, 1])
+            return mb.less(x=a_mean, y=b_mean)
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 2)),
+                mb.TensorSpec(shape=(1, 2)),
+            ]
+        )
+        def prog(a, b):
+            # b is loop invariant
+            return mb.while_loop(_cond=cond, _body=body, loop_vars=(a, b))
+
+        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
+        assert len(while_op.blocks[0].inputs) == 2
+        assert len(while_op.outputs) == 2
+        assert len(while_op.loop_vars) == 2
+        assert while_op.blocks[0].inputs[0].name == "a_x0"
+        assert while_op.blocks[0].inputs[1].name == "b_x0"
+
+        prev_prog = copy.deepcopy(prog)
+        PASS_REGISTRY["common::loop_invariant_elimination"](prog)
+        assert_same_output_names(prev_prog, prog)
+
+        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
+        assert len(while_op.blocks[0].inputs) == 1
+        assert len(while_op.outputs) == 1
+        assert len(while_op.loop_vars) == 1
+        assert while_op.blocks[0].inputs[0].name == "a_x0"
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(prog, {"a": (1, 2), "b": (1, 2)})
+
+    def test_loop_invariant_elimination2(self):
+        """
+        Invariant pattern: Block outputs var from outside of the block
+        """
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1, 2)),
+                mb.TensorSpec(shape=(1, 2)),
+            ]
+        )
+        def prog(a, b):
+            def body(a, bx):
+                return mb.add(x=a, y=b), b
+
+            def cond(a, bx):
+                a_mean = mb.reduce_mean(x=a, axes=[0, 1])
+                b_mean = mb.reduce_mean(x=bx, axes=[0, 1])
+                return mb.less(x=a_mean, y=b_mean)
+
+            # b is loop invariant
+            return mb.while_loop(_cond=cond, _body=body, loop_vars=(a, b))
+
+        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
+        assert len(while_op.blocks[0].inputs) == 2
+        assert len(while_op.outputs) == 2
+        assert len(while_op.loop_vars) == 2
+        assert while_op.blocks[0].inputs[0].name == "a_x0"
+        assert while_op.blocks[0].inputs[1].name == "b_x0"
+
+        prev_prog = copy.deepcopy(prog)
+        PASS_REGISTRY["common::loop_invariant_elimination"](prog)
+        assert_same_output_names(prev_prog, prog)
+
+        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
+        assert len(while_op.blocks[0].inputs) == 1
+        assert len(while_op.outputs) == 1
+        assert len(while_op.loop_vars) == 1
+        assert while_op.blocks[0].inputs[0].name == "a_x0"
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(prog, {"a": (1, 2), "b": (1, 2)})
+
+
+class TestNoopElimination:
+    @pytest.mark.parametrize("is_block_output", ((True, False)))
+    def test_identity(self, is_block_output):
+        """
+        Input graph:
+
+            input -> identity -> (add 1.0 if not is_block_output) -> output
+
+        Output graph:
+
+            if is_block_output:
+                input -> identity -> output
+            else:
+                input -> add 1.0 -> output
+        """
+        SHAPE = (2, 3)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
+        def prog(x):
+            y = mb.identity(x=x)
+            if not is_block_output:
+                y = mb.add(x=y, y=1.0)
+            return y
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        if is_block_output:
+            assert get_op_types_in_program(prev_prog) == ["identity"]
+            assert get_op_types_in_program(prog) == ["identity"]
+        else:
+            assert get_op_types_in_program(prev_prog) == ["identity", "add"]
+            assert get_op_types_in_program(prog) == ["add"]
+
+        output_name = block.outputs[0].name
+        assert_model_is_valid(
+            prog,
+            {"x": SHAPE},
+            expected_output_shapes={output_name: SHAPE},
+        )
+
+    @pytest.mark.parametrize(
+        "op_type, pos, val",
+        itertools.product(
+            ["add", "mul", "floor_div", "pow", "real_div", "sub"],
+            ["x", "y"],
+            [0.0, 1.0, [0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
+        ),
+    )
+    def test_elementwise_elimination(self, op_type, pos, val):
+        if "div" in op_type and np.prod(val) == 0:
+            return
+        if "pow" in op_type and (val != 0 or val != 1):
+            return
+
+        test_op = getattr(mb, op_type)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            if pos == "x":
+                r1 = test_op(x=val, y=x)
+            else:
+                r1 = test_op(x=x, y=val)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        original_program = [op_type, "relu"]
+        new_program = original_program
+        if op_type in {"add"}:
+            if val == 0.0 or val == [0.0, 0.0, 0.0, 0.0]:
+                new_program = ["relu"]
+        elif op_type in {"mul"}:
+            if val == 1.0 or val == [1.0, 1.0, 1.0, 1.0]:
+                new_program = ["relu"]
+        elif op_type in {"real_div"}:
+            if pos == "y" and (val == 1.0 or val == [1.0, 1.0, 1.0, 1.0]):
+                new_program = ["relu"]
+        elif op_type in {"pow", "floor_div"}:
+            if pos == "y" and (val == 1.0 or val == [1.0, 1.0, 1.0, 1.0]):
+                new_program = ["relu"]
+        elif op_type in {"sub"}:
+            if pos == "y" and (val == 0.0 or val == [0.0, 0.0, 0.0, 0.0]):
+                new_program = ["relu"]
+
+        assert get_op_types_in_program(prev_prog) == original_program
+        assert get_op_types_in_program(prog) == new_program
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_elementwise_broadcast(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=[4])])
+        def prog(x):
+            r1 = mb.add(x=x, y=[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        original_program = ["add", "relu"]
+
+        assert get_op_types_in_program(prev_prog) == original_program
+        assert get_op_types_in_program(prog) == original_program
+        assert_model_is_valid(
+            prog,
+            {"x": [4]},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_elementwise_elimination_fill(self):
+        """
+        When fill layer with dynamic shape is fed to elementwise-binary operation,
+        even though the tensor can't be materialized at conversion time but no-op
+        elimination can still be performed based on fill-value
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, get_new_symbol()))])
+        def prog(x):
+            shape = mb.shape(x=x)
+            y = mb.fill(value=0.0, shape=shape)
+            x = mb.add(x=x, y=y)
+            return mb.relu(x=x)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["shape", "fill", "add", "relu"]
+        assert get_op_types_in_program(prog) == ["shape", "fill", "relu"]
+
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["relu"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_reshape_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.reshape(x=x, shape=[1, 8])
+            mb.reshape(x=r1, shape=[1, 8])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["reshape", "reshape", "relu"]
+        assert get_op_types_in_program(prog) == ["reshape", "relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (1, 8)},
+        )
+
+    def test_oneway_split_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.split(x=x, num_splits=1, axis=-1)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["split", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_full_split_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.split(x=x, split_sizes=[4], axis=-1)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["split", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_slicebysize_full_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.slice_by_size(x=x, begin=[0, 0], size=[2, 4])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["slice_by_size", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_slicebysize_to_end_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.slice_by_size(x=x, begin=[0, 0], size=[-1, -1])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["slice_by_size", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_slicebyindex_full_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.slice_by_index(x=x, begin=[0, 0], end=[2, 4])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["slice_by_index", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_slicebyindex_negative_stride(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.slice_by_index(
+                x=x,
+                begin=[0, 0],
+                end=[0, 0],
+                stride=[1, -1],
+                begin_mask=[True, True],
+                end_mask=[True, True],
+            )
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["slice_by_index", "relu"]
+        assert get_op_types_in_program(prog) == ["slice_by_index", "relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    @pytest.mark.parametrize(
+        "begin_mask, end_mask",
+        itertools.product(
+            itertools.product([True, False], [True, False]),
+            itertools.product([True, False], [True, False]),
+        ),
+    )
+    def test_slicebyindex_mask_elimination(self, begin_mask, end_mask):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 4))])
+        def prog(x):
+            begin = [1, 1]
+            end = [1, 1]
+            for i in range(2):
+                if not begin_mask[i]:
+                    begin[i] = 0
+                if not end_mask[i]:
+                    end[i] = 4
+            r1 = mb.slice_by_index(
+                x=x, begin=begin, end=end, begin_mask=begin_mask, end_mask=end_mask
+            )
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["slice_by_index", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (4, 4)},
+            expected_output_shapes={block.outputs[0].name: (4, 4)},
+        )
+
+    def test_pad_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.pad(x=x, pad=[0, 0, 0, 0])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["pad", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_keep_pad(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.pad(x=x, pad=[4, 4, 2, 2])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["pad", "relu"]
+        assert get_op_types_in_program(prog) == ["pad", "relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (10, 8)},
+        )
+
+    def test_tile_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.tile(x=x, reps=[1, 1])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["tile", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_keep_tile(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.tile(x=x, reps=[2, 2])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["tile", "relu"]
+        assert get_op_types_in_program(prog) == ["tile", "relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (4, 8)},
+        )
+
+    def test_upsample_nearest_neighbor_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
+        def prog(x):
+            r1 = mb.upsample_nearest_neighbor(x=x)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["upsample_nearest_neighbor", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (3, 2, 4)},
+            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
+        )
+
+    def test_upsample_bilinear_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
+        def prog(x):
+            r1 = mb.upsample_bilinear(x=x)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["upsample_bilinear", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (3, 2, 4)},
+            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
+        )
+
+    def test_resize_bilinear_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
+        def prog(x):
+            r1 = mb.resize_bilinear(x=x, target_size_height=2, target_size_width=4)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["resize_bilinear", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (3, 2, 4)},
+            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
+        )
+
+    def test_crop_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
+        def prog(x):
+            r1 = mb.crop(x=x, crop_height=[0, 0], crop_width=[0, 0])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["crop", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (3, 2, 4)},
+            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
+        )
+
+    def test_linear_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            r1 = mb.linear_activation(x=x, alpha=1.0, beta=0.0)
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["linear_activation", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 4)},
+        )
+
+    def test_transpose_elimination(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 4))])
+        def prog(x):
+            r1 = mb.transpose(x=x, perm=[0, 1, 2])
+            return mb.relu(x=r1)
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
+        assert get_op_types_in_program(prev_prog) == ["transpose", "relu"]
+        assert get_op_types_in_program(prog) == ["relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 4)},
+            expected_output_shapes={block.outputs[0].name: (2, 3, 4)},
+        )
+
+
+class TestRemoveRedundantOps:
+    def test_redundant_ops_just_after_input_valid_pattern_1(self):
+        """
+        Input graph:
+        input----->transpose(perm=[0, 2, 1])--->add---> add ---> out
+               |                                 ^       ^
+               |                                 |       |
+               |---->transpose(perm=[0, 2, 1])----       |
+               |                                         |
+               |                                         |
+               |---->transpose(perm=[0, 2, 1])------------
+
+        Output graph:
+        input----->transpose(perm=[0, 2, 1])--->add---> add ----> out
+                                    |            ^       ^
+                                    |            |       |
+                                    |-------------       |
+                                    |                    |
+                                    |--------------------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.transpose(x=x, perm=[0, 2, 1])
+            x2 = mb.transpose(x=x, perm=[0, 2, 1])
+            x3 = mb.transpose(x=x, perm=[0, 2, 1])
+            z = mb.add(x=x1, y=x2)
+            z = mb.add(x=z, y=x3)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "transpose",
+            "transpose",
+            "transpose",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == ["transpose", "add", "add"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 5)},
+            expected_output_shapes={block.outputs[0].name: (2, 5, 3)},
+        )
+
+    def test_redundant_ops_just_after_input_valid_pattern_2(self):
+        """
+        Input graph:
+        input----->leaky_relu(alpha=0.3)--->add---> add ---> out
+               |                             ^       ^
+               |                             |       |
+               |----->leaky_relu(alpha=0.3)---       |
+               |                                     |
+               |                                     |
+               |---->leaky_relu(alpha=0.3)------------
+
+        Output graph:
+        input--------->leaky_relu(alpha=0.3)--->add---> add ----> out
+                                    |            ^       ^
+                                    |            |       |
+                                    |-------------       |
+                                    |                    |
+                                    |---------------------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.leaky_relu(x=x, alpha=0.3)
+            x2 = mb.leaky_relu(x=x, alpha=0.3)
+            x3 = mb.leaky_relu(x=x, alpha=0.3)
+            z = mb.add(x=x1, y=x2)
+            z = mb.add(x=z, y=x3)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "leaky_relu",
+            "leaky_relu",
+            "leaky_relu",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == ["leaky_relu", "add", "add"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 5)},
+            expected_output_shapes={block.outputs[0].name: (2, 3, 5)},
+        )
+
+    def test_redundant_ops_just_after_input_valid_pattern_3(self):
+        """
+        Input graph:
+        input----->leaky_relu(alpha=0.4)--->add---> add ---> out
+               |                             ^       ^
+               |                             |       |
+               |----->leaky_relu(alpha=0.3)---       |
+               |                                     |
+               |                                     |
+               |---->leaky_relu(alpha=0.3)------------
+
+        Output graph:
+        input----->leaky_relu(alpha=0.4)--->add---> add ---> out
+               |                             ^       ^
+               |                             |       |
+               |----->leaky_relu(alpha=0.3)----------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.leaky_relu(x=x, alpha=0.4)
+            x2 = mb.leaky_relu(x=x, alpha=0.3)
+            x3 = mb.leaky_relu(x=x, alpha=0.3)
+            z = mb.add(x=x1, y=x2)
+            z = mb.add(x=z, y=x3)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "leaky_relu",
+            "leaky_relu",
+            "leaky_relu",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == ["leaky_relu", "leaky_relu", "add", "add"]
+
+        leaky_relu_ops = block.find_ops(op_type="leaky_relu")
+        assert leaky_relu_ops[0].alpha.val == np.float32(0.4)
+        assert leaky_relu_ops[1].alpha.val == np.float32(0.3)
+
+    def test_redundant_ops_just_after_input_invalid_pattern_1(self):
+        """
+        input----->transpose(perm=[0, 2, 1])---> reshape(shape=[-1]) -----> add ---> out
+               |                                                             ^
+               |                                                             |
+               |---->transpose(perm=[1, 0, 2])----> reshape(shape=[-1])------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.transpose(x=x, perm=[0, 2, 1])
+            x2 = mb.transpose(x=x, perm=[1, 0, 2])
+            x1 = mb.reshape(x=x1, shape=[-1])
+            x2 = mb.reshape(x=x2, shape=[-1])
+            z = mb.add(x=x1, y=x2)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "transpose",
+            "transpose",
+            "reshape",
+            "reshape",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "transpose",
+            "transpose",
+            "reshape",
+            "reshape",
+            "add",
+        ]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 5)},
+            expected_output_shapes={block.outputs[0].name: (30,)},
+        )
+
+    def test_redundant_ops_just_after_input_invalid_pattern_2(self):
+        """
+        input----->leaky_relu(alpha=0.3) -----> add ---> out
+               |                                 ^
+               |                                 |
+               |---->leaky_relu(alpha=0.4)-------
+
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.leaky_relu(x=x, alpha=0.3)
+            x2 = mb.leaky_relu(x=x, alpha=0.4)
+            z = mb.add(x=x1, y=x2)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == ["leaky_relu", "leaky_relu", "add"]
+        assert get_op_types_in_program(prog) == ["leaky_relu", "leaky_relu", "add"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 5)},
+            expected_output_shapes={block.outputs[0].name: (2, 3, 5)},
+        )
+
+    def test_redundant_ops_just_after_input_invalid_pattern_3(self):
+        """
+        test case, when inputs of 1 op is a subset of the inputs of the other op
+
+        input----->layer_norm1 -----> add ---> out
+               |                       ^
+               |                       |
+               |---->layer_norm2-------
+
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 3, 2))])
+        def prog(x):
+            x1 = mb.layer_norm(x=x, axes=[2], epsilon=1e-4)
+            gamma_val = np.array([1.0, 1.0], dtype=np.float32)
+            beta_val = np.array([1.0, 0.0], dtype=np.float32)
+            x2 = mb.layer_norm(x=x, axes=[2], epsilon=1e-4, gamma=gamma_val, beta=beta_val)
+            z = mb.add(x=x1, y=x2)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == ["layer_norm", "layer_norm", "add"]
+        assert get_op_types_in_program(prog) == ["layer_norm", "layer_norm", "add"]
+        assert_model_is_valid(
+            prog,
+            {"x": (1, 3, 2)},
+            expected_output_shapes={block.outputs[0].name: (1, 3, 2)},
+        )
+
+    @staticmethod
+    def _make_repeated_conv_prog(redundant_conv=True, out_channel=2):
+        prog = mil.Program()
+        func_inputs = {"x": mb.placeholder(shape=[1, 4, 5, 5])}
+        with Function(func_inputs) as ssa_fun:
+            x = ssa_fun.inputs["x"]
+            x = mb.relu(x=x)
+            W = np.random.rand(out_channel, 4, 3, 3)
+            if redundant_conv:
+                bias = np.random.rand(out_channel)
+                x1 = mb.conv(x=x, weight=W, bias=bias, pad_type="same", strides=[1, 1])
+                x2 = mb.conv(x=x, weight=W, bias=bias, pad_type="same", strides=[1, 1])
+            else:
+                x1 = mb.conv(
+                    x=x, weight=W, bias=np.random.rand(out_channel), pad_type="same", strides=[1, 1]
+                )
+                x2 = mb.conv(
+                    x=x, weight=W, bias=np.random.rand(out_channel), pad_type="same", strides=[1, 1]
+                )
+            x1 = mb.relu(x=x1)
+            x2 = mb.relu(x=x2)
+            x1 = mb.avg_pool(x=x1, kernel_sizes=[2, 2], strides=[1, 1], pad_type="same")
+            z = mb.concat(values=(x1, x2), axis=-3)
+            ssa_fun.set_outputs([z])
+        prog.add_function("main", ssa_fun)
+        return prog
+
+    def test_redundant_ops_inside_graph_valid_pattern(self):
+        """
+        Input graph:
+        input--> relu--------->conv------>relu----> pool ---> concat ---> out
+                 |                                              ^
+                 |                                              |
+                 |---->conv---->relu----------------------------
+
+        Output graph:
+        input-> relu--->conv------>relu----> pool ---> concat ---> out
+                                    |                   ^
+                                    |                   |
+                                    |-------------------
+        """
+        prog = self._make_repeated_conv_prog(redundant_conv=True)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "relu",
+            "conv",
+            "conv",
+            "relu",
+            "relu",
+            "avg_pool",
+            "concat",
+        ]
+        assert get_op_types_in_program(prog) == ["relu", "conv", "relu", "avg_pool", "concat"]
+        assert_model_is_valid(
+            prog,
+            {"x": (1, 4, 5, 5)},
+            expected_output_shapes={block.outputs[0].name: (1, 4, 5, 5)},
+        )
+
+    def test_redundant_ops_inside_graph_with_large_const(self):
+        """
+        For the large constants, they need to be deduplicated by the const_deduplication first.
+        This test is making sure the converter is not doing any "brutal force" comparison.
+
+        Input graph:
+        input--> relu--------->conv------>relu----> pool ---> concat ---> out
+                 |                                              ^
+                 |                                              |
+                 |---->conv---->relu----------------------------
+
+        Output graph:
+        input-> relu--->conv------>relu----> pool ---> concat ---> out
+                                    |                   ^
+                                    |                   |
+                                    |-------------------
+        """
+        # The remove_redundant_ops is not doing brutal force array comparison
+        prog = self._make_repeated_conv_prog(redundant_conv=True, out_channel=10)
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        ops_in_prev_prog = [
+            "relu",
+            "conv",
+            "conv",
+            "relu",
+            "relu",
+            "avg_pool",
+            "concat",
+        ]
+        assert get_op_types_in_program(prev_prog) == ops_in_prev_prog
+        assert get_op_types_in_program(prog) == ops_in_prev_prog
+
+        # We need to first run the const_deduplication pass.
+        prog = self._make_repeated_conv_prog(redundant_conv=True, out_channel=10)
+        _, _, block = apply_pass_and_basic_check(prog, "common::const_deduplication")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        _, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+
+        assert get_op_types_in_program(prog) == ["relu", "conv", "relu", "avg_pool", "concat"]
+        assert_model_is_valid(
+            prog,
+            {"x": (1, 4, 5, 5)},
+            expected_output_shapes={block.outputs[0].name: (1, 20, 5, 5)},
+        )
+
+    def test_redundant_ops_inside_graph_invalid_pattern(self):
+        """
+        input--->relu--------->conv1------>relu----> pool ---> concat ---> out
+                  |                                              ^
+                  |                                              |
+                  |---->conv2---->relu---------------------------
+        """
+        prog = self._make_repeated_conv_prog(redundant_conv=False)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == [
+            "relu",
+            "conv",
+            "conv",
+            "relu",
+            "relu",
+            "avg_pool",
+            "concat",
+        ]
+        assert get_op_types_in_program(prog) == [
+            "relu",
+            "conv",
+            "conv",
+            "relu",
+            "relu",
+            "avg_pool",
+            "concat",
+        ]
+        assert_model_is_valid(
+            prog,
+            {"x": (1, 4, 5, 5)},
+            expected_output_shapes={block.outputs[0].name: (1, 4, 5, 5)},
+        )
+
+    def test_redundant_op_as_output_valid_pattern_1(self):
+        """
+        Input graph:
+        input--------->relu------> out1
+               |
+               |
+               |---->relu---->tanh---> out2
+
+        Output graph:
+        input--------->relu------> out1
+                             |
+                             |
+                             |---->tanh---> out2
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.relu(x=x)
+            x2 = mb.relu(x=x)
+            return x1, mb.tanh(x=x2)
+
+        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
+        assert get_op_types_in_program(prev_prog) == ["relu", "relu", "tanh"]
+        assert get_op_types_in_program(prog) == ["relu", "tanh"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 5)},
+            expected_output_shapes={
+                block.outputs[0].name: (2, 3, 5),
+                block.outputs[1].name: (2, 3, 5),
+            },
+        )
+
+    def test_redundant_op_as_output_invalid_pattern_1(self):
+        """
+        Input graph:
+        input--------->relu------> out1
+               |
+               |
+               |---->relu---> out2
+
+        "common::remove_redundant_ops" pass does not remove ops if their outputs
+        are block outputs.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x1 = mb.relu(x=x)
+            x2 = mb.relu(x=x)
+            return x1, x2
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == ["relu", "relu"]
+        assert get_op_types_in_program(prog) == ["relu", "relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (2, 3, 5)},
+            expected_output_shapes={
+                block.outputs[0].name: (2, 3, 5),
+                block.outputs[1].name: (2, 3, 5),
+            },
+        )
+
+    def test_cond_block_program(self):
+        """
+        - Test identical ops within different blocks are not removed. The "relu" op inside true and
+        false blocks are not removed since they are in different blocks.
+        - Test ops that have blocks inside them are not removed. There are two cond ops here,
+        with identical inputs but they are not removed, since they are ops that have nested block
+        inside them.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
+        def prog(x):
+            x1 = mb.cast(x=x, dtype="bool")
+
+            def true_fn():
+                x = mb.shape(x=x1)
+                x = mb.cast(x=x, dtype="fp32")
+                return mb.add(x=x, y=1.0)
+
+            def false_fn():
+                x = mb.shape(x=x1)
+                x = mb.cast(x=x, dtype="fp32")
+                return mb.add(x=x, y=-1.0)
+
+            z1 = mb.cond(pred=x1, _true_fn=true_fn, _false_fn=false_fn)
+            z2 = mb.cond(pred=x1, _true_fn=true_fn, _false_fn=false_fn)
+            z = mb.add(x=z1, y=z2)
+            return z
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == ["cast", "cond", "cond", "add"]
+        assert get_op_types_in_program(prog) == ["cast", "cond", "cond", "add"]
+        cond_op = prog.find_ops(op_type="cond")[0]
+        assert cond_op.blocks[0].operations[0].op_type == "shape"
+        assert cond_op.blocks[1].operations[0].op_type == "shape"
+        assert_model_is_valid(
+            prog,
+            {"x": (1,)},
+            expected_output_shapes={block.outputs[0].name: (1,)},
+        )
+
+    def test_concat_op_pattern(self):
+        """
+        Input graph:
+                          ---------------> concat ------> log ------> out1
+                         |                   ^
+                         |                   |
+        input--------->relu------> concat ------> relu----> out2
+                 |                  ^        |
+                 |                  |        |
+                 |---->tanh--------------------
+
+        Output graph:
+                                     |------>log ------> out1
+                                     |
+                                     |
+        input--------->relu------> concat ------> relu----> out2
+                 |                  ^
+                 |                  |
+                 |---->tanh---------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 5))])
+        def prog(x):
+            x1 = mb.relu(x=x)
+            x2 = mb.tanh(x=x)
+            c1 = mb.concat(values=(x1, x2), axis=0)
+            c2 = mb.concat(values=(x1, x2), axis=0)
+            z1 = mb.log(x=c1)
+            z2 = mb.relu(x=c2)
+            return z1, z2
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "relu",
+            "tanh",
+            "concat",
+            "concat",
+            "log",
+            "relu",
+        ]
+        assert get_op_types_in_program(prog) == ["relu", "tanh", "concat", "log", "relu"]
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 5)},
+            expected_output_shapes={block.outputs[0].name: (20, 5), block.outputs[1].name: (20, 5)},
+        )
+
+    def test_multiple_redundant_child_ops_pattern(self):
+        """
+        Input graph
+
+        input -------------> reshape ----------> add ---------> out1
+                  |                               ^
+                  |                               |
+                  |-------> reshape ---------------
+                  |
+                  |------> slice_by_size-----> add ----------> out2
+                  |                             ^
+                  |                             |
+                  |------> slice_by_size -------
+
+        Output graph
+
+        input -------------> reshape ----------> add ------------> out1
+          |                              |        ^
+          |                              |        |
+          |                              |---------
+          |
+          |------> slice_by_size----------> add -----------------> out2
+                        |                    ^
+                        |                    |
+                        |---------------------
+
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 5, 4))])
+        def prog(x):
+            x1 = mb.reshape(x=x, shape=[5, 2, -1])
+            x2 = mb.reshape(x=x, shape=[5, 2, -1])
+            x3 = mb.slice_by_size(x=x, begin=[0, 0, 1], size=[2, 4, 3])
+            x4 = mb.slice_by_size(x=x, begin=[0, 0, 1], size=[2, 4, 3])
+            z1 = mb.add(x=x1, y=x2)
+            z2 = mb.add(x=x3, y=x4)
+            return z1, z2
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "reshape",
+            "reshape",
+            "slice_by_size",
+            "slice_by_size",
+            "add",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == ["reshape", "slice_by_size", "add", "add"]
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 5, 4)},
+            expected_output_shapes={
+                block.outputs[0].name: (5, 2, 20),
+                block.outputs[1].name: (2, 4, 3),
+            },
+        )
+
+    def test_random_distribution_op_invalid_pattern(self):
+        """
+        Identical random ops are not removed
+
+        input----->cast---->random_uniform------> add ---> out
+                    |                              ^
+                    |                              |
+                    |---->random_uniform------------
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3,))])
+        def prog(shape):
+            shape = mb.cast(x=shape, dtype="int32")
+            x1 = mb.random_uniform(shape=shape, low=0.0, high=1.0, seed=11)
+            x2 = mb.random_uniform(shape=shape, low=0.0, high=1.0, seed=11)
+            return mb.add(x=x1, y=x2)
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == [
+            "cast",
+            "random_uniform",
+            "random_uniform",
+            "add",
+        ]
+        assert get_op_types_in_program(prog) == ["cast", "random_uniform", "random_uniform", "add"]
+
+    def test_nonreplaceable_vars(self):
+        """
+        Nonreplaceable vars shouldn't be removed, e.g. palettized weights
+
+        const_1----->add---->add_1------|
+                      |                 |
+                    input              add---->output
+                      |                 |
+        const_2----->add---->add_2------|
+        """
+
+        def _constexpr_lut_to_dense():
+            lut_data = np.array(
+                [-19.0, 4.0, 0.0, -1.0, 1.0, 3.0, 5.0, -8.0, 19, 13, 42, 4.5, 5.4, 2.0, -6, -7]
+            ).astype(np.float32)
+            indices = np.array([212, 21]).astype(np.uint8)
+            shape = np.array([4, 1]).astype(np.uint32)
+            return mb.constexpr_lut_to_dense(lut=lut_data, indices=indices, shape=shape)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 1))])
+        def prog(x):
+            constexpr_1 = _constexpr_lut_to_dense()
+            constexpr_2 = _constexpr_lut_to_dense()
+            c = mb.add(x=constexpr_1, y=x)
+            d = mb.add(x=constexpr_2, y=x)
+            return mb.add(x=c, y=d)
+
+        prev_prog, _, _ = apply_pass_and_basic_check(
+            prog,
+            "common::remove_redundant_ops",
+        )
+        assert get_op_types_in_program(prev_prog) == get_op_types_in_program(prog)
+
+    def test_redundant_ops_time_complexity(self):
+        """
+        Test the graph pass doesn't re-run right away after detecting a redundant pattern,
+        in order to keep time complexity low.
+
+        In this example, a program with 26 ops is first traversed, and 5 relu ops are removed.
+        At the time of second traversal, there are only 21 remaining ops.
+        As the result, the total ops of visited is 26 + 21 = 47.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x = mb.cos(x=x)
+            for i in range(5):
+                x1 = mb.relu(x=x)
+                x2 = mb.relu(x=x)
+                z = mb.add(x=x1, y=x2)
+                z = mb.add(x=z, y=x2)
+                x = mb.sin(x=x)
+            return x
+
+        graph_pass = remove_redundant_ops()
+        graph_pass.apply(prog)
+
+        assert get_op_types_in_program(prog) == ["cos"] + ["relu", "add", "add", "sin"] * 5
+        assert graph_pass._num_of_visited_ops == 47
+
+    def test_redundant_ops_time_complexity_pattern_2(self):
+        """
+        Test the graph pass doesn't re-run right away after detecting a redundant pattern,
+        in order to keep time complexity low.
+
+        In this example, there are three groups of identical leaky_relu ops can be removed,
+        and the algorithm should be run in the fashion that only goes through the
+        program twice. As the result, the total ops visited is:
+
+        8 + (8 - 3) = 13
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
+        def prog(x):
+            x = mb.cos(x=x)
+            x1 = mb.leaky_relu(x=x, alpha=0.2)
+            x2 = mb.leaky_relu(x=x, alpha=0.2)
+            x3 = mb.leaky_relu(x=x, alpha=0.3)
+            x4 = mb.leaky_relu(x=x, alpha=0.3)
+            x5 = mb.leaky_relu(x=x, alpha=0.4)
+            x6 = mb.leaky_relu(x=x, alpha=0.4)
+            return mb.sin(x=x6)
+
+        graph_pass = remove_redundant_ops()
+        graph_pass.apply(prog)
+
+        assert get_op_types_in_program(prog) == ["cos"] + ["leaky_relu"] * 3 + ["sin"]
+        assert graph_pass._num_of_visited_ops == 13
+
+
+class TestRemoveSymbolicReshape:
+    def test_remove_symbolic_reshape(self):
+        sym_b = Symbol("s0")
+        original_shape = (sym_b, Symbol("s1"), 2)
+        reshape_name = "reshape"
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(sym_b, 4))])
+        def prog(x):
+            # const cannot represent symbolic values. Use _const_symbolic
+            shape = mb._const_symbolic(val=original_shape)
+            return mb.reshape(x=x, shape=shape, name=reshape_name)
+
+        reshape_op = prog.find_ops(prefix=reshape_name, op_type="reshape", exactly_one=True)[0]
+        shape_var = reshape_op.shape
+        reshaped_var = reshape_op.outputs[0]
+        assert np.all(shape_var.sym_val == original_shape)
+        assert np.all(reshaped_var.shape == (sym_b, 2, 2))
+
+        # Note: we cannot deepcopy prog with symbol.
+        prev_outputs = [o.name for o in prog["main"].outputs]
+        PASS_REGISTRY["common::remove_symbolic_reshape"](prog)
+        curr_outputs = [o.name for o in prog["main"].outputs]
+        assert curr_outputs == prev_outputs
+
+        reshape_op = prog.find_ops(prefix=reshape_name, op_type="reshape", exactly_one=True)[0]
+        shape_var = reshape_op.shape
+        reshaped_var = reshape_op.outputs[0]
+        # shape param cannot be symbolic after the pass
+        assert np.all(shape_var.sym_val == (-1, 2, 2))
+        # output shape is still symbolic
+        assert np.all(reshaped_var.shape == (sym_b, 2, 2))
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(prog, {"x": (3, 4)})
+
+
+class TestTopologicalReorder:
+    def test_move_sink_casts_to_the_end(self):
+        """
+        Input graph:
+            x (input) ---> square ---> cast (output)
+            |
+            | -----------> log ------> cast (output)
+            |
+            | -----------> relu -----> cast ----> relu (output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
+            x1 = mb.square(x=x)
+            x2 = mb.cast(x=x1, dtype="fp32")
+            x3 = mb.log(x=x)
+            x4 = mb.cast(x=x3, dtype="fp32")
+            x5 = mb.relu(x=x)
+            x6 = mb.cast(x=x5, dtype="fp32")
+            x7 = mb.relu(x=x6)
+            return x2, x4, x7
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "cast",
+            "log",
+            "cast",
+            "relu",
+            "cast",
+            "relu",
+        ]
+
+        apply_pass_and_basic_check(prog, "common::topological_reorder")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "log",
+            "relu",
+            "cast",
+            "relu",
+            "cast",
+            "cast",
+        ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={
+                block.outputs[0].name: (10, 20),
+                block.outputs[1].name: (10, 20),
+                block.outputs[2].name: (10, 20),
+            },
+        )
+
+    def test_move_sink_cast_transpose_to_the_end(self):
+        """
+        Input graph:
+            x (input) ---> square ---> transpose ---> cast (output)
+            |
+            | -----------> log ------> transpose ---> cast (output)
+            |
+            | -----------> relu -----> cast ----> relu (output)
+            |
+            | -----------> relu (output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
+            x1 = mb.square(x=x)
+            x1_t = mb.transpose(x=x1, perm=[1, 0])
+            x2 = mb.cast(x=x1_t, dtype="fp32")
+            x3 = mb.log(x=x)
+            x3_t = mb.transpose(x=x3, perm=[1, 0])
+            x4 = mb.cast(x=x3_t, dtype="fp32")
+            x5 = mb.relu(x=x)
+            x6 = mb.cast(x=x5, dtype="fp32")
+            x7 = mb.relu(x=x6)
+            x8 = mb.relu(x=x)
+            return x2, x4, x7, x8
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "transpose",
+            "cast",
+            "log",
+            "transpose",
+            "cast",
+            "relu",
+            "cast",
+            "relu",
+            "relu",
+        ]
+
+        apply_pass_and_basic_check(prog, "common::topological_reorder")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "log",
+            "relu",
+            "cast",
+            "relu",
+            "relu",
+            "transpose",
+            "cast",
+            "transpose",
+            "cast",
+        ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={
+                block.outputs[0].name: (20, 10),
+                block.outputs[1].name: (20, 10),
+                block.outputs[2].name: (10, 20),
+                block.outputs[3].name: (10, 20),
+            },
+        )
+
+    def test_move_multiple_uses_overlapping(self):
+        """
+        Input graph:
+            x (input) ---> cast ---> cast (output)
+                           |
+                           |-------> transpose ---> transpose (output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x1 = mb.cast(x=x, dtype="fp16")
+            x2 = mb.cast(x=x1, dtype="fp32")
+            x3 = mb.transpose(x=x1, perm=[1, 0])
+            x4 = mb.transpose(x=x3, perm=[1, 0])
+            return x2, x4
+
+        assert get_op_types_in_program(prog) == ["cast", "cast", "transpose", "transpose"]
+
+        apply_pass_and_basic_check(prog, "common::topological_reorder")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["cast", "transpose", "transpose", "cast"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={
+                block.outputs[0].name: (10, 20),
+                block.outputs[1].name: (10, 20),
+            },
+        )
+
+    def test_move_split_to_first_use(self):
+        """
+        Input graph:
+            x (input) ---> split ---> square ---> add (output)
+            |                |                     |
+            |                | --------------------|
+            |
+            | -----------> square --------------> relu (output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            s1, s2 = mb.split(x=x, num_splits=2, axis=0)
+            x2 = mb.square(x=x)
+            x3 = mb.relu(x=x2)
+            s1_1 = mb.square(x=s1)
+            s3 = mb.add(x=s1_1, y=s2)
+            return x3, s3
+
+        assert get_op_types_in_program(prog) == ["split", "square", "relu", "square", "add"]
+
+        block = prog.functions["main"]
+        # Reorder `split` op to test op with multiple output case
+        topological_reorder._move_operations_to_the_end_block(block, ["split"])
+        assert get_op_types_in_program(prog) == ["square", "relu", "split", "square", "add"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={
+                block.outputs[0].name: (10, 20),
+                block.outputs[1].name: (5, 20),
+            },
+        )
+
+    def test_move_transpose_before_subblock(self):
+        """
+        Input graph:
+            x (input) ---> cast ---> transpose ---> cast (output)
+            |
+            | -----------> square ------> transpose (x1_t) ---> cast (output)
+            |
+            | -----------> squeeze ----> equal ----> squeeze
+                                                        |
+                                          (true) <--- /  \ ---> (false)
+                                            |                        |
+                                            |      /<-(x1_t)->\      |
+                                           add  <-/            \--> add
+                                            |---------> | <---------|
+                                                        |
+                                                       add ---> cast (output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
+            x1 = mb.square(x=x)
+            x1_t = mb.transpose(x=x1, perm=[1, 0])
+
+            def true_fn():
+                return mb.add(x=x1_t, y=np.float16(1), name="x2")
+
+            def false_fn():
+                return mb.add(x=x1_t, y=np.float16(2), name="x2")
+
+            is_one = mb.equal(x=mb.squeeze(x=x), y=np.float16(1.0))
+            pred = mb.squeeze(x=is_one)
+            x3 = mb.cond(pred=pred, _true_fn=true_fn, _false_fn=false_fn)
+            x4 = mb.add(x=x1_t, y=x3)
+            x5 = mb.cast(x=x4, dtype="fp32")
+            return x5
+
+        apply_pass_and_basic_check(prog, "common::topological_reorder")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "squeeze",
+            "equal",
+            "squeeze",
+            "transpose",
+            "cond",
+            "add",
+            "cast",
+        ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={block.outputs[0].name: (20, 10)},
+        )
+
+    def test_cast_transpose_already_at_the_end(self):
+        """
+        Input graph:
+            x (input) ---> square ---> transpose ---> cast (output)
+            |
+            | -----------> log ------> transpose ---> cast (output)
+            |
+            | -----------> relu -----> cast ----> relu (output)
+            |
+            | -----------> relu (output)
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        def prog(x):
+            x = mb.cast(x=x, dtype="fp16")
+            x1 = mb.square(x=x)
+            x3 = mb.log(x=x)
+            x5 = mb.relu(x=x)
+            x6 = mb.cast(x=x5, dtype="fp32")
+            x7 = mb.relu(x=x6)
+            x8 = mb.relu(x=x)
+            x1_t = mb.transpose(x=x1, perm=[1, 0])
+            x2 = mb.cast(x=x1_t, dtype="fp32")
+            x3_t = mb.transpose(x=x3, perm=[1, 0])
+            x4 = mb.cast(x=x3_t, dtype="fp32")
+            return x2, x4, x7, x8
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "log",
+            "relu",
+            "cast",
+            "relu",
+            "relu",
+            "transpose",
+            "cast",
+            "transpose",
+            "cast",
+        ]
+
+        apply_pass_and_basic_check(prog, "common::topological_reorder")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == [
+            "cast",
+            "square",
+            "log",
+            "relu",
+            "cast",
+            "relu",
+            "relu",
+            "transpose",
+            "cast",
+            "transpose",
+            "cast",
+        ]
+
+        assert_model_is_valid(
+            prog,
+            {"x": (10, 20)},
+            expected_output_shapes={
+                block.outputs[0].name: (20, 10),
+                block.outputs[1].name: (20, 10),
+                block.outputs[2].name: (10, 20),
+                block.outputs[3].name: (10, 20),
+            },
+        )
diff --git a/coremltools/converters/mil/mil/passes/tests/test_lower_complex_dialect_ops.py b/coremltools/converters/mil/mil/passes/tests/test_lower_complex_dialect_ops.py
index cf6b5f409..e43809fbd 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_lower_complex_dialect_ops.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_lower_complex_dialect_ops.py
@@ -3,12 +3,17 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import copy
+
 import numpy as np
 import pytest
-from coremltools import ComputeUnit
 
+from coremltools import ComputeUnit
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil.passes.defs.lower_complex_dialect_ops import _calculate_dft_matrix
+from coremltools.converters.mil.mil.passes.defs.lower_complex_dialect_ops import (
+    _calculate_dft_matrix,
+)
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 from coremltools.converters.mil.testing_utils import (
     apply_pass_and_basic_check,
     assert_model_is_valid,
@@ -38,6 +43,33 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (1, 2, 3)},
         )
 
+    def test_lower_fft_with_scope(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 3))])
+        def prog(x):
+            with mb.scope(ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["m1"])):
+                fft_res = mb.complex_fft(data=x)
+            with mb.scope(ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["m2"])):
+                return mb.complex_real(data=fft_res)
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        apply_pass_and_basic_check(
+            prog,
+            "common::lower_complex_dialect_ops",
+            skip_essential_scope_check=True,  # this graph pass introduces two subgraphs, while only one of them is used.
+        )
+        apply_pass_and_basic_check(
+            prog,
+            "common::dead_code_elimination",
+        )
+
+        # since the _replace_var is operated on the output of complex_real, so the scope info should be "m2"
+        block = prog.functions["main"]
+        for op in block.operations:
+            assert op.scopes == {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["m2"],
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["lower_complex_dialect_ops"],
+            }
+
     def test_lower_fft(self):
         @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 3))])
         def prog(x):
@@ -45,13 +77,34 @@ def prog(x):
             real_data = mb.complex_real(data=fft_res)
             return real_data
 
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::lower_complex_dialect_ops")
+        # Test the apply_pass_and_basic_check utils has the ability to catch errors regarding incomplete scope information
+        with pytest.raises(
+            ValueError, match="is missing essential scopes ScopeSource.TORCHSCRIPT_MODULE_TYPE"
+        ):
+            prev_prog, _, block = apply_pass_and_basic_check(
+                copy.deepcopy(prog),
+                "common::lower_complex_dialect_ops",
+            )
+
+        prev_prog, _, block = apply_pass_and_basic_check(
+            prog,
+            "common::lower_complex_dialect_ops",
+            skip_essential_scope_check=True,  # this graph pass introduces two subgraphs, while only one of them is used.
+        )
         assert get_op_types_in_program(prev_prog) == ["complex_fft", "complex_real"]
         after_pass_op_types_set = set(get_op_types_in_program(prog))
         # Verifies that the complex dialect ops got lowered to core ops.
         assert "complex_fft" not in after_pass_op_types_set
         assert "complex_real" not in after_pass_op_types_set
 
+        apply_pass_and_basic_check(
+            prog,
+            "common::dead_code_elimination",
+        )
+        # Verifies that the complex dialect ops got lowered to core ops.
+        assert "complex_fft" not in after_pass_op_types_set
+        assert "complex_real" not in after_pass_op_types_set
+
         inputs = {"x": (1, 2, 3)}
         assert_model_is_valid(
             prog,
@@ -59,10 +112,7 @@ def prog(x):
             expected_output_shapes={block.outputs[0].name: (1, 2, 3)},
         )
 
-    @pytest.mark.parametrize(
-        "onesided", 
-        [True, False]
-    )
+    @pytest.mark.parametrize("onesided", [True, False])
     def test_calculate_dft_matrix(self, onesided):
         expected_C = np.zeros((16, 16))
         expected_S = np.zeros((16, 16))
@@ -71,7 +121,7 @@ def test_calculate_dft_matrix(self, onesided):
         for k in range(16):
             expected_C[k, :] = np.cos(2 * np.pi * k * _range / 16)
             expected_S[k, :] = np.sin(2 * np.pi * k * _range / 16)
-        
+
         if onesided:
             expected_C = expected_C[:9]
             expected_S = expected_S[:9]
@@ -80,7 +130,9 @@ def test_calculate_dft_matrix(self, onesided):
         def prog(x):
             return _calculate_dft_matrix(x, onesided=onesided)
 
-        model = ct_convert(program=prog, convert_to=("neuralnetwork", "fp32"), compute_units=ComputeUnit.CPU_ONLY)    
+        model = ct_convert(
+            program=prog, convert_to=("neuralnetwork", "fp32"), compute_units=ComputeUnit.CPU_ONLY
+        )
         p = model.predict({"x": np.array([16.0])})
         cos_matrix, sin_matrix = p["cos_0"], p["sin_0"]
 
diff --git a/coremltools/converters/mil/mil/passes/tests/test_optimize_linear_passes.py b/coremltools/converters/mil/mil/passes/tests/test_optimize_linear_passes.py
new file mode 100644
index 000000000..9a90104a3
--- /dev/null
+++ b/coremltools/converters/mil/mil/passes/tests/test_optimize_linear_passes.py
@@ -0,0 +1,324 @@
+#  Copyright (c) 2024, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy
+import itertools
+
+import numpy as np
+import pytest
+
+from coremltools.converters.mil.mil import Builder as mb
+from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
+from coremltools.converters.mil.testing_reqs import backends
+from coremltools.converters.mil.testing_utils import (
+    apply_pass_and_basic_check,
+    assert_model_is_valid,
+    assert_op_count_match,
+    assert_same_output_names,
+    get_op_types_in_program,
+)
+
+from .test_passes import _VALIDATE_MODEL
+
+
+class TestFuseLinearBias:
+    @staticmethod
+    def _apply_transform(inputs, func, is_first_input, has_bias):
+        """
+        Utility function to test the weight/bias transform function in linear bias fusion pass.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 4))])
+        def prog(x):
+
+            if has_bias:
+                linear = mb.linear(
+                    x=x,
+                    weight=inputs["linear_weight"],
+                    bias=inputs["linear_bias"],
+                )
+            else:
+                linear = mb.linear(
+                    x=x,
+                    weight=inputs["linear_weight"],
+                )
+
+            if is_first_input:
+                kwargs = {
+                    "x": linear,
+                    "y": inputs["bias"],
+                }
+            else:
+                kwargs = {
+                    "x": inputs["bias"],
+                    "y": linear,
+                }
+
+            x = func(**kwargs)
+            return x
+
+        apply_pass_and_basic_check(
+            prog,
+            "common::fuse_linear_bias",
+        )
+
+        # get the updated weight from the prog
+        linear_op = []
+        for op in prog["main"].operations:
+            if op.op_type == "const":
+                continue
+            linear_op.append(op)
+        assert len(linear_op) == 1, "should only have one linear layer."
+
+        return linear_op[0].weight.val, linear_op[0].bias.val
+
+    @pytest.mark.parametrize(
+        "op_type, is_first_input, has_bias, broadcast",
+        itertools.product(
+            ["add", "sub"],
+            [True, False],
+            [True, False],
+            [True, False],
+        ),
+    )
+    def test_transform_linear(self, op_type, is_first_input, has_bias, broadcast):
+        """
+        Test the weight / bias transform function in the linear bias fusion pass
+        """
+        weight = np.reshape(np.arange(8), (2, 4)).astype(np.float32)
+        linear_bias = (
+            np.array([1, 2]).astype(np.float32) if has_bias else np.array([0, 0]).astype(np.float32)
+        )
+        bias = np.array([3, 4]).astype(np.float32)
+        if broadcast:
+            bias = np.reshape(bias, (1, 2))
+
+        inputs = {
+            "linear_weight": weight,
+            "linear_bias": linear_bias,
+            "bias": bias,
+        }
+
+        if op_type == "add":
+            func = mb.add
+        elif op_type == "sub":
+            func = mb.sub
+
+        new_weight, new_bias = self._apply_transform(
+            inputs,
+            func,
+            is_first_input,
+            has_bias,
+        )
+        if broadcast:
+            bias = np.reshape(bias, (2,))
+
+        if op_type == "sub" and not is_first_input:
+            expected_weight = -weight
+        else:
+            expected_weight = weight
+
+        if op_type == "sub":
+            if is_first_input:
+                expected_bias = linear_bias - bias
+            else:
+                expected_bias = bias - linear_bias
+        else:
+            expected_bias = linear_bias + bias
+
+        np.testing.assert_almost_equal(new_weight, expected_weight)
+        np.testing.assert_almost_equal(new_bias, expected_bias)
+
+    @pytest.mark.parametrize(
+        "rank, op_type, is_first_input, broadcast, backend",
+        itertools.product([1, 2, 3], ["add", "sub"], [True, False], [True, False], backends),
+    )
+    def test_linear_bias_fusion(self, rank, op_type, is_first_input, broadcast, backend):
+        """
+        Input graph:
+                                    Const
+                                      |
+                                      V
+        input -----> linear -----> add/sub ---> out
+
+        Output graph:
+        input -----> linear ----> out
+        """
+        input_shape = [1, 2, 3]
+        input_shape = input_shape[-rank:]
+        input_shape = tuple(input_shape)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)])
+        def prog(x):
+            linear_weight = np.reshape(np.arange(6), (2, 3)).astype(np.float32)
+            linear_bias = np.array([1.0, 2.0])
+            bias = np.array([3.0, 4.0])
+            if broadcast:
+                if rank >= 2:
+                    bias = np.reshape(bias, (1, 2))
+
+            x = mb.linear(
+                x=x,
+                weight=linear_weight,
+                bias=linear_bias,
+            )
+
+            func = mb.add if op_type == "add" else mb.sub
+            if is_first_input:
+                kwargs = {
+                    "x": x,
+                    "y": bias,
+                }
+            else:
+                kwargs = {
+                    "x": bias,
+                    "y": x,
+                }
+            x = func(**kwargs)
+            return x
+
+        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_linear_bias")
+
+        assert get_op_types_in_program(prev_prog) == ["linear", op_type]
+        assert get_op_types_in_program(prog) == ["linear"]
+
+        # validate graph pass
+        output_shape = [1, 2, 2]
+        output_shape = tuple(output_shape[-rank:])
+        assert_model_is_valid(
+            prog,
+            {"x": input_shape},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+            backend=backend,
+        )
+
+
+class TestFuseMatmulWeightBias:
+    def test_fuse_matmul_weight_bias(self):
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            weights_val = np.random.rand(2, 4).T.astype(np.float32)
+            weights = mb.const(val=weights_val)
+            bias_val = np.random.rand(2).astype(np.float32)
+            bias = mb.const(val=bias_val)
+
+            matmul = mb.matmul(x=x, y=weights)
+            return mb.add(x=matmul, y=bias)
+
+        assert_op_count_match(prog, expect=1, op="matmul")
+        assert_op_count_match(prog, expect=0, op="linear")
+        prev_prog = copy.deepcopy(prog)
+        PASS_REGISTRY["common::fuse_matmul_weight_bias"](prog)
+        assert_same_output_names(prev_prog, prog)
+        assert_op_count_match(prog, expect=0, op="matmul")
+        assert_op_count_match(prog, expect=1, op="linear")
+
+        if _VALIDATE_MODEL:
+            assert_model_is_valid(prog, {"x": (2, 4)})
+
+
+class TestFuseTransposeMatmul:
+    def test_fuse_transposes(self):
+        X_SHAPE = (3, 2)
+        Y_SHAPE = (5, 2)
+
+        output_shape = (X_SHAPE[0], Y_SHAPE[0])
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=X_SHAPE), mb.TensorSpec(shape=Y_SHAPE)])
+        def prog(x, y):
+            transposed_x = mb.transpose(x=x, perm=(1, 0))
+            transposed_y = mb.transpose(x=y, perm=(1, 0))
+            z = mb.matmul(x=transposed_x, y=transposed_y, transpose_x=True, transpose_y=False)
+            return z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::fuse_transpose_matmul")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prev_prog) == ["transpose", "transpose", "matmul"]
+        assert get_op_types_in_program(prog) == ["matmul"]
+
+        matmul = prog.find_ops(op_type="matmul")[0]
+        assert not matmul.transpose_x.val
+        assert matmul.transpose_y.val
+
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE, "y": Y_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+        )
+
+    def test_fuse_transpose_y(self):
+        X_SHAPE = (3, 2)
+        Y_SHAPE = (2, 5)
+
+        output_shape = (X_SHAPE[0], Y_SHAPE[1])
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=X_SHAPE), mb.TensorSpec(shape=Y_SHAPE)])
+        def prog(x, y):
+            transposed_y = mb.transpose(x=y, perm=(1, 0))
+            z = mb.matmul(x=x, y=transposed_y, transpose_y=True)
+            return z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::fuse_transpose_matmul")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prev_prog) == ["transpose", "matmul"]
+        assert get_op_types_in_program(prog) == ["matmul"]
+
+        matmul = prog.find_ops(op_type="matmul")[0]
+        assert not matmul.transpose_x.val
+        assert not matmul.transpose_y.val
+
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE, "y": Y_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+        )
+
+    def test_fuse_transpose_x_but_unfuseable_transpose_y(self):
+        X_SHAPE = (4, 2, 5, 3)
+        Y_SHAPE = (4, 5, 2, 7)
+
+        output_shape = (X_SHAPE[0], X_SHAPE[1], X_SHAPE[3], Y_SHAPE[3])
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=X_SHAPE), mb.TensorSpec(shape=Y_SHAPE)])
+        def prog(x, y):
+            transposed_x = mb.transpose(x=x, perm=(0, 1, 3, 2))
+            transposed_y = mb.transpose(x=y, perm=(0, 2, 1, 3))
+            z = mb.matmul(x=transposed_x, y=transposed_y)
+            return z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::fuse_transpose_matmul")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prev_prog) == ["transpose", "transpose", "matmul"]
+        assert get_op_types_in_program(prog) == ["transpose", "matmul"]
+
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE, "y": Y_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+        )
+
+    def test_unfuseable_transposes(self):
+        X_SHAPE = (3, 2, 5)
+        Y_SHAPE = (5, 2, 7)
+
+        output_shape = (X_SHAPE[1], X_SHAPE[0], Y_SHAPE[2])
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=X_SHAPE), mb.TensorSpec(shape=Y_SHAPE)])
+        def prog(x, y):
+            transposed_x = mb.transpose(x=x, perm=(1, 0, 2))
+            transposed_y = mb.transpose(x=y, perm=(1, 0, 2))
+            z = mb.matmul(x=transposed_x, y=transposed_y)
+            return z
+
+        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::fuse_transpose_matmul")
+        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        assert get_op_types_in_program(prev_prog) == ["transpose", "transpose", "matmul"]
+        assert get_op_types_in_program(prev_prog) == get_op_types_in_program(prog)
+
+        assert_model_is_valid(
+            prog,
+            {"x": X_SHAPE, "y": Y_SHAPE},
+            expected_output_shapes={block.outputs[0].name: output_shape},
+        )
diff --git a/coremltools/converters/mil/mil/passes/tests/test_passes.py b/coremltools/converters/mil/mil/passes/tests/test_passes.py
index 98fa61a66..1187cd915 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_passes.py
@@ -9,21 +9,22 @@
 
 import numpy as np
 import pytest
-from mock import patch
 
 import coremltools as ct
 import coremltools.optimize as cto
 from coremltools._deps import _IS_MACOS
+from coremltools.converters.mil import mil
 from coremltools.converters.mil.experimental.passes.generic_pass_infrastructure import (
     register_generic_pass,
 )
 from coremltools.converters.mil.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, Program, Symbol, get_new_symbol, types
+from coremltools.converters.mil.mil import Function, types
 from coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary import cast as _cast_iOS14
 from coremltools.converters.mil.mil.ops.defs.iOS17.elementwise_unary import cast as _cast_iOS17
-from coremltools.converters.mil.mil.passes.defs.cleanup import topological_reorder
+from coremltools.converters.mil.mil.passes.defs.optimize_repeat_ops import cast_optimization
 from coremltools.converters.mil.mil.passes.helper import _check_var_scalar_value
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource
 from coremltools.converters.mil.mil.types import numpy_type_to_builtin_type
 from coremltools.converters.mil.mil.types.type_mapping import builtin_to_string
 from coremltools.converters.mil.testing_reqs import backends
@@ -32,7 +33,6 @@
     assert_model_is_valid,
     assert_op_count_match,
     assert_same_output_names,
-    get_op_names_in_program,
     get_op_types_in_block,
     get_op_types_in_program,
 )
@@ -113,140 +113,6 @@ def _get_constexpr_val(constexpr_var):
     "constexpr_affine_dequantize",
 ]
 
-class TestConstDeduplication:
-    def test_const_deduplication(self):
-        BATCH_DIM = 5
-        SEQUENCE_LENGTH = 4
-        ENCODING_DIM = 256
-        EMBEDDING_DIM = 128
-        weight = np.random.rand(EMBEDDING_DIM, ENCODING_DIM)
-        bias = np.random.rand(EMBEDDING_DIM)
-
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
-                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
-            ]
-        )
-        def prog(q, k):
-            q_e = mb.linear(x=q, weight=weight, bias=bias)
-            k_e = mb.linear(x=k, weight=weight, bias=bias)
-            attention = mb.matmul(x=q_e, y=k_e, transpose_y=True)
-            return attention
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
-        assert_op_count_match(prev_prog, expect=6, op="const")
-        assert_op_count_match(prog, expect=4, op="const")
-
-    @pytest.mark.parametrize(
-        "constexpr_op",
-        CONSTEXPR_OPS,
-    )
-    def test_constexpr_deduplication(self, constexpr_op):
-        BATCH_DIM = 5
-        SEQUENCE_LENGTH = 4
-        ENCODING_DIM = 256
-        EMBEDDING_DIM = 128
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
-                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
-            ]
-        )
-        def prog(q, k):
-            weight_q = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM, ENCODING_DIM), seed=19)
-            weight_k = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM, ENCODING_DIM), seed=19)
-            bias_q = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM,), seed=29)
-            bias_k = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM,), seed=29)
-            q_e = mb.linear(x=q, weight=weight_q, bias=bias_q)
-            k_e = mb.linear(x=k, weight=weight_k, bias=bias_k)
-            attention = mb.matmul(x=q_e, y=k_e, transpose_y=True)
-            return attention
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
-        assert_op_count_match(prev_prog, expect=4, op=constexpr_op)
-        assert_op_count_match(prog, expect=2, op=constexpr_op)
-
-    def test_const_deduplication_as_outputs(self):
-        """
-        If the duplicated constants are block outputs, we should not remove them.
-        """
-        # case 1:
-        # const_2 can be eliminated since it is not block output
-        const = np.random.rand(40, 20, 30)
-
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(
-                    shape=(
-                        40,
-                        20,
-                        30,
-                    )
-                )
-            ]
-        )
-        def prog(x):
-            const_1 = mb.const(val=const, name="const_1")
-            const_2 = mb.const(val=const, name="const_2")
-            x = mb.relu(x=x)
-            x = mb.add(x=x, y=const_2)
-            return x, const_1
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
-        assert_op_count_match(prev_prog, expect=2, op="const")
-        assert_op_count_match(prog, expect=1, op="const")
-        assert prog.functions["main"].outputs[1].name == "const_1"
-
-        # case 2:
-        # const_2 can not be eliminated since it is a block output
-        const = np.random.rand(40, 20, 30)
-
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(
-                    shape=(
-                        40,
-                        20,
-                        30,
-                    )
-                )
-            ]
-        )
-        def prog(x):
-            const_1 = mb.const(val=const, name="const_1")
-            const_2 = mb.const(val=const, name="const_2")
-            x = mb.relu(x=x)
-            x = mb.add(x=x, y=const_2)
-            return x, const_1, const_2
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
-        assert_op_count_match(prev_prog, expect=2, op="const")
-        assert_op_count_match(prog, expect=2, op="const")
-        assert prog.functions["main"].outputs[1].name == "const_1"
-        assert prog.functions["main"].outputs[2].name == "const_2"
-
-    @pytest.mark.skip("rdar://109374995 consts are not shared across blocks")
-    def test_const_deduplication_multiple_blocks(self):
-        weight = np.random.rand(5, 3, 2, 2)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 3, 8, 8))])
-        def prog(x):
-            def _true_fn():
-                return mb.conv(x=x, weight=weight, pad_type="valid")
-
-            def _false_fn():
-                y = mb.mul(x=x, y=2.0)
-                return mb.conv(x=y, weight=weight, pad_type="valid")
-
-            x_gt_0_tensor = mb.greater(x=x, y=0.0)
-            x_gt_0 = mb.slice_by_index(x=x_gt_0_tensor, begin=(0, 0, 0, 0), end=(1, 1, 1, 1))
-            return mb.cond(pred=x_gt_0, _true_fn=_true_fn, _false_fn=_false_fn)
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_deduplication")
-        assert_op_count_match(prev_prog, expect=8, op="const")
-        assert_op_count_match(prog, expect=6, op="const")
-
 
 class TestFuseSqueezeExpandDims:
     @pytest.mark.parametrize(
@@ -283,1973 +149,74 @@ def prog(x):
 
         # noop_elimination can further remove the identity op
         apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prog) == ["relu"]
-
-    def test_fuse_squeeze_expand_dims_negative(self):
-        """
-        If squeeze and expand_dims cannot cancel each other,
-        the graph pass does nothing
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 1, 4, 1, 1))])
-        def prog(x):
-            x = mb.squeeze(x=x, axes=(1, 2))
-            x = mb.expand_dims(x=x, axes=(1, 3))
-            return mb.relu(x=x)
-
-        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
-        assert get_op_types_in_program(prog) == ["squeeze", "expand_dims", "relu"]
-
-    def test_fuse_squeeze_expand_dims_connected_output(self):
-        """
-        If squeeze is connected to block output, it cannot be removed.
-        However, the expand_dims can be a block output.
-        """
-        # squeeze connected to output. Nothing happens.
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
-        def prog(x):
-            squeeze = mb.squeeze(x=x, axes=(0,))
-            expand_dims = mb.expand_dims(x=squeeze, axes=(0,))
-            return mb.relu(x=expand_dims), squeeze
-
-        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
-        assert get_op_types_in_program(prog) == ["squeeze", "expand_dims", "relu"]
-
-        # expand_dims connected to output. Still good to fuse.
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
-        def prog(x):
-            squeeze = mb.squeeze(x=x, axes=(0,))
-            expand_dims = mb.expand_dims(x=squeeze, axes=(0,))
-            return mb.relu(x=expand_dims), expand_dims
-
-        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
-        assert get_op_types_in_program(prog) == ["identity", "relu"]
-
-class TestConstElimination:
-    def test_const_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            a = np.random.rand(2, 4).astype(np.float32)
-            double_a = mb.add(x=a, y=a)
-            return mb.add(x=x, y=double_a)
-
-        assert_op_count_match(prog, expect=2, op="const")
-        prev_prog = copy.deepcopy(prog)
-        PASS_REGISTRY["common::const_elimination"](prog)
-        assert_same_output_names(prev_prog, prog)
-        assert_op_count_match(prog, expect=3, op="const")
-
-        if _VALIDATE_MODEL:
-            assert_model_is_valid(prog, {"x": (2, 4)})
-
-    def test_const_elimination_nonreplaceable(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            a = np.random.rand(2, 4).astype(np.float16)
-            constexpr_a = mb.constexpr_cast(source_val=a, output_dtype="fp32")
-            double_a = mb.add(x=constexpr_a, y=a.astype(np.float32))
-            return mb.add(x=x, y=double_a)
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_elimination")
-        assert get_op_types_in_program(prev_prog) == ["constexpr_cast", "add", "add"]
-        # Not fold into const because the upstream constexpr_cast op is non-replaceable.
-        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "add"]
-
-    def test_force_const_eliminate_nonreplaceable_ops(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3,), dtype=types.int32)])
-        def prog(x):
-            a = np.random.rand(2, 3, 5).astype(np.float16)
-            constexpr_a = mb.constexpr_cast(source_val=a, output_dtype="fp32")
-            double_a = mb.add(x=constexpr_a, y=a.astype(np.float32))
-            a_shape = mb.shape(x=double_a)
-            return mb.add(x=x, y=a_shape)
-
-        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "shape", "add"]
-
-        apply_pass_and_basic_check(prog, "common::const_elimination")
-        # still fold shape into const regardless the non-replaceable upstream
-        # constexpr_cast op, since it only provides a shape
-        assert get_op_types_in_program(prog) == ["constexpr_cast", "add", "add"]
-
-        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-        # constexpr_cast(a) and add(a, a) no longer contributes to output,
-        # so they should get dead code eliminated
-        assert get_op_types_in_program(prog) == ["add"]
-
-    @patch(
-        "coremltools.converters.mil.mil.passes.defs.cleanup.const_elimination._skip_const_by_size",
-        1000,
-    )
-    def test_const_elimination_larger_than_threshold(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
-        def prog(x):
-            # Construct a 10 x 10 matrix (100 elements) which is smaller than the threshold (1000).
-            tmp = mb.range_1d(start=0, end=10, step=1)
-            tmp_x = mb.reshape(x=tmp, shape=[-1, 1])
-            tmp_y = mb.reshape(x=tmp, shape=[1, -1])
-            return mb.matmul(x=tmp_x, y=tmp_y)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
-        def prog_large_const_size(x):
-            # Construct a 100 x 100 matrix (10000 elements) which is larger than the threshold (1000).
-            tmp = mb.range_1d(start=0, end=100, step=1)
-            tmp_x = mb.reshape(x=tmp, shape=[-1, 1])
-            tmp_y = mb.reshape(x=tmp, shape=[1, -1])
-            return mb.matmul(x=tmp_x, y=tmp_y)
-
-        prev_prog, _, _ = apply_pass_and_basic_check(prog, "common::const_elimination")
-        assert get_op_types_in_program(prev_prog) == [
-            "range_1d",
-            "reshape",
-            "reshape",
-            "matmul",
-        ]
-        # All ops (range_1d, reshape, matmul) constructing that 10x10 matrix is folded into a const.
-        assert get_op_types_in_program(prog) == []
-
-        prev_prog_large_const_size, _, _ = apply_pass_and_basic_check(
-            prog_large_const_size, "common::const_elimination"
-        )
-        assert get_op_types_in_program(prev_prog_large_const_size) == [
-            "range_1d",
-            "reshape",
-            "reshape",
-            "matmul",
-        ]
-        # The matmul op constructing the large matrix is kept due to size larger than threshold.
-        assert get_op_types_in_program(prog_large_const_size) == ["matmul"]
-
-
-class TestDeadCodeElimination:
-    def test_dead_code_elimination(self):
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(shape=(2, 4)),
-                mb.TensorSpec(shape=(2, 4)),
-            ]
-        )
-        def program0(x, y):
-            # following three unused op should be eliminated
-            a = mb.const(val=np.zeros(shape=(1,)))
-            b = mb.const(val=np.zeros(shape=(1,)))
-            _ = mb.add(x=a, y=b)
-            return mb.add(x=x, y=y)
-
-        assert_op_count_match(program0, expect=4)
-        prev_prog = copy.deepcopy(program0)
-        PASS_REGISTRY["common::dead_code_elimination"](program0)
-        assert_same_output_names(prev_prog, program0)
-        assert_op_count_match(program0, expect=1)
-
-        if _VALIDATE_MODEL:
-            assert_model_is_valid(program0, {"x": (2, 4), "y": (2, 4)})
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def program1(x):
-            weights_val = np.random.rand(4, 2).T.astype(np.float32)
-            weights = mb.const(val=weights_val)
-            bias_val = np.random.rand(2).astype(np.float32)
-            bias = mb.const(val=bias_val)
-
-            # unused op and its inputs should be eliminated
-            weights_for_matmul = mb.transpose(x=weights, perm=[1, 0])
-            mb.matmul(x=x, y=weights_for_matmul)
-
-            return mb.linear(x=x, weight=weights, bias=bias)
-
-        assert_op_count_match(program1, expect=8)
-        prev_prog = copy.deepcopy(program1)
-        PASS_REGISTRY["common::dead_code_elimination"](program1)
-        assert_same_output_names(prev_prog, program1)
-        assert_op_count_match(program1, expect=3)
-
-        if _VALIDATE_MODEL:
-            assert_model_is_valid(program1, {"x": (2, 4)})
-
-
-class TestDedupOpAndVarNames(unittest.TestCase):
-    def test_unchanged(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            x = mb.reshape(x=x, shape=(1, 8), name="reshape")
-            return x
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
-
-        self.assertEqual(get_op_types_in_program(prev_prog), ["reshape"])
-        self.assertEqual(get_op_names_in_program(prev_prog), ["reshape"])
-
-        self.assertEqual(get_op_types_in_program(prog), ["reshape"])
-        self.assertEqual(get_op_names_in_program(prog), ["reshape"])
-
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (1, 8)},
-        )
-
-    def test_op_name_duplicated_once(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x = mb.cast(x=x, dtype="fp16", name="castop")
-            x = mb.cast(x=x, dtype="fp32", name="castop")
-            x = mb.square(x=x, name="square_last")
-            return x
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
-
-        self.assertEqual(get_op_types_in_program(prev_prog), ["cast", "cast", "square"])
-        self.assertEqual(get_op_names_in_program(prev_prog), ["castop", "castop", "square_last"])
-
-        self.assertEqual(get_op_types_in_program(prog), ["cast", "cast", "square"])
-        self.assertEqual(get_op_names_in_program(prog), ["castop", "castop_1", "square_last"])
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (10, 20)},
-        )
-
-    def test_op_name_duplicated_many(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x = mb.cast(x=x, dtype="fp16", name="castop")
-            x = mb.cast(x=x, dtype="fp16", name="castop")
-            x = mb.cast(x=x, dtype="int32", name="castop_2")
-            x = mb.cast(x=x, dtype="fp16", name="castop")
-            x = mb.cast(x=x, dtype="fp32", name="castop_2")
-            x = mb.square(x=x, name="square")
-            return x
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
-
-        self.assertEqual(
-            get_op_types_in_program(prev_prog), ["cast", "cast", "cast", "cast", "cast", "square"]
-        )
-        self.assertEqual(
-            get_op_names_in_program(prev_prog),
-            ["castop", "castop", "castop_2", "castop", "castop_2", "square"],
-        )
-
-        self.assertEqual(
-            get_op_types_in_program(prog), ["cast", "cast", "cast", "cast", "cast", "square"]
-        )
-        self.assertEqual(
-            get_op_names_in_program(prog),
-            ["castop", "castop_1", "castop_2", "castop_3", "castop_2_1", "square"],
-        )
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (10, 20)},
-        )
-
-    def test_input_name_shadow(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            # op name "x" results in output var name "x", which shadows prog
-            # input var name "x"
-            x = mb.transpose(x=x, perm=[1, 0], name="x")
-            x = mb.relu(x=x, name="relu")
-            return x
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
-        self.assertEqual(get_op_types_in_program(prev_prog), ["transpose", "relu"])
-        self.assertEqual(get_op_names_in_program(prev_prog), ["x", "relu"])
-
-        self.assertEqual(get_op_types_in_program(prog), ["transpose", "relu"])
-        self.assertEqual(get_op_names_in_program(prog), ["x", "relu"])
-
-        op = prog["main"].find_ops(op_type="transpose")[0]
-        self.assertEqual("x_1", op.outputs[0].name)
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (20, 10)},
-        )
-
-    def test_nested_block(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
-        def prog(x):
-            def true_fn():
-                # returns var with name x shadows input 'x'
-                return mb.add(x=x, y=1.0, name="x")
-
-            def false_fn():
-                # two ops with name "x"
-                return mb.add(x=x, y=-1.0, name="x")
-
-            pred = mb.equal(x=mb.squeeze(x=x), y=1.0)
-            return mb.cond(pred=pred, _true_fn=true_fn, _false_fn=false_fn)
-
-        cond_op = prog.functions["main"].operations[-1]
-        assert cond_op.blocks[0].outputs[0].name == "x"
-        assert cond_op.blocks[1].outputs[0].name == "x"
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::dedup_op_and_var_names")
-        cond_op = prog.functions["main"].operations[-1]
-        assert cond_op.blocks[0].outputs[0].name == "x_1"
-        assert cond_op.blocks[1].outputs[0].name == "x_2"
-
-        assert_model_is_valid(
-            prog,
-            {"x": (1,)},
-            expected_output_shapes={block.outputs[0].name: (1,)},
-        )
-
-
-class TestAddConvTransposeOutputShape:
-    def test_add_conv_transpose_output_shape(self):
-        """
-        Given:
-          %1: (1, 5, 39, fp32) = conv_transpose(...) # no output_shape input.
-
-        Result:
-          %2: (3, i32) = const(val=[1,5,39])
-          %3: (1, 5, 39, fp32) = conv_transpose(..., output_shape=%2)
-        """
-        N, C_in, C_out, D1 = 1, 3, 5, 20
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(N, C_in, D1))])
-        def prog(x):
-            weight = np.random.rand(C_in, C_out, D1).astype(np.float32)
-            return mb.conv_transpose(x=x, weight=weight)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "common::add_conv_transpose_output_shape"
-        )
-        assert get_op_types_in_program(prev_prog) == ["conv_transpose"]
-        assert get_op_types_in_program(prog) == ["conv_transpose"]
-        prev_conv_transpose_op = prev_prog.find_ops(op_type="conv_transpose", exactly_one=True)[0]
-        conv_transpose_op = prog.find_ops(op_type="conv_transpose", exactly_one=True)[0]
-        assert np.all(conv_transpose_op.output_shape.val == prev_conv_transpose_op.outputs[0].shape)
-
-
-class TestNoopElimination:
-    @pytest.mark.parametrize("is_block_output", ((True, False)))
-    def test_identity(self, is_block_output):
-        """
-        Input graph:
-
-            input -> identity -> (add 1.0 if not is_block_output) -> output
-
-        Output graph:
-
-            if is_block_output:
-                input -> identity -> output
-            else:
-                input -> add 1.0 -> output
-        """
-        SHAPE = (2, 3)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=SHAPE)])
-        def prog(x):
-            y = mb.identity(x=x)
-            if not is_block_output:
-                y = mb.add(x=y, y=1.0)
-            return y
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        if is_block_output:
-            assert get_op_types_in_program(prev_prog) == ["identity"]
-            assert get_op_types_in_program(prog) == ["identity"]
-        else:
-            assert get_op_types_in_program(prev_prog) == ["identity", "add"]
-            assert get_op_types_in_program(prog) == ["add"]
-
-        output_name = block.outputs[0].name
-        assert_model_is_valid(
-            prog,
-            {"x": SHAPE},
-            expected_output_shapes={output_name: SHAPE},
-        )
-
-    @pytest.mark.parametrize(
-        "op_type, pos, val",
-        itertools.product(
-            ["add", "mul", "floor_div", "pow", "real_div", "sub"],
-            ["x", "y"],
-            [0.0, 1.0, [0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
-        ),
-    )
-    def test_elementwise_elimination(self, op_type, pos, val):
-        if "div" in op_type and np.prod(val) == 0:
-            return
-        if "pow" in op_type and (val != 0 or val != 1):
-            return
-
-        test_op = getattr(mb, op_type)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            if pos == "x":
-                r1 = test_op(x=val, y=x)
-            else:
-                r1 = test_op(x=x, y=val)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        original_program = [op_type, "relu"]
-        new_program = original_program
-        if op_type in {"add"}:
-            if val == 0.0 or val == [0.0, 0.0, 0.0, 0.0]:
-                new_program = ["relu"]
-        elif op_type in {"mul"}:
-            if val == 1.0 or val == [1.0, 1.0, 1.0, 1.0]:
-                new_program = ["relu"]
-        elif op_type in {"real_div"}:
-            if pos == "y" and (val == 1.0 or val == [1.0, 1.0, 1.0, 1.0]):
-                new_program = ["relu"]
-        elif op_type in {"pow", "floor_div"}:
-            if pos == "y" and (val == 1.0 or val == [1.0, 1.0, 1.0, 1.0]):
-                new_program = ["relu"]
-        elif op_type in {"sub"}:
-            if pos == "y" and (val == 0.0 or val == [0.0, 0.0, 0.0, 0.0]):
-                new_program = ["relu"]
-
-        assert get_op_types_in_program(prev_prog) == original_program
-        assert get_op_types_in_program(prog) == new_program
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_elementwise_broadcast(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=[4])])
-        def prog(x):
-            r1 = mb.add(x=x, y=[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        original_program = ["add", "relu"]
-
-        assert get_op_types_in_program(prev_prog) == original_program
-        assert get_op_types_in_program(prog) == original_program
-        assert_model_is_valid(
-            prog,
-            {"x": [4]},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_elementwise_elimination_fill(self):
-        """
-        When fill layer with dynamic shape is fed to elementwise-binary operation,
-        even though the tensor can't be materialized at conversion time but no-op
-        elimination can still be performed based on fill-value
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, get_new_symbol()))])
-        def prog(x):
-            shape = mb.shape(x=x)
-            y = mb.fill(value=0.0, shape=shape)
-            x = mb.add(x=x, y=y)
-            return mb.relu(x=x)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["shape", "fill", "add", "relu"]
-        assert get_op_types_in_program(prog) == ["shape", "fill", "relu"]
-
-        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        assert get_op_types_in_program(prog) == ["relu"]
-
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_reshape_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.reshape(x=x, shape=[1, 8])
-            mb.reshape(x=r1, shape=[1, 8])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["reshape", "reshape", "relu"]
-        assert get_op_types_in_program(prog) == ["reshape", "relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (1, 8)},
-        )
-
-    def test_oneway_split_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.split(x=x, num_splits=1, axis=-1)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["split", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_full_split_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.split(x=x, split_sizes=[4], axis=-1)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["split", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_slicebysize_full_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.slice_by_size(x=x, begin=[0, 0], size=[2, 4])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["slice_by_size", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_slicebysize_to_end_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.slice_by_size(x=x, begin=[0, 0], size=[-1, -1])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["slice_by_size", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_slicebyindex_full_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.slice_by_index(x=x, begin=[0, 0], end=[2, 4])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["slice_by_index", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_slicebyindex_negative_stride(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.slice_by_index(
-                x=x,
-                begin=[0, 0],
-                end=[0, 0],
-                stride=[1, -1],
-                begin_mask=[True, True],
-                end_mask=[True, True],
-            )
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["slice_by_index", "relu"]
-        assert get_op_types_in_program(prog) == ["slice_by_index", "relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    @pytest.mark.parametrize(
-        "begin_mask, end_mask",
-        itertools.product(
-            itertools.product([True, False], [True, False]),
-            itertools.product([True, False], [True, False]),
-        ),
-    )
-    def test_slicebyindex_mask_elimination(self, begin_mask, end_mask):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 4))])
-        def prog(x):
-            begin = [1, 1]
-            end = [1, 1]
-            for i in range(2):
-                if not begin_mask[i]:
-                    begin[i] = 0
-                if not end_mask[i]:
-                    end[i] = 4
-            r1 = mb.slice_by_index(
-                x=x, begin=begin, end=end, begin_mask=begin_mask, end_mask=end_mask
-            )
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["slice_by_index", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (4, 4)},
-            expected_output_shapes={block.outputs[0].name: (4, 4)},
-        )
-
-    def test_pad_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.pad(x=x, pad=[0, 0, 0, 0])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["pad", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_keep_pad(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.pad(x=x, pad=[4, 4, 2, 2])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["pad", "relu"]
-        assert get_op_types_in_program(prog) == ["pad", "relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (10, 8)},
-        )
-
-    def test_tile_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.tile(x=x, reps=[1, 1])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["tile", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_keep_tile(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.tile(x=x, reps=[2, 2])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["tile", "relu"]
-        assert get_op_types_in_program(prog) == ["tile", "relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (4, 8)},
-        )
-
-    def test_upsample_nearest_neighbor_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
-        def prog(x):
-            r1 = mb.upsample_nearest_neighbor(x=x)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["upsample_nearest_neighbor", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (3, 2, 4)},
-            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
-        )
-
-    def test_upsample_bilinear_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
-        def prog(x):
-            r1 = mb.upsample_bilinear(x=x)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["upsample_bilinear", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (3, 2, 4)},
-            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
-        )
-
-    def test_resize_bilinear_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
-        def prog(x):
-            r1 = mb.resize_bilinear(x=x, target_size_height=2, target_size_width=4)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["resize_bilinear", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (3, 2, 4)},
-            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
-        )
-
-    def test_crop_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 2, 4))])
-        def prog(x):
-            r1 = mb.crop(x=x, crop_height=[0, 0], crop_width=[0, 0])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["crop", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (3, 2, 4)},
-            expected_output_shapes={block.outputs[0].name: (3, 2, 4)},
-        )
-
-    def test_linear_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
-        def prog(x):
-            r1 = mb.linear_activation(x=x, alpha=1.0, beta=0.0)
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["linear_activation", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 4)},
-        )
-
-    def test_transpose_elimination(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 4))])
-        def prog(x):
-            r1 = mb.transpose(x=x, perm=[0, 1, 2])
-            return mb.relu(x=r1)
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::noop_elimination")
-        assert get_op_types_in_program(prev_prog) == ["transpose", "relu"]
-        assert get_op_types_in_program(prog) == ["relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 4)},
-            expected_output_shapes={block.outputs[0].name: (2, 3, 4)},
-        )
-
-
-class TestRemoveSymbolicReshape:
-    def test_remove_symbolic_reshape(self):
-        sym_b = Symbol("s0")
-        original_shape = (sym_b, Symbol("s1"), 2)
-        reshape_name = "reshape"
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(sym_b, 4))])
-        def prog(x):
-            # const cannot represent symbolic values. Use _const_symbolic
-            shape = mb._const_symbolic(val=original_shape)
-            return mb.reshape(x=x, shape=shape, name=reshape_name)
-
-        reshape_op = prog.find_ops(prefix=reshape_name, op_type="reshape", exactly_one=True)[0]
-        shape_var = reshape_op.shape
-        reshaped_var = reshape_op.outputs[0]
-        assert np.all(shape_var.sym_val == original_shape)
-        assert np.all(reshaped_var.shape == (sym_b, 2, 2))
-
-        # Note: we cannot deepcopy prog with symbol.
-        prev_outputs = [o.name for o in prog["main"].outputs]
-        PASS_REGISTRY["common::remove_symbolic_reshape"](prog)
-        curr_outputs = [o.name for o in prog["main"].outputs]
-        assert curr_outputs == prev_outputs
-
-        reshape_op = prog.find_ops(prefix=reshape_name, op_type="reshape", exactly_one=True)[0]
-        shape_var = reshape_op.shape
-        reshaped_var = reshape_op.outputs[0]
-        # shape param cannot be symbolic after the pass
-        assert np.all(shape_var.sym_val == (-1, 2, 2))
-        # output shape is still symbolic
-        assert np.all(reshaped_var.shape == (sym_b, 2, 2))
-
-        if _VALIDATE_MODEL:
-            assert_model_is_valid(prog, {"x": (3, 4)})
-
-
-class TestLoopInvariantElimination:
-    def test_loop_invariant_elimination1(self):
-        """
-        Invariant pattern: Block input vars are returned as block output vars.
-        """
-
-        def body(a, b):
-            return mb.add(x=a, y=b), b
-
-        def cond(a, b):
-            a_mean = mb.reduce_mean(x=a, axes=[0, 1])
-            b_mean = mb.reduce_mean(x=b, axes=[0, 1])
-            return mb.less(x=a_mean, y=b_mean)
-
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(shape=(1, 2)),
-                mb.TensorSpec(shape=(1, 2)),
-            ]
-        )
-        def prog(a, b):
-            # b is loop invariant
-            return mb.while_loop(_cond=cond, _body=body, loop_vars=(a, b))
-
-        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
-        assert len(while_op.blocks[0].inputs) == 2
-        assert len(while_op.outputs) == 2
-        assert len(while_op.loop_vars) == 2
-        assert while_op.blocks[0].inputs[0].name == "a_x0"
-        assert while_op.blocks[0].inputs[1].name == "b_x0"
-
-        prev_prog = copy.deepcopy(prog)
-        PASS_REGISTRY["common::loop_invariant_elimination"](prog)
-        assert_same_output_names(prev_prog, prog)
-
-        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
-        assert len(while_op.blocks[0].inputs) == 1
-        assert len(while_op.outputs) == 1
-        assert len(while_op.loop_vars) == 1
-        assert while_op.blocks[0].inputs[0].name == "a_x0"
-
-        if _VALIDATE_MODEL:
-            assert_model_is_valid(prog, {"a": (1, 2), "b": (1, 2)})
-
-    def test_loop_invariant_elimination2(self):
-        """
-        Invariant pattern: Block outputs var from outside of the block
-        """
-
-        @mb.program(
-            input_specs=[
-                mb.TensorSpec(shape=(1, 2)),
-                mb.TensorSpec(shape=(1, 2)),
-            ]
-        )
-        def prog(a, b):
-            def body(a, bx):
-                return mb.add(x=a, y=b), b
-
-            def cond(a, bx):
-                a_mean = mb.reduce_mean(x=a, axes=[0, 1])
-                b_mean = mb.reduce_mean(x=bx, axes=[0, 1])
-                return mb.less(x=a_mean, y=b_mean)
-
-            # b is loop invariant
-            return mb.while_loop(_cond=cond, _body=body, loop_vars=(a, b))
-
-        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
-        assert len(while_op.blocks[0].inputs) == 2
-        assert len(while_op.outputs) == 2
-        assert len(while_op.loop_vars) == 2
-        assert while_op.blocks[0].inputs[0].name == "a_x0"
-        assert while_op.blocks[0].inputs[1].name == "b_x0"
-
-        prev_prog = copy.deepcopy(prog)
-        PASS_REGISTRY["common::loop_invariant_elimination"](prog)
-        assert_same_output_names(prev_prog, prog)
-
-        while_op = prog.find_ops(op_type="while_loop", exactly_one=True)[0]
-        assert len(while_op.blocks[0].inputs) == 1
-        assert len(while_op.outputs) == 1
-        assert len(while_op.loop_vars) == 1
-        assert while_op.blocks[0].inputs[0].name == "a_x0"
-
-        if _VALIDATE_MODEL:
-            assert_model_is_valid(prog, {"a": (1, 2), "b": (1, 2)})
-
-
-class TestReduceMeanFusion:
-    def test_valid_pattern1(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
-            x1 = mb.mul(x=1.0 / 30, y=x1)
-            return x1
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
-        assert get_op_types_in_program(prev_prog) == ["reduce_sum", "mul"]
-        assert get_op_types_in_program(prog) == ["reduce_mean"]
-        assert_model_is_valid(
-            prog,
-            {"x": (3, 5, 6)},
-            expected_output_shapes={block.outputs[0].name: (3, 1, 1)},
-        )
-
-    def test_valid_pattern2(self):
-        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 5))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[0], keep_dims=False)
-            x1 = mb.real_div(x=x1, y=4.0)
-            return x1
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
-        assert get_op_types_in_program(prev_prog) == ["reduce_sum", "real_div"]
-        assert get_op_types_in_program(prog) == ["reduce_mean"]
-        assert_model_is_valid(
-            prog,
-            {"x": (4, 5)},
-            expected_output_shapes={block.outputs[0].name: (5,)},
-        )
-
-    def test_invalid_pattern1(self):
-        """
-        The mul does not correspond to "1/count"
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
-            x1 = mb.mul(x=5.0, y=x1)
-            return x1
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
-        assert get_op_types_in_program(prog) == ["reduce_sum", "mul"]
-
-    def test_invalid_pattern2(self):
-        """
-        The div does not correspond to "count"
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
-            x1 = mb.real_div(x=x1, y=31.0)
-            return x1
-
-        prev_prog, prev_block, block = apply_pass_and_basic_check(prog, "common::fuse_reduce_mean")
-        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div"]
-
-    def test_invalid_pattern3(self):
-        """
-        One of the reduction dim is symbolic
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, get_new_symbol(), 6))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
-            x1 = mb.real_div(x=x1, y=30.0)
-            return x1
-
-        pass_name = "common::fuse_reduce_mean"
-        PASS_REGISTRY[pass_name](prog)
-        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div"]
-
-    def test_invalid_pattern4(self):
-        """
-        output of reduce_sum is model output
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
-            y1 = mb.real_div(x=x1, y=30.0)
-            return y1, x1
-
-        pass_name = "common::fuse_reduce_mean"
-        PASS_REGISTRY[pass_name](prog)
-        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div"]
-
-    def test_invalid_pattern5(self):
-        """
-        output of reduce_sum is feeding into another op
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 5, 6))])
-        def prog(x):
-            x1 = mb.reduce_sum(x=x, axes=[-1, -2], keep_dims=True)
-            y1 = mb.real_div(x=x1, y=30.0)
-            y2 = mb.mul(x=x1, y=10.0)
-            y3 = mb.add(x=y1, y=y2)
-            return y3
-
-        pass_name = "common::fuse_reduce_mean"
-        PASS_REGISTRY[pass_name](prog)
-        assert get_op_types_in_program(prog) == ["reduce_sum", "real_div", "mul", "add"]
-
-
-class TestRemoveRedundantOps:
-    def test_redundant_ops_just_after_input_valid_pattern_1(self):
-        """
-        Input graph:
-        input----->transpose(perm=[0, 2, 1])--->add---> add ---> out
-               |                                 ^       ^
-               |                                 |       |
-               |---->transpose(perm=[0, 2, 1])----       |
-               |                                         |
-               |                                         |
-               |---->transpose(perm=[0, 2, 1])------------
-
-        Output graph:
-        input----->transpose(perm=[0, 2, 1])--->add---> add ----> out
-                                    |            ^       ^
-                                    |            |       |
-                                    |-------------       |
-                                    |                    |
-                                    |--------------------
-        """
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.transpose(x=x, perm=[0, 2, 1])
-            x2 = mb.transpose(x=x, perm=[0, 2, 1])
-            x3 = mb.transpose(x=x, perm=[0, 2, 1])
-            z = mb.add(x=x1, y=x2)
-            z = mb.add(x=z, y=x3)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == [
-            "transpose",
-            "transpose",
-            "transpose",
-            "add",
-            "add",
-        ]
-        assert get_op_types_in_program(prog) == ["transpose", "add", "add"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 5)},
-            expected_output_shapes={block.outputs[0].name: (2, 5, 3)},
-        )
-
-    def test_redundant_ops_just_after_input_valid_pattern_2(self):
-        """
-        Input graph:
-        input----->leaky_relu(alpha=0.3)--->add---> add ---> out
-               |                             ^       ^
-               |                             |       |
-               |----->leaky_relu(alpha=0.3)---       |
-               |                                     |
-               |                                     |
-               |---->leaky_relu(alpha=0.3)------------
-
-        Output graph:
-        input--------->leaky_relu(alpha=0.3)--->add---> add ----> out
-                                    |            ^       ^
-                                    |            |       |
-                                    |-------------       |
-                                    |                    |
-                                    |---------------------
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.leaky_relu(x=x, alpha=0.3)
-            x2 = mb.leaky_relu(x=x, alpha=0.3)
-            x3 = mb.leaky_relu(x=x, alpha=0.3)
-            z = mb.add(x=x1, y=x2)
-            z = mb.add(x=z, y=x3)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == [
-            "leaky_relu",
-            "leaky_relu",
-            "leaky_relu",
-            "add",
-            "add",
-        ]
-        assert get_op_types_in_program(prog) == ["leaky_relu", "add", "add"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 5)},
-            expected_output_shapes={block.outputs[0].name: (2, 3, 5)},
-        )
-
-    def test_redundant_ops_just_after_input_valid_pattern_3(self):
-        """
-        Input graph:
-        input----->leaky_relu(alpha=0.4)--->add---> add ---> out
-               |                             ^       ^
-               |                             |       |
-               |----->leaky_relu(alpha=0.3)---       |
-               |                                     |
-               |                                     |
-               |---->leaky_relu(alpha=0.3)------------
-
-        Output graph:
-        input----->leaky_relu(alpha=0.4)--->add---> add ---> out
-               |                             ^       ^
-               |                             |       |
-               |----->leaky_relu(alpha=0.3)----------
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.leaky_relu(x=x, alpha=0.4)
-            x2 = mb.leaky_relu(x=x, alpha=0.3)
-            x3 = mb.leaky_relu(x=x, alpha=0.3)
-            z = mb.add(x=x1, y=x2)
-            z = mb.add(x=z, y=x3)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == [
-            "leaky_relu",
-            "leaky_relu",
-            "leaky_relu",
-            "add",
-            "add",
-        ]
-        assert get_op_types_in_program(prog) == ["leaky_relu", "leaky_relu", "add", "add"]
-
-        leaky_relu_ops = block.find_ops(op_type="leaky_relu")
-        assert leaky_relu_ops[0].alpha.val == np.float32(0.4)
-        assert leaky_relu_ops[1].alpha.val == np.float32(0.3)
-
-    def test_redundant_ops_just_after_input_invalid_pattern_1(self):
-        """
-        input----->transpose(perm=[0, 2, 1])---> reshape(shape=[-1]) -----> add ---> out
-               |                                                             ^
-               |                                                             |
-               |---->transpose(perm=[1, 0, 2])----> reshape(shape=[-1])------
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.transpose(x=x, perm=[0, 2, 1])
-            x2 = mb.transpose(x=x, perm=[1, 0, 2])
-            x1 = mb.reshape(x=x1, shape=[-1])
-            x2 = mb.reshape(x=x2, shape=[-1])
-            z = mb.add(x=x1, y=x2)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == [
-            "transpose",
-            "transpose",
-            "reshape",
-            "reshape",
-            "add",
-        ]
-        assert get_op_types_in_program(prog) == [
-            "transpose",
-            "transpose",
-            "reshape",
-            "reshape",
-            "add",
-        ]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 5)},
-            expected_output_shapes={block.outputs[0].name: (30,)},
-        )
-
-    def test_redundant_ops_just_after_input_invalid_pattern_2(self):
-        """
-        input----->leaky_relu(alpha=0.3) -----> add ---> out
-               |                                 ^
-               |                                 |
-               |---->leaky_relu(alpha=0.4)-------
-
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.leaky_relu(x=x, alpha=0.3)
-            x2 = mb.leaky_relu(x=x, alpha=0.4)
-            z = mb.add(x=x1, y=x2)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == ["leaky_relu", "leaky_relu", "add"]
-        assert get_op_types_in_program(prog) == ["leaky_relu", "leaky_relu", "add"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 5)},
-            expected_output_shapes={block.outputs[0].name: (2, 3, 5)},
-        )
-
-    def test_redundant_ops_just_after_input_invalid_pattern_3(self):
-        """
-        test case, when inputs of 1 op is a subset of the inputs of the other op
-
-        input----->layer_norm1 -----> add ---> out
-               |                       ^
-               |                       |
-               |---->layer_norm2-------
-
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 3, 2))])
-        def prog(x):
-            x1 = mb.layer_norm(x=x, axes=[2], epsilon=1e-4)
-            gamma_val = np.array([1.0, 1.0], dtype=np.float32)
-            beta_val = np.array([1.0, 0.0], dtype=np.float32)
-            x2 = mb.layer_norm(x=x, axes=[2], epsilon=1e-4, gamma=gamma_val, beta=beta_val)
-            z = mb.add(x=x1, y=x2)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == ["layer_norm", "layer_norm", "add"]
-        assert get_op_types_in_program(prog) == ["layer_norm", "layer_norm", "add"]
-        assert_model_is_valid(
-            prog,
-            {"x": (1, 3, 2)},
-            expected_output_shapes={block.outputs[0].name: (1, 3, 2)},
-        )
-
-    @staticmethod
-    def _make_repeated_conv_prog(redundant_conv=True, out_channel=2):
-        prog = Program()
-        func_inputs = {"x": mb.placeholder(shape=[1, 4, 5, 5])}
-        with Function(func_inputs) as ssa_fun:
-            x = ssa_fun.inputs["x"]
-            x = mb.relu(x=x)
-            W = np.random.rand(out_channel, 4, 3, 3)
-            if redundant_conv:
-                bias = np.random.rand(out_channel)
-                x1 = mb.conv(x=x, weight=W, bias=bias, pad_type="same", strides=[1, 1])
-                x2 = mb.conv(x=x, weight=W, bias=bias, pad_type="same", strides=[1, 1])
-            else:
-                x1 = mb.conv(
-                    x=x, weight=W, bias=np.random.rand(out_channel), pad_type="same", strides=[1, 1]
-                )
-                x2 = mb.conv(
-                    x=x, weight=W, bias=np.random.rand(out_channel), pad_type="same", strides=[1, 1]
-                )
-            x1 = mb.relu(x=x1)
-            x2 = mb.relu(x=x2)
-            x1 = mb.avg_pool(x=x1, kernel_sizes=[2, 2], strides=[1, 1], pad_type="same")
-            z = mb.concat(values=(x1, x2), axis=-3)
-            ssa_fun.set_outputs([z])
-        prog.add_function("main", ssa_fun)
-        return prog
-
-    def test_redundant_ops_inside_graph_valid_pattern(self):
-        """
-        Input graph:
-        input--> relu--------->conv------>relu----> pool ---> concat ---> out
-                 |                                              ^
-                 |                                              |
-                 |---->conv---->relu----------------------------
-
-        Output graph:
-        input-> relu--->conv------>relu----> pool ---> concat ---> out
-                                    |                   ^
-                                    |                   |
-                                    |-------------------
-        """
-        prog = self._make_repeated_conv_prog(redundant_conv=True)
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == [
-            "relu",
-            "conv",
-            "conv",
-            "relu",
-            "relu",
-            "avg_pool",
-            "concat",
-        ]
-        assert get_op_types_in_program(prog) == ["relu", "conv", "relu", "avg_pool", "concat"]
-        assert_model_is_valid(
-            prog,
-            {"x": (1, 4, 5, 5)},
-            expected_output_shapes={block.outputs[0].name: (1, 4, 5, 5)},
-        )
-
-    def test_redundant_ops_inside_graph_with_large_const(self):
-        """
-        For the large constants, they need to be deduplicated by the const_deduplication first.
-        This test is making sure the converter is not doing any "brutal force" comparison.
-
-        Input graph:
-        input--> relu--------->conv------>relu----> pool ---> concat ---> out
-                 |                                              ^
-                 |                                              |
-                 |---->conv---->relu----------------------------
-
-        Output graph:
-        input-> relu--->conv------>relu----> pool ---> concat ---> out
-                                    |                   ^
-                                    |                   |
-                                    |-------------------
-        """
-        # The remove_redundant_ops is not doing brutal force array comparison
-        prog = self._make_repeated_conv_prog(redundant_conv=True, out_channel=10)
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        ops_in_prev_prog = [
-            "relu",
-            "conv",
-            "conv",
-            "relu",
-            "relu",
-            "avg_pool",
-            "concat",
-        ]
-        assert get_op_types_in_program(prev_prog) == ops_in_prev_prog
-        assert get_op_types_in_program(prog) == ops_in_prev_prog
-
-        # We need to first run the const_deduplication pass.
-        prog = self._make_repeated_conv_prog(redundant_conv=True, out_channel=10)
-        _, _, block = apply_pass_and_basic_check(prog, "common::const_deduplication")
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-        _, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-
-        assert get_op_types_in_program(prog) == ["relu", "conv", "relu", "avg_pool", "concat"]
-        assert_model_is_valid(
-            prog,
-            {"x": (1, 4, 5, 5)},
-            expected_output_shapes={block.outputs[0].name: (1, 20, 5, 5)},
-        )
-
-    def test_redundant_ops_inside_graph_invalid_pattern(self):
-        """
-        input--->relu--------->conv1------>relu----> pool ---> concat ---> out
-                  |                                              ^
-                  |                                              |
-                  |---->conv2---->relu---------------------------
-        """
-        prog = self._make_repeated_conv_prog(redundant_conv=False)
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == [
-            "relu",
-            "conv",
-            "conv",
-            "relu",
-            "relu",
-            "avg_pool",
-            "concat",
-        ]
-        assert get_op_types_in_program(prog) == [
-            "relu",
-            "conv",
-            "conv",
-            "relu",
-            "relu",
-            "avg_pool",
-            "concat",
-        ]
-        assert_model_is_valid(
-            prog,
-            {"x": (1, 4, 5, 5)},
-            expected_output_shapes={block.outputs[0].name: (1, 4, 5, 5)},
-        )
-
-    def test_redundant_op_as_output_valid_pattern_1(self):
-        """
-        Input graph:
-        input--------->relu------> out1
-               |
-               |
-               |---->relu---->tanh---> out2
-
-        Output graph:
-        input--------->relu------> out1
-                             |
-                             |
-                             |---->tanh---> out2
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.relu(x=x)
-            x2 = mb.relu(x=x)
-            return x1, mb.tanh(x=x2)
-
-        prev_prog, _, block = apply_pass_and_basic_check(prog, "common::remove_redundant_ops")
-        assert get_op_types_in_program(prev_prog) == ["relu", "relu", "tanh"]
-        assert get_op_types_in_program(prog) == ["relu", "tanh"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 5)},
-            expected_output_shapes={
-                block.outputs[0].name: (2, 3, 5),
-                block.outputs[1].name: (2, 3, 5),
-            },
-        )
-
-    def test_redundant_op_as_output_invalid_pattern_1(self):
-        """
-        Input graph:
-        input--------->relu------> out1
-               |
-               |
-               |---->relu---> out2
-
-        "common::remove_redundant_ops" pass does not remove ops if their outputs
-        are block outputs.
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3, 5))])
-        def prog(x):
-            x1 = mb.relu(x=x)
-            x2 = mb.relu(x=x)
-            return x1, x2
-
-        prev_prog, _, block = apply_pass_and_basic_check(
-            prog,
-            "common::remove_redundant_ops",
-        )
-        assert get_op_types_in_program(prev_prog) == ["relu", "relu"]
-        assert get_op_types_in_program(prog) == ["relu", "relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (2, 3, 5)},
-            expected_output_shapes={
-                block.outputs[0].name: (2, 3, 5),
-                block.outputs[1].name: (2, 3, 5),
-            },
-        )
-
-    def test_cond_block_program(self):
-        """
-        - Test identical ops within different blocks are not removed. The "relu" op inside true and
-        false blocks are not removed since they are in different blocks.
-        - Test ops that have blocks inside them are not removed. There are two cond ops here,
-        with identical inputs but they are not removed, since they are ops that have nested block
-        inside them.
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
-        def prog(x):
-            x1 = mb.cast(x=x, dtype="bool")
-
-            def true_fn():
-                x = mb.shape(x=x1)
-                x = mb.cast(x=x, dtype="fp32")
-                return mb.add(x=x, y=1.0)
-
-            def false_fn():
-                x = mb.shape(x=x1)
-                x = mb.cast(x=x, dtype="fp32")
-                return mb.add(x=x, y=-1.0)
-
-            z1 = mb.cond(pred=x1, _true_fn=true_fn, _false_fn=false_fn)
-            z2 = mb.cond(pred=x1, _true_fn=true_fn, _false_fn=false_fn)
-            z = mb.add(x=z1, y=z2)
-            return z
-
-        prev_prog, _, block = apply_pass_and_basic_check(
-            prog,
-            "common::remove_redundant_ops",
-        )
-        assert get_op_types_in_program(prev_prog) == ["cast", "cond", "cond", "add"]
-        assert get_op_types_in_program(prog) == ["cast", "cond", "cond", "add"]
-        cond_op = prog.find_ops(op_type="cond")[0]
-        assert cond_op.blocks[0].operations[0].op_type == "shape"
-        assert cond_op.blocks[1].operations[0].op_type == "shape"
-        assert_model_is_valid(
-            prog,
-            {"x": (1,)},
-            expected_output_shapes={block.outputs[0].name: (1,)},
-        )
-
-    def test_concat_op_pattern(self):
-        """
-        Input graph:
-                          ---------------> concat ------> log ------> out1
-                         |                   ^
-                         |                   |
-        input--------->relu------> concat ------> relu----> out2
-                 |                  ^        |
-                 |                  |        |
-                 |---->tanh--------------------
-
-        Output graph:
-                                     |------>log ------> out1
-                                     |
-                                     |
-        input--------->relu------> concat ------> relu----> out2
-                 |                  ^
-                 |                  |
-                 |---->tanh---------
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 5))])
-        def prog(x):
-            x1 = mb.relu(x=x)
-            x2 = mb.tanh(x=x)
-            c1 = mb.concat(values=(x1, x2), axis=0)
-            c2 = mb.concat(values=(x1, x2), axis=0)
-            z1 = mb.log(x=c1)
-            z2 = mb.relu(x=c2)
-            return z1, z2
-
-        prev_prog, _, block = apply_pass_and_basic_check(
-            prog,
-            "common::remove_redundant_ops",
-        )
-        assert get_op_types_in_program(prev_prog) == [
-            "relu",
-            "tanh",
-            "concat",
-            "concat",
-            "log",
-            "relu",
-        ]
-        assert get_op_types_in_program(prog) == ["relu", "tanh", "concat", "log", "relu"]
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 5)},
-            expected_output_shapes={block.outputs[0].name: (20, 5), block.outputs[1].name: (20, 5)},
-        )
-
-    def test_multiple_redundant_child_ops_pattern(self):
-        """
-        Input graph
-
-        input -------------> reshape ----------> add ---------> out1
-                  |                               ^
-                  |                               |
-                  |-------> reshape ---------------
-                  |
-                  |------> slice_by_size-----> add ----------> out2
-                  |                             ^
-                  |                             |
-                  |------> slice_by_size -------
-
-        Output graph
-
-        input -------------> reshape ----------> add ------------> out1
-          |                              |        ^
-          |                              |        |
-          |                              |---------
-          |
-          |------> slice_by_size----------> add -----------------> out2
-                        |                    ^
-                        |                    |
-                        |---------------------
-
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 5, 4))])
-        def prog(x):
-            x1 = mb.reshape(x=x, shape=[5, 2, -1])
-            x2 = mb.reshape(x=x, shape=[5, 2, -1])
-            x3 = mb.slice_by_size(x=x, begin=[0, 0, 1], size=[2, 4, 3])
-            x4 = mb.slice_by_size(x=x, begin=[0, 0, 1], size=[2, 4, 3])
-            z1 = mb.add(x=x1, y=x2)
-            z2 = mb.add(x=x3, y=x4)
-            return z1, z2
-
-        prev_prog, _, block = apply_pass_and_basic_check(
-            prog,
-            "common::remove_redundant_ops",
-        )
-        assert get_op_types_in_program(prev_prog) == [
-            "reshape",
-            "reshape",
-            "slice_by_size",
-            "slice_by_size",
-            "add",
-            "add",
-        ]
-        assert get_op_types_in_program(prog) == ["reshape", "slice_by_size", "add", "add"]
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 5, 4)},
-            expected_output_shapes={
-                block.outputs[0].name: (5, 2, 20),
-                block.outputs[1].name: (2, 4, 3),
-            },
-        )
-
-    def test_random_distribution_op_invalid_pattern(self):
-        """
-        Identical random ops are not removed
-
-        input----->cast---->random_uniform------> add ---> out
-                    |                              ^
-                    |                              |
-                    |---->random_uniform------------
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(3,))])
-        def prog(shape):
-            shape = mb.cast(x=shape, dtype="int32")
-            x1 = mb.random_uniform(shape=shape, low=0.0, high=1.0, seed=11)
-            x2 = mb.random_uniform(shape=shape, low=0.0, high=1.0, seed=11)
-            return mb.add(x=x1, y=x2)
-
-        prev_prog, _, block = apply_pass_and_basic_check(
-            prog,
-            "common::remove_redundant_ops",
-        )
-        assert get_op_types_in_program(prev_prog) == [
-            "cast",
-            "random_uniform",
-            "random_uniform",
-            "add",
-        ]
-        assert get_op_types_in_program(prog) == ["cast", "random_uniform", "random_uniform", "add"]
-
-    def test_nonreplaceable_vars(self):
-        """
-        Nonreplaceable vars shouldn't be removed, e.g. palettized weights
-
-        const_1----->add---->add_1------|
-                      |                 |
-                    input              add---->output
-                      |                 |
-        const_2----->add---->add_2------|
-        """
-        def _constexpr_lut_to_dense():
-            lut_data = np.array(
-                [-19.0, 4.0, 0.0, -1.0, 1.0, 3.0, 5.0, -8.0, 19, 13, 42, 4.5, 5.4, 2.0, -6, -7]
-            ).astype(np.float32)
-            indices = np.array([212, 21]).astype(np.uint8)
-            shape = np.array([4, 1]).astype(np.uint32)
-            return mb.constexpr_lut_to_dense(lut=lut_data, indices=indices, shape=shape)
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(4, 1))])
-        def prog(x):
-            constexpr_1 = _constexpr_lut_to_dense()
-            constexpr_2 = _constexpr_lut_to_dense()
-            c = mb.add(x=constexpr_1, y=x)
-            d = mb.add(x=constexpr_2, y=x)
-            return mb.add(x=c, y=d)
-
-        prev_prog, _, _ = apply_pass_and_basic_check(
-            prog,
-            "common::remove_redundant_ops",
-        )
-        assert get_op_types_in_program(prev_prog) == get_op_types_in_program(prog)
-
-
-class TestTopologicalReorder:
-    def test_move_sink_casts_to_the_end(self):
-        """
-        Input graph:
-            x (input) ---> square ---> cast (output)
-            |
-            | -----------> log ------> cast (output)
-            |
-            | -----------> relu -----> cast ----> relu (output)
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x = mb.cast(x=x, dtype="fp16")
-            x1 = mb.square(x=x)
-            x2 = mb.cast(x=x1, dtype="fp32")
-            x3 = mb.log(x=x)
-            x4 = mb.cast(x=x3, dtype="fp32")
-            x5 = mb.relu(x=x)
-            x6 = mb.cast(x=x5, dtype="fp32")
-            x7 = mb.relu(x=x6)
-            return x2, x4, x7
-
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "cast",
-            "log",
-            "cast",
-            "relu",
-            "cast",
-            "relu",
-        ]
-
-        apply_pass_and_basic_check(prog, "common::topological_reorder")
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "log",
-            "relu",
-            "cast",
-            "relu",
-            "cast",
-            "cast",
-        ]
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={
-                block.outputs[0].name: (10, 20),
-                block.outputs[1].name: (10, 20),
-                block.outputs[2].name: (10, 20),
-            },
-        )
-
-    def test_move_sink_cast_transpose_to_the_end(self):
-        """
-        Input graph:
-            x (input) ---> square ---> transpose ---> cast (output)
-            |
-            | -----------> log ------> transpose ---> cast (output)
-            |
-            | -----------> relu -----> cast ----> relu (output)
-            |
-            | -----------> relu (output)
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x = mb.cast(x=x, dtype="fp16")
-            x1 = mb.square(x=x)
-            x1_t = mb.transpose(x=x1, perm=[1, 0])
-            x2 = mb.cast(x=x1_t, dtype="fp32")
-            x3 = mb.log(x=x)
-            x3_t = mb.transpose(x=x3, perm=[1, 0])
-            x4 = mb.cast(x=x3_t, dtype="fp32")
-            x5 = mb.relu(x=x)
-            x6 = mb.cast(x=x5, dtype="fp32")
-            x7 = mb.relu(x=x6)
-            x8 = mb.relu(x=x)
-            return x2, x4, x7, x8
-
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "transpose",
-            "cast",
-            "log",
-            "transpose",
-            "cast",
-            "relu",
-            "cast",
-            "relu",
-            "relu",
-        ]
-
-        apply_pass_and_basic_check(prog, "common::topological_reorder")
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "log",
-            "relu",
-            "cast",
-            "relu",
-            "relu",
-            "transpose",
-            "cast",
-            "transpose",
-            "cast",
-        ]
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={
-                block.outputs[0].name: (20, 10),
-                block.outputs[1].name: (20, 10),
-                block.outputs[2].name: (10, 20),
-                block.outputs[3].name: (10, 20),
-            },
-        )
-
-    def test_move_multiple_uses_overlapping(self):
-        """
-        Input graph:
-            x (input) ---> cast ---> cast (output)
-                           |
-                           |-------> transpose ---> transpose (output)
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            x1 = mb.cast(x=x, dtype="fp16")
-            x2 = mb.cast(x=x1, dtype="fp32")
-            x3 = mb.transpose(x=x1, perm=[1, 0])
-            x4 = mb.transpose(x=x3, perm=[1, 0])
-            return x2, x4
-
-        assert get_op_types_in_program(prog) == ["cast", "cast", "transpose", "transpose"]
-
-        apply_pass_and_basic_check(prog, "common::topological_reorder")
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        assert get_op_types_in_program(prog) == ["cast", "transpose", "transpose", "cast"]
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={
-                block.outputs[0].name: (10, 20),
-                block.outputs[1].name: (10, 20),
-            },
-        )
-
-    def test_move_split_to_first_use(self):
-        """
-        Input graph:
-            x (input) ---> split ---> square ---> add (output)
-            |                |                     |
-            |                | --------------------|
-            |
-            | -----------> square --------------> relu (output)
-        """
-
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
-        def prog(x):
-            s1, s2 = mb.split(x=x, num_splits=2, axis=0)
-            x2 = mb.square(x=x)
-            x3 = mb.relu(x=x2)
-            s1_1 = mb.square(x=s1)
-            s3 = mb.add(x=s1_1, y=s2)
-            return x3, s3
-
-        assert get_op_types_in_program(prog) == ["split", "square", "relu", "square", "add"]
-
-        block = prog.functions["main"]
-        # Reorder `split` op to test op with multiple output case
-        topological_reorder._move_operations_to_the_end_block(block, ["split"])
-        assert get_op_types_in_program(prog) == ["square", "relu", "split", "square", "add"]
-
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={
-                block.outputs[0].name: (10, 20),
-                block.outputs[1].name: (5, 20),
-            },
-        )
+        assert get_op_types_in_program(prog) == ["relu"]
 
-    def test_move_transpose_before_subblock(self):
+    def test_fuse_squeeze_expand_dims_negative(self):
         """
-        Input graph:
-            x (input) ---> cast ---> transpose ---> cast (output)
-            |
-            | -----------> square ------> transpose (x1_t) ---> cast (output)
-            |
-            | -----------> squeeze ----> equal ----> squeeze
-                                                        |
-                                          (true) <--- /  \ ---> (false)
-                                            |                        |
-                                            |      /<-(x1_t)->\      |
-                                           add  <-/            \--> add
-                                            |---------> | <---------|
-                                                        |
-                                                       add ---> cast (output)
+        If squeeze and expand_dims cannot cancel each other,
+        the graph pass does nothing
         """
 
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        @mb.program(input_specs=[mb.TensorSpec(shape=(3, 1, 4, 1, 1))])
         def prog(x):
-            x = mb.cast(x=x, dtype="fp16")
-            x1 = mb.square(x=x)
-            x1_t = mb.transpose(x=x1, perm=[1, 0])
+            x = mb.squeeze(x=x, axes=(1, 2))
+            x = mb.expand_dims(x=x, axes=(1, 3))
+            return mb.relu(x=x)
 
-            def true_fn():
-                return mb.add(x=x1_t, y=np.float16(1), name="x2")
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["squeeze", "expand_dims", "relu"]
 
-            def false_fn():
-                return mb.add(x=x1_t, y=np.float16(2), name="x2")
+    def test_fuse_squeeze_expand_dims_connected_output(self):
+        """
+        If squeeze is connected to block output, it cannot be removed.
+        However, the expand_dims can be a block output.
+        """
+        # squeeze connected to output. Nothing happens.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
+        def prog(x):
+            squeeze = mb.squeeze(x=x, axes=(0,))
+            expand_dims = mb.expand_dims(x=squeeze, axes=(0,))
+            return mb.relu(x=expand_dims), squeeze
 
-            is_one = mb.equal(x=mb.squeeze(x=x), y=np.float16(1.0))
-            pred = mb.squeeze(x=is_one)
-            x3 = mb.cond(pred=pred, _true_fn=true_fn, _false_fn=false_fn)
-            x4 = mb.add(x=x1_t, y=x3)
-            x5 = mb.cast(x=x4, dtype="fp32")
-            return x5
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["squeeze", "expand_dims", "relu"]
 
-        apply_pass_and_basic_check(prog, "common::topological_reorder")
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+        # expand_dims connected to output. Still good to fuse.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,))])
+        def prog(x):
+            squeeze = mb.squeeze(x=x, axes=(0,))
+            expand_dims = mb.expand_dims(x=squeeze, axes=(0,))
+            return mb.relu(x=expand_dims), expand_dims
 
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "squeeze",
-            "equal",
-            "squeeze",
-            "transpose",
-            "cond",
-            "add",
-            "cast",
-        ]
+        apply_pass_and_basic_check(prog, "common::fuse_squeeze_expand_dims")
+        assert get_op_types_in_program(prog) == ["identity", "relu"]
 
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={block.outputs[0].name: (20, 10)},
-        )
 
-    def test_cast_transpose_already_at_the_end(self):
+class TestAddConvTransposeOutputShape:
+    def test_add_conv_transpose_output_shape(self):
         """
-        Input graph:
-            x (input) ---> square ---> transpose ---> cast (output)
-            |
-            | -----------> log ------> transpose ---> cast (output)
-            |
-            | -----------> relu -----> cast ----> relu (output)
-            |
-            | -----------> relu (output)
+        Given:
+          %1: (1, 5, 39, fp32) = conv_transpose(...) # no output_shape input.
+
+        Result:
+          %2: (3, i32) = const(val=[1,5,39])
+          %3: (1, 5, 39, fp32) = conv_transpose(..., output_shape=%2)
         """
+        N, C_in, C_out, D1 = 1, 3, 5, 20
 
-        @mb.program(input_specs=[mb.TensorSpec(shape=(10, 20))])
+        @mb.program(input_specs=[mb.TensorSpec(shape=(N, C_in, D1))])
         def prog(x):
-            x = mb.cast(x=x, dtype="fp16")
-            x1 = mb.square(x=x)
-            x3 = mb.log(x=x)
-            x5 = mb.relu(x=x)
-            x6 = mb.cast(x=x5, dtype="fp32")
-            x7 = mb.relu(x=x6)
-            x8 = mb.relu(x=x)
-            x1_t = mb.transpose(x=x1, perm=[1, 0])
-            x2 = mb.cast(x=x1_t, dtype="fp32")
-            x3_t = mb.transpose(x=x3, perm=[1, 0])
-            x4 = mb.cast(x=x3_t, dtype="fp32")
-            return x2, x4, x7, x8
-
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "log",
-            "relu",
-            "cast",
-            "relu",
-            "relu",
-            "transpose",
-            "cast",
-            "transpose",
-            "cast",
-        ]
-
-        apply_pass_and_basic_check(prog, "common::topological_reorder")
-        _, _, block = apply_pass_and_basic_check(prog, "common::dead_code_elimination")
-
-        assert get_op_types_in_program(prog) == [
-            "cast",
-            "square",
-            "log",
-            "relu",
-            "cast",
-            "relu",
-            "relu",
-            "transpose",
-            "cast",
-            "transpose",
-            "cast",
-        ]
+            weight = np.random.rand(C_in, C_out, D1).astype(np.float32)
+            return mb.conv_transpose(x=x, weight=weight)
 
-        assert_model_is_valid(
-            prog,
-            {"x": (10, 20)},
-            expected_output_shapes={
-                block.outputs[0].name: (20, 10),
-                block.outputs[1].name: (20, 10),
-                block.outputs[2].name: (10, 20),
-                block.outputs[3].name: (10, 20),
-            },
+        prev_prog, prev_block, block = apply_pass_and_basic_check(
+            prog, "common::add_conv_transpose_output_shape"
         )
+        assert get_op_types_in_program(prev_prog) == ["conv_transpose"]
+        assert get_op_types_in_program(prog) == ["conv_transpose"]
+        prev_conv_transpose_op = prev_prog.find_ops(op_type="conv_transpose", exactly_one=True)[0]
+        conv_transpose_op = prog.find_ops(op_type="conv_transpose", exactly_one=True)[0]
+        assert np.all(conv_transpose_op.output_shape.val == prev_conv_transpose_op.outputs[0].shape)
 
 
 class TestChildOrdering:
@@ -3707,6 +1674,41 @@ class TestCastOptimizationReduendantCastRemoval:
     """
     Test single cast op removal.
     """
+
+    def test_time_complexity(self):
+        """
+        This test makes sure the cast_optimization's time complexity is O(N) for most of the cases.
+
+        In this test case, the program consists of 1000 relu ops followed by 100 cast ops.
+
+            input -> relu -> relu -> ... -> relu -> cast -> cast -> ... -> cast
+
+        The algorithm goes through the first pass to eliminate all cast ops:
+
+            input -> relu -> ... -> relu
+
+        Note that, the total number of visited op is 1000 (relu) + 100 (cast) + 100 (const for the dtype) = 1200.
+
+        Because the fusion happens, the algorithm goes through the program again.
+        This time, the number of visited op is 1000 (relu) + 100 (const) = 1100.
+
+        Overally, the number of visited op is 2300.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1,), dtype=types.fp32)])
+        def prog(x):
+            for _ in range(1000):
+                x = mb.relu(x=x)
+            for _ in range(100):
+                x = mb.cast(x=x, dtype="fp32")
+            return x
+
+        graph_pass = cast_optimization()
+        graph_pass.apply(prog)
+        assert (
+            graph_pass._num_of_visited_ops == 2_300
+        )  # Please refer to the doc string for how 2300 comes from.
+
     def test_remove_redundant_cast_smoke(self):
         """
         Input graph:
@@ -4660,6 +2662,7 @@ def _false_fn():
             v.dtype.val for v in false_block.find_ops(op_type="cast")
         ]
 
+
 class TestConv1dCompositionPasses:
     @pytest.mark.parametrize(
         "backend, has_strides, pad_type, has_pad, has_dilations, has_bias",
@@ -4690,7 +2693,8 @@ def test_conv1d_composition(
         if has_strides:
             conv_kwargs["strides"] = (2, 2)
         if has_pad:
-            conv_kwargs["pad"] = (1, 1, 1, 1)
+            # The pad is specially designed to make sure the output of conv has dim_size=1 at axis 1.
+            conv_kwargs["pad"] = (0, 0, 1, 1) if pad_type == "custom" else (1, 1, 1, 1)
         if has_dilations:
             conv_kwargs["dilations"] = (2, 2)
         if has_bias:
@@ -4896,7 +2900,8 @@ def test_conv1d_channellast_composition(
         if has_strides:
             conv_kwargs["strides"] = (2, 2)
         if has_pad:
-            conv_kwargs["pad"] = (1, 1, 1, 1)
+            # The pad is specially designed to make sure the output of conv has dim_size=1 at axis 1.
+            conv_kwargs["pad"] = (0, 0, 1, 1) if pad_type == "custom" else (1, 1, 1, 1)
         if has_dilations:
             conv_kwargs["dilations"] = (2, 2)
         if has_bias:
@@ -4961,7 +2966,8 @@ def test_conv1d_channellast_composotion_dynamic_weight(self, backend):
         K = 4
 
         strides = (1, 2)
-        pad = (1, 0, 0, 1)
+        # The pad is specially designed to make sure the output of conv has dim_size=1 at axis 1.
+        pad = (0, 0, 0, 1)
         # MIL convolution with dynamic weights does not support dilations != 1
         # see coremltools/coremltools/converters/mil/mil/ops/defs/iOS15/conv.py
         dilations = (1, 1)
@@ -5035,7 +3041,7 @@ def test_conv1d_channellast_bias_fusion(self, backend, has_bias, bias_op_type):
         K = 4
 
         strides = (1, 2)
-        pad = (0, 1, 1, 0)
+        pad = (0, 0, 1, 0)
         dilations = (1, 2)
 
         # infer L_out with pad_type fixed to custom
@@ -5414,6 +3420,36 @@ def prog(x):
         apply_pass_and_basic_check(prog, "common::dead_code_elimination")
         assert get_op_types_in_program(prog) == ["conv"]
 
+    def test_scope_back_propagation(self):
+        Cin, Cout = 3, 3
+        input_shape = (2, Cin, 100, 100)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape)])
+        def prog(x):
+            with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"])):
+                x = self.get_conv(x, "conv1")
+
+            with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"])):
+                x = self.get_linear(x, "linear1", "add")
+
+            with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_3"])):
+                x = self.get_conv(x, "conv2")
+
+            with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_4"])):
+                x = self.get_linear(x, "linear2", "add")
+            return x
+
+        apply_pass_and_basic_check(prog, "common::fuse_conv_bias")
+        assert get_op_types_in_program(prog) == ["conv", "conv"]
+
+        conv_ops = prog.functions["main"].find_ops(op_type="conv")
+        assert conv_ops[0].scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2", "fuse_conv_bias"]
+        }
+        assert conv_ops[1].scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_4", "fuse_conv_bias"]
+        }
+
     """
     Input graph:
                                     Const
@@ -7551,7 +5587,9 @@ def prog(x):
             x4 = mb.add(x=x1, y=x3)
             return mb.relu(x=x4)
 
-        prog.main_input_types = [ct.ImageType(name="x", shape=(10, 20, 30, 3), channel_first=False)]
+        prog.functions["main"].input_types = [
+            ct.ImageType(name="x", shape=(10, 20, 30, 3), channel_first=False)
+        ]
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "common::image_input_preprocess"
         )
@@ -7594,7 +5632,7 @@ def test_nn_backend_style_sanitization(self):
         for the NN backend.
         """
 
-        prog = Program()
+        prog = mil.Program()
         func_inputs = {"x/0": mb.placeholder(shape=[2, 3]), "y": mb.placeholder(shape=[2, 3])}
         with Function(func_inputs) as ssa_fun:
             x, y = ssa_fun.inputs["x/0"], ssa_fun.inputs["y"]
@@ -7654,7 +5692,7 @@ def prog(input):
             x = mb.square(x=input, name="output_square")
             return x
 
-        prog.set_main_output_types([ct.TensorType(dtype=np.float16)])
+        prog.functions["main"].set_output_types([ct.TensorType(dtype=np.float16)])
         prev_prog, prev_block, block = apply_pass_and_basic_check(
             prog, "common::update_output_dtypes", skip_output_type_check=True
         )
@@ -7691,7 +5729,7 @@ def prog(input):
             x1, x2 = mb.split(x=input, num_splits=2, axis=1, name="split")
             return x1, x2
 
-        prog.set_main_output_types([ct.TensorType(), ct.TensorType(dtype=np.float16)])
+        prog.functions["main"].set_output_types([ct.TensorType(), ct.TensorType(dtype=np.float16)])
         _, _, block = apply_pass_and_basic_check(
             prog, "common::update_output_dtypes", skip_output_type_check=True
         )
@@ -7723,7 +5761,7 @@ def test_output_as_input(self, caplog):
         def prog(input):
             return input
 
-        prog.set_main_output_types([ct.TensorType(dtype=np.float16)])
+        prog.functions["main"].set_output_types([ct.TensorType(dtype=np.float16)])
         _, _, block = apply_pass_and_basic_check(
             prog,
             "common::update_output_dtypes",
@@ -8378,7 +6416,6 @@ def prog(x):
             prog, {"x": shape}, expected_output_shapes={block.outputs[0].name: shape}
         )
 
-
 class TestFuseLinearBias:
     @staticmethod
     def _apply_transform(inputs, func, is_first_input, has_bias):
@@ -8573,3 +6610,387 @@ def prog(x):
 
         if _VALIDATE_MODEL:
             assert_model_is_valid(prog, {"x": (2, 4)})
+
+
+class TestGraphPassScopePreservation:
+    def test_single_pass(self):
+        """
+        Input:
+
+            x
+            -> relu(torch_scope="module_1")
+            -> transpose_1(torch_scope="module_1")
+            -> transpose_2(torch_scope="module_2")
+            -> output
+
+        Output:
+
+            x
+            -> relu(torch_scope="module_1")
+            -> transpose_3(
+                    torch_scope="module_2",
+                    pass_scope="merge_consecutive_transposes"
+                )
+            -> output
+
+        In the above case, the relu op preserves its original scope information.
+        Since transpose_3 is created by the "merge_consecutive_transposes" pass, the COREMLTOOLS_GRAPH_PASS scope
+        information will be saved in the op.
+        Also, the TORCHSCRIPT_MODULE_TYPE scope info of transpose_2 is back propagated to transpose_3,
+        when the use of output of transpose_2 is replaced by the output of transpose_3.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 3, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_1"),
+            ):
+                x = mb.relu(x=x)
+                x = mb.transpose(x=x, perm=[0, 2, 1, 3])
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_2"),
+            ):
+                return mb.transpose(x=x, perm=[3, 2, 0, 1])
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        apply_pass_and_basic_check(prog, "common::merge_consecutive_transposes")
+        assert get_op_types_in_program(prog) == ["relu", "transpose"]
+
+        # the scope info in the relu op is not affected
+        relu_op = prog.find_ops(op_type="relu")[0]
+        assert len(relu_op.scopes) == 1
+        assert relu_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+
+        # the new transpose op has the scope information from the graph pass
+        transpose_op = prog.find_ops(op_type="transpose")[0]
+        assert len(transpose_op.scopes) == 2
+        assert transpose_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "merge_consecutive_transposes"
+        ]
+        assert transpose_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+
+    def test_single_pass_without_creating_new_var(self):
+        """
+        Input:
+
+            x
+            -> relu(torch_scope="module_1")
+            -> relu(torch_scope="module_2")
+            -> relu(torch_scope="module_3")
+            -> output
+
+        Output:
+
+            x
+            -> relu(torch_scope="module_1")
+            -> output
+
+        In the above case, the relu op preserves its original scope information, since the graph pass only reconnects the graph.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 3, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_1"),
+            ):
+                x = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_2"),
+            ):
+                x = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_3"),
+            ):
+                return mb.relu(x=x)
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        apply_pass_and_basic_check(prog, "common::merge_consecutive_relus")
+        assert get_op_types_in_program(prog) == ["relu"]
+
+        # the scope info in the relu op is not affected
+        relu_op = prog.find_ops(op_type="relu")[0]
+        assert len(relu_op.scopes) == 1
+        assert relu_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+
+    def test_multiple_passes(self):
+        """
+        In this case, a program goes through two graph passes.
+        And the resulting program should have scope information from both passes.
+        """
+        shape = (3, 5, 6, 7)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=shape)])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_1"),
+            ):
+                # dummy op
+                x = mb.relu(x=x)
+
+                # pattern for "merge_consecutive_transposes"
+                x = mb.transpose(x=x, perm=[0, 2, 1, 3])
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_2"),
+            ):
+                x = mb.transpose(x=x, perm=[3, 2, 0, 1])
+
+                # pattern for "fuse_layernorm_or_instancenorm"
+                mean0 = mb.reduce_mean(x=x, axes=[2, 3], keep_dims=True)
+                sub0 = mb.sub(x=x, y=mean0)
+                sub1 = mb.sub(x=x, y=mean0)
+                square = mb.square(x=sub0)
+                mean1 = mb.reduce_mean(x=square, axes=[2, 3], keep_dims=True)
+                add_eps = mb.add(x=mean1, y=1e-5)  # epsilon
+                pow = mb.pow(x=add_eps, y=0.5)
+                div = mb.real_div(x=sub1, y=pow)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_3"),
+            ):
+                mul_gamma = mb.mul(x=np.random.rand(1, shape[1], 1, 1), y=div)
+                return mb.add(x=np.random.rand(1, shape[1], 1, 1), y=mul_gamma)
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        apply_pass_and_basic_check(prog, "common::fuse_layernorm_or_instancenorm")
+        apply_pass_and_basic_check(prog, "common::merge_consecutive_transposes")
+        assert get_op_types_in_program(prog) == ["relu", "transpose", "instance_norm"]
+
+        # the scope info in the relu op is not affected
+        relu_op = prog.find_ops(op_type="relu")[0]
+        assert len(relu_op.scopes) == 1
+        assert relu_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+
+        # the new transpose op has the scope information from the graph pass
+        transpose_op = prog.find_ops(op_type="transpose")[0]
+        assert len(transpose_op.scopes) == 2
+        assert transpose_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "merge_consecutive_transposes"
+        ]
+        assert transpose_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+
+        # the new instance_norm op has the scope information from the graph pass
+        instance_norm_op = prog.find_ops(op_type="instance_norm")[0]
+        assert len(instance_norm_op.scopes) == 2
+        assert instance_norm_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "fuse_layernorm_or_instancenorm"
+        ]
+        assert instance_norm_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_3"]
+
+    def test_fp16_scope_preservation(self):
+        """
+        This test explains step-by-step how the scope information preservation works in the fp32 -> fp16 pass.
+
+        Input graph:
+
+            x(fp32)
+            -> relu(torch_scope="module_1")
+            -> sin(torch_scope="module_2")
+            -> output(fp32)
+
+        (1) "common::add_fp16_cast"
+
+            First, in the add_fp16_cast graph pass, multiple cast ops are injected in the graph:
+
+                x(fp32)
+                -> cast(dtype="fp16", torch_scope="module_1", pass_scope="add_fp16_cast")
+                -> relu(torch_scope="module_1", pass_scope="add_fp16_cast")
+                -> cast(dtype="fp32", torch_scope="module_1", pass_scope="add_fp16_cast")
+                -> cast(dtype="fp16", torch_scope="module_2", pass_scope="add_fp16_cast")
+                -> sin(torch_scope="module_2", pass_scope="add_fp16_cast")
+                -> cast(dtype="fp32, torch_scope="module_2", pass_scope="add_fp16_cast")
+                -> output
+
+            There are 4 cast ops in the graph who has pass_scope = "add_fp16_cast", which indicates they are added by the "add_fp16_cast" pass.
+
+            Note that, the first cast -> relu -> cast pattern has the same torch scope information as
+            the original relu(torch_scope="module_1"). This is due to the fact that when we replace
+            the use of the original relu output with the output of the second cast op, the scope information is back propagated.
+
+            The same reason applied for why the cast -> sin -> cast patterns has the torch scope as
+            the original sin op.
+
+        (2) "common::cast_optimization" + "dead-code_elimination"
+
+            After the cleanup, the graph becomes:
+
+                x(fp32)
+                -> cast(
+                        dtype="fp16",
+                        torch_scope="module_1",
+                        pass_scope="add_fp16_cast"
+                    )
+                -> relu(
+                        torch_scope="module_1",
+                        pass_scope="add_fp16_cast]
+                    )
+                -> sin(
+                        torch_scope="module_2",
+                        pass_scope="add_fp16_cast"
+                    )
+                -> cast(
+                        dtype="fp32,
+                        torch_scope="module_2",
+                        pass_scope="add_fp16_cast"
+                    )
+                -> output
+
+            We can see that, the fp16 version of relu / sin preserves the original torch scope information.
+        """
+        shape = (3, 5, 6, 7)
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=shape)])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_1"),
+            ):
+                x = mb.relu(x=x)
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_2"),
+            ):
+                return mb.sin(x=x)
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        # fp16 cast pass
+        apply_pass_and_basic_check(prog, "common::add_fp16_cast")
+        assert get_op_types_in_program(prog) == ["cast", "relu", "cast", "cast", "sin", "cast"]
+
+        cast_ops = prog.find_ops(op_type="cast")
+        assert len(cast_ops) == 4
+        assert len(cast_ops[0].scopes) == 2
+        assert cast_ops[0].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert cast_ops[0].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+        assert len(cast_ops[1].scopes) == 2
+        assert cast_ops[1].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert cast_ops[1].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+        assert len(cast_ops[2].scopes) == 2
+        assert cast_ops[2].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "add_fp16_cast",
+        ]
+        assert cast_ops[2].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+        assert len(cast_ops[3].scopes) == 2
+        assert cast_ops[3].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert cast_ops[3].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+
+        relu_op = prog.find_ops(op_type="relu")[0]
+        assert len(relu_op.scopes) == 2
+        assert relu_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert relu_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+
+        sin_op = prog.find_ops(op_type="sin")[0]
+        assert len(sin_op.scopes) == 2
+        assert sin_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert sin_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+
+        # clean up with cast optimization and dead code elimination
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["cast", "relu", "sin", "cast"]
+        cast_ops = prog.find_ops(op_type="cast")
+        assert len(cast_ops) == 2
+        assert len(cast_ops[0].scopes) == 2
+        assert cast_ops[0].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert cast_ops[0].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+        assert len(cast_ops[1].scopes) == 2
+        assert cast_ops[1].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert cast_ops[1].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+
+        relu_op = prog.find_ops(op_type="relu")[0]
+        assert len(relu_op.scopes) == 2
+        assert relu_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "add_fp16_cast",
+        ]
+        assert relu_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+
+        sin_op = prog.find_ops(op_type="sin")[0]
+        assert len(sin_op.scopes) == 2
+        assert sin_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == ["add_fp16_cast"]
+        assert sin_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+
+    def test_pass_followed_by_fp16(self):
+        """
+        Input:
+
+            x
+            -> transpose_1(torch_scope="module_1")
+            -> transpose_2(torch_scope="module_2")
+            -> output
+
+        Output:
+
+            x
+            -> cast(
+                    dtype="fp16",
+                    torch_scope="module_2",
+                    pass_scope=["merge_consecutive_transposes", "add_fp16_cast"]
+               )
+            -> transpose_3_fp16(
+                    torch_scope="module_2",
+                    pass_scope=["merge_consecutive_transposes", "add_fp16_cast"]
+               )
+            -> cast(dtype="fp32",
+                    torch_scope="module_2",
+                    pass_scope=["merge_consecutive_transposes", "add_fp16_cast"]
+               )
+            -> output
+
+        In the above case, two transpose ops first merged into a single transpose op,
+        and the graph is transformed into fp16.
+
+        Hence, the final transpose op should have scope information from both graph passes.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(1, 2, 3, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_1"),
+            ):
+                x = mb.transpose(x=x, perm=[0, 2, 1, 3])
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="module_2"),
+            ):
+                return mb.transpose(x=x, perm=[3, 2, 0, 1])
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        apply_pass_and_basic_check(prog, "common::merge_consecutive_transposes")
+        apply_pass_and_basic_check(prog, "common::add_fp16_cast")
+        apply_pass_and_basic_check(prog, "common::cast_optimization")
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == ["cast", "transpose", "cast"]
+
+        cast_ops = prog.find_ops(op_type="cast")
+        assert len(cast_ops) == 2
+        assert len(cast_ops[0].scopes) == 2
+        assert cast_ops[0].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "merge_consecutive_transposes",
+            "add_fp16_cast",
+        ]
+        assert cast_ops[0].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_2"]
+        assert len(cast_ops[1].scopes) == 2
+        assert cast_ops[1].scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "merge_consecutive_transposes",
+            "add_fp16_cast",
+        ]
+        assert cast_ops[1].scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_2",
+        ]
+
+        transpose_op = prog.find_ops(op_type="transpose")[0]
+        assert len(transpose_op.scopes) == 2
+        assert transpose_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "merge_consecutive_transposes",
+            "add_fp16_cast",
+        ]
+        assert transpose_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_2",
+        ]
diff --git a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
index b1cbcf52f..ebcd0d7e5 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
@@ -16,6 +16,7 @@
 from coremltools._deps import _HAS_TORCH, _IS_MACOS, MSG_TORCH_NOT_FOUND
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes.defs import quantization
+from coremltools.converters.mil.mil.passes.defs.quantization import add_fp16_cast
 from coremltools.converters.mil.mil.types import numpy_type_to_builtin_type
 from coremltools.converters.mil.testing_utils import (
     apply_pass_and_basic_check,
@@ -31,7 +32,8 @@
 
 
 class TestTensorwiseAffineDequantizeConstElimination:
-    def test_eliminate_transpose(self):
+    @pytest.mark.parametrize("axis", (None, 0, 1, -1))
+    def test_eliminate_transpose(self, axis):
         """
         Input graph:
             data -> constexpr_affine_dequantize -> transpose
@@ -41,21 +43,28 @@ def test_eliminate_transpose(self):
 
         where new_data is the value after applying transpose to data
         """
-        quantized_data = np.random.randint(0, 256, (1, 2, 3, 4)).astype(np.int8)
+        SHAPE = (1, 2, 3, 4)
+        quantized_data = np.random.randint(0, 256, SHAPE).astype(np.int8)
+        if axis is None:
+            axis = 0  # although tensor-wise, constexpr_affine_dequantize requires a (dummy) axis
+            scale = np.random.rand()
+            zero_point = np.random.randint(-127, 128, dtype=np.int8)
+        else:
+            size = SHAPE[axis]
+            scale = np.random.rand(size)
+            zero_point = np.random.randint(-127, 128, size, dtype=np.int8)
 
         @mb.program(input_specs=[], opset_version=ct.target.iOS16)
         def prog():
             res = mb.constexpr_affine_dequantize(
                 quantized_data=quantized_data,
-                axis=0,
-                scale=8.9,
-                zero_point=np.int8(34),
+                axis=axis,
+                scale=scale,
+                zero_point=zero_point,
             )
             return mb.transpose(x=res, perm=(2, 0, 1, 3))
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
 
         new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
@@ -84,9 +93,7 @@ def prog():
             )
             return mb.reshape(x=res, shape=(3, -1))
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
 
         new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
@@ -115,9 +122,7 @@ def prog():
             )
             return mb.expand_dims(x=res, axes=(0, 2, 4))
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
 
         new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
@@ -147,9 +152,7 @@ def prog():
             )
             return mb.squeeze(x=res, axes=axis)
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
 
         new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
@@ -182,9 +185,7 @@ def prog():
             res = mb.expand_dims(x=res, axes=(0, 2, 4))
             return mb.squeeze(x=res, axes=(2,))
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
 
         new_op = prog.find_ops(op_type="constexpr_affine_dequantize", exactly_one=True)[0]
@@ -196,39 +197,6 @@ def prog():
 
         np.testing.assert_array_equal(new_op.quantized_data.val, expected_quantized_data)
 
-    def test_negative_channel_wise_pattern(self):
-        """
-        If ``constexpr_affine_dequantize`` is not tensor-wise,
-        the graph is not changed.
-        """
-        quantized_data = np.random.randint(0, 256, (2, 3, 4)).astype(np.int8)
-
-        @mb.program(input_specs=[], opset_version=ct.target.iOS16)
-        def prog():
-            x = mb.constexpr_affine_dequantize(
-                quantized_data=quantized_data,
-                axis=0,
-                scale=[8.9, 6.5],
-                zero_point=np.int8(34),
-            )
-            y = mb.constexpr_affine_dequantize(
-                quantized_data=quantized_data,
-                axis=0,
-                scale=8.9,
-                zero_point=np.int8([34, 56]),
-            )
-            return mb.transpose(x=x, perm=(1, 0, 2)), mb.transpose(x=y, perm=(1, 0, 2))
-
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
-        assert get_op_types_in_program(prog) == [
-            "constexpr_affine_dequantize",
-            "constexpr_affine_dequantize",
-            "transpose",
-            "transpose",
-        ]
-
     def test_negative_non_linked_list_pattern(self):
         """
         If ``quantized_data`` feeds into multiple ``constexpr_affine_dequantize`` ops,
@@ -253,9 +221,7 @@ def prog():
             )
             return mb.transpose(x=x, perm=(1, 0, 2)), mb.reshape(x=y, shape=(24,))
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == [
             "constexpr_affine_dequantize",
             "constexpr_affine_dequantize",
@@ -282,9 +248,7 @@ def prog():
             y = mb.transpose(x=x, perm=(0, 3, 2, 1))
             return x, y
 
-        apply_pass_and_basic_check(
-            prog, "common::merge_tensorwise_affine_dequantize_with_consecutive_ops"
-        )
+        apply_pass_and_basic_check(prog, "common::merge_affine_dequantize_with_consecutive_ops")
         assert get_op_types_in_program(prog) == [
             "constexpr_affine_dequantize",
             "transpose",
@@ -1741,10 +1705,11 @@ def prog():
             return y
 
         assert get_op_types_in_program(prog) == ["dequantize"]
+        dequantize_op = prog.find_ops(op_type="dequantize")[0]
+        assert dequantize_op.outputs[0].val is None
+        assert dequantize_op.can_materialize_val()
 
-        prev_prog, prev_block, block = apply_pass_and_basic_check(
-            prog, "common::dequantize_to_constexpr"
-        )
+        apply_pass_and_basic_check(prog, "common::dequantize_to_constexpr")
         assert get_op_types_in_program(prog) == ["constexpr_affine_dequantize"]
 
     @pytest.mark.parametrize(
@@ -2196,20 +2161,150 @@ def prog(x):
         )
 
 
+class TestTransformFunctionSignatures:
+    @staticmethod
+    def test_empty():
+        """
+        Case where the input var is also a block output.
+        """
+        # case 1
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            return x
+
+        graph_pass = add_fp16_cast()
+        block = prog.functions["main"]
+        graph_pass.transform_function_signatures(block)
+        apply_pass_and_basic_check(prog, "common::dead_code_elimination")
+
+        assert get_op_types_in_program(prog) == []
+        assert block.inputs["x"].dtype == types.fp16
+        assert len(block.outputs) == 1
+        assert block.outputs[0].dtype == types.fp16
+        assert block.outputs[0] is block.inputs["x"]
+
+        # case 2
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            return x, mb.relu(x=x), x, x
+
+        graph_pass = add_fp16_cast()
+        block = prog.functions["main"]
+        graph_pass.transform_function_signatures(block)
+
+        assert block.inputs["x"].dtype == types.fp16
+        assert len(block.outputs) == 4
+
+        assert block.outputs[0].dtype == types.fp16
+        assert block.outputs[2].dtype == types.fp16
+        assert block.outputs[3].dtype == types.fp16
+
+        assert block.outputs[1].dtype == types.fp32
+
+        assert block.outputs[0] is block.inputs["x"]
+        assert block.outputs[2] is block.inputs["x"]
+        assert block.outputs[3] is block.inputs["x"]
+
+        assert all([x.dtype == types.fp16 for x in block.output_types])
+
+        assert get_op_types_in_program(prog) == ["cast", "relu"]
+        cast_op = block.find_ops(op_type="cast")[0]
+        assert cast_op.dtype.val == "fp32"
+
+    @staticmethod
+    def test_simple():
+        """
+        Input graph:
+
+            input(fp32) -> relu -> output
+
+        Output graph:
+
+            input(fp16) -> cast(dtype="fp32") -> relu -> output,
+
+            with function.output_types = [ct.TesorType(dtype=types.fp16)]
+
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            return mb.relu(x=x)
+
+        graph_pass = add_fp16_cast()
+        block = prog.functions["main"]
+        graph_pass.transform_function_signatures(block)
+
+        assert block.inputs["x"].dtype == types.fp16
+
+        assert get_op_types_in_program(prog) == ["cast", "relu"]
+        cast_op = block.find_ops(op_type="cast")[0]
+        assert cast_op.dtype.val == "fp32"
+
+        assert len(block.outputs) == 1
+        assert block.outputs[0].dtype == types.fp32
+
+        assert len(block.output_types) == 1
+        assert block.output_types[0].dtype == types.fp16
+
+    @staticmethod
+    def test_simple_2():
+        """
+        Input graph:
+
+            input(fp32) -> identity -> cast(dtype="int32") -> output_1
+                               |
+                               .-> output_2
+
+        Output graph:
+
+            input(fp16) -> cast(dtype="fp32") -> identity -> cast(dtype="int32")  -> output_1
+                                                      |
+                                                      .-> output_2,
+
+            with function.output_types = [ct.TesorType(dtype=types.int32), ct.TesorType(dtype=types.fp16)]
+
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            x = mb.identity(x=x)
+            return mb.cast(x=x, dtype="int32"), x
+
+        graph_pass = add_fp16_cast()
+        block = prog.functions["main"]
+        graph_pass.transform_function_signatures(block)
+
+        assert block.inputs["x"].dtype == types.fp16
+
+        assert get_op_types_in_program(prog) == ["cast", "identity", "cast"]
+        cast_ops = block.find_ops(op_type="cast")
+        assert cast_ops[0].dtype.val == "fp32"
+        assert cast_ops[1].dtype.val == "int32"
+
+        assert len(block.outputs) == 2
+        assert block.outputs[0].dtype == types.int32
+        assert block.outputs[1].dtype == types.fp32
+
+        assert len(block.output_types) == 2
+        assert block.output_types[0].dtype == types.int32
+        assert block.output_types[1].dtype == types.fp16
+
+
 class TestInt32CastToInt16:
     @pytest.mark.parametrize(
-        "x_dtype, dynamic, opset_version",
+        "x_dtype, dynamic, has_neg, opset_version",
         itertools.product(
             [np.int32, np.float32],
             [True, False],
+            [True, False],
             [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
         ),
     )
-    def test_gather_int16_indices(self, x_dtype, dynamic, opset_version):
+    def test_gather_int16_indices(self, x_dtype, dynamic, has_neg, opset_version):
         @mb.program(opset_version=opset_version)
         def prog_static():
             params = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
-            indices = np.array([1, 0], dtype=np.int32)
+            indices = np.array([-2, 0] if has_neg else [1, 0], dtype=np.int32)
             return mb.gather(x=params, indices=indices, axis=-1)
 
         @mb.program(
@@ -2241,7 +2336,7 @@ def prog_dynamic(x, indices):
             assert get_op_types_in_program(prog) == expected_ops
             indices_cast_op_idx = 1 if x_dtype == np.int32 else 0
             cast_op = block.find_ops(op_type="cast")[indices_cast_op_idx]
-            assert cast_op.dtype.val == "int16"
+            assert cast_op.dtype.val == "int16" if has_neg else "uint16"
             assert len(cast_op.outputs) == 1
             assert len(cast_op.outputs[0].child_ops) == 1
             assert cast_op.outputs[0].child_ops[0].op_type == "gather"
@@ -2255,19 +2350,42 @@ def prog_dynamic(x, indices):
                 rtol=1e-05,
             )
 
+    def test_gather_int16_scalar_indices(self):
+        @mb.program(input_specs=[], opset_version=ct.target.iOS17)
+        def prog_static():
+            params = np.array([1, 2, 3, 4], dtype=np.int32)
+            res = mb.gather(x=params, indices=0, axis=0, batch_dims=0, validate_indices=False)
+            return res
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=(4,), dtype=types.int32)],
+            opset_version=ct.target.iOS17,
+        )
+        def prog_dynamic(x):
+            return mb.gather(x=x, indices=0, axis=0)
+
+        for prog in (prog_static, prog_dynamic):
+            assert get_op_types_in_program(prog) == ["gather"]
+            prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
+            expected_ops = ["cast", "cast", "gather", "cast"]
+            assert get_op_types_in_program(prog) == expected_ops
+
     @pytest.mark.parametrize(
-        "x_dtype, dynamic, opset_version",
+        "x_dtype, dynamic, has_neg, opset_version",
         itertools.product(
             [np.int32, np.float32],
             [True, False],
+            [True, False],
             [ct.target.iOS15, ct.target.iOS16, ct.target.iOS17],
         ),
     )
-    def test_gather_along_axis_int16_indices(self, x_dtype, dynamic, opset_version):
+    def test_gather_along_axis_int16_indices(self, x_dtype, dynamic, has_neg, opset_version):
         @mb.program(opset_version=opset_version)
         def prog_static():
             params = np.array([[1, 2, 3], [4, 5, 6]], dtype=x_dtype)
-            indices = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.int32)
+            indices = np.array(
+                [[-2, 0, -2], [-2, -2, 0]] if has_neg else [[1, 0, 1], [1, 1, 0]], dtype=np.int32
+            )
             return mb.gather_along_axis(x=params, indices=indices, axis=-1)
 
         @mb.program(
@@ -2299,7 +2417,7 @@ def prog_dynamic(x, indices):
             assert get_op_types_in_program(prog) == expected_ops
             indices_cast_op_idx = 1 if x_dtype == np.int32 else 0
             cast_op = block.find_ops(op_type="cast")[indices_cast_op_idx]
-            assert cast_op.dtype.val == "int16"
+            assert cast_op.dtype.val == "int16" if has_neg else "uint16"
             assert len(cast_op.outputs) == 1
             assert len(cast_op.outputs[0].child_ops) == 1
             assert cast_op.outputs[0].child_ops[0].op_type == "gather_along_axis"
@@ -2336,17 +2454,25 @@ def prog(x, indices):
             assert cast_op.dtype.val == "int16"
             assert cast_op.outputs[0] == block.find_ops(op_type="gather")[0].indices
 
-    def test_gather_static_overflow_int16(self):
-        """Indices cannot be represented by int16 range, don't cast to int16."""
+    @pytest.mark.parametrize("overflow_uint16", [True, False])
+    def test_gather_static_overflow_int16(self, overflow_uint16):
+        """Indices cannot be represented by int16 range, but might be represented by uint16."""
+        max_index = 65536 if overflow_uint16 else 32768
 
         @mb.program(opset_version=ct.target.iOS17)
         def prog():
-            params = np.array([[1, 2]] * 32769, dtype=np.float32)
-            indices = np.array([32768, 0], dtype=np.int32)
+            params = np.array([[1, 2]] * (max_index + 1), dtype=np.float32)
+            indices = np.array([max_index, 0], dtype=np.int32)
             return mb.gather(x=params, indices=indices, axis=0)
 
         prev_prog, _, block = apply_pass_and_basic_check(prog, "common::add_int16_cast")
-        assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+        if overflow_uint16:
+            assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
+        else:
+            assert get_op_types_in_program(prog) == ["cast", "gather"]
+            cast_op = block.find_ops(op_type="cast")[0]
+            assert cast_op.dtype.val == "uint16"
+            assert cast_op.outputs[0] == block.find_ops(op_type="gather")[0].indices
 
     @patch(
         "coremltools.converters.mil.mil.passes.defs.quantization.add_int16_cast._PREFER_INT16_OPS",
diff --git a/coremltools/converters/mil/mil/program.py b/coremltools/converters/mil/mil/program.py
index 462e88ffc..10deb389d 100644
--- a/coremltools/converters/mil/mil/program.py
+++ b/coremltools/converters/mil/mil/program.py
@@ -4,14 +4,13 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from collections import defaultdict
-from typing import Dict, List
+from typing import Dict, List, Optional, Union
 
 import numpy as _np
 import sympy as _sm
 
 from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget as _target
-from coremltools.converters.mil.input_types import InputType
 from coremltools.converters.mil.mil.input_type import InternalInputType
 from coremltools.converters.mil.mil.ops.helper import _get_version_of_op
 from coremltools.converters.mil.mil.var import ListVar
@@ -19,6 +18,7 @@
 from . import types
 from .block import Function
 from .operation import Operation
+from .scope import ScopeSource
 from .types.symbolic import k_num_internal_syms, k_used_symbols
 from .var import Var
 
@@ -28,20 +28,19 @@ class Program:
     def _get_opset_str_value(op):
         return f"coremltools.target.{op.name}"
 
-    @staticmethod
-    def _get_supported_dialect_opset() -> List[str]:
-        """
-        Return a list of supported dialect opsets at runtime.
-        """
-        return []
-
     def __init__(self):
-        self.main_input_types = []
-        self.main_output_types = None
         self.functions = {}
-        self.parameters = {}
         self.skip_all_passes = False
 
+    def _add_essential_scope_source(
+        self, scope_source: Union[ScopeSource, List[ScopeSource]]
+    ) -> None:
+        """
+        Add essential scope sources to functions.
+        """
+        for func in self.functions.values():
+            func._add_essential_scope_source(scope_source)
+
     def _get_dialect_namespaces(self) -> Dict[str, List[Operation]]:
         """
         Return a dict which maps the dialect namespace into a list of corresponding operations.
@@ -49,7 +48,7 @@ def _get_dialect_namespaces(self) -> Dict[str, List[Operation]]:
         res = defaultdict(list)
 
         def get_dialect_namespaces_block(block):
-            for op in list(block.operations):
+            for op in block.operations:
                 for b in op.blocks:
                     get_dialect_namespaces_block(b)
                 if hasattr(op, "_dialect_namespace"):
@@ -72,7 +71,7 @@ def _get_max_opset_version_and_op(self):
 
     def _check_ops_version_compatibility(self, max_opset_version):
         def check_version_compatibility_block(block):
-            for op in list(block.operations):
+            for op in block.operations:
                 for b in op.blocks:
                     check_version_compatibility_block(b)
                 if not hasattr(op, "_op_variants") or not isinstance(op._op_variants, dict):
@@ -146,10 +145,13 @@ def _check_invalid_tensor_rank_block(block):
                     _check_invalid_tensor_rank_block(b)
                 for o in op.outputs:
                     if not isinstance(o, ListVar) and (o.rank < 0 or o.rank >= 6):
-                        if op.op_type == "const" and len(o.child_ops) == 1 and \
-                                o.child_ops[0].op_type == "constexpr_lut_to_dense":
-                            # For lut op, the lookup table is allowed to have rank > 5.
-                            continue
+                        if op.op_type == "const" or op.op_type.startswith("constexpr_"):
+                            if all(
+                                child_op.op_type.startswith("constexpr_")
+                                for child_op in o.child_ops
+                            ):
+                                # For const/constexpr op's constexpr output, tensor with rank > 5 is ok.
+                                continue
                         raise ValueError(
                             f'Core ML only supports tensors with rank <= 5. Layer "{op.name}", '
                             f'with type "{op.op_type}", outputs a rank {o.rank} tensor. '
@@ -207,20 +209,6 @@ def add_function(self, name, ssa_func):
     def add_parameters(self, name, ssa_val):
         raise NotImplementedError()
 
-    def set_main_input_types(self, inputs):
-        if not isinstance(inputs, tuple):
-            raise ValueError("main inputs should be tuple of TensorType or ImageType")
-        elif not all([isinstance(inp, InputType) for inp in inputs]):
-            raise ValueError("main inputs should be tuple of InputSpec")
-        self.main_input_types = inputs
-
-    def set_main_output_types(self, outputs=None):
-        if outputs is not None:
-            if not (isinstance(outputs, list) and all([isinstance(out, InputType) for out in outputs])):
-                raise TypeError("main outputs should be a list of type ct.TensorType or ct.ImageType")
-        self.main_output_types = outputs
-
-
     def find_ops(self, prefix=None, op_type=None, exactly_one=False):
         """
         Return list of ops with name matching `prefix` if specified, and
@@ -242,9 +230,51 @@ def find_ops(self, prefix=None, op_type=None, exactly_one=False):
             raise ValueError(msg.format(found_ops))
         return found_ops
 
-    def validate(self):
+    def validate(self, check_essential_scope: Optional[bool] = False) -> None:
         for f in self.functions.values():
-            f.validate()
+            f.validate(force_validate=True, check_essential_scope=check_essential_scope)
+
+    def construct_debug_handle_to_ops_mapping(self) -> Dict:
+        """
+        For PyMIL program translated from ExecuTorch only: Based on scope info inherited from EXIR,
+        construct a debug handle to ops mapping. The mapping format is something like
+        {
+          1: [
+            {"Type": "Program"},
+            {"Type": "Function", "Name": "main"},
+            {"Type": "Block"},
+            {"Type": "Operation", "Operation_Type": "add", "Output": "z"}
+          ]
+        }
+        where `1`, `"main"`, `"add"`, and `"z"` are example values of
+        the debug handle, function name, operation type,
+        and output var name (or the name of the first output var, if multiple outputs)
+        """
+        debug_handle_to_ops_mapping = {}
+        for function_name, function in self.functions.items():
+            for operation in function.operations:
+                # TODO (rdar://115846569): Handle multi-block case from EXIR
+                if len(operation.blocks) > 0:
+                    raise NotImplementedError("Multi-block case has not been supported yet")
+                debug_handle = operation.scopes.get(ScopeSource.EXIR_DEBUG_HANDLE)
+                if debug_handle is None:
+                    continue
+                debug_handle = debug_handle[0]
+                if debug_handle not in debug_handle_to_ops_mapping:
+                    debug_handle_to_ops_mapping[debug_handle] = []
+                debug_handle_to_ops_mapping[debug_handle].append(
+                    [
+                        {"Type": "Program"},
+                        {"Type": "Function", "Name": function_name},
+                        {"Type": "Block"},
+                        {
+                            "Type": "Operation",
+                            "Operation_Type": operation.op_type,
+                            "Output": operation.outputs[0].name,
+                        },
+                    ]
+                )
+        return debug_handle_to_ops_mapping
 
     def __getitem__(self, func_name):
         if func_name not in self.functions:
@@ -255,10 +285,11 @@ def __getitem__(self, func_name):
     def __repr__(self):
         return self.__str__()
 
-    def __str__(self):
+    def __str__(self, print_attr: Optional[bool] = False) -> str:
         s = ""
         for f_name, f in self.functions.items():
-            s += f.to_str(f_name)
+            s += "\n"
+            s += f.to_str(f_name, print_attr=print_attr)
         return s
 
 
diff --git a/coremltools/converters/mil/mil/scope.py b/coremltools/converters/mil/mil/scope.py
new file mode 100644
index 000000000..cc65b3f93
--- /dev/null
+++ b/coremltools/converters/mil/mil/scope.py
@@ -0,0 +1,337 @@
+#  Copyright (c) 2024, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import copy
+from collections import defaultdict
+from enum import Enum
+from typing import Dict, List, Union
+
+from attrs import define, field, validators
+
+
+class ScopeSource(Enum):
+    """
+    Pre-defined scope source enum:
+
+    # Torch script related:
+    TORCHSCRIPT_MODULE_TYPE:
+        * Torchscript module type of a scope, which usually corresponds to the submodule object class type.
+        * If provided as str, it denotes a single scope, and cannot be an empty str.
+        * Nested scopes are represented by a list of str.
+
+    TORCHSCRIPT_MODULE_NAME:
+        * Unique torchscript identifier for a scope, which usually corresponds to the submodule object name.
+        * If provided as str, it denotes a single scope.
+        * Nested scopes are represented by a list of str.
+
+    # Core ML converter graph passes related:
+    COREMLTOOLS_GRAPH_PASS:
+        * This scope traces the graph transformations (graph passes) applied on the program.
+        * For instance, operations constructed under the "fuse_conv_batchnorm" pass, is going to have
+          the scopes attribute of ``{COREMLTOOLS_GRAPH_PASS: ["fuse_conv_batchnorm"]}``.
+        * If the op went through multiple graph pass transformations, it is represetned by a list of str.
+          For instance: ["fuse_conv_batchnorm", "add_fp16_cast"] means the op is created by "fuse_conv_batchnorm"
+          and then undergoes "add_fp16_cast".
+
+    # Torch export related:
+    EXIR_DEBUG_HANDLE:
+        * The ``debug_handle`` metadata inherited from torch.fx.Node.meta in EXIR
+        * This metadata enables post-run analysis in ExecuTorch integration
+        * ExecuTorch uses integer as debug handle. When a MIL op can be traced back to ExecuTorch
+          (e.g. translated from torch op), we inherit the integer value
+        * If a MIL op cannot be traced back to ExecuTorch (e.g. created by graph pass),
+          then we use None to denote "no debug handle"
+
+
+    Examples
+    --------
+    Here is an example of torchscript related scope enum:
+
+    .. sourcecode:: python
+
+        class SubModule(torch.nn.Module):
+            pass
+
+
+        class MainModule(torch.nn.Module):
+            def __init__(self):
+                self.submodule_1 = SubModule()
+
+            def forward(self, x):
+                node = self.submodule_1(x)
+                return node
+
+
+        my_model = MainModule()
+
+    when the above model is translated into pymil, the Operation corresponding to ``node`` would have:
+
+        * TORCHSCRIPT_MODULE_TYPE: ["SubModule", ...]
+        * TORCHSCRIPT_MODULE_NAME: ["submodule_1", ...]
+
+    in their scope attributes.
+    """
+
+    TORCHSCRIPT_MODULE_TYPE = 0
+    TORCHSCRIPT_MODULE_NAME = 1
+    COREMLTOOLS_GRAPH_PASS = 2
+    EXIR_DEBUG_HANDLE = 3
+
+
+class ScopeStack(defaultdict):
+    """
+    A utility class to handle the scope context manager
+    """
+
+    def __init__(self):
+        super().__init__(list)
+
+    def get_curr_scopes(self) -> Dict[ScopeSource, List[str]]:
+        """
+        Returns the current scope information as a dictionary.
+        """
+        res = defaultdict(list)
+        for key, val in self.items():
+            if len(val) == 0:
+                continue
+            scope_for_one_source = []
+            for v in val:
+                scope_for_one_source.extend(v.data)
+            res[key] = scope_for_one_source
+        return res
+
+
+SCOPE_STACK = ScopeStack()
+VALID_OPS_TO_COPY_SCOPE_INFO = []
+
+
+def add_graph_pass_scope(
+    src_scopes: Dict[ScopeSource, List[str]], graph_pass_scopes: Dict[ScopeSource, List[str]]
+) -> Dict[ScopeSource, List[str]]:
+    res = {}
+    """
+    Construct a scope by adding graph pass scopes from ``graph_pass_scopes`` to ``src_scopes``.
+
+    The rules are the following:
+
+    (1) We append the COREMLTOOLS_GRAPH_PASS ScopeSource in ``graph_pass_scopes`` to the ``src_scopes``.
+        This will allow us to keep tracking the history of transformation.
+        For instance:
+
+        Input:
+
+            src_scopes = {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+            }
+            graph_pass_scopes = {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2", "pass_3"],
+            }
+
+        Output:
+
+            res = {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2", "pass_3"],
+            }
+
+    (2) Only COREMLTOOLS_GRAPH_PASS ScopeSource is allowed in ``graph_pass_scopes``.
+
+    (3) Other ScopeSource will be passed down from ``src_scopes``.
+
+        Input:
+
+            src_scopes = {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+                ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+                ScScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+            }
+            graph_pass_scopes = {
+                ScScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2", "pass_3"],
+            }
+
+        Output:
+
+            res = {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+                ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+                ScScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2", "pass_3"],
+            }
+    """
+    res = defaultdict(list)
+    for scope_source_key in ScopeSource:
+        if scope_source_key in graph_pass_scopes:
+            assert (
+                scope_source_key == ScopeSource.COREMLTOOLS_GRAPH_PASS
+            ), "Only ScopeSource.COREMLTOOLS_GRAPH_PASS is allowed in the graph_pass_scopes."
+            if ScopeSource.COREMLTOOLS_GRAPH_PASS in src_scopes:
+                old_graph_pass_data = copy.copy(src_scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS])
+            else:
+                old_graph_pass_data = []
+            new_graph_pass_data = copy.copy(graph_pass_scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS])
+            res[ScopeSource.COREMLTOOLS_GRAPH_PASS] = old_graph_pass_data + new_graph_pass_data
+        elif scope_source_key in src_scopes:
+            res[scope_source_key] = copy.copy(src_scopes[scope_source_key])
+
+    return res
+
+
+@define
+class ScopeInfo:
+    """
+    Parameters
+    ----------
+    source: str
+        * Source of the scope. For instance, it could be a frontend framework like torchsccript, or a converter graph pass, etc.
+        * Must be type of ScopeSource Enum.
+
+    data: Union[str, List[str]]
+        * Scope data.
+        * It could be type of str or List[str].
+
+    Examples
+    --------
+    Here are examples of creating a ScopeInfo:
+
+    .. sourcecode:: python
+        # A scope for a single torchscript module type
+        scope_info = ScopeInfo(
+            source=ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+            data="Module_1",
+        )
+
+        # A scope for a two layers torchscript model hierarchy type
+        scope_info = ScopeInfo(
+            source=ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+            data=["Module_1", "Module_2"],
+        )
+    """
+
+    source: str = field(validator=validators.instance_of(ScopeSource))
+    data: Union[str, List[str]] = field(validator=validators.instance_of((str, list)))
+
+    def __attrs_post_init__(self):
+        # cleanup scope info
+        if self.source in (
+            ScopeSource.TORCHSCRIPT_MODULE_NAME,
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+            ScopeSource.COREMLTOOLS_GRAPH_PASS,
+        ):
+            if not isinstance(self.data, list):
+                self.data = [self.data]
+            for i, val in enumerate(self.data):
+                if not isinstance(val, str):
+                    raise ValueError(
+                        f"Scope must be type of List[str]. Got element {val} with type {type(val)}."
+                    )
+                self.data[i] = val.replace(" ", "")
+        elif self.source == ScopeSource.EXIR_DEBUG_HANDLE:
+            if not isinstance(self.data, list):
+                self.data = [self.data]
+            for val in self.data:
+                if val is not None and not isinstance(val, int):
+                    raise ValueError(
+                        f"Scope must be None or type of List[int]. Got element {val} with type {type(val)}."
+                    )
+
+        if self.source == ScopeSource.COREMLTOOLS_GRAPH_PASS:
+            if len(self.data) > 1:
+                raise ValueError(
+                    f"COREMLTOOLS_GRAPH_PASS scope cannot have len > 1. Got {self.data}."
+                )
+
+        if self.source == ScopeSource.TORCHSCRIPT_MODULE_TYPE:
+            if "" in self.data:
+                raise ValueError(
+                    "TORCHSCRIPT_MODULE_TYPE scope info cannot contains empty string."
+                )
+
+        if self.source == ScopeSource.EXIR_DEBUG_HANDLE:
+            if len(self.data) > 1:
+                raise ValueError(f"EXIR_DEBUG_HANDLE scope cannot have len > 1. Got {self.data}.")
+
+
+class ScopeContextManger:
+    def __init__(
+        self,
+        *scopes: List[ScopeInfo],
+    ):
+        """
+        A context manager pushes/pops the scope information, which makes the
+        operations created within it have the corresponding scope information.
+
+        Parameters
+        ----------
+        scopes: Optional[List[ScopeInfo]] (Optional)
+            * A list of ScopeInfo under the context manager.
+            * The source in each ScopeInfo cannot be duplicated.
+            * If not provided, this context manager does no affects.
+
+        Examples
+        --------
+        Here is an example of creating a scope for torchscript module heirarchy with type and name information.
+
+        .. sourcecode:: python
+
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+                ):
+                    return mb.add(x=x, y=4.3, name="add_1")
+
+
+        In the above example, the "add_1" op will have two scope attributes, for torchscipt module type and name:
+            * TORCHSCRIPT_MODULE_TYPE: ["Module1"]
+            * TORCHSCRIPT_MODULE_NAME: ["module_1"]
+
+        Here is an example of creating nested scopes:
+
+        .. sourcecode:: python
+
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+                ):
+                    x = mb.add(x=x, y=4.3, name="add_1")
+                    with mb.scope(
+                        ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module2"]),
+                        ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_2"]),
+                    ):
+                        return mb.add(x=x, y=3.2, name="add_2")
+
+        In the above example, the "add_1" op would have a scope attribute:
+            * TORCHSCRIPT_MODULE_TYPE: ["Module1"]
+
+        while the "add_2" op would have scope attributes:
+            * TORCHSCRIPT_MODULE_TYPE: ["Module1", "Module2"]
+            * TORCHSCRIPT_MODULE_NAME: ["module_2"]
+        """
+        self.scopes = scopes
+        # Validate scopes are type of ScopeInfo
+        for scope in self.scopes:
+            if not isinstance(scope, ScopeInfo):
+                raise ValueError(
+                    f"mb.scope only accepts inputs of type ScopeInfo. Got {type(scope)}."
+                )
+
+        # validate there is no duplicated scope source
+        visited_scope_sources = set()
+        for scope in self.scopes:
+            if scope.source in visited_scope_sources:
+                raise ValueError(f"Scope source {scope.source} duplicated.")
+            visited_scope_sources.add(scope.source)
+
+    def __enter__(self):
+        for scope in self.scopes:
+            SCOPE_STACK[scope.source].append(scope)
+            if scope.source == ScopeSource.COREMLTOOLS_GRAPH_PASS:
+                VALID_OPS_TO_COPY_SCOPE_INFO.append(set())
+
+    def __exit__(self, type, value, traceback):
+        for scope in self.scopes:
+            SCOPE_STACK[scope.source].pop()
+            if scope.source == ScopeSource.COREMLTOOLS_GRAPH_PASS:
+                VALID_OPS_TO_COPY_SCOPE_INFO.pop()
diff --git a/coremltools/converters/mil/mil/tests/test_block.py b/coremltools/converters/mil/mil/tests/test_block.py
index 0f943c710..70da4f853 100644
--- a/coremltools/converters/mil/mil/tests/test_block.py
+++ b/coremltools/converters/mil/mil/tests/test_block.py
@@ -10,6 +10,7 @@
 
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil.passes.tests.test_passes import CONSTEXPR_FUNCS
+from coremltools.converters.mil.mil.utils import CacheDoublyLinkedList
 from coremltools.converters.mil.testing_utils import (
     assert_same_output_names,
     assert_same_output_shapes,
@@ -61,7 +62,7 @@ def prog(x0):
     print("after:\n{}".format(prog))
     assert block.inputs["x0"] == block.find_ops(op_type="log")[0].inputs["x"]
     assert len(block.operations) == 2 # const op for epsilon + log
-    assert block.operations[1].op_type == "log"
+    assert list(block.operations)[1].op_type == "log"
     assert block.outputs[0] == x1
 
 
@@ -492,3 +493,58 @@ def prog(x0, y0):
     assert block.outputs[0].op.name == "new_output"
     assert block.outputs[1].op.name == "new_output"
     assert len(block.outputs[0].consuming_blocks) == 1
+
+
+class TestCacheDoublyLinkedList:
+    def test_basic(self):
+        operations = CacheDoublyLinkedList()
+
+        operations.insert_op_before(1)
+        assert list(operations) == [1]
+
+        operations.insert_op_before(2, before_op=1)
+        assert list(operations) == [2, 1]
+
+        operations.insert_op_before(3)
+        assert list(operations) == [2, 1, 3]
+
+        operations.insert_op_before(4, before_op=1)
+        assert list(operations) == [2, 4, 1, 3]
+
+        operations.remove(2)
+        assert list(operations) == [4, 1, 3]
+
+        operations.remove(3)
+        assert list(operations) == [4, 1]
+
+        operations.remove(4)
+        assert list(operations) == [1]
+
+        node = operations._get_node_from_op(1)
+        operations.remove(1)
+        assert list(operations) == []
+        assert node.prev is CacheDoublyLinkedList.INVALID_NODE
+        assert node.next is CacheDoublyLinkedList.INVALID_NODE
+
+        operations.insert_op_before(0)
+        assert list(operations) == [0]
+
+        operations = CacheDoublyLinkedList([1, 2, 3])
+        assert list(operations) == [1, 2, 3]
+
+        operations = CacheDoublyLinkedList([])
+        assert list(operations) == []
+
+    def test_reversed(self):
+        operations = CacheDoublyLinkedList([1, 2, 3])
+        assert list(reversed(operations)) == [3, 2, 1]
+
+    def test_error(self):
+        operations = CacheDoublyLinkedList([1, 2, 3])
+        assert operations[0] == 1
+        assert operations[-1] == 3
+        # Indexing doubly linked list is super expensive, we need to error out.
+        with pytest.raises(
+            ValueError, match="Doubly linked list does not support indexing other than 0, -1."
+        ):
+            operations[1]
diff --git a/coremltools/converters/mil/mil/tests/test_debug.py b/coremltools/converters/mil/mil/tests/test_debug.py
index b6601ccfc..0fa69dbf0 100644
--- a/coremltools/converters/mil/mil/tests/test_debug.py
+++ b/coremltools/converters/mil/mil/tests/test_debug.py
@@ -35,7 +35,7 @@ def compute_ground_truth_answer(input):
     square = x * x
     tanh = np.tanh(square)
     return {"output_0": square, "output_1":tanh}
-    
+
 class TestExtractSubModel:
 
     def test_extract_submodel_error_handling(self):
@@ -48,16 +48,16 @@ def test_extract_submodel_error_handling(self):
 
         invalid_outputs = ["output_1", 1]
         with pytest.raises(ValueError, match="outputs must be a list of str. Got element 1 with type <class 'int'>."):
-            extract_submodel(mlmodel, outputs=invalid_outputs) 
+            extract_submodel(mlmodel, outputs=invalid_outputs)
 
         invalid_outputs = ["output_1", "output_1"]
         with pytest.raises(ValueError, match="outputs must be a list of unique elements. 'output_1' occurs 2 times"):
             extract_submodel(mlmodel, outputs=invalid_outputs)
-            
+
         invalid_outputs = ["error"]
         with pytest.raises(ValueError, match="outputs \['error'\] not found in the function."):
             extract_submodel(mlmodel, outputs=invalid_outputs)
-            
+
         model_dir = tempfile.TemporaryDirectory()
         mlmodel_path = os.path.join(model_dir.name, "model.mlmodel")
         mlmodel.save(mlmodel_path)
@@ -72,7 +72,7 @@ def test_extract_submodel_symbolic_input(self):
                   |
                   v
                  mul -> tan -> output_2
-                 
+
         If x has symbolic shape, then the subgraph mil -> tan should also have symbolic shape
         """
         @mb.program(input_specs=[mb.TensorSpec(shape=(1, get_new_symbol()))])
@@ -85,15 +85,15 @@ def prog(x):
         model = ct.convert(prog, convert_to="neuralnetwork")
         submodel = extract_submodel(model, outputs=["tan"], inputs=["mul"])
         func = submodel._mil_program.functions["main"]
-        
+
         input = list(func.inputs.values())[0]
         assert input.shape[0] == 1
         assert is_symbolic(input.shape[1])
-        
+
         output = func.outputs[0]
         assert output.shape[0] == 1
         assert is_symbolic(output.shape[1])
-        
+
     def test_extract_submodel_complex(self):
         """
         Input graph:
@@ -117,7 +117,7 @@ def prog(x, y):
         Case 1:
         inputs = None
         outputs = [sin, mul]
-        
+
         Output graph:
         x -> sin ------> output_1
               |      |
@@ -126,12 +126,12 @@ def prog(x, y):
         """
         submodel = extract_submodel(model, outputs=["sin", "mul"])
         assert get_op_types_in_program(submodel._mil_program) == ["sin", "add", "mul"]
-        
+
         """
         Case 2:
         inputs = None
         outputs = [sin, add]
-        
+
         Output graph:
         x -> sin -> output_1
               |
@@ -140,12 +140,12 @@ def prog(x, y):
         """
         submodel = extract_submodel(model, outputs=["sin", "add"])
         assert get_op_types_in_program(submodel._mil_program) == ["sin", "add"]
-        
+
         """
         Case 3:
         inputs = None
         outputs = [mul]
-        
+
         Output graph:
         x -> sin -----
               |      |
@@ -154,12 +154,12 @@ def prog(x, y):
         """
         submodel = extract_submodel(model, outputs=["mul"])
         assert get_op_types_in_program(submodel._mil_program) == ["sin", "add", "mul"]
-        
+
         """
         Case 4:
         inputs = None
         outputs = [sin, sub]
-        
+
         Output graph:
         x -> sin -> sub -> output_2
               |
@@ -168,14 +168,13 @@ def prog(x, y):
         y
         """
         submodel = extract_submodel(model, outputs=["sin", "sub"])
-        print(submodel._mil_program)
         assert get_op_types_in_program(submodel._mil_program) == ["sin", "sub"]
-        
+
         """
         Case 5:
         inputs = [x, y]
         outputs = [mul]
-        
+
         Output graph:
         x -> sin -----
               |      |
@@ -184,12 +183,12 @@ def prog(x, y):
         """
         submodel = extract_submodel(model, outputs=["mul"], inputs=["x", "y"])
         assert get_op_types_in_program(submodel._mil_program) == ["sin", "add", "mul"]
-        
+
         """
         Case 6:
         inputs = [mul]
         outputs = [tan]
-        
+
         mul -> tan -> output_1
         """
         submodel = extract_submodel(model, outputs=["tan"], inputs=["mul"])
@@ -207,22 +206,22 @@ def prog(x, y):
         """
         submodel = extract_submodel(model, outputs=["sub", "mul"], inputs=["sin", "add"])
         assert get_op_types_in_program(submodel._mil_program) == ["sub", "mul"]
-        
+
         """
         Case 8 (Negative):
         inputs = [sin]
         outputs = [mul]
-        
+
         mul not reachable merely through sin
         """
         with pytest.raises(ValueError, match="output mul not reachable from inputs"):
             submodel = extract_submodel(model, outputs=["mul"], inputs=["sin"])
-            
+
         """
         Case 9 (Negative):
         inputs = [mul]
         outputs = [sin]
-        
+
         sin not reachable merely through sin
         """
         with pytest.raises(ValueError, match="output sin not reachable from inputs"):
@@ -242,7 +241,7 @@ def test_extract_submodel_neuralnetwork(self, compute_unit):
 
         # check that the submodel retains the same backend
         assert submodel.get_spec().WhichOneof("Type") == "neuralNetwork"
-        
+
         # check that the submodel retains the same compute unit
         assert submodel.compute_unit == compute_unit
 
@@ -286,7 +285,7 @@ def test_extract_submodel_mlprogram(self, compute_unit, store_to_disk):
 
         # check that the submodel retains the same backend
         assert submodel.get_spec().WhichOneof("Type") == "mlProgram"
-        
+
         # check that the submodel retains the same compute unit
         assert submodel.compute_unit == compute_unit
 
diff --git a/coremltools/converters/mil/mil/tests/test_programs.py b/coremltools/converters/mil/mil/tests/test_programs.py
index 139bf4d62..5230600b6 100644
--- a/coremltools/converters/mil/mil/tests/test_programs.py
+++ b/coremltools/converters/mil/mil/tests/test_programs.py
@@ -8,9 +8,11 @@
 
 import coremltools as ct
 from coremltools import _logger as logger
+from coremltools.converters.mil import mil
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Program, types
 from coremltools.converters.mil.mil.passes.tests.test_passes import CONSTEXPR_FUNCS
+from coremltools.converters.mil.mil.scope import ScopeInfo, ScopeSource, add_graph_pass_scope
 
 np.random.seed(0)
 
@@ -375,7 +377,7 @@ def func_2(x):
         def func_3(x):
             return x
 
-        prog = Program()
+        prog = mil.Program()
         prog.add_function("func_1", func_1)
         prog.add_function("func_2", func_2)
         prog.add_function("func_3", func_3)
@@ -397,12 +399,12 @@ def func_2(x):
 
         err_msg = "all functions must have the same opset_version."
 
-        prog = Program()
+        prog = mil.Program()
         prog.add_function("func_1", func_1)
         with pytest.raises(ValueError, match=err_msg):
             prog.add_function("func_2", func_2)
 
-        prog = Program()
+        prog = mil.Program()
         prog.add_function("func_2", func_2)
         with pytest.raises(ValueError, match=err_msg):
             prog.add_function("func_1", func_1)
@@ -553,21 +555,1422 @@ def prog(x):
                     prog,
                     convert_to="mlprogram",
                     minimum_deployment_target=ct.target.iOS16,
+                    pass_pipeline=ct.PassPipeline.EMPTY,
                     compute_units=ct.ComputeUnit.CPU_ONLY,
                     compute_precision=compute_precision,
                 )
 
-        # If the transpose is removed by optimization passes, the conversion goes through
+        # If the transpose is removed by graph pass merge_affine_dequantize_with_consecutive_ops,
+        # the conversion goes through
         @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
         def prog(x):
             constexpr = CONSTEXPR_FUNCS["constexpr_affine_dequantize"]((4, 3))
             constexpr = mb.transpose(x=constexpr, perm=[0, 1])
             return mb.linear(x=x, weight=constexpr)
 
-            mlmodel = ct.convert(
-                prog,
-                convert_to="mlprogram",
-                minimum_deployment_target=ct.target.iOS16,
-                compute_units=ct.ComputeUnit.CPU_ONLY,
-                compute_precision=compute_precision,
-            )
+        mlmodel = ct.convert(
+            prog,
+            convert_to="mlprogram",
+            minimum_deployment_target=ct.target.iOS16,
+            compute_units=ct.ComputeUnit.CPU_ONLY,
+            compute_precision=compute_precision,
+        )
+
+class TestScope:
+    @staticmethod
+    def test_basic_single_TorchScript_scope():
+        # single scope with scope_name and scope_type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data="module_1"),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="Module1"),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["module_1"]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["Module1"]
+
+        # single scope with scope_name and scope_type with list type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["module_1"]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["Module1"]
+
+        # single scope with scope_type and no scope_name
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op.scopes
+
+        # nested scope in a single mb.scope call. Both scope_name and scope_type provided
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1", "module_2"]
+                ),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1", "Module2"]),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == [
+            "module_1",
+            "module_2",
+        ]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "Module1",
+            "Module2",
+        ]
+
+        # nested scope in a single mb.scope call. Only scope_type provided
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1", "module_2"]
+                ),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+        ]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op.scopes
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["", ""]),
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1", "module_2"]
+                ),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["", ""]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+        ]
+
+    @staticmethod
+    def test_basic_nested_TorchScript_scope():
+        # nested scope with scope_name and scope_type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data="module_1"),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="Module1"),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data="module_2"),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="Module2"),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == [
+            "module_1",
+            "module_2",
+        ]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "Module1",
+            "Module2",
+        ]
+
+        add_op_2 = prog.find_ops(op_type="add")[1]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["module_1"]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["Module1"]
+
+        # nested scope with scope_name and scope_type with list type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_2"]),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module2"]),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == [
+            "module_1",
+            "module_2",
+        ]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "Module1",
+            "Module2",
+        ]
+
+        add_op_2 = prog.find_ops(op_type="add")[1]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["module_1"]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["Module1"]
+
+        # nested scope with scope_name and no scope_type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_2"]),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op_1.scopes
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+        ]
+
+        add_op_2 = prog.find_ops(op_type="add")[1]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op_2.scopes
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["module_1"]
+
+        # nested scope in a nested mb.scope call. Both scope_name and scope_type provided
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1", "module_2"]
+                ),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1", "Module2"]),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data="module_3"),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="Module3"),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == [
+            "module_1",
+            "module_2",
+            "module_3",
+        ]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "Module1",
+            "Module2",
+            "Module3",
+        ]
+
+        add_op_2 = prog.find_ops(op_type="add")[1]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == [
+            "module_1",
+            "module_2",
+        ]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "Module1",
+            "Module2",
+        ]
+
+        # nested scope in a single mb.scope call. Only scope_type provided
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1", "module_2"]
+                ),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op_1.scopes
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+            "module_3",
+        ]
+
+        add_op_2 = prog.find_ops(op_type="add")[1]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op_2.scopes
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+        ]
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1", "module_2"]
+                ),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["", ""]),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+            "module_3",
+        ]
+        assert add_op_1.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["", ""]
+
+        add_op_2 = prog.find_ops(op_type="add")[1]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+        ]
+        assert add_op_2.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["", ""]
+
+    @staticmethod
+    def test_graph_pass_scope_handling():
+        # default list type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.COREMLTOOLS_GRAPH_PASS,
+                    data="pass_1",
+                ),
+            ):
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "pass_1",
+        ]
+
+        # data cannot have len > 1
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(
+                ValueError, match="COREMLTOOLS_GRAPH_PASS scope cannot have len > 1."
+            ):
+                with mb.scope(
+                    ScopeInfo(
+                        source=ScopeSource.COREMLTOOLS_GRAPH_PASS,
+                        data=["pass_1", "pass_2"],
+                    ),
+                ):
+                    return mb.add(x=x, y=0.0)
+            return x
+
+        # nested graph pass scope is allowed
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.COREMLTOOLS_GRAPH_PASS,
+                    data="pass_1",
+                ),
+            ):
+                with mb.scope(
+                    ScopeInfo(
+                        source=ScopeSource.COREMLTOOLS_GRAPH_PASS,
+                        data="pass_2",
+                    ),
+                ):
+                    return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "pass_1",
+            "pass_2",
+        ]
+
+    @staticmethod
+    def test_EXIR_scope_handling():
+        # default list type
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[1])):
+                return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.EXIR_DEBUG_HANDLE] == [1]
+
+        # data cannot have len > 1
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(ValueError, match="EXIR_DEBUG_HANDLE scope cannot have len > 1."):
+                with mb.scope(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[2, 3])):
+                    return mb.add(x=x, y=0.0)
+            return x
+
+        # nested graph pass scope is allowed
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None])):
+                with mb.scope(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[0])):
+                    return mb.add(x=x, y=0.0)
+
+        add_op_1 = prog.find_ops(op_type="add")[0]
+        assert add_op_1.scopes[ScopeSource.EXIR_DEBUG_HANDLE] == [None, 0]
+
+    @staticmethod
+    def test_invalid_dtype_error_out():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(
+                ValueError,
+                match="Scope must be type of List\[str\]. Got element 9 with type \<class 'int'\>.",
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["m1", 9]),
+                    ScopeInfo(
+                        source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1", "Module2"]
+                    ),
+                ):
+                    return mb.add(x=x, y=5.4)
+
+            with pytest.raises(
+                ValueError,
+                match="Scope must be type of List\[str\]. Got element 0 with type \<class 'int'\>.",
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["m1", "m2"]),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1", 0]),
+                ):
+                    return mb.add(x=x, y=5.4)
+            return x
+
+    @staticmethod
+    def test_empty_scope():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope():
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert ScopeSource.TORCHSCRIPT_MODULE_TYPE not in add_op.scopes
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op.scopes
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope():
+                with mb.scope():
+                    return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert ScopeSource.TORCHSCRIPT_MODULE_TYPE not in add_op.scopes
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op.scopes
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope():
+                with mb.scope(ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="m1")):
+                    with mb.scope():
+                        return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == ["m1"]
+        assert ScopeSource.TORCHSCRIPT_MODULE_NAME not in add_op.scopes
+
+
+    @staticmethod
+    def test_empty_scope_type_error_out():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(
+                ValueError, match="TORCHSCRIPT_MODULE_TYPE scope info cannot contains empty string."
+            ):
+                with mb.scope(ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="")):
+                    with mb.scope():
+                        return mb.add(x=x, y=5.4)
+
+            with pytest.raises(
+                ValueError, match="TORCHSCRIPT_MODULE_TYPE scope info cannot contains empty string."
+            ):
+                with mb.scope(
+                    ScopeInfo(
+                        source=ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+                        data=["a", ""],
+                    )
+                ):
+                    with mb.scope():
+                        return mb.add(x=x, y=5.4)
+            return x
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+                    data=["module_1"],
+                )
+            ):
+                with pytest.raises(
+                    ValueError,
+                    match="TORCHSCRIPT_MODULE_TYPE scope info cannot contains empty string.",
+                ):
+                    with mb.scope(
+                        ScopeInfo(
+                            source=ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+                            data=[""],
+                        )
+                    ):
+                        return mb.add(x=x, y=5.4)
+                with pytest.raises(
+                    ValueError,
+                    match="TORCHSCRIPT_MODULE_TYPE scope info cannot contains empty string.",
+                ):
+                    with mb.scope(
+                        ScopeInfo(
+                            source=ScopeSource.TORCHSCRIPT_MODULE_TYPE,
+                            data=["a", "", ""],
+                        )
+                    ):
+                        return mb.add(x=x, y=5.4)
+            return x
+
+    @staticmethod
+    def test_white_space_handling():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=[" module_1  "]),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=[" Module1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=[" pass_1"]),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == [
+            "module_1",
+        ]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "Module1",
+        ]
+        assert add_op.scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS] == [
+            "pass_1",
+        ]
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=[" Module1   ", " "]),
+                ScopeInfo(
+                    source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=[" module_1 ", " module_2 "]
+                ),
+            ):
+                return mb.add(x=x, y=5.4)
+
+        add_op = prog.find_ops(op_type="add")[0]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] == [
+            "module_1",
+            "module_2",
+        ]
+        assert add_op.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME] == ["Module1", ""]
+
+    @staticmethod
+    def test_duplicated_scope_source_error_out():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(
+                ValueError, match="Scope source ScopeSource.TORCHSCRIPT_MODULE_TYPE duplicated."
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="a1"),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data="a2"),
+                ):
+                    return mb.add(x=x, y=5.4)
+            return x
+
+    @staticmethod
+    def test_check_prog_has_scope_error_out():
+        def get_prog():
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["Module1"]),
+                ):
+                    x = mb.add(x=x, y=5.4)
+                x = mb.relu(x=x, name="invalid_op")
+                return x
+
+            return prog
+
+        prog = get_prog()
+        prog._add_essential_scope_source(
+            [ScopeSource.TORCHSCRIPT_MODULE_TYPE, ScopeSource.TORCHSCRIPT_MODULE_NAME]
+        )
+        with pytest.raises(
+            ValueError, match="is missing essential scopes ScopeSource.TORCHSCRIPT_MODULE_TYPE"
+        ):
+            prog.validate(check_essential_scope=True)
+
+        # If check_essential_scope is not passes, it will not error out
+        prog.validate()
+
+        # No error if no essential scope source are set
+        prog = get_prog()
+        prog.validate(check_essential_scope=True)
+
+    @staticmethod
+    def test_invalid_scope_source_type():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(TypeError, match="'source' must be \<enum 'ScopeSource'\>"):
+                with mb.scope(
+                    ScopeInfo(source="invalid_source", data="a1"),
+                ):
+                    return mb.add(x=x, y=5.4)
+            return x
+
+    @staticmethod
+    def test_invalid_scope_info_type():
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with pytest.raises(
+                ValueError,
+                match="mb.scope only accepts inputs of type ScopeInfo. Got \<class 'str'\>.",
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+                    "invalid",
+                ):
+                    return mb.add(x=x, y=5.4)
+            return x
+
+    @staticmethod
+    def test_scope_setter_immutable():
+        """
+        When setting the `scopes` property for an op, the value should be deep copied.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+            ):
+                x = mb.add(x=x, y=5.4)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_2"]),
+            ):
+                y = mb.add(x=x, y=5.4)
+
+            x.scopes = y.scopes
+            y.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME][0] = "invalid"
+            assert x.scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME][0] == "module_2"
+
+            return x
+
+    @staticmethod
+    def test_scopes_for_function_inputs():
+        """
+        If a var's parent op is a placeholder, we cannot set its scopes.
+        And its scopes is an empty dictionary.
+        """
+
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 3))])
+        def prog(x):
+            assert len(x.scopes) == 0
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+            ):
+                y = mb.add(x=x, y=5.4)
+
+            with pytest.raises(
+                ValueError,
+                match="Cannot set scopes to a function input var",
+            ):
+                x.scopes = y.scopes
+
+            return y
+
+    @staticmethod
+    def test_add_graph_pass_scope():
+        """
+        Test the rules of merging two scopes.
+        """
+        # Rule of merging COREMLTOOLS_GRAPH_PASS
+        old_scopes = {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        new_scopes = {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2", "pass_3"],
+        }
+        res = dict(add_graph_pass_scope(old_scopes, new_scopes))
+
+        assert res == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2", "pass_3"],
+        }
+
+        # Ensure we make a copy of the list
+        old_scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS][0] = "invalid"
+        assert res == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2", "pass_3"],
+        }
+        new_scopes[ScopeSource.COREMLTOOLS_GRAPH_PASS][0] = "invalid"
+        assert res == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2", "pass_3"],
+        }
+
+        # Another test
+        old_scopes = {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+        }
+        new_scopes = {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        res = add_graph_pass_scope(old_scopes, new_scopes)
+
+        assert res == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+        # Ensure we make a copy of the list
+        old_scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE][0] = "invalid"
+        assert res == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        old_scopes[ScopeSource.TORCHSCRIPT_MODULE_NAME][0] = "invalid"
+        assert res == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+        # Test for other scope source
+        old_scopes = {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.TORCHSCRIPT_MODULE_NAME: ["a1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        new_scopes = {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+        }
+
+        with pytest.raises(
+            AssertionError,
+            match="Only ScopeSource.COREMLTOOLS_GRAPH_PASS is allowed in the graph_pass_scopes.",
+        ):
+            add_graph_pass_scope(old_scopes, new_scopes)
+
+    @staticmethod
+    def test_scope_preservation_when_reconnect_graph():
+        """
+        If the _replace_var is doing reconnection of the graph, without any new op introduced,
+        no scope information is going to change.
+        """
+
+        def get_prog():
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ):
+                    relu = mb.relu(x=x)
+
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_2"]),
+                ):
+                    sin = mb.sin(x=x)
+
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+                ):
+                    return mb.relu(x=relu)
+
+            return prog
+
+        # Case 1: No graph pass is involved, and only reconnect graph is done.
+        # Scope information will not change.
+        prog = get_prog()
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+        }
+
+        block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+        }
+
+        # Case 2: Even the reconnection happens under graph pass, nothing will change.
+        prog = get_prog()
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["dummy_pass"])):
+            block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+        }
+
+        # Case 3: old_var and new_var are created under a graph pass, and the reconnection happens under the
+        # same graph pass. Nothing will change still.
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["dummy_pass"])):
+            prog = get_prog()
+            block = prog.functions["main"]
+            ops = list(block.operations)
+            var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+            assert var_1.scopes == {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+            }
+            assert var_2.scopes == {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+            }
+
+            block._replace_var(var_1, var_2)
+            assert var_2.scopes == {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+            }
+
+        # Case 4: Ops are created under a graph pass, and the reconnection happens outside the graph pass.
+        # Nothing happens.
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["dummy_pass"])):
+            prog = get_prog()
+            block = prog.functions["main"]
+            ops = list(block.operations)
+            var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+
+        block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+
+        # Case 5: Ops are created under a graph pass 1, and the reconnection happens under graph pass2.
+        # Nothing happens.
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["dummy_pass"])):
+            prog = get_prog()
+            block = prog.functions["main"]
+            ops = list(block.operations)
+            var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["dummy_pass_2"])):
+            block._replace_var(var_1, var_2)
+
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["dummy_pass"],
+        }
+
+        # Case 6. old_var and new_var are created under the same graph pass
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                sin = mb.sin(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"])):
+            block._replace_var(var_1, var_2)
+
+        assert var_1.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+    @staticmethod
+    def test_scope_passdown_when_new_var_created_under_graph_pass():
+        """
+        If a new_var is created by a graph pass, and the _replace_var happens under the same graph pass,
+        the scope information from the old_var is passed to new_var.
+        """
+
+        def get_prog():
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                    ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+                ):
+                    # This op is created by pass_1
+                    relu = mb.relu(x=x)
+
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+                ):
+                    # This op is newly created by a pass_2
+                    sin = mb.sin(x=x)
+
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+                ):
+                    return mb.relu(x=relu)
+
+            return prog
+
+        # Case 1: _replace_var happens outside the graph pass. Nothing happens
+        prog = get_prog()
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        # Case 2: new_var created under a pass_2, and _replace_var happens under pass_2. Scope info is passed from the old_var
+        # to the new_var
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        with prog.functions["main"] as block:
+            op_1, op_2 = list(block.operations)
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                # This op is newly created by a pass_2
+                sin = mb.sin(x=block.inputs["x"], before_op=op_2)
+                block._replace_var(op_1.outputs[0], sin)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2"],
+        }
+
+        # Case 3: new_var created under a pass_2, but _replace_var happens under pass_3.
+        # Nothing happens.
+        prog = get_prog()
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_3"])):
+            block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        # Case 4: new_var created under pass_2, and be passed down some scope info,
+        # so even though _replace_var happens under pass_2 again, nothing happens.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_2"]),
+            ):
+                # This op is newly created by a pass_2, and other scope info already passed down
+                sin = mb.sin(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"])):
+            block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_2"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        # Case 5: new_var created under pass_2, but the graph pass already finished,
+        # so even though _replace_var happens under pass_2 again, nothing happens.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                # This op is newly created by a pass_2, and other scope info already passed down
+                sin = mb.sin(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        with mb.scope(ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"])):
+            block._replace_var(var_1, var_2)
+        assert var_1.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2"],
+        }
+
+        # Case 6: new_var created under nested graph passes scope. And graph pass happens under pass_3.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_3"]),
+                ):
+                    sin = mb.sin(x=block.inputs["x"], before_op=ops[1])
+                    block._replace_var(ops[0].outputs[0], sin)
+
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2", "pass_3"],
+        }
+
+        # Case 7: new_var created under nested graph passes scope. And graph pass happens under pass_2. Nothing will happen in this case, since new_var is created under pass_3.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_3"]),
+                ):
+                    sin = mb.sin(x=block.inputs["x"], before_op=ops[1])
+                block._replace_var(ops[0].outputs[0], sin)
+
+        ops = list(block.operations)
+        var_1, var_2 = ops[0].outputs[0], ops[1].outputs[0]
+        assert var_1.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+        assert var_2.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_2", "pass_3"],
+        }
+
+    @staticmethod
+    def test_scope_passdown_resursive():
+        """
+        Test the resursive back propagation when passing down scope info.
+        """
+        # Case 1
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_3"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                # The subgraph is constructed under pass_2
+                y = mb.leaky_relu(x=block.inputs["x"], alpha=0.8, before_op=ops[1])
+                y = mb.add(x=y, y=y, before_op=ops[1])
+                y = mb.leaky_relu(x=y, alpha=0.4, before_op=ops[1])
+
+                block._replace_var(ops[0].outputs[0], y)
+
+        ops = list(block.operations)
+        assert ops[0].outputs[0].scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+        add_ops = block.find_ops(op_type="add")
+        const_ops = block.find_ops(op_type="const")
+        leaky_relu_ops = block.find_ops(op_type="leaky_relu")
+
+        assert len(add_ops) == 1
+        assert len(const_ops) == 2
+        assert len(leaky_relu_ops) == 2
+
+        for op in leaky_relu_ops + add_ops + const_ops:
+            assert op.scopes == {
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2"],
+            }
+
+        # Case 2: Test for VALID_OPS_TO_COPY_SCOPE_INFO in the scope back propagation
+        # The same var cannot be visited twice
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                # The subgraph is constructed under pass_2
+                relu = ops[0].outputs[0]
+
+                y = mb.leaky_relu(x=relu, alpha=0.8, before_op=ops[1])
+                y = mb.concat(values=[y, y, relu, y], axis=0, before_op=ops[1])
+                y1, y2, y3, y4 = mb.split(x=y, axis=0, num_splits=4, before_op=ops[1])
+
+                block._replace_var(relu, y1, anchor_op=y1.op)
+
+        ops = list(block.operations)
+        relu_ops = block.find_ops(op_type="relu")
+        leaky_relu_op = block.find_ops(op_type="leaky_relu")[0]
+        concat_op = block.find_ops(op_type="concat")[0]
+        split_op = block.find_ops(op_type="split")[0]
+
+        for op in [leaky_relu_op, concat_op, split_op]:
+            assert op.scopes == {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2"],
+            }
+
+        for op in relu_ops:
+            assert op.scopes == {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+            }
+
+        # Case 3: Similar to case 2, but the relu op has torch scope.
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=x)
+
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                return mb.relu(x=relu)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_2"]),
+            ):
+                # The subgraph is constructed under pass_2
+                relu = ops[0].outputs[0]
+
+                y = mb.leaky_relu(x=relu, alpha=0.8, before_op=ops[1])
+                y = mb.concat(values=[y, y, relu, y], axis=0, before_op=ops[1])
+                y1, y2, y3, y4 = mb.split(x=y, axis=0, num_splits=4, before_op=ops[1])
+
+                block._replace_var(relu, y1, anchor_op=y1.op)
+
+        ops = list(block.operations)
+        relu_ops = block.find_ops(op_type="relu")
+        leaky_relu_op = block.find_ops(op_type="leaky_relu")[0]
+        concat_op = block.find_ops(op_type="concat")[0]
+        split_op = block.find_ops(op_type="split")[0]
+
+        for op in [leaky_relu_op, concat_op, split_op]:
+            assert op.scopes == {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1", "pass_2"],
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            }
+
+        for op in relu_ops:
+            assert op.scopes == {
+                ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+                ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["module_1"],
+            }
+
+    @staticmethod
+    def test_scope_passdown_function_input_var():
+        """
+        If the old_var is function input var, and then the converter sets some default value for each scope source.
+        """
+        # Case 1: with no essential scope set, no scope information is passed down
+        def get_prog():
+            @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+            def prog(x):
+                with mb.scope(
+                    ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=["module_1"]),
+                ):
+                    return mb.sin(x=x)
+            return prog
+
+        prog = get_prog()
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=block.inputs["x"], before_op=ops[0])
+                block._replace_var(block.inputs["x"], relu)
+
+        assert relu.scopes == {
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+        # Case 2: essential scope set to TORCHSCRIPT_MODULE_TYPE
+        prog = get_prog()
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=block.inputs["x"], before_op=ops[0])
+                block._replace_var(block.inputs["x"], relu)
+
+        assert relu.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_TYPE: ["__COREML__::TORCHSCRIPT_PLACEHOLDER"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+        # Case 3: essential scope set to TORCHSCRIPT_MODULE_NAME
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=["module_1"]),
+            ):
+                return mb.sin(x=x)
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_NAME)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=block.inputs["x"], before_op=ops[0])
+                block._replace_var(block.inputs["x"], relu)
+
+        assert relu.scopes == {
+            ScopeSource.TORCHSCRIPT_MODULE_NAME: ["__COREML__::TORCHSCRIPT_PLACEHOLDER_x"],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
+
+        # Case 4: essential scope set to EXIR_DEBUG_HANDLE
+        @mb.program(input_specs=[mb.TensorSpec(shape=(2, 4))])
+        def prog(x):
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[1]),
+            ):
+                return mb.sin(x=x)
+
+        prog._add_essential_scope_source(ScopeSource.EXIR_DEBUG_HANDLE)
+
+        block = prog.functions["main"]
+        ops = list(block.operations)
+
+        with block:
+            with mb.scope(
+                ScopeInfo(source=ScopeSource.COREMLTOOLS_GRAPH_PASS, data=["pass_1"]),
+            ):
+                # This op is created by pass_1
+                relu = mb.relu(x=block.inputs["x"], before_op=ops[0])
+                block._replace_var(block.inputs["x"], relu)
+
+        assert relu.scopes == {
+            ScopeSource.EXIR_DEBUG_HANDLE: [None],
+            ScopeSource.COREMLTOOLS_GRAPH_PASS: ["pass_1"],
+        }
diff --git a/coremltools/converters/mil/mil/types/type_int.py b/coremltools/converters/mil/mil/types/type_int.py
index 2080d5b45..bcecd57a9 100644
--- a/coremltools/converters/mil/mil/types/type_int.py
+++ b/coremltools/converters/mil/mil/types/type_int.py
@@ -37,9 +37,12 @@ def val(self, v):
                                        numpy_type_to_builtin_type)
 
             if not isinstance(v, (np.generic, np.ndarray, sm.Basic)):
-                raise ValueError(
-                    f"types should have value of numpy type or Symbols, got {type(v)} instead"
-                )
+                try:
+                    v = np.array(v)
+                except Exception:
+                    raise ValueError(
+                        f"types should have value of numpy type or Symbols, got {type(v)} instead"
+                    )
 
             if isinstance(v, sm.Basic):
                 self._val = v
diff --git a/coremltools/converters/mil/mil/types/type_mapping.py b/coremltools/converters/mil/mil/types/type_mapping.py
index 8cc1d6e65..78e82a18a 100644
--- a/coremltools/converters/mil/mil/types/type_mapping.py
+++ b/coremltools/converters/mil/mil/types/type_mapping.py
@@ -4,11 +4,13 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from collections import namedtuple
+from typing import Optional, Union
 
 import numpy as _np
 import numpy as np
 import sympy as sm
 
+import coremltools.converters.mil.backend.mil.helper as mil_helper
 import coremltools.proto.MIL_pb2 as _mil_pm
 
 from .get_type_info import get_type_info
@@ -196,7 +198,7 @@ def builtin_to_resolution(builtin_type: type):
     return _TYPES_TO_RESOLUTION[builtin_type]
 
 
-def builtin_to_range(builtin_type: type):
+def builtin_to_range(builtin_type: type) -> RangeTuple:
     """
     Given a builtin type, return its corresponding range.
     """
@@ -341,9 +343,22 @@ def is_builtin(t):
     return is_scalar(t) or is_tensor(t) or is_str(t) or is_tuple(t)
 
 
-# Converts a numpy type to its types equivalent.
-# Supports both dtypes and numpy primitive types.
-def numpy_type_to_builtin_type(nptype):
+def _numpy_dtype_instance_to_builtin_type(np_dtype: np.dtype) -> Optional[type]:
+    if np_dtype in _NPTYPES_TO_STRINGS:
+        return string_to_builtin(_NPTYPES_TO_STRINGS[np_dtype])
+    return None
+
+
+def numpy_type_to_builtin_type(nptype) -> type:
+    """
+    Converts a numpy type to its builtin `types` equivalent.
+    Supports Python native types and numpy types.
+    """
+    if isinstance(nptype, np.dtype):
+        builtin_type = _numpy_dtype_instance_to_builtin_type(nptype)
+        if builtin_type is not None:
+            return builtin_type
+
     # If this is a data type object, use the corresponding scalar data type.
     if np.issubclass_(type(nptype), np.dtype):
         nptype = nptype.type
@@ -473,11 +488,14 @@ def is_subtype(type1, type2):
     return type1 == type2
 
 
+def _numpy_val_to_bytes(val: Union[np.ndarray, np.generic]) -> bytes:
+    return val.tobytes()
+
 def np_val_to_py_type(val):
     """Convert numpy val to python primitive equivalent. Ex:
 
     Given: val = np.array([True, False])
-    Returns: [True, False]
+    Returns: (True, False)
 
     Given: val = np.array(32, dtype=np.int32)
     Returns 32
@@ -485,9 +503,9 @@ def np_val_to_py_type(val):
     if not isinstance(val, (_np.ndarray, _np.generic)):
         return val
 
-    if val.dtype in (_np.float16, _np.uint8, _np.int8, _np.uint32):
-        # Serialize to bytes because MIL read them from bytes field (see TensorValue in MIL.proto).
-        return val.tobytes()
+    builtin_type = numpy_type_to_builtin_type(val.dtype)
+    if builtin_type in mil_helper.IMMEDIATE_VALUE_TYPES_IN_BYTES:
+        return _numpy_val_to_bytes(val)
     else:
         if val.dtype in (_np.uint16, _np.int16):
             # TODO (rdar://111797203): Serialize to byte after MIL changes to read from byte field.
diff --git a/coremltools/converters/mil/mil/types/type_tensor.py b/coremltools/converters/mil/mil/types/type_tensor.py
index 71f400f18..bee987b1d 100644
--- a/coremltools/converters/mil/mil/types/type_tensor.py
+++ b/coremltools/converters/mil/mil/types/type_tensor.py
@@ -86,11 +86,13 @@ def val(self):
         @val.setter
         def val(self, v):
             if not isinstance(v, np.ndarray):
-                raise ValueError(
-                    "tensor should have value of type ndarray, got {} instead".format(
-                        type(v)
+                try:
+                    v = np.array(v)
+                except:
+                    raise ValueError(
+                        f"tensor value type should be compatible with type np.ndarray, "
+                        f"got {type(v)} instead"
                     )
-                )
 
             v_type = numpy_type_to_builtin_type(v.dtype)
             promoted_type = promote_types(v_type, primitive)
diff --git a/coremltools/converters/mil/mil/utils.py b/coremltools/converters/mil/mil/utils.py
new file mode 100644
index 000000000..2377ed70a
--- /dev/null
+++ b/coremltools/converters/mil/mil/utils.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2024, Apple Inc. All rights reserved.
+#
+#  Use of this source code is governed by a BSD-3-clause license that can be
+#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from typing import Dict, List, Optional
+
+from .operation import Operation
+
+class OpNode:
+    """
+    A helper node class for the doubly linked list.
+    It contains an Operation data and pointers to the previous and the next node.
+    """
+
+    def __init__(self, op: Operation):
+        self.op = op
+        self.next: Optional[OpNode] = None
+        self.prev: Optional[OpNode] = None
+
+class CacheDoublyLinkedList:
+    """
+    This array-like data structure is useful to implement pymil's
+    core program transformations, including:
+
+    1. Insert an op at a target location (before a target op)
+    2. Remove an op from the program
+
+    Given the fact that each op in the list must be unique, a hash table
+    is maintained in this data structure, and hence the insert / pop can both be performed in O(1).
+    """
+
+    INVALID_NODE = OpNode(None)
+
+    def __init__(self, array: Optional[List[Operation]] = None):
+        self.start: OpNode = None
+        self.end: OpNode = None
+        self.op_to_node: Dict[Operation, OpNode] = {}
+
+        if array is not None:
+            for op in array:
+                self.insert_op_before(op)
+
+    def insert_op_before(self, new_op: Operation, before_op: Optional[Operation] = None):
+        """
+        Insert an op right before before_op. If before_op is None,
+         then the new op is appended in the end.
+        """
+        if new_op in self.op_to_node:
+            raise ValueError(f"{new_op} already exisits.")
+
+        new_node = OpNode(new_op)
+
+        if before_op is None:
+            # If before op is None, the new node is appended in the end.
+            if self.start is None:
+                self.start = self.end = new_node
+            else:
+                self.end.next = new_node
+                new_node.prev = self.end
+                self.end = new_node
+        else:
+            anchor_node = self.op_to_node[before_op]
+            prev_node = anchor_node.prev
+
+            if prev_node is None:
+                self.start = new_node
+            else:
+                prev_node.next = new_node
+
+            new_node.prev = prev_node
+            new_node.next = anchor_node
+            anchor_node.prev = new_node
+
+        self.op_to_node[new_op] = new_node
+
+    def remove(self, op: Operation):
+        """
+        Remove an op from the data structure.
+        """
+        node = self.op_to_node[op]
+        prev_node, next_node = node.prev, node.next
+
+        # reconnect the linked list
+        if prev_node is None:
+            self.start = next_node
+        else:
+            prev_node.next = next_node
+
+        if next_node is None:
+            self.end = prev_node
+        else:
+            next_node.prev = prev_node
+
+        node.prev = node.next = self.INVALID_NODE
+
+        # remove op from the cache
+        del self.op_to_node[op]
+
+    def __getitem__(self, idx: int) -> Operation:
+        """
+        The indexing is expensive in doubly linked list, we should prevent direct access besides [0] and [-1].
+        """
+        if self.start is None:
+            raise ValueError("Cannot index an empty list.")
+        if idx >= len(self):
+            raise ValueError("Index out of range")
+        if idx == 0:
+            return self.start.op
+        elif idx == -1:
+            return self.end.op
+        raise ValueError("Doubly linked list does not support indexing other than 0, -1.")
+
+    def _get_node_from_op(self, op: Operation) -> OpNode:
+        return self.op_to_node[op]
+
+    def __iter__(self):
+        cursor = self.start
+        while cursor is not None:
+            if cursor is self.INVALID_NODE:
+                raise ValueError("Invalid iterator on CacheDoublyLinkedList.")
+            yield cursor.op
+            cursor = cursor.next
+
+    def __reversed__(self):
+        cursor = self.end
+        while cursor is not None:
+            if cursor is self.INVALID_NODE:
+                raise ValueError("Invalid iterator on CacheDoublyLinkedList.")
+            yield cursor.op
+            cursor = cursor.prev
+
+    def __len__(self) -> int:
+        return len(self.op_to_node)
diff --git a/coremltools/converters/mil/mil/var.py b/coremltools/converters/mil/mil/var.py
index d1fc96d6e..0d0756de8 100644
--- a/coremltools/converters/mil/mil/var.py
+++ b/coremltools/converters/mil/mil/var.py
@@ -3,12 +3,15 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from typing import Optional, Union
+import copy
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
 
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.types import builtin_to_string
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 
+from .scope import ScopeSource
 
 class Var:
     """
@@ -153,7 +156,11 @@ def _propagate_constness_upstream(var):
         op = var.op
         if op is None:
             return False
-        if op.op_type.startswith("constexpr_") or var.val is not None:
+        if (
+            op.op_type.startswith("constexpr_")
+            or (op.op_type == "dequantize" and op.can_materialize_val())
+            or var.val is not None
+        ):
             return True
         flattened_inputs = op.get_flattened_inputs()
         return all([x.is_descendant_of_const for x in flattened_inputs])
@@ -167,6 +174,10 @@ def _set_nonreplaceable_vars_upstream(self):
         op = self.op
         if op is None:
             return
+        if op.op_type == "shape":
+            # For the meta data ops, like shape, we stop propogate the nonreplaceable_vars.
+            self.nonreplaceable_vars_upstream = set()
+            return
         if Var._is_nonreplaceable_var(self):
             self.nonreplaceable_vars_upstream = set([self])
         else:
@@ -283,6 +294,19 @@ def is_tensor_or_scalar_of(self, dtype: Union[str, type]):
     def __str__(self):
         return "%" + self.name + ": " + self.shape_str() + self.type_str()
 
+    @property
+    def scopes(self) -> Dict[ScopeSource, List[str]]:
+        if self.op is None:
+            # An empty dictionary is returned for function input vars.
+            return defaultdict(list)
+        return self.op.scopes
+
+    @scopes.setter
+    def scopes(self, scopes: Dict[ScopeSource, List[str]]):
+        if self.op is None:
+            raise ValueError(f"Cannot set scopes to a function input var {self}.")
+        self.op.scopes = copy.deepcopy(scopes)
+
 
 class ListVar(Var):
     __slots__ = ["_elem_type", "init_length", "dynamic_length"]
diff --git a/coremltools/converters/mil/testing_utils.py b/coremltools/converters/mil/testing_utils.py
index 791e8d0d8..ee7a34908 100644
--- a/coremltools/converters/mil/testing_utils.py
+++ b/coremltools/converters/mil/testing_utils.py
@@ -2,13 +2,12 @@
 #
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
-import itertools
-
 import copy
+import itertools
 import os
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pytest
@@ -17,10 +16,12 @@
 import coremltools as ct
 import coremltools.models.utils as coremltoolsutils
 from coremltools._deps import _IS_MACOS
+from coremltools.converters.mil import mil
 from coremltools.converters.mil.mil import Block, Function, Program
 from coremltools.converters.mil.mil.passes.defs.preprocess import NameSanitizer as _NameSanitizer
-from coremltools.converters.mil.mil.passes.defs.quantization import AbstractQuantizationPass
+from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
 from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
+from coremltools.converters.mil.mil.scope import ScopeSource
 from coremltools.proto import FeatureTypes_pb2 as ft
 
 np.random.seed(10)
@@ -289,7 +290,7 @@ def ssa_fn(func):
     """
 
     def wrapper(*args, **kwargs):
-        prog = Program()
+        prog = mil.Program()
         with Function({}) as ssa_func:
             func(*args, **kwargs)
 
@@ -489,7 +490,7 @@ def get_core_ml_prediction(
     Return predictions of the given model.
     """
     minimum_deployment_target = backend.opset_version
-    program = Program()
+    program = mil.Program()
     with Function(input_placeholders, opset_version=minimum_deployment_target) as ssa_func:
         output_vars = build(**ssa_func.inputs)
         if isinstance(output_vars, tuple):
@@ -509,33 +510,88 @@ def get_core_ml_prediction(
     return mlmodel.predict(input_values)
 
 
+def _decorate_prog_with_scope_if_not_present(prog: Program):
+    """
+    For a program without any scope info, we manually add scopes to every op,
+    in ordere to test that all graph passes can preserve the source scope info.
+    """
+
+    def _is_scopes_present_in_program(prog: Program) -> bool:
+        """
+        Return True is any op already has the scopes info.
+        """
+
+        def _is_scopes_present_in_block(block: Block) -> bool:
+            for op in block.operations:
+                for b in op.blocks:
+                    if _is_scopes_present_in_block(b):
+                        return True
+                if len(op.scopes) > 0:
+                    return True
+
+        for func in prog.functions.values():
+            if _is_scopes_present_in_block(func):
+                return True
+
+    def _decorate_prog_with_default_torch_scope(prog: Program):
+        """
+        Decorate every op in the program with a default TORCHSCRIPT_MODULE_TYPE scope info.
+        """
+
+        def _decorate_block_with_default_torch_scope(block: Block):
+            for op in block.operations:
+                for b in op.blocks:
+                    _decorate_block_with_default_torch_scope(b)
+                assert ScopeSource.TORCHSCRIPT_MODULE_TYPE not in op.scopes
+                op.scopes[ScopeSource.TORCHSCRIPT_MODULE_TYPE] = ["dummy"]
+
+        for func in prog.functions.values():
+            _decorate_block_with_default_torch_scope(func)
+
+        prog._add_essential_scope_source(ScopeSource.TORCHSCRIPT_MODULE_TYPE)
+
+    if not _is_scopes_present_in_program(prog):
+        _decorate_prog_with_default_torch_scope(prog)
+
 def apply_pass_and_basic_check(
-    prog,
-    pass_name,
-    skip_output_name_check=False,
-    skip_output_type_check=False,
-    skip_input_name_check=False,
-    skip_input_type_check=False,
-):
+    prog: Program,
+    pass_name: Union[str, AbstractGraphPass],
+    skip_output_name_check: Optional[bool] = False,
+    skip_output_type_check: Optional[bool] = False,
+    skip_input_name_check: Optional[bool] = False,
+    skip_input_type_check: Optional[bool] = False,
+    skip_function_name_check: Optional[bool] = False,
+    func_name: Optional[str] = "main",
+    skip_essential_scope_check: Optional[bool] = False,
+) -> Tuple[Program, Block, Block]:
     """
     Apply pass to the program
     """
     prev_prog = copy.deepcopy(prog)
-    graph_pass = pass_name if isinstance(pass_name, AbstractQuantizationPass) else PASS_REGISTRY[pass_name]
+
+    graph_pass = pass_name if isinstance(pass_name, AbstractGraphPass) else PASS_REGISTRY[pass_name]
+
+    _decorate_prog_with_scope_if_not_present(prog)
     graph_pass(prog)
-    block = prog.functions["main"]
-    prev_block = prev_prog.functions["main"]
-    if not skip_output_name_check:
-        assert_same_output_names(prev_prog, prog)
-    if not skip_output_type_check:
-        assert_same_output_types(prev_prog, prog)
-    assert_same_output_shapes(prev_prog, prog)
+    prog.validate(check_essential_scope=not skip_essential_scope_check)
+
+    if not skip_function_name_check:
+        if prev_prog.functions.keys() != prog.functions.keys():
+            raise ValueError("function names changed during {pass_name}.")
+
+    for name in prev_prog.functions:
+        if not skip_output_name_check:
+            assert_same_output_names(prev_prog, prog, name)
+        if not skip_output_type_check:
+            assert_same_output_types(prev_prog, prog, name)
+        assert_same_output_shapes(prev_prog, prog, name)
+
+        if not skip_input_name_check:
+            assert_same_input_names(prev_prog, prog, name)
+        if not skip_input_type_check:
+            assert_same_input_types(prev_prog, prog, name)
 
-    if not skip_input_name_check:
-        assert_same_input_names(prev_prog, prog)
-    if not skip_input_type_check:
-        assert_same_input_types(prev_prog, prog)
-    return prev_prog, prev_block, block
+    return prev_prog, prev_prog.functions[func_name], prog.functions[func_name]
 
 
 def assert_prog_input_type(prog, expected_dtype_str, expected_name=None, index=0):
@@ -644,7 +700,9 @@ def verify_prediction(mlmodel, multiarray_type=None):
         input_dict[input_desc.name] = random_gen_input_feature_type(input_desc)
         if multiarray_type is not None:
             input_dict[input_desc.name] = input_dict[input].astype(multiarray_type)
-    mlmodel.predict(input_dict)
+    res = mlmodel.predict(input_dict)
+    assert isinstance(res, dict)
+    assert len(res) >= 1
 
 def assert_spec_input_image_type(spec, expected_feature_type):
     assert spec.description.input[0].type.imageType.colorSpace == expected_feature_type
diff --git a/coremltools/models/_compiled_model.py b/coremltools/models/_compiled_model.py
index bba2baf4f..fe9f7a168 100644
--- a/coremltools/models/_compiled_model.py
+++ b/coremltools/models/_compiled_model.py
@@ -16,6 +16,21 @@
 
 
 class CompiledMLModel:
+
+    @staticmethod
+    def _init_check(path: str, compute_units: _ComputeUnit):
+        if _macos_version() < (10, 13):
+            raise Exception("Loading compiled Core ML models is only support on macOS 10.13 or higher.")
+        if _MLModelProxy is None:
+            raise Exception("Unable to load any compiled models. This is most likely because"
+                            " coremltools was installed from an egg rather than a wheel.")
+
+        if not isinstance(path, str):
+            raise TypeError('The "path" parameter must be of type "str".')
+        if not isinstance(compute_units, _ComputeUnit):
+            raise TypeError('The "compute_units" parameter must be of type: "coremltools.ComputeUnit".')
+
+
     def __init__(self, path: str, compute_units: _ComputeUnit =_ComputeUnit.ALL):
         """
         Loads a compiled Core ML model.
@@ -46,19 +61,9 @@ def __init__(self, path: str, compute_units: _ComputeUnit =_ComputeUnit.ALL):
         --------
         predict
         """
-        if _macos_version() < (10, 13):
-            raise Exception("Loading compiled Core ML models is only support on macOS 10.13 or higher.")
-        if _MLModelProxy is None:
-            raise Exception("Unable to load any compiled models. This is most likely because"
-                            " coremltools was installed from an egg rather than a wheel.")
-
-        if not isinstance(path, str):
-            raise TypeError('The "path" parameter must be of type "str".')
-        if not isinstance(compute_units, _ComputeUnit):
-            raise TypeError('The "compute_units" parameter must be of type: "coremltools.ComputeUnit".')
+        self._init_check(path, compute_units)
 
         path = _expanduser(path)
-
         self._proxy = _MLModelProxy(path, compute_units.name)
 
 
diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index bc427a438..478a1a191 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -4,22 +4,24 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import atexit as _atexit
+from copy import deepcopy as _deepcopy
+import json
 import os as _os
 import shutil as _shutil
 import tempfile as _tempfile
+from typing import Optional as _Optional
 import warnings as _warnings
-from copy import deepcopy as _deepcopy
+
 
 import numpy as _np
 import numpy as _numpy
 
 from coremltools import ComputeUnit as _ComputeUnit
+from coremltools import _logger as logger
+from coremltools import proto as _proto
 from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH
 from coremltools.converters.mil.mil.program import Program as _Program
 
-from ..proto import FeatureTypes_pb2 as _ft
-from ..proto import MIL_pb2 as _MIL_pb2
-from ..proto import Model_pb2 as _Model_pb2
 from .utils import (
     _MLMODEL_EXTENSION,
     _MLPACKAGE_AUTHOR_NAME,
@@ -46,6 +48,12 @@
 except:
     _ModelPackage = None
 
+try:
+    from ..libcoremlpython import _MLModelProxy
+except Exception as e:
+    logger.warning(f"Failed to load _MLModelProxy: {e}")
+    _MLModelProxy = None
+
 _HAS_PIL = True
 try:
     from PIL import Image as _PIL_IMAGE
@@ -130,38 +138,6 @@ def __iter__(self):
             yield f.name
 
 
-def _get_proxy_and_spec(filename, compute_units, skip_model_load=False):
-    try:
-        from ..libcoremlpython import _MLModelProxy
-    except Exception:
-        _MLModelProxy = None
-
-    filename = _os.path.expanduser(filename)
-    specification = _load_spec(filename)
-
-    if _MLModelProxy and not skip_model_load:
-
-        # check if the version is supported
-        engine_version = _MLModelProxy.maximum_supported_specification_version()
-        if specification.specificationVersion > engine_version:
-            # in this case the specification is a newer kind of .mlmodel than this
-            # version of the engine can support so we'll not try to have a proxy object
-            return None, specification, None
-
-        try:
-            return _MLModelProxy(filename, compute_units.name), specification, None
-        except RuntimeError as e:
-            _warnings.warn(
-                "You will not be able to run predict() on this Core ML model."
-                + " Underlying exception message was: "
-                + str(e),
-                RuntimeWarning,
-            )
-            return None, specification, e
-
-    return None, specification, None
-
-
 def _try_get_weights_dir_path(mlpackage_path):
     """
     Try to find the weights in mlpackage and return the path to the weights directory if found.
@@ -182,7 +158,7 @@ def _try_get_weights_dir_path(mlpackage_path):
 
 class MLModel:
     """
-    This class defines the minimal interface to a CoreML object in Python.
+    This class defines the minimal interface to a Core ML object in Python.
 
     At a high level, the protobuf specification consists of:
 
@@ -379,10 +355,10 @@ def does_model_contain_mlprogram(model) -> bool:
                 self.package_path = model
                 self.is_temp_package = is_temp_package
                 self._weights_dir = _try_get_weights_dir_path(model)
-            self.__proxy__, self._spec, self._framework_error = _get_proxy_and_spec(
+            self.__proxy__, self._spec, self._framework_error = self._get_proxy_and_spec(
                 model, compute_units, skip_model_load=skip_model_load,
             )
-        elif isinstance(model, _Model_pb2.Model):
+        elif isinstance(model, _proto.Model_pb2.Model):
             if does_model_contain_mlprogram(model):
                 if model.WhichOneof("Type") == "mlProgram" and weights_dir is None:
                     raise Exception(
@@ -399,7 +375,7 @@ def does_model_contain_mlprogram(model) -> bool:
                 filename = _tempfile.mktemp(suffix=_MLMODEL_EXTENSION)
                 _save_spec(model, filename)
 
-            self.__proxy__, self._spec, self._framework_error = _get_proxy_and_spec(
+            self.__proxy__, self._spec, self._framework_error = self._get_proxy_and_spec(
                 filename, compute_units, skip_model_load=skip_model_load,
             )
             try:
@@ -413,10 +389,43 @@ def does_model_contain_mlprogram(model) -> bool:
 
         self._input_description = _FeatureDescription(self._spec.description.input)
         self._output_description = _FeatureDescription(self._spec.description.output)
+        self._model_input_names_set = set([i.name for i in self._spec.description.input])
 
         if self.is_package and self.is_temp_package:
             _atexit.register(cleanup, self.package_path)
 
+
+    def _get_proxy_and_spec(self,
+                            filename: str,
+                            compute_units: _ComputeUnit,
+                            skip_model_load: _Optional[bool] = False):
+
+        filename = _os.path.expanduser(filename)
+        specification = _load_spec(filename)
+
+        if _MLModelProxy and not skip_model_load:
+
+            # check if the version is supported
+            engine_version = _MLModelProxy.maximum_supported_specification_version()
+            if specification.specificationVersion > engine_version:
+                # in this case the specification is a newer kind of .mlmodel than this
+                # version of the engine can support so we'll not try to have a proxy object
+                return None, specification, None
+
+            try:
+                return _MLModelProxy(filename, compute_units.name), specification, None
+            except RuntimeError as e:
+                _warnings.warn(
+                    "You will not be able to run predict() on this Core ML model."
+                    + " Underlying exception message was: "
+                    + str(e),
+                    RuntimeWarning,
+                )
+                return None, specification, e
+
+        return None, specification, None
+
+
     @property
     def short_description(self):
         return self._spec.description.metadata.shortDescription
@@ -509,6 +518,23 @@ def save(self, save_path: str):
                 )
             _shutil.copytree(self.package_path, save_path)
 
+            if self._mil_program is not None:
+                debug_handle_to_ops_mapping = (
+                    self._mil_program.construct_debug_handle_to_ops_mapping()
+                )
+                if len(debug_handle_to_ops_mapping) > 0:
+                    debug_handle_to_ops_mapping_as_json = json.dumps(
+                        [
+                            {_METADATA_VERSION: self.user_defined_metadata[_METADATA_VERSION]},
+                            debug_handle_to_ops_mapping,
+                        ]
+                    )
+                    saved_debug_handle_to_ops_mapping_path = _os.path.join(
+                        save_path, "executorch_debug_handle_mapping.json"
+                    )
+                    with open(saved_debug_handle_to_ops_mapping_path, "w") as f:
+                        f.write(debug_handle_to_ops_mapping_as_json)
+
             saved_spec_path = _os.path.join(
                 save_path, "Data", _MLPACKAGE_AUTHOR_NAME, _MODEL_FILE_NAME
             )
@@ -600,15 +626,6 @@ def verify_and_convert_input_dict(d):
                     "Model prediction is only supported on macOS version 10.13 or later."
                 )
 
-            try:
-                from ..libcoremlpython import _MLModelProxy
-            except Exception as e:
-                print("Exception loading model proxy: %s\n" % e)
-                _MLModelProxy = None
-            except:
-                print("Exception while loading model proxy.\n")
-                _MLModelProxy = None
-
             if not _MLModelProxy:
                 raise Exception("Unable to load CoreML.framework. Cannot make predictions.")
             elif (
@@ -670,9 +687,9 @@ def _set_build_info_mil_attributes(self, metadata):
         build_info_proto = ml_program_attributes["buildInfo"]
 
         # Set ValueType to dictionary of string to string
-        str_type = _MIL_pb2.ValueType()
-        str_type.tensorType.dataType = _MIL_pb2.DataType.STRING
-        dict_type_str_to_str = _MIL_pb2.ValueType()
+        str_type = _proto.MIL_pb2.ValueType()
+        str_type.tensorType.dataType = _proto.MIL_pb2.DataType.STRING
+        dict_type_str_to_str = _proto.MIL_pb2.ValueType()
         dict_type_str_to_str.dictionaryType.keyType.CopyFrom(str_type)
         dict_type_str_to_str.dictionaryType.valueType.CopyFrom(str_type)
         build_info_proto.type.CopyFrom(dict_type_str_to_str)
@@ -680,7 +697,7 @@ def _set_build_info_mil_attributes(self, metadata):
         # Copy the metadata
         build_info_dict = build_info_proto.immediateValue.dictionary
         for k, v in metadata.items():
-            key_pair = _MIL_pb2.DictionaryValue.KeyValuePair()
+            key_pair = _proto.MIL_pb2.DictionaryValue.KeyValuePair()
             key_pair.key.immediateValue.tensor.strings.values.append(k)
             key_pair.key.type.CopyFrom(str_type)
             key_pair.value.immediateValue.tensor.strings.values.append(v)
@@ -728,27 +745,36 @@ def _verify_pil_image_modes(self, input_dict):
                 if not isinstance(input_val, _PIL_IMAGE.Image):
                     msg = "Image input, '{}' must be of type PIL.Image.Image in the input dict"
                     raise TypeError(msg.format(input_desc.name))
-                if input_desc.type.imageType.colorSpace in (_ft.ImageFeatureType.BGR, _ft.ImageFeatureType.RGB):
-                    if input_val.mode != 'RGB':
+                if input_desc.type.imageType.colorSpace in (
+                    _proto.FeatureTypes_pb2.ImageFeatureType.BGR,
+                    _proto.FeatureTypes_pb2.ImageFeatureType.RGB,
+                ):
+                    if input_val.mode != "RGB":
                         msg = "RGB/BGR image input, '{}', must be of type PIL.Image.Image with mode=='RGB'"
                         raise TypeError(msg.format(input_desc.name))
-                elif input_desc.type.imageType.colorSpace == _ft.ImageFeatureType.GRAYSCALE:
-                    if input_val.mode != 'L':
+                elif (
+                    input_desc.type.imageType.colorSpace
+                    == _proto.FeatureTypes_pb2.ImageFeatureType.GRAYSCALE
+                ):
+                    if input_val.mode != "L":
                         msg = "GRAYSCALE image input, '{}', must be of type PIL.Image.Image with mode=='L'"
                         raise TypeError(msg.format(input_desc.name))
-                elif input_desc.type.imageType.colorSpace == _ft.ImageFeatureType.GRAYSCALE_FLOAT16:
-                    if input_val.mode != 'F':
+                elif (
+                    input_desc.type.imageType.colorSpace
+                    == _proto.FeatureTypes_pb2.ImageFeatureType.GRAYSCALE_FLOAT16
+                ):
+                    if input_val.mode != "F":
                         msg = "GRAYSCALE_FLOAT16 image input, '{}', must be of type PIL.Image.Image with mode=='F'"
                         raise TypeError(msg.format(input_desc.name))
 
+
     def _verify_input_name_exists(self, input_dict):
-        model_input_names = [inp.name for inp in self._spec.description.input]
-        model_input_names_set = set(model_input_names)
         for given_input in input_dict.keys():
-            if given_input not in model_input_names_set:
+            if given_input not in self._model_input_names_set:
                 err_msg = "Provided key \"{}\", in the input dict, " \
                           "does not match any of the model input name(s), which are: {}"
-                raise KeyError(err_msg.format(given_input, ",".join(model_input_names)))
+                raise KeyError(err_msg.format(given_input, self._model_input_names_set))
+
 
     @staticmethod
     def _update_float16_multiarray_input_to_float32(input_data: dict):
diff --git a/coremltools/models/neural_network/builder.py b/coremltools/models/neural_network/builder.py
index 7f791c2c4..6b28a0008 100644
--- a/coremltools/models/neural_network/builder.py
+++ b/coremltools/models/neural_network/builder.py
@@ -10,18 +10,16 @@
 
 import numpy as _np
 
-from ... import (_MINIMUM_NDARRAY_SPEC_VERSION,
-                 _MINIMUM_UPDATABLE_SPEC_VERSION,
-                 _SPECIFICATION_VERSION_IOS_14)
+from ... import (
+    _MINIMUM_NDARRAY_SPEC_VERSION,
+    _MINIMUM_UPDATABLE_SPEC_VERSION,
+    _SPECIFICATION_VERSION_IOS_14,
+)
 from ... import SPECIFICATION_VERSION as _SPECIFICATION_VERSION
-from ...proto import FeatureTypes_pb2 as _FeatureTypes_pb2
-from ...proto import Model_pb2 as _Model_pb2
-from ...proto import NeuralNetwork_pb2 as _NeuralNetwork_pb2
+from ... import proto as _proto
 from .. import datatypes
-from .._interface_management import (set_training_features,
-                                     set_transform_interface_params)
-from .quantization_utils import (_convert_array_to_nbit_quantized_bytes,
-                                 _unpack_to_bytes)
+from .._interface_management import set_training_features, set_transform_interface_params
+from .quantization_utils import _convert_array_to_nbit_quantized_bytes, _unpack_to_bytes
 from .spec_inspection_utils import _summarize_network_layer_info
 from .update_optimizer_utils import AdamParams, SgdParams
 
@@ -146,7 +144,7 @@ def _get_nn_spec(spec):
 def _get_lstm_weight_fields(lstm_wp):
     """
     Get LSTM weight fields.
-    lstm_wp: _NeuralNetwork_pb2.LSTMWeightParams
+    lstm_wp: _proto.NeuralNetwork_pb2.LSTMWeightParams
     """
     return [
         lstm_wp.inputGateWeightMatrix,
@@ -387,7 +385,7 @@ def __init__(
 
         # Set the interface params.
         if self.spec is None:
-            self.spec = _Model_pb2.Model()
+            self.spec = _proto.Model_pb2.Model()
         self.spec.specificationVersion = _SPECIFICATION_VERSION
         if disable_rank5_shape_mapping:
             self.spec.specificationVersion = _MINIMUM_NDARRAY_SPEC_VERSION
@@ -408,9 +406,9 @@ def __init__(
             del self.spec.description.output[:]
 
         if use_float_arraytype:
-            array_datatype = _Model_pb2.ArrayFeatureType.FLOAT32
+            array_datatype = _proto.Model_pb2.ArrayFeatureType.FLOAT32
         else:
-            array_datatype = _Model_pb2.ArrayFeatureType.DOUBLE
+            array_datatype = _proto.Model_pb2.ArrayFeatureType.DOUBLE
 
         self.spec = set_transform_interface_params(
             self.spec,
@@ -439,11 +437,13 @@ def __init__(
             self.nn_spec = nn_spec
 
         if disable_rank5_shape_mapping and self.nn_spec:
-            self.nn_spec.arrayInputShapeMapping = _NeuralNetwork_pb2.NeuralNetworkMultiArrayShapeMapping.Value(
-                "EXACT_ARRAY_MAPPING"
+            self.nn_spec.arrayInputShapeMapping = (
+                _proto.NeuralNetwork_pb2.NeuralNetworkMultiArrayShapeMapping.Value(
+                    "EXACT_ARRAY_MAPPING"
+                )
             )
-            self.nn_spec.imageInputShapeMapping = _NeuralNetwork_pb2.NeuralNetworkImageShapeMapping.Value(
-                "RANK4_IMAGE_MAPPING"
+            self.nn_spec.imageInputShapeMapping = (
+                _proto.NeuralNetwork_pb2.NeuralNetworkImageShapeMapping.Value("RANK4_IMAGE_MAPPING")
             )
 
     def set_input(self, input_names, input_dims):
@@ -503,7 +503,7 @@ def set_input(self, input_names, input_dims):
             # TODO: if it's an embedding, this should be integer
             spec.description.input[
                 idx
-            ].type.multiArrayType.dataType = _Model_pb2.ArrayFeatureType.DOUBLE
+            ].type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.DOUBLE
 
             spec.description.input[idx].name = input_names[idx]
 
@@ -542,7 +542,7 @@ def set_output(self, output_names, output_dims):
             spec.description.output[idx].type.multiArrayType.shape.extend(dim)
             spec.description.output[
                 idx
-            ].type.multiArrayType.dataType = _Model_pb2.ArrayFeatureType.DOUBLE
+            ].type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.DOUBLE
 
             spec.description.output[idx].name = output_names[idx]
 
@@ -735,11 +735,11 @@ def add_optionals(self, optionals_in, optionals_out):
         for idx in range(len_before_in, len(spec.description.input)):
             spec.description.input[
                 idx
-            ].type.multiArrayType.dataType = _Model_pb2.ArrayFeatureType.DOUBLE
+            ].type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.DOUBLE
         for idx in range(len_before_out, len(spec.description.output)):
             spec.description.output[
                 idx
-            ].type.multiArrayType.dataType = _Model_pb2.ArrayFeatureType.DOUBLE
+            ].type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.DOUBLE
 
 
     def _check_fp16_weight_params_lstms(self, lstm_wp, has_peephole=True):
@@ -939,11 +939,11 @@ def make_updatable(self, trainables):
             typed_layer = getattr(spec_layer, spec_layer.WhichOneof("layer"))
             for fd in typed_layer.DESCRIPTOR.fields:
                 field = getattr(typed_layer, fd.name)
-                if type(field) == _NeuralNetwork_pb2.LSTMWeightParams:
+                if type(field) == _proto.NeuralNetwork_pb2.LSTMWeightParams:
                     wfs = _get_lstm_weight_fields(field)
                     for wf in wfs:
                         wf.isUpdatable = True
-                elif type(field) == _NeuralNetwork_pb2.WeightParams:
+                elif type(field) == _proto.NeuralNetwork_pb2.WeightParams:
                     field.isUpdatable = True
                 else:
                     pass
@@ -1042,9 +1042,7 @@ def set_categorical_cross_entropy_loss(self, name, input):
         else:
             training_input.name = target
             datatypes._set_datatype(training_input.type, datatypes.Array(1))
-            training_input.type.multiArrayType.dataType = (
-                _Model_pb2.ArrayFeatureType.INT32
-            )
+            training_input.type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.INT32
 
         print(
             "Now adding input {} as target for categorical cross-entropy loss layer.".format(
@@ -1112,7 +1110,7 @@ def set_mean_squared_error_loss(self, name, input_feature=None):
         training_input.name = target
 
         datatypes._set_datatype(training_input.type, input_feature[1])
-        training_input.type.multiArrayType.dataType = _Model_pb2.ArrayFeatureType.DOUBLE
+        training_input.type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.DOUBLE
         print(
             "Now adding input {} as target for mean squared error loss layer.".format(
                 target
@@ -2097,11 +2095,13 @@ def add_upsample(
             spec_layer_params.scalingFactor.append(int(scaling_factor_h))
             spec_layer_params.scalingFactor.append(int(scaling_factor_w))
 
-        spec_layer_params.mode = _NeuralNetwork_pb2.UpsampleLayerParams.InterpolationMode.Value(
-            mode
+        spec_layer_params.mode = (
+            _proto.NeuralNetwork_pb2.UpsampleLayerParams.InterpolationMode.Value(mode)
         )
-        spec_layer_params.linearUpsampleMode = _NeuralNetwork_pb2.UpsampleLayerParams.LinearUpsampleMode.Value(
-            linear_upsample_mode
+        spec_layer_params.linearUpsampleMode = (
+            _proto.NeuralNetwork_pb2.UpsampleLayerParams.LinearUpsampleMode.Value(
+                linear_upsample_mode
+            )
         )
 
         return spec_layer
@@ -2469,8 +2469,10 @@ def add_convolution(
                     "Invalid value %d of same_padding_asymmetry_mode parameter"
                     % same_padding_asymmetry_mode
                 )
-            spec_layer_params.same.asymmetryMode = _NeuralNetwork_pb2.SamePadding.SamePaddingMode.Value(
-                same_padding_asymmetry_mode
+            spec_layer_params.same.asymmetryMode = (
+                _proto.NeuralNetwork_pb2.SamePadding.SamePaddingMode.Value(
+                    same_padding_asymmetry_mode
+                )
             )
         else:
             raise NotImplementedError(
@@ -2722,8 +2724,10 @@ def add_convolution3d(
             spec_layer_params.customPaddingBottom = padding_bottom
             spec_layer_params.customPaddingLeft = padding_left
             spec_layer_params.customPaddingRight = padding_right
-        spec_layer_params.paddingType = _NeuralNetwork_pb2.Convolution3DLayerParams.PaddingType.Value(
-            padding_mode.upper()
+        spec_layer_params.paddingType = (
+            _proto.NeuralNetwork_pb2.Convolution3DLayerParams.PaddingType.Value(
+                padding_mode.upper()
+            )
         )
 
         spec_layer_params.dilationDepth = dilation_depth
@@ -2837,7 +2841,7 @@ def add_pooling(
         spec_layer_params = spec_layer.pooling
 
         # Set the parameters
-        spec_layer_params.type = _NeuralNetwork_pb2.PoolingLayerParams.PoolingType.Value(
+        spec_layer_params.type = _proto.NeuralNetwork_pb2.PoolingLayerParams.PoolingType.Value(
             layer_type.upper()
         )
 
@@ -2868,8 +2872,10 @@ def add_pooling(
                     "Invalid value %d of same_padding_asymmetry_mode parameter"
                     % same_padding_asymmetry_mode
                 )
-            spec_layer_params.same.asymmetryMode = _NeuralNetwork_pb2.SamePadding.SamePaddingMode.Value(
-                same_padding_asymmetry_mode
+            spec_layer_params.same.asymmetryMode = (
+                _proto.NeuralNetwork_pb2.SamePadding.SamePaddingMode.Value(
+                    same_padding_asymmetry_mode
+                )
             )
         elif padding_type == "INCLUDE_LAST_PIXEL":
             if padding_top != padding_bottom or padding_left != padding_right:
@@ -2968,7 +2974,7 @@ def add_pooling3d(
         spec_layer = self._add_generic_layer(name, [input_name], [output_name])
         spec_layer_params = spec_layer.pooling3d
 
-        spec_layer_params.type = _NeuralNetwork_pb2.Pooling3DLayerParams.PoolingType3D.Value(
+        spec_layer_params.type = _proto.NeuralNetwork_pb2.Pooling3DLayerParams.PoolingType3D.Value(
             pooling_type.upper()
         )
 
@@ -2993,8 +2999,10 @@ def add_pooling3d(
             spec_layer_params.customPaddingBottom = custom_padding_bottom
             spec_layer_params.customPaddingLeft = custom_padding_left
             spec_layer_params.customPaddingRight = custom_padding_right
-        spec_layer_params.paddingType = _NeuralNetwork_pb2.Pooling3DLayerParams.Pooling3DPaddingType.Value(
-            padding_mode.upper()
+        spec_layer_params.paddingType = (
+            _proto.NeuralNetwork_pb2.Pooling3DLayerParams.Pooling3DPaddingType.Value(
+                padding_mode.upper()
+            )
         )
 
         spec_layer_params.countExcludePadding = average_pooling_count_excludes_padding
@@ -3034,8 +3042,10 @@ def add_global_pooling3d(self, name, input_name, output_name, pooling_type):
         spec_layer = self._add_generic_layer(name, [input_name], [output_name])
         spec_layer_params = spec_layer.globalPooling3d
 
-        spec_layer_params.type = _NeuralNetwork_pb2.GlobalPooling3DLayerParams.GlobalPoolingType3D.Value(
-            pooling_type.upper()
+        spec_layer_params.type = (
+            _proto.NeuralNetwork_pb2.GlobalPooling3DLayerParams.GlobalPoolingType3D.Value(
+                pooling_type.upper()
+            )
         )
 
         return spec_layer
@@ -3743,11 +3753,11 @@ def add_flatten(self, name, mode, input_name, output_name):
 
         # Set the parameters
         if mode == 0:
-            spec_layer_params.mode = _NeuralNetwork_pb2.FlattenLayerParams.FlattenOrder.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.FlattenLayerParams.FlattenOrder.Value(
                 "CHANNEL_FIRST"
             )
         elif mode == 1:
-            spec_layer_params.mode = _NeuralNetwork_pb2.FlattenLayerParams.FlattenOrder.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.FlattenLayerParams.FlattenOrder.Value(
                 "CHANNEL_LAST"
             )
         else:
@@ -3803,15 +3813,15 @@ def add_slice(
 
         axis = axis.lower() if isinstance(axis, str) else axis
         if axis == "channel":
-            spec_layer_params.axis = _NeuralNetwork_pb2.SliceLayerParams.SliceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.SliceLayerParams.SliceAxis.Value(
                 "CHANNEL_AXIS"
             )
         elif axis == "height":
-            spec_layer_params.axis = _NeuralNetwork_pb2.SliceLayerParams.SliceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.SliceLayerParams.SliceAxis.Value(
                 "HEIGHT_AXIS"
             )
         elif axis == "width":
-            spec_layer_params.axis = _NeuralNetwork_pb2.SliceLayerParams.SliceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.SliceLayerParams.SliceAxis.Value(
                 "WIDTH_AXIS"
             )
         else:
@@ -3909,12 +3919,16 @@ def add_reorganize_data(
 
         mode = mode.upper() if isinstance(mode, str) else mode
         if mode == "SPACE_TO_DEPTH":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReorganizeDataLayerParams.ReorganizationType.Value(
-                "SPACE_TO_DEPTH"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReorganizeDataLayerParams.ReorganizationType.Value(
+                    "SPACE_TO_DEPTH"
+                )
             )
         elif mode == "DEPTH_TO_SPACE":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReorganizeDataLayerParams.ReorganizationType.Value(
-                "DEPTH_TO_SPACE"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReorganizeDataLayerParams.ReorganizationType.Value(
+                    "DEPTH_TO_SPACE"
+                )
             )
         elif mode == "PIXEL_SHUFFLE":
             if self.spec and (
@@ -3922,8 +3936,10 @@ def add_reorganize_data(
                 or self.spec.specificationVersion < _SPECIFICATION_VERSION_IOS_14
             ):
                 self.spec.specificationVersion = _SPECIFICATION_VERSION_IOS_14
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReorganizeDataLayerParams.ReorganizationType.Value(
-                "PIXEL_SHUFFLE"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReorganizeDataLayerParams.ReorganizationType.Value(
+                    "PIXEL_SHUFFLE"
+                )
             )
         else:
             raise NotImplementedError("Unknown reorganization mode %s." % mode)
@@ -4085,11 +4101,11 @@ def add_reshape(self, name, input_name, output_name, target_shape, mode):
         spec_layer_params = spec_layer.reshape
         spec_layer_params.targetShape.extend(target_shape)
         if mode == 0:
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReshapeLayerParams.ReshapeOrder.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ReshapeLayerParams.ReshapeOrder.Value(
                 "CHANNEL_FIRST"
             )
         else:
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReshapeLayerParams.ReshapeOrder.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ReshapeLayerParams.ReshapeOrder.Value(
                 "CHANNEL_LAST"
             )
 
@@ -4139,67 +4155,67 @@ def add_reduce(self, name, input_name, output_name, axis, mode, epsilon=1e-6):
 
         mode = mode.lower() if isinstance(mode, str) else mode
         if mode == "sum":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "SUM"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("SUM")
             )
         elif mode == "avg":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "AVG"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("AVG")
             )
         elif mode == "prod":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "PROD"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("PROD")
             )
         elif mode == "logsum":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "LOGSUM"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("LOGSUM")
             )
         elif mode == "sumsquare":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "SUMSQUARE"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("SUMSQUARE")
             )
         elif mode == "l1":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "L1"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("L1")
             )
         elif mode == "l2":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "L2"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("L2")
             )
         elif mode == "max":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "MAX"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("MAX")
             )
         elif mode == "min":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "MIN"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("MIN")
             )
         elif mode == "argmax":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value(
-                "ARGMAX"
+            spec_layer_params.mode = (
+                _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceOperation.Value("ARGMAX")
             )
         else:
             raise NotImplementedError("Unknown reduction operation %s." % mode)
 
         axis = axis.upper() if isinstance(axis, str) else axis
         if axis == "CHW":
-            spec_layer_params.axis = _NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
                 "CHW"
             )
         elif axis == "HW":
-            spec_layer_params.axis = _NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
                 "HW"
             )
         elif axis == "C":
-            spec_layer_params.axis = _NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
                 "C"
             )
         elif axis == "H":
-            spec_layer_params.axis = _NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
                 "H"
             )
         elif axis == "W":
-            spec_layer_params.axis = _NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
+            spec_layer_params.axis = _proto.NeuralNetwork_pb2.ReduceLayerParams.ReduceAxis.Value(
                 "W"
             )
         else:
@@ -4377,36 +4393,36 @@ def add_unary(
 
         mode = mode.lower() if isinstance(mode, str) else mode
         if mode == "sqrt":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "SQRT"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("SQRT")
             )
         elif mode == "rsqrt":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "RSQRT"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("RSQRT")
             )
         elif mode == "inverse":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "INVERSE"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("INVERSE")
             )
         elif mode == "power":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "POWER"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("POWER")
             )
         elif mode == "exp":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "EXP"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("EXP")
             )
         elif mode == "log":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "LOG"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("LOG")
             )
         elif mode == "abs":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "ABS"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("ABS")
             )
         elif mode == "threshold":
-            spec_layer_params.type = _NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value(
-                "THRESHOLD"
+            spec_layer_params.type = (
+                _proto.NeuralNetwork_pb2.UnaryFunctionLayerParams.Operation.Value("THRESHOLD")
             )
         else:
             raise NotImplementedError("Unknown unary function %s " % mode)
@@ -4552,20 +4568,20 @@ def add_resize_bilinear(
         mode = mode.upper() if isinstance(mode, str) else mode
         if mode == "ALIGN_ENDPOINTS_MODE":
 
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "ALIGN_ENDPOINTS_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("ALIGN_ENDPOINTS_MODE")
             )
         elif mode == "STRICT_ALIGN_ENDPOINTS_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "STRICT_ALIGN_ENDPOINTS_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("STRICT_ALIGN_ENDPOINTS_MODE")
             )
         elif mode == "UPSAMPLE_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "UPSAMPLE_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("UPSAMPLE_MODE")
             )
         elif mode == "ROI_ALIGN_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "ROI_ALIGN_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("ROI_ALIGN_MODE")
             )
         else:
             raise ValueError("Unsupported resize bilinear mode %s" % mode)
@@ -4669,39 +4685,45 @@ def add_crop_resize(
         )
 
         if mode == "ALIGN_ENDPOINTS_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "ALIGN_ENDPOINTS_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("ALIGN_ENDPOINTS_MODE")
             )
         elif mode == "STRICT_ALIGN_ENDPOINTS_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "STRICT_ALIGN_ENDPOINTS_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("STRICT_ALIGN_ENDPOINTS_MODE")
             )
         elif mode == "UPSAMPLE_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "UPSAMPLE_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("UPSAMPLE_MODE")
             )
         elif mode == "ROI_ALIGN_MODE":
-            spec_layer_params.mode.samplingMethod = _NeuralNetwork_pb2.SamplingMode.Method.Value(
-                "ROI_ALIGN_MODE"
+            spec_layer_params.mode.samplingMethod = (
+                _proto.NeuralNetwork_pb2.SamplingMode.Method.Value("ROI_ALIGN_MODE")
             )
         else:
             raise ValueError("Unsupported crop resize mode %s" % mode)
 
         if box_indices_mode == "CORNERS_HEIGHT_FIRST":
-            spec_layer_params.boxIndicesMode.boxMode = _NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
-                "CORNERS_HEIGHT_FIRST"
+            spec_layer_params.boxIndicesMode.boxMode = (
+                _proto.NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
+                    "CORNERS_HEIGHT_FIRST"
+                )
             )
         elif box_indices_mode == "CORNERS_WIDTH_FIRST":
-            spec_layer_params.boxIndicesMode.boxMode = _NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
-                "CORNERS_WIDTH_FIRST"
+            spec_layer_params.boxIndicesMode.boxMode = (
+                _proto.NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value("CORNERS_WIDTH_FIRST")
             )
         elif box_indices_mode == "CENTER_SIZE_HEIGHT_FIRST":
-            spec_layer_params.boxIndicesMode.boxMode = _NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
-                "CENTER_SIZE_HEIGHT_FIRST"
+            spec_layer_params.boxIndicesMode.boxMode = (
+                _proto.NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
+                    "CENTER_SIZE_HEIGHT_FIRST"
+                )
             )
         elif box_indices_mode == "CENTER_SIZE_WIDTH_FIRST":
-            spec_layer_params.boxIndicesMode.boxMode = _NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
-                "CENTER_SIZE_WIDTH_FIRST"
+            spec_layer_params.boxIndicesMode.boxMode = (
+                _proto.NeuralNetwork_pb2.BoxCoordinatesMode.Coordinates.Value(
+                    "CENTER_SIZE_WIDTH_FIRST"
+                )
             )
         else:
             raise ValueError(
@@ -4892,22 +4914,22 @@ def check_valid_preprocessing_keys(input, target, input_name):
                     # TODO: If input is not rank 3 or 4, then accordingly handle
                     # e.g. for rank-2 input, squeeze additional dimension in case of Gray scale image
                     if channels == 1:
-                        input_.type.imageType.colorSpace = _FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value(
-                            "GRAYSCALE"
+                        input_.type.imageType.colorSpace = (
+                            _proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("GRAYSCALE")
                         )
                     elif channels == 3:
                         if input_.name in is_bgr:
                             if is_bgr[input_.name]:
-                                input_.type.imageType.colorSpace = _FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value(
-                                    "BGR"
+                                input_.type.imageType.colorSpace = (
+                                    _proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("BGR")
                                 )
                             else:
-                                input_.type.imageType.colorSpace = _FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value(
-                                    "RGB"
+                                input_.type.imageType.colorSpace = (
+                                    _proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("RGB")
                                 )
                         else:
-                            input_.type.imageType.colorSpace = _FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value(
-                                "RGB"
+                            input_.type.imageType.colorSpace = (
+                                _proto.FeatureTypes_pb2.ImageFeatureType.ColorSpace.Value("RGB")
                             )
                     else:
                         raise ValueError(
@@ -5058,15 +5080,15 @@ def add_gelu(self, name, input_name, output_name, mode="EXACT"):
         spec_layer_params = spec_layer.gelu
 
         if mode == "EXACT":
-            spec_layer_params.mode = _NeuralNetwork_pb2.GeluLayerParams.GeluMode.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.GeluLayerParams.GeluMode.Value(
                 "EXACT"
             )
         elif mode == "TANH_APPROXIMATION":
-            spec_layer_params.mode = _NeuralNetwork_pb2.GeluLayerParams.GeluMode.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.GeluLayerParams.GeluMode.Value(
                 "TANH_APPROXIMATION"
             )
         elif mode == "SIGMOID_APPROXIMATION":
-            spec_layer_params.mode = _NeuralNetwork_pb2.GeluLayerParams.GeluMode.Value(
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.GeluLayerParams.GeluMode.Value(
                 "SIGMOID_APPROXIMATION"
             )
         else:
@@ -6530,21 +6552,19 @@ def add_scatter(self, name, input_names, output_name, axis=0, mode="UPDATE"):
 
         mode = mode.upper() if isinstance(mode, str) else mode
         if mode == "UPDATE":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value(
-                "SCATTER_UPDATE"
-            )
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_UPDATE")
         elif mode == "ADD":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_ADD")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_ADD")
         elif mode == "SUB":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_SUB")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_SUB")
         elif mode == "MUL":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MUL")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MUL")
         elif mode == "DIV":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_DIV")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_DIV")
         elif mode == "MAX":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MAX")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MAX")
         elif mode == "MIN":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MIN")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MIN")
         else:
             raise ValueError("Unsupported Scatter mode %s" % mode)
 
@@ -6611,21 +6631,19 @@ def add_scatter_along_axis(
 
         mode = mode.upper() if isinstance(mode, str) else mode
         if mode == "UPDATE":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value(
-                "SCATTER_UPDATE"
-            )
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_UPDATE")
         elif mode == "ADD":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_ADD")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_ADD")
         elif mode == "SUB":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_SUB")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_SUB")
         elif mode == "MUL":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MUL")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MUL")
         elif mode == "DIV":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_DIV")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_DIV")
         elif mode == "MAX":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MAX")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MAX")
         elif mode == "MIN":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MIN")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MIN")
         else:
             raise ValueError("Unsupported scatter_along_axis mode %s" % mode)
 
@@ -6690,21 +6708,19 @@ def add_scatter_nd(self, name, input_names, output_name, mode="UPDATE"):
 
         mode = mode.upper() if isinstance(mode, str) else mode
         if mode == "UPDATE":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value(
-                "SCATTER_UPDATE"
-            )
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_UPDATE")
         elif mode == "ADD":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_ADD")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_ADD")
         elif mode == "SUB":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_SUB")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_SUB")
         elif mode == "MUL":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MUL")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MUL")
         elif mode == "DIV":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_DIV")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_DIV")
         elif mode == "MAX":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MAX")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MAX")
         elif mode == "MIN":
-            spec_layer_params.mode = _NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MIN")
+            spec_layer_params.mode = _proto.NeuralNetwork_pb2.ScatterMode.Value("SCATTER_MIN")
         else:
             raise ValueError("Unsupported scatter mode %s" % mode)
 
diff --git a/coremltools/models/neural_network/flexible_shape_utils.py b/coremltools/models/neural_network/flexible_shape_utils.py
index f2e42794e..fbb8d1f03 100644
--- a/coremltools/models/neural_network/flexible_shape_utils.py
+++ b/coremltools/models/neural_network/flexible_shape_utils.py
@@ -7,8 +7,12 @@
 Utilities to annotate Neural Network Features with flexible shape information.
 """
 
-from ... import (_MINIMUM_FLEXIBLE_SHAPES_SPEC_VERSION,
-                 _MINIMUM_NDARRAY_SPEC_VERSION)
+from typing import List as _List
+from typing import Tuple as _Tuple
+
+from coremltools.proto import Model_pb2 as _ml
+
+from ... import _MINIMUM_FLEXIBLE_SHAPES_SPEC_VERSION, _MINIMUM_NDARRAY_SPEC_VERSION
 from ..utils import _get_feature
 
 _SEQUENCE_KEY = "S"
@@ -296,6 +300,150 @@ def get_height_range(self):
         return self._height_range
 
 
+def _set_multiarray_ndshape_range_for_feature(
+    feature: _ml.FeatureDescription,
+    lower_bounds: _List[int],
+    upper_bounds: _List[int],
+):
+
+    if not isinstance(lower_bounds, list):
+        raise Exception("lower_bounds must be a list")
+    if not isinstance(upper_bounds, list):
+        raise Exception("upper_bounds must be a list")
+
+    if feature.type.WhichOneof("Type") != "multiArrayType":
+        raise Exception("Trying to update shape range for " "a non-multiArray feature type")
+
+    shape = feature.type.multiArrayType.shape
+
+    if len(shape) != len(lower_bounds):
+        raise Exception(
+            "Length of lower_bounds is not equal to the number of dimensions in the default shape"
+        )
+    if len(shape) != len(upper_bounds):
+        raise Exception(
+            "Length of upper_bounds is not equal to the number of dimensions in the default shape"
+        )
+
+    feature.type.multiArrayType.ClearField("ShapeFlexibility")
+
+    for i in range(len(lower_bounds)):
+        if shape[i] < lower_bounds[i]:
+            raise Exception(
+                "Default shape in %d-th dimension, which is %d, is smaller"
+                " than the lower bound of %d" % (i, int(shape[i]), lower_bounds[i])
+            )
+        if upper_bounds[i] != -1:
+            if shape[i] > upper_bounds[i]:
+                raise Exception(
+                    "Default shape in %d-th dimension, which is %d, is greater"
+                    " than the upper bound of %d" % (i, int(shape[i]), upper_bounds[i])
+                )
+
+        s = feature.type.multiArrayType.shapeRange.sizeRanges.add()
+        s.lowerBound = lower_bounds[i]
+        s.upperBound = upper_bounds[i]
+
+
+def _update_image_size_range_for_feature(
+    feature: _ml.FeatureDescription,
+    size_range: NeuralNetworkImageSizeRange,
+):
+    if not isinstance(size_range, NeuralNetworkImageSizeRange):
+        raise Exception("Shape ranges should be of type NeuralNetworkImageSizeRange")
+
+    if feature.type.WhichOneof("Type") != "imageType":
+        raise Exception("Trying to add size ranges for " "a non-image feature type")
+
+    feature.type.imageType.ClearField("SizeFlexibility")
+    feature.type.imageType.imageSizeRange.heightRange.lowerBound = (
+        size_range.get_height_range().lowerBound
+    )
+    feature.type.imageType.imageSizeRange.heightRange.upperBound = (
+        size_range.get_height_range().upperBound
+    )
+
+    feature.type.imageType.imageSizeRange.widthRange.lowerBound = (
+        size_range.get_width_range().lowerBound
+    )
+    feature.type.imageType.imageSizeRange.widthRange.upperBound = (
+        size_range.get_width_range().upperBound
+    )
+
+
+def _add_multiarray_ndshape_enumeration_for_feature(
+    feature: _ml.FeatureDescription,
+    enumerated_shapes: _List[_Tuple[int]],
+):
+    if not isinstance(enumerated_shapes, list):
+        raise Exception("enumerated_shapes must be a list")
+    if len(enumerated_shapes) == 0:
+        raise Exception("enumerated_shapes is empty")
+
+    if feature.type.WhichOneof("Type") != "multiArrayType":
+        raise Exception("Trying to update shape range for " "a non-multiArray feature type")
+
+    shape = feature.type.multiArrayType.shape
+
+    if feature.type.multiArrayType.WhichOneof("ShapeFlexibility") != "enumeratedShapes":
+        feature.type.multiArrayType.ClearField("ShapeFlexibility")
+
+    eshape_len = len(feature.type.multiArrayType.enumeratedShapes.shapes)
+
+    shapes_added_so_far = []
+
+    # Add default array shape to list of enumerated shapes if enumerated shapes
+    # field is currently empty
+    if eshape_len == 0:
+        fixed_shape = feature.type.multiArrayType.shape
+        s = feature.type.multiArrayType.enumeratedShapes.shapes.add()
+        s.shape.extend(fixed_shape)
+        shapes_added_so_far.append(list(fixed_shape))
+
+    for shape in enumerated_shapes:
+        if not isinstance(shape, tuple):
+            raise Exception("An element in 'enumerated_shapes' is not a tuple")
+        if list(shape) not in shapes_added_so_far:
+            s = feature.type.multiArrayType.enumeratedShapes.shapes.add()
+            s.shape.extend(list(shape))
+            shapes_added_so_far.append(list(shape))
+
+
+def _add_enumerated_image_sizes_for_feature(
+    feature: _ml.FeatureDescription,
+    sizes: _List[NeuralNetworkImageSize],
+):
+    if not isinstance(sizes, list):
+        sizes = [sizes]
+
+    for size in sizes:
+        if not isinstance(size, NeuralNetworkImageSize):
+            raise Exception("Shape ranges should be of type NeuralNetworkImageSize")
+
+    if feature.type.WhichOneof("Type") != "imageType":
+        raise Exception("Trying to add enumerated sizes to " "a non-image feature type")
+
+    if feature.type.imageType.WhichOneof("SizeFlexibility") != "enumeratedSizes":
+        feature.type.imageType.ClearField("SizeFlexibility")
+
+    esizes_len = len(feature.type.imageType.enumeratedSizes.sizes)
+
+    # Add default image size to list of enumerated sizes if enumerated sizes
+    # field is currently empty
+    if esizes_len == 0:
+        fixed_height = feature.type.imageType.height
+        fixed_width = feature.type.imageType.width
+        sizes.append(NeuralNetworkImageSize(fixed_height, fixed_width))
+
+    shapes_added_so_far = []
+    for size in sizes:
+        if [size.height, size.width] not in shapes_added_so_far:
+            s = feature.type.imageType.enumeratedSizes.sizes.add()
+            s.height = size.height
+            s.width = size.width
+            shapes_added_so_far.append([s.height, s.width])
+
+
 def add_enumerated_multiarray_shapes(spec, feature_name, shapes):
     """
     Annotate an input or output multiArray feature in a Neural Network spec to
@@ -317,16 +465,19 @@ def add_enumerated_multiarray_shapes(spec, feature_name, shapes):
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import flexible_shape_utils
-        >>> spec = coremltools.utils.load_spec('mymodel.mlmodel')
-        >>> array_shapes = [flexible_shape_utils.NeuralNetworkMultiArrayShape(3)]
-        >>> second_shape = flexible_shape_utils.NeuralNetworkMultiArrayShape()
-        >>> second_shape.set_channel_shape(3)
-        >>> second_shape.set_height_shape(10)
-        >>> second_shape.set_width_shape(15)
-        >>> array_shapes.append(second_shape)
-        >>> flexible_shape_utils.add_enumerated_multiarray_shapes(spec, feature_name='my_multiarray_featurename', shapes=array_shapes)
+        import coremltools
+        from coremltools.models.neural_network import flexible_shape_utils
+
+        spec = coremltools.utils.load_spec("mymodel.mlmodel")
+        array_shapes = [flexible_shape_utils.NeuralNetworkMultiArrayShape(3)]
+        second_shape = flexible_shape_utils.NeuralNetworkMultiArrayShape()
+        second_shape.set_channel_shape(3)
+        second_shape.set_height_shape(10)
+        second_shape.set_width_shape(15)
+        array_shapes.append(second_shape)
+        flexible_shape_utils.add_enumerated_multiarray_shapes(
+            spec, feature_name="my_multiarray_featurename", shapes=array_shapes
+        )
 
     :return:
         None. The spec object is updated
@@ -380,7 +531,6 @@ def add_enumerated_multiarray_shapes(spec, feature_name, shapes):
         _MINIMUM_FLEXIBLE_SHAPES_SPEC_VERSION, spec.specificationVersion
     )
 
-
 def add_enumerated_image_sizes(spec, feature_name, sizes):
     """
     Annotate an input or output image feature in a Neural Network spec to
@@ -402,12 +552,15 @@ def add_enumerated_image_sizes(spec, feature_name, sizes):
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import flexible_shape_utils
-        >>> spec = coremltools.utils.load_spec('mymodel.mlmodel')
-        >>> image_sizes = [flexible_shape_utils.NeuralNetworkImageSize(128, 128)]
-        >>> image_sizes.append(flexible_shape_utils.NeuralNetworkImageSize(256, 256))
-        >>> flexible_shape_utils.add_enumerated_image_sizes(spec, feature_name='my_multiarray_featurename', sizes=image_sizes)
+        import coremltools
+        from coremltools.models.neural_network import flexible_shape_utils
+
+        spec = coremltools.utils.load_spec("mymodel.mlmodel")
+        image_sizes = [flexible_shape_utils.NeuralNetworkImageSize(128, 128)]
+        image_sizes.append(flexible_shape_utils.NeuralNetworkImageSize(256, 256))
+        flexible_shape_utils.add_enumerated_image_sizes(
+            spec, feature_name="my_multiarray_featurename", sizes=image_sizes
+        )
 
     :return:
         None. The spec object is updated
@@ -448,7 +601,6 @@ def add_enumerated_image_sizes(spec, feature_name, sizes):
         _MINIMUM_FLEXIBLE_SHAPES_SPEC_VERSION, spec.specificationVersion
     )
 
-
 def update_image_size_range(spec, feature_name, size_range):
     """
     Annotate an input or output Image feature in a Neural Network spec to
@@ -470,38 +622,22 @@ def update_image_size_range(spec, feature_name, size_range):
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import flexible_shape_utils
-        >>> spec = coremltools.utils.load_spec('mymodel.mlmodel')
-        >>> img_size_ranges = flexible_shape_utils.NeuralNetworkImageSizeRange()
-        >>> img_size_ranges.add_height_range(64, 128)
-        >>> img_size_ranges.add_width_range(128, -1)
-        >>> flexible_shape_utils.update_image_size_range(spec, feature_name='my_multiarray_featurename', size_range=img_size_ranges)
+        import coremltools
+        from coremltools.models.neural_network import flexible_shape_utils
+
+        spec = coremltools.utils.load_spec("mymodel.mlmodel")
+        img_size_ranges = flexible_shape_utils.NeuralNetworkImageSizeRange()
+        img_size_ranges.add_height_range(64, 128)
+        img_size_ranges.add_width_range(128, -1)
+        flexible_shape_utils.update_image_size_range(
+            spec, feature_name="my_multiarray_featurename", size_range=img_size_ranges
+        )
 
     :return:
         None. The spec object is updated
     """
-    if not isinstance(size_range, NeuralNetworkImageSizeRange):
-        raise Exception("Shape ranges should be of type NeuralNetworkImageSizeRange")
-
     feature = _get_feature(spec, feature_name)
-    if feature.type.WhichOneof("Type") != "imageType":
-        raise Exception("Trying to add size ranges for " "a non-image feature type")
-
-    feature.type.imageType.ClearField("SizeFlexibility")
-    feature.type.imageType.imageSizeRange.heightRange.lowerBound = (
-        size_range.get_height_range().lowerBound
-    )
-    feature.type.imageType.imageSizeRange.heightRange.upperBound = (
-        size_range.get_height_range().upperBound
-    )
-
-    feature.type.imageType.imageSizeRange.widthRange.lowerBound = (
-        size_range.get_width_range().lowerBound
-    )
-    feature.type.imageType.imageSizeRange.widthRange.upperBound = (
-        size_range.get_width_range().upperBound
-    )
+    _update_image_size_range_for_feature(feature, size_range)
 
     # Bump up specification version
     spec.specificationVersion = max(
@@ -532,14 +668,17 @@ def update_multiarray_shape_range(spec, feature_name, shape_range):
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import flexible_shape_utils
-        >>> spec = coremltools.utils.load_spec('mymodel.mlmodel')
-        >>> shape_range = flexible_shape_utils.NeuralNetworkMultiArrayShapeRange()
-        >>> shape_range.add_channel_range((1, 3))
-        >>> shape_range.add_width_range((128, 256))
-        >>> shape_range.add_height_range((128, 256))
-        >>> flexible_shape_utils.update_multiarray_shape_range(spec, feature_name='my_multiarray_featurename', shape_range=shape_range)
+        import coremltools
+        from coremltools.models.neural_network import flexible_shape_utils
+
+        spec = coremltools.utils.load_spec("mymodel.mlmodel")
+        shape_range = flexible_shape_utils.NeuralNetworkMultiArrayShapeRange()
+        shape_range.add_channel_range((1, 3))
+        shape_range.add_width_range((128, 256))
+        shape_range.add_height_range((128, 256))
+        flexible_shape_utils.update_multiarray_shape_range(
+            spec, feature_name="my_multiarray_featurename", shape_range=shape_range
+        )
 
     :return:
         None. The spec is updated
@@ -606,63 +745,29 @@ def set_multiarray_ndshape_range(spec, feature_name, lower_bounds, upper_bounds)
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import flexible_shape_utils
-        >>> spec = coremltools.utils.load_spec('mymodel.mlmodel')
-        >>> # say, the default shape of "my_multiarray_featurename" is (2,3)
-        >>> flexible_shape_utils.set_multiarray_ndshape_range(spec, feature_name='my_multiarray_featurename', lower_bounds=[1,2], upper_bounds=[10,-1])
+        import coremltools
+        from coremltools.models.neural_network import flexible_shape_utils
+
+        spec = coremltools.utils.load_spec("mymodel.mlmodel")
+        # say, the default shape of "my_multiarray_featurename" is (2,3)
+        flexible_shape_utils.set_multiarray_ndshape_range(
+            spec,
+            feature_name="my_multiarray_featurename",
+            lower_bounds=[1, 2],
+            upper_bounds=[10, -1],
+        )
 
     :return:
         None. The spec is updated
     """
-    if not isinstance(lower_bounds, list):
-        raise Exception("lower_bounds must be a list")
-    if not isinstance(upper_bounds, list):
-        raise Exception("upper_bounds must be a list")
-
     feature = _get_feature(spec, feature_name)
-
-    if feature.type.WhichOneof("Type") != "multiArrayType":
-        raise Exception(
-            "Trying to update shape range for " "a non-multiArray feature type"
-        )
-
-    shape = feature.type.multiArrayType.shape
-
-    if len(shape) != len(lower_bounds):
-        raise Exception(
-            "Length of lower_bounds is not equal to the number of dimensions in the default shape"
-        )
-    if len(shape) != len(upper_bounds):
-        raise Exception(
-            "Length of upper_bounds is not equal to the number of dimensions in the default shape"
-        )
-
-    feature.type.multiArrayType.ClearField("ShapeFlexibility")
-
-    for i in range(len(lower_bounds)):
-        if shape[i] < lower_bounds[i]:
-            raise Exception(
-                "Default shape in %d-th dimension, which is %d, is smaller"
-                " than the lower bound of %d" % (i, int(shape[i]), lower_bounds[i])
-            )
-        if upper_bounds[i] != -1:
-            if shape[i] > upper_bounds[i]:
-                raise Exception(
-                    "Default shape in %d-th dimension, which is %d, is greater"
-                    " than the upper bound of %d" % (i, int(shape[i]), upper_bounds[i])
-                )
-
-        s = feature.type.multiArrayType.shapeRange.sizeRanges.add()
-        s.lowerBound = lower_bounds[i]
-        s.upperBound = upper_bounds[i]
+    _set_multiarray_ndshape_range_for_feature(feature, lower_bounds, upper_bounds)
 
     # Bump up specification version
     spec.specificationVersion = max(
         _MINIMUM_NDARRAY_SPEC_VERSION, spec.specificationVersion
     )
 
-
 def add_multiarray_ndshape_enumeration(spec, feature_name, enumerated_shapes):
     """
     Annotate an input or output MLMultiArray feature in a Neural Network spec
@@ -687,50 +792,20 @@ def add_multiarray_ndshape_enumeration(spec, feature_name, enumerated_shapes):
     --------
     .. sourcecode:: python
 
-        >>> import coremltools
-        >>> from coremltools.models.neural_network import flexible_shape_utils
-        >>> spec = coremltools.utils.load_spec('mymodel.mlmodel')
-        >>> # say, the default shape of "my_multiarray_featurename" is (2,3)
-        >>> flexible_shape_utils.add_multiarray_ndshape_enumeration(spec, feature_name='my_multiarray_featurename', enumerated_shapes=[(2,4), (2,6)])
+        import coremltools
+        from coremltools.models.neural_network import flexible_shape_utils
+
+        spec = coremltools.utils.load_spec("mymodel.mlmodel")
+        # say, the default shape of "my_multiarray_featurename" is (2,3)
+        flexible_shape_utils.add_multiarray_ndshape_enumeration(
+            spec, feature_name="my_multiarray_featurename", enumerated_shapes=[(2, 4), (2, 6)]
+        )
 
     :return:
         None. The spec is updated
     """
-    if not isinstance(enumerated_shapes, list):
-        raise Exception("enumerated_shapes must be a list")
-    if len(enumerated_shapes) == 0:
-        raise Exception("enumerated_shapes is empty")
-
     feature = _get_feature(spec, feature_name)
-    if feature.type.WhichOneof("Type") != "multiArrayType":
-        raise Exception(
-            "Trying to update shape range for " "a non-multiArray feature type"
-        )
-
-    shape = feature.type.multiArrayType.shape
-
-    if feature.type.multiArrayType.WhichOneof("ShapeFlexibility") != "enumeratedShapes":
-        feature.type.multiArrayType.ClearField("ShapeFlexibility")
-
-    eshape_len = len(feature.type.multiArrayType.enumeratedShapes.shapes)
-
-    shapes_added_so_far = []
-
-    # Add default array shape to list of enumerated shapes if enumerated shapes
-    # field is currently empty
-    if eshape_len == 0:
-        fixed_shape = feature.type.multiArrayType.shape
-        s = feature.type.multiArrayType.enumeratedShapes.shapes.add()
-        s.shape.extend(fixed_shape)
-        shapes_added_so_far.append(list(fixed_shape))
-
-    for shape in enumerated_shapes:
-        if not isinstance(shape, tuple):
-            raise Exception("An element in 'enumerated_shapes' is not a tuple")
-        if list(shape) not in shapes_added_so_far:
-            s = feature.type.multiArrayType.enumeratedShapes.shapes.add()
-            s.shape.extend(list(shape))
-            shapes_added_so_far.append(list(shape))
+    _add_multiarray_ndshape_enumeration_for_feature(feature, enumerated_shapes)
 
     # Bump up specification version
     spec.specificationVersion = max(
diff --git a/coremltools/models/utils.py b/coremltools/models/utils.py
index 36e4e269b..851598c16 100644
--- a/coremltools/models/utils.py
+++ b/coremltools/models/utils.py
@@ -7,24 +7,24 @@
 Utilities for the entire package.
 """
 
-from collections.abc import Iterable as _Iterable
-from functools import lru_cache as _lru_cache
 import math as _math
 import os as _os
 import shutil as _shutil
 import subprocess as _subprocess
 import sys as _sys
 import tempfile as _tempfile
-from typing import Optional as _Optional, Union as _Union
 import warnings as _warnings
+from collections.abc import Iterable as _Iterable
+from functools import lru_cache as _lru_cache
+from typing import Optional as _Optional
+from typing import Union as _Union
 
 import numpy as _np
 
 import coremltools as _ct
 from coremltools import ComputeUnit as _ComputeUnit
+from coremltools import proto as _proto
 from coremltools.converters.mil.mil.passes.defs.preprocess import NameSanitizer as _NameSanitizer
-from coremltools.proto import Model_pb2 as _Model_pb2
-import coremltools.proto.MIL_pb2 as _mil_proto
 
 from .._deps import _HAS_SCIPY
 
@@ -62,7 +62,7 @@ def _remove_invalid_keys(input_dict, model):
 
 
 def _create_mlpackage(
-    proto_spec: _Model_pb2,
+    proto_spec: _proto.Model_pb2,
     weights_dir: _Optional[str] = None,
     package_path: _Optional[str] = None,
 ) -> str:
@@ -190,7 +190,7 @@ def save_spec(spec, filename, auto_set_specification_version=False, weights_dir=
             f.write(spec.SerializeToString())
 
 
-def load_spec(model_path: str) -> _Model_pb2:
+def load_spec(model_path: str) -> _proto.Model_pb2:
     """
     Load a protobuf model specification from file (mlmodel) or directory (mlpackage).
 
@@ -221,7 +221,7 @@ def load_spec(model_path: str) -> _Model_pb2:
     else:
         specfile = model_path
 
-    spec = _Model_pb2.Model()
+    spec = _proto.Model_pb2.Model()
     with open(specfile, "rb") as f:
         spec.ParseFromString(f.read())
     return spec
@@ -304,7 +304,9 @@ def _wp_to_fp16wp(wp):
 
 def _convert_neural_network_spec_weights_to_fp16(fp_spec):
     from .neural_network.quantization_utils import (
-        _QUANTIZATION_MODE_LINEAR_QUANTIZATION, _quantize_spec_weights)
+        _QUANTIZATION_MODE_LINEAR_QUANTIZATION,
+        _quantize_spec_weights,
+    )
 
     qspec = _quantize_spec_weights(fp_spec, 16, _QUANTIZATION_MODE_LINEAR_QUANTIZATION)
     return qspec
@@ -344,7 +346,6 @@ def _get_model(spec, compute_units=_ComputeUnit.ALL):
     else:
         return MLModel(spec, compute_units=compute_units)
 
-
 def evaluate_regressor(model, data, target="target", verbose=False):
     """
     Evaluate a CoreML regression model and compare against predictions
@@ -982,13 +983,8 @@ def convert_double_to_float_multiarray_type(spec):
 
     def _convert_to_float(feature):
         if feature.type.HasField("multiArrayType"):
-            if (
-                feature.type.multiArrayType.dataType
-                == _Model_pb2.ArrayFeatureType.DOUBLE
-            ):
-                feature.type.multiArrayType.dataType = (
-                    _Model_pb2.ArrayFeatureType.FLOAT32
-                )
+            if feature.type.multiArrayType.dataType == _proto.Model_pb2.ArrayFeatureType.DOUBLE:
+                feature.type.multiArrayType.dataType = _proto.Model_pb2.ArrayFeatureType.FLOAT32
 
     for feature in spec.description.input:
         _convert_to_float(feature)
@@ -1004,7 +1000,7 @@ def _convert_to_float(feature):
             convert_double_to_float_multiarray_type(model_spec)
 
 
-def compile_model(model: _Model_pb2.Model, destination_path: _Optional[str]=None) -> str:
+def compile_model(model: _proto.Model_pb2.Model, destination_path: _Optional[str] = None) -> str:
     """
     Compiles a Core ML model spec.
 
@@ -1036,13 +1032,13 @@ def compile_model(model: _Model_pb2.Model, destination_path: _Optional[str]=None
         spec.specificationVersion = 1
 
         input_ = spec.description.input.add()
-        input_.name = 'x'
+        input_.name = "x"
         input_.type.doubleType.MergeFromString(b"")
 
         output_ = spec.description.output.add()
-        output_.name = 'y'
+        output_.name = "y"
         output_.type.doubleType.MergeFromString(b"")
-        spec.description.predictedFeatureName = 'y'
+        spec.description.predictedFeatureName = "y"
 
         lr = spec.glmRegressor
         lr.offset.append(0.1)
@@ -1051,7 +1047,7 @@ def compile_model(model: _Model_pb2.Model, destination_path: _Optional[str]=None
 
         compiled_model_path = compile_model(spec)
         model = CompiledMLModel(compiled_model_path)
-        y = model.predict({'x': 2})
+        y = model.predict({"x": 2})
 
     See Also
     --------
@@ -1072,7 +1068,7 @@ def compile_model(model: _Model_pb2.Model, destination_path: _Optional[str]=None
     if isinstance(model, _ct.models.MLModel):
         raise TypeError("This model has already been compiled. Call \"get_compiled_model_path\""
                         " to get the compiled model.")
-    if not isinstance(model, _Model_pb2.Model):
+    if not isinstance(model, _proto.Model_pb2.Model):
         raise TypeError("Unrecognized input for \"model\" parameter. It should be a spec.")
 
     # Check file extension of destination_path parameter
@@ -1128,20 +1124,20 @@ def make_pipeline(
     --------
     .. sourcecode:: python
 
-        my_model1 = ct.models.MLModel('/tmp/m1.mlpackage')
-        my_model2 = ct.models.MLModel('/tmp/m2.mlmodel')
-        
+        my_model1 = ct.models.MLModel("/tmp/m1.mlpackage")
+        my_model2 = ct.models.MLModel("/tmp/m2.mlmodel")
+
         my_pipeline_model = ct.utils.make_pipeline(my_model1, my_model2)
 
-        y = my_pipeline_model.predict({'x': 12})
+        y = my_pipeline_model.predict({"x": 12})
 
-        my_pipeline_model.save('/tmp/my_pipeline.mlpackage')
-        new_my_pipeline = ct.model.MLModel('/tmp/my_pipeline.mlpackage')
+        my_pipeline_model.save("/tmp/my_pipeline.mlpackage")
+        new_my_pipeline = ct.model.MLModel("/tmp/my_pipeline.mlpackage")
 
     """
 
     def updateBlobFileName(proto_message, new_path):
-        if type(proto_message) == _mil_proto.Value:
+        if type(proto_message) == _proto.MIL_pb2.Value:
             # Value protobuf message. This is what might need to be updated.
             if proto_message.WhichOneof('value') == 'blobFileValue':
                 assert proto_message.blobFileValue.fileName == "@model_path/weights/weight.bin"
diff --git a/coremltools/optimize/coreml/_config.py b/coremltools/optimize/coreml/_config.py
index d43f46d25..9835099f8 100644
--- a/coremltools/optimize/coreml/_config.py
+++ b/coremltools/optimize/coreml/_config.py
@@ -790,8 +790,8 @@ def _get_const_op_config(self, op: Operation):
         if not isinstance(op, Operation):
             raise TypeError(f"op must be type of Operation. Got {type(op)}")
 
-        if op.op_type != "const":
-            raise TypeError(f"op must be of type const. Got {op.op_type}")
+        if not (op.op_type == "const" or op.op_type.startswith("constexpr_")):
+            raise TypeError(f"op must be of type const or constexpr. Got {op.op_type}")
 
         if op.name in self.op_name_configs:
             return self.op_name_configs[op.name]
diff --git a/coremltools/optimize/coreml/_post_training_quantization.py b/coremltools/optimize/coreml/_post_training_quantization.py
index 8b86704c3..3e4d0ae03 100644
--- a/coremltools/optimize/coreml/_post_training_quantization.py
+++ b/coremltools/optimize/coreml/_post_training_quantization.py
@@ -16,6 +16,8 @@
 from coremltools.converters.mil.mil.passes.defs.quantization import (
     AbstractQuantizationPass as _AbstractQuantizationPass,
 )
+from coremltools.converters.mil.mil.passes.graph_pass import PassOption
+from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 from coremltools.models import MLModel as _MLModel
 from coremltools.optimize.coreml import OptimizationConfig as _OptimizationConfig
 from coremltools.optimize.coreml._config import _MetaDataDict
@@ -23,7 +25,6 @@
 from ._quantization_passes import WeightDecompressor as _WeightDecompressor
 from ._quantization_passes import linear_quantize_weights as _linear_quantize_weights
 from ._quantization_passes import palettize_weights as _palettize_weights
-from ._quantization_passes import prune_weights as _prune_weights
 
 
 def _convert_model_spec_to_pymil_prog(
@@ -319,8 +320,8 @@ def prune_weights(mlmodel: _MLModel, config: _OptimizationConfig):
         compressed_model = cto.coreml.prune_weights(model, config)
 
     """
-
-    weight_pruner = _prune_weights(config, fake_compression=False)
+    weight_pruner = PASS_REGISTRY["compression::prune_weights"]
+    weight_pruner.set_options([PassOption("config", config)])
     return _apply_graph_pass(mlmodel, weight_pruner)
 
 def decompress_weights(mlmodel: _MLModel):
@@ -477,7 +478,7 @@ def _get_weight_metadata(op):
     def get_weights_meta_block(block):
         # get the candidates ops with the given op_type
         candidate_ops = []
-        for op in list(block.operations):
+        for op in block.operations:
             for b in op.blocks:
                 get_weights_meta_block(b)
 
diff --git a/coremltools/optimize/coreml/_quantization_passes.py b/coremltools/optimize/coreml/_quantization_passes.py
index 878fa3d68..0338c2e4c 100644
--- a/coremltools/optimize/coreml/_quantization_passes.py
+++ b/coremltools/optimize/coreml/_quantization_passes.py
@@ -8,6 +8,7 @@
 import numpy as np
 from tqdm import tqdm
 
+import coremltools.converters.mil.frontend._utils as frontend_utils
 from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.backend.mil.load import should_use_weight_file
@@ -24,7 +25,10 @@
 from coremltools.converters.mil.mil.passes.helper import block_context_manager
 from coremltools.converters.mil.mil.passes.pass_registry import register_pass
 from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
+from coremltools.converters.mil.mil.var import Var
+from coremltools.models._deprecation import deprecated as _deprecated
 from coremltools.models.neural_network.quantization_utils import _get_kmeans_lookup_table_and_weight
+from coremltools.optimize.coreml import _utils
 from coremltools.optimize.coreml._config import (
     OpLinearQuantizerConfig,
     OpMagnitudePrunerConfig,
@@ -117,36 +121,48 @@ def apply_block(block):
             apply_block(f)
 
     @property
-    def config(self):
+    def config(self) -> OptimizationConfig:
         return self._config
 
     @config.setter
-    def config(self, value):
+    def config(self, value: OptimizationConfig):
         self._check_config_type(value)
         self._config = value
+        if value._op_selector is not None:
+            self.op_selector = value._op_selector
 
-    @staticmethod
-    def need_compress_const(op: Operation, _is_deprecated: bool, weight_threshold: float):
+    def need_compress_const(
+        self, op: Operation, _is_deprecated: bool, weight_threshold: float
+    ) -> bool:
         """
         The utility function is checking whether a const op can be compressed.
         If ``_is_deprecated = True``, the user is using the ``ct.compression_utils``, in which the ops are already filtered by ``op_selector``.
         For the new ``ct.optimize.coreml`` API, ``op_selector`` is no longer supported, so the ``weight_threshold`` is checked explicitly instead.
         """
-        val = op.outputs[0].val
+        val = self._get_const_value(op)
         if _is_deprecated and weight_threshold != None:
             raise ValueError("weight_threshold cannot be set through the deprecated ct.compression_util API")
 
         if _is_deprecated:
             return should_use_weight_file(val)
 
-        # const fed into constexpr ops cannot be compressed
-        if any([child_op.op_type.startswith("constexpr") for child_op in op.outputs[0].child_ops]):
+        if not self._validate_child_constexpr_for_compress(op):
             return False
 
         if weight_threshold is None:
             raise ValueError("weight_threshold cannot be None")
 
-        return should_use_weight_file(val) and val.size > weight_threshold
+        return (
+            should_use_weight_file(val) and self._get_weight_to_compress_size(op) > weight_threshold
+        )
+
+    def _validate_child_constexpr_for_compress(self, op: Operation) -> bool:
+        """Check if child constexpr ops support current op to be compressed."""
+        for child_op in op.outputs[0].child_ops:
+            if child_op.op_type.startswith("constexpr_"):
+                # Const fed into constexpr_ ops cannot be further compressed.
+                return False
+        return True
 
     def _check_config_type(self, config: OptimizationConfig):
         """
@@ -169,28 +185,53 @@ def get_supported_types_as_str(supported_type):
                 raise ValueError(f"{self.__class__.__name__} only accept {supported_type_str} type config. Got {config.__class__.__name__}.")
 
     @staticmethod
-    def pick_channnel_axis(op: Operation) -> int:
+    def select_input_output_channel_axis(op: Operation) -> Tuple[int, int]:
         """
-        By default, output channel is used as the channel axis. Here are some representative ops:
+        Here are some representative ops:
         - linear: [D_out, D_in]
         - matmul's y: [..., D_in, D_out] if transpose_y is False, else [..., D_out, D_in]
         - conv: [C_out, C_in_div_group, KH, KW]
         - conv_transpose: [C_in, C_out_div_group, KH, KW]
 
-        So the channel axis picking criterial is:
-        - For conv_transpose it's 1
-        - For matmul's y it's -1 (transpose_y=False) or -2 (transpose_y=True)
-        - For all other ops, it's 0
+        The input output channel axis selection criteria is:
+        - For conv_transpose the output channel is 1 and input channel is 0.
+        - For matmul's y:
+            - When transpose_y=False, output channel is -1 and input channel is -2
+            - When transpose_y=True, output channel is -2 and input channel is -1
+        - For all other ops, output channel is 0 and input channel is 1.
         """
-        channel_axis = 0
+        output_channel_axis, input_channel_axis = 0, 1
         var = op.outputs[0]
         if len(var.child_ops) == 1:
             child_op = var.child_ops[0]
             if child_op.op_type == "conv_transpose":
-                channel_axis = 1
+                output_channel_axis = 1
+                input_channel_axis = 0
             if child_op.op_type == "matmul" and child_op.y == var:
-                channel_axis = -1 if child_op.transpose_y else -2
-        return channel_axis
+                if child_op.transpose_y.val:
+                    output_channel_axis = -2
+                    input_channel_axis = -1
+                else:
+                    output_channel_axis = -1
+                    input_channel_axis = -2
+            if child_op.op_type.startswith("constexpr_"):
+                return AbstractCompressionPass.select_input_output_channel_axis(child_op)
+        return input_channel_axis, output_channel_axis
+
+    def is_valid_op(self, op: Operation):
+        if op.op_type == "const" and should_use_weight_file(self._get_const_value(op)):
+            return True
+        return False
+
+    def _get_const_value(self, op: Operation) -> np.ndarray:
+        if op.op_type != "const":
+            raise ValueError(f"The op {op} is not a const")
+        return op.outputs[0].val
+
+    def _get_weight_to_compress_size(self, op: Operation) -> int:
+        if op.op_type != "const":
+            raise ValueError("Only const weight can be compressed")
+        return np.prod(op.outputs[0].shape)
 
 
 @register_pass(namespace="compression")
@@ -210,11 +251,6 @@ class prune_weights(AbstractCompressionPass):
     """
     _SUPPORTED_CONFIG_TYPE = (OpMagnitudePrunerConfig, OpThresholdPrunerConfig)
 
-    def is_valid_op(self, op: Operation):
-        if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
-            return True
-        return False
-
     @staticmethod
     def _pack_val_to_sparse_param(val):
         flattened_val = val.flatten()
@@ -367,6 +403,16 @@ def decompress(params):
             raise ValueError("Invalid type of params")
         return constexpr_sparse_to_dense.decompress(params.nonzero_data, params.mask, params.shape)
 
+    @staticmethod
+    def _create_constexpr_var(op: Operation, sparse_params: SparseParams) -> Var:
+        return mb.constexpr_sparse_to_dense(
+            nonzero_data=sparse_params.nonzero_data,
+            mask=sparse_params.mask,
+            shape=np.uint32(sparse_params.shape),
+            before_op=op,
+            name=op.name + "_sparsified",
+        )
+
     def transform_op(self, op: Operation):
         op_config = self.config._get_const_op_config(op)
         if op_config is None:
@@ -374,15 +420,16 @@ def transform_op(self, op: Operation):
         if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
             return
 
-        if not isinstance(op.outputs[0].val, (np.ndarray, np.generic)):
+        const_val = self._get_const_value(op)
+        if not isinstance(const_val, (np.ndarray, np.generic)):
             raise ValueError("Only numpy arrays are supported")
 
         if isinstance(op_config, OpThresholdPrunerConfig):
             sparse_params = self.compress_by_threshold(
-                                val=op.outputs[0].val,
-                                threshold=op_config.threshold,
-                                minimum_sparsity_percentile=op_config.minimum_sparsity_percentile
-                            )
+                val=const_val,
+                threshold=op_config.threshold,
+                minimum_sparsity_percentile=op_config.minimum_sparsity_percentile,
+            )
         elif isinstance(op_config, OpMagnitudePrunerConfig):
             # Structural sparsity can only be applied to conv / linear weight
             # For non applicable constant, we skip the compression,
@@ -395,29 +442,23 @@ def transform_op(self, op: Operation):
 
             if op_config.target_sparsity is not None:
                 sparse_params = self.compress_by_magnitude(
-                                    val=op.outputs[0].val,
-                                    target_sparsity=op_config.target_sparsity,
-                                    block_size=op_config.block_size,
-                                    dim=op_config.dim,
-                                )
+                    val=const_val,
+                    target_sparsity=op_config.target_sparsity,
+                    block_size=op_config.block_size,
+                    dim=op_config.dim,
+                )
             elif op_config.n_m_ratio is not None:
                 sparse_params = self.compress_by_nm_sparsity(
-                                    val=op.outputs[0].val,
-                                    n_m_ratio=op_config.n_m_ratio,
-                                    dim=op_config.dim,
-                                )
+                    val=const_val,
+                    n_m_ratio=op_config.n_m_ratio,
+                    dim=op_config.dim,
+                )
 
         if sparse_params is None:
             return
 
         if not self.fake_compression:
-            new_var = mb.constexpr_sparse_to_dense(
-                nonzero_data=sparse_params.nonzero_data,
-                mask=sparse_params.mask,
-                shape=np.uint32(sparse_params.shape),
-                before_op=op,
-                name=op.name + "_sparsified",
-            )
+            new_var = self._create_constexpr_var(op, sparse_params)
         else:
             decompressed_val = self.decompress(sparse_params)
             new_var = mb.const(
@@ -453,19 +494,22 @@ class palettize_weights(AbstractCompressionPass):
     _SUPPORTED_CONFIG_TYPE = OpPalettizerConfig
     _SUPPORTED_NBITS = (1, 2, 4, 6, 8)
 
-    def is_valid_op(self, op: Operation):
-        if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
-            return True
-        return False
-
     @staticmethod
     def _get_nbits_for_unique_mode(val: np.ndarray, allowed_nbits: Tuple[int, ...]) -> int:
+        """
+        Try each nbit in allowed_nbits to find one that can represent number of unique values in val.
+
+        Note that the values in `allowed_nbits` need to be in ascending order.
+        """
         val = val.flatten()
         unique_vals = np.unique(val).tolist()
         for nbits in allowed_nbits:
             if len(unique_vals) <= 1 << nbits:
                 return nbits
-        raise ValueError("Unique values in weight cannot be represented by 8 bits palettization.")
+        raise ValueError(
+            f"Unique values in weight cannot be represented by {allowed_nbits[-1]} "
+            "bits palettization."
+        )
 
     @staticmethod
     def _get_lut_and_indices(
@@ -572,6 +616,16 @@ def decompress(params):
             raise ValueError("Invalid type of params")
         return constexpr_lut_to_dense.decompress(params.lut, params.indices, params.shape)
 
+    @staticmethod
+    def _create_constexpr_var(op: Operation, lut_params: LutParams) -> Var:
+        return mb.constexpr_lut_to_dense(
+            indices=lut_params.indices,
+            lut=lut_params.lut,
+            shape=np.uint32(lut_params.shape),
+            before_op=op,
+            name=op.name + "_palettized",
+        )
+
     def transform_op(self, op: Operation):
         op_config = self.config._get_const_op_config(op)
         if op_config is None:
@@ -596,13 +650,7 @@ def transform_op(self, op: Operation):
         )
 
         if not self.fake_compression:
-            new_var = mb.constexpr_lut_to_dense(
-                indices=lut_params.indices,
-                lut=lut_params.lut,
-                shape=np.uint32(lut_params.shape),
-                before_op=op,
-                name=op.name + "_palettized",
-            )
+            new_var = palettize_weights._create_constexpr_var(op, lut_params)
         else:
             decompressed_val = self.decompress(lut_params)
             new_var = mb.const(
@@ -620,6 +668,7 @@ def transform_op(self, op: Operation):
 
         op.enclosing_block.remove_ops([op])
 
+
 @register_pass(namespace="compression")
 class linear_quantize_weights(AbstractCompressionPass):
     """
@@ -642,16 +691,16 @@ class linear_quantize_weights(AbstractCompressionPass):
         (types.uint8, "LINEAR_SYMMETRIC"): (0, 254),
     }
 
-    def is_valid_op(self, op: Operation):
-        if op.op_type == "const" and should_use_weight_file(op.outputs[0].val):
-            return True
-        return False
-
     @classmethod
+    @_deprecated(
+        suffix="Please use _utils.quantize_weight",
+        version="8.0",
+        obj_prefix="coremltools.optimize.coreml._quantization_passes.",
+    )
     def _get_quantized_data(
         cls, original_data: np.ndarray, axes: Tuple[int, ...], mode: str, dtype: type
     ) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]:
-        """Get quantized data along with metadata (scale, zero_point)."""
+        """[Deprecated] Get quantized data along with metadata (scale, zero_point)."""
         if not np.issubdtype(original_data.dtype, np.floating):
             raise ValueError("Only floating numpy arrays are supported.")
 
@@ -696,8 +745,21 @@ def _get_quantized_data(
     def compress(cls, val: np.ndarray, axis: int, mode: str, dtype: type) -> AffineQuantParams:
         if not isinstance(val, (np.ndarray, np.generic)):
             raise ValueError("Only numpy arrays are supported")
+        if isinstance(dtype, np.dtype):
+            dtype = types.numpy_type_to_builtin_type(dtype)
+        if not types.is_builtin(dtype):
+            raise ValueError(f"The input dtype is should be a built-in type, but got {type(dtype)}")
+
         axes = tuple([i for i in range(len(val.shape)) if i != axis])
-        quantized_data, scale, zero_point = cls._get_quantized_data(val, axes, mode, dtype)
+        quantized_data, scale, zero_point = _utils.quantize_weight(
+            val,
+            axes,
+            nbits=dtype.get_bitwidth(),
+            signed=not dtype.is_unsigned(),
+            quantization_mode=mode,
+            dtype=types.nptype_from_builtin(dtype),
+        )
+
         if zero_point is None:
             # The iOS16 constexpr_affine_dequantize op requires zero_point.
             zero_point = np.zeros_like(scale).astype(quantized_data.dtype)
@@ -718,18 +780,19 @@ def transform_op(self, op: Operation):
         if not self.need_compress_const(op, self.config._is_deprecated, op_config.weight_threshold):
             return
 
+        output_channel = self.select_input_output_channel_axis(op)[1]
         quant_params = self.compress(
-            op.outputs[0].val, self.pick_channnel_axis(op), op_config.mode, op_config.dtype
+            op.outputs[0].val, output_channel, op_config.mode, op_config.dtype
         )
 
         if not self.fake_compression:
-            new_var = mb.constexpr_affine_dequantize(
-                quantized_data=quant_params.quantized_data,
-                zero_point=quant_params.zero_point,
-                scale=quant_params.scale,
-                axis=quant_params.axis,
-                before_op=op,
+            new_var = frontend_utils._construct_constexpr_affine_op(
+                quant_params.quantized_data,
+                quant_params.zero_point,
+                quant_params.scale,
+                quant_params.axis,
                 name=op.name + "_affine_quantized",
+                before_op=op,
             )
         else:
             decompressed_val = self.decompress(quant_params)
@@ -752,11 +815,7 @@ def transform_op(self, op: Operation):
 class WeightDecompressor(AbstractQuantizationPass):
     """
     This graph pass transforms the ``constexpr`` op back into ``mb.const`` op.
-    The ``constexpr`` op includes:
-
-    - ``constexpr_affine_dequantize``
-    - ``constexpr_lut_to_dense``
-    - ``constexpr_sparse_to_dense``
+    The ``constexpr`` op has op_type starts with the "constexpr_" prefix.
     """
 
     def __init__(self, op_selector):
@@ -767,18 +826,24 @@ def is_valid_op(self, op):
 
     def transform_op(self, op):
         decompressed_val = op.materialized_val_inference()
-        new_var = mb.const(
-            val=decompressed_val,
-            before_op=op,
-            name=op.name,
-        )
 
-        op.enclosing_block.replace_uses_of_var_after_op(
-            anchor_op=op,
-            old_var=op.outputs[0],
-            new_var=new_var,
-            no_check_var_types=True,
-            force_replace=True,
-        )
+        if not isinstance(decompressed_val, (list, tuple)):
+            decompressed_val = [decompressed_val]
+
+        if len(decompressed_val) != len(op.outputs):
+            raise ValueError(
+                "The number of decompressed value should match the number of op outputs. "
+                f"But got {len(decompressed_val)} vs {len(op.outputs)}"
+            )
+
+        for decomp_val, output_var in zip(decompressed_val, op.outputs):
+            new_const = mb.const(val=decomp_val, before_op=op, name=op.name)
+            op.enclosing_block.replace_uses_of_var_after_op(
+                anchor_op=op,
+                old_var=output_var,
+                new_var=new_const,
+                no_check_var_types=True,
+                force_replace=True,
+            )
 
         op.enclosing_block.remove_ops([op])
diff --git a/coremltools/optimize/coreml/_utils.py b/coremltools/optimize/coreml/_utils.py
new file mode 100644
index 000000000..75a953419
--- /dev/null
+++ b/coremltools/optimize/coreml/_utils.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Apple Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+import numpy as np
+from typing import Tuple, Optional
+
+
+def get_quant_range(n_bits: int, signed: bool, mode: str) -> Tuple[int, int]:
+    """
+    Utility to get the quantization range for a given quantization config
+    Adapted from phoenix/quatization/_utils.py
+    """
+    max_q = 2**n_bits
+    if not signed:
+        quant_min = 0
+        quant_max = max_q - 1
+        if mode == "LINEAR_SYMMETRIC":
+            quant_max -= 1
+    else:
+        quant_min = -max_q / 2
+        quant_max = max_q / 2 - 1
+        if mode == "LINEAR_SYMMETRIC":
+            quant_min += 1
+    return int(quant_min), int(quant_max)
+
+
+def quantize_weight(
+    weight: np.ndarray,
+    axes: Tuple[int, ...],
+    nbits: int,
+    signed: bool,
+    quantization_mode: str,
+    dtype: np.dtype,
+) -> Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]]:
+    """Get quantized data along with metadata (scale, zero_point)."""
+    if not np.issubdtype(weight.dtype, np.floating):
+        raise ValueError("Only floating numpy arrays are supported.")
+
+    val_min = np.amin(weight, axis=axes, keepdims=True)
+    val_max = np.amax(weight, axis=axes, keepdims=True)
+
+    q_val_min, q_val_max = get_quant_range(nbits, signed, quantization_mode)
+
+    zero_point = None
+    if quantization_mode == "LINEAR_SYMMETRIC":
+        # For the linear_symmetric quantization_mode, the range is symmetrical to 0
+        max_abs = np.maximum(np.abs(val_min), np.abs(val_max))
+        val_min = -max_abs
+        val_max = max_abs
+
+        if not signed:
+            zero_point_shift = q_val_max // 2
+            zero_point = zero_point_shift * np.ones(val_min.shape)
+    else:
+        assert quantization_mode == "LINEAR"
+        # For the linear quantization_mode, we need to make sure the data range contains `0`
+        val_min = np.minimum(0.0, val_min)
+        val_max = np.maximum(0.0, val_max)
+        zero_point = (q_val_min * val_max - q_val_max * val_min) / (val_max - val_min)
+        zero_point = np.round(zero_point)
+        zero_point = np.clip(zero_point, q_val_min, q_val_max)
+
+    scale = (val_max - val_min) / (q_val_max - q_val_min)
+    quantized_data = np.round(weight / scale)
+    if zero_point is not None:
+        quantized_data += zero_point
+        zero_point = zero_point.squeeze().astype(dtype)
+    quantized_data = np.clip(quantized_data, q_val_min, q_val_max).astype(dtype)
+    scale = scale.astype(weight.dtype).squeeze()
+
+    return quantized_data, scale, zero_point
diff --git a/coremltools/proto/MIL_pb2.py b/coremltools/proto/MIL_pb2.py
index 0e9bf64f9..b1be30e92 100644
--- a/coremltools/proto/MIL_pb2.py
+++ b/coremltools/proto/MIL_pb2.py
@@ -20,7 +20,7 @@
   name='MIL.proto',
   package='CoreML.Specification.MILSpec',
   syntax='proto3',
-  serialized_pb=_b('\n\tMIL.proto\x12\x1c\x43oreML.Specification.MILSpec\"\xf3\x02\n\x07Program\x12\x0f\n\x07version\x18\x01 \x01(\x03\x12G\n\tfunctions\x18\x02 \x03(\x0b\x32\x34.CoreML.Specification.MILSpec.Program.FunctionsEntry\x12\x11\n\tdocString\x18\x03 \x01(\t\x12I\n\nattributes\x18\x04 \x03(\x0b\x32\x35.CoreML.Specification.MILSpec.Program.AttributesEntry\x1aX\n\x0e\x46unctionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x35\n\x05value\x18\x02 \x01(\x0b\x32&.CoreML.Specification.MILSpec.Function:\x02\x38\x01\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"\xbe\x03\n\x08\x46unction\x12<\n\x06inputs\x18\x01 \x03(\x0b\x32,.CoreML.Specification.MILSpec.NamedValueType\x12\r\n\x05opset\x18\x02 \x01(\t\x12_\n\x15\x62lock_specializations\x18\x03 \x03(\x0b\x32@.CoreML.Specification.MILSpec.Function.BlockSpecializationsEntry\x12J\n\nattributes\x18\x04 \x03(\x0b\x32\x36.CoreML.Specification.MILSpec.Function.AttributesEntry\x1a`\n\x19\x42lockSpecializationsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Block:\x02\x38\x01\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"\xb4\x02\n\x05\x42lock\x12<\n\x06inputs\x18\x01 \x03(\x0b\x32,.CoreML.Specification.MILSpec.NamedValueType\x12\x0f\n\x07outputs\x18\x02 \x03(\t\x12;\n\noperations\x18\x03 \x03(\x0b\x32\'.CoreML.Specification.MILSpec.Operation\x12G\n\nattributes\x18\x04 \x03(\x0b\x32\x33.CoreML.Specification.MILSpec.Block.AttributesEntry\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"\xa9\x01\n\x08\x41rgument\x12\x41\n\targuments\x18\x01 \x03(\x0b\x32..CoreML.Specification.MILSpec.Argument.Binding\x1aZ\n\x07\x42inding\x12\x0e\n\x04name\x18\x01 \x01(\tH\x00\x12\x34\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.ValueH\x00\x42\t\n\x07\x62inding\"\xce\x03\n\tOperation\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x43\n\x06inputs\x18\x02 \x03(\x0b\x32\x33.CoreML.Specification.MILSpec.Operation.InputsEntry\x12=\n\x07outputs\x18\x03 \x03(\x0b\x32,.CoreML.Specification.MILSpec.NamedValueType\x12\x33\n\x06\x62locks\x18\x04 \x03(\x0b\x32#.CoreML.Specification.MILSpec.Block\x12K\n\nattributes\x18\x05 \x03(\x0b\x32\x37.CoreML.Specification.MILSpec.Operation.AttributesEntry\x1aU\n\x0bInputsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x35\n\x05value\x18\x02 \x01(\x0b\x32&.CoreML.Specification.MILSpec.Argument:\x02\x38\x01\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"U\n\x0eNamedValueType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x35\n\x04type\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\"\x95\x02\n\tValueType\x12>\n\ntensorType\x18\x01 \x01(\x0b\x32(.CoreML.Specification.MILSpec.TensorTypeH\x00\x12:\n\x08listType\x18\x02 \x01(\x0b\x32&.CoreML.Specification.MILSpec.ListTypeH\x00\x12<\n\ttupleType\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.TupleTypeH\x00\x12\x46\n\x0e\x64ictionaryType\x18\x04 \x01(\x0b\x32,.CoreML.Specification.MILSpec.DictionaryTypeH\x00\x42\x06\n\x04type\"\xb7\x02\n\nTensorType\x12\x38\n\x08\x64\x61taType\x18\x01 \x01(\x0e\x32&.CoreML.Specification.MILSpec.DataType\x12\x0c\n\x04rank\x18\x02 \x01(\x03\x12;\n\ndimensions\x18\x03 \x03(\x0b\x32\'.CoreML.Specification.MILSpec.Dimension\x12L\n\nattributes\x18\x04 \x03(\x0b\x32\x38.CoreML.Specification.MILSpec.TensorType.AttributesEntry\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"C\n\tTupleType\x12\x36\n\x05types\x18\x01 \x03(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\"z\n\x08ListType\x12\x35\n\x04type\x18\x01 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\x12\x37\n\x06length\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.Dimension\"\x86\x01\n\x0e\x44ictionaryType\x12\x38\n\x07keyType\x18\x01 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\x12:\n\tvalueType\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\"\xfd\x01\n\tDimension\x12M\n\x08\x63onstant\x18\x01 \x01(\x0b\x32\x39.CoreML.Specification.MILSpec.Dimension.ConstantDimensionH\x00\x12K\n\x07unknown\x18\x02 \x01(\x0b\x32\x38.CoreML.Specification.MILSpec.Dimension.UnknownDimensionH\x00\x1a!\n\x11\x43onstantDimension\x12\x0c\n\x04size\x18\x01 \x01(\x04\x1a$\n\x10UnknownDimension\x12\x10\n\x08variadic\x18\x01 \x01(\x08\x42\x0b\n\tdimension\"\xb9\x04\n\x05Value\x12\x11\n\tdocString\x18\x01 \x01(\t\x12\x35\n\x04type\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\x12L\n\x0eimmediateValue\x18\x03 \x01(\x0b\x32\x32.CoreML.Specification.MILSpec.Value.ImmediateValueH\x00\x12J\n\rblobFileValue\x18\x05 \x01(\x0b\x32\x31.CoreML.Specification.MILSpec.Value.BlobFileValueH\x00\x1a\x8f\x02\n\x0eImmediateValue\x12;\n\x06tensor\x18\x01 \x01(\x0b\x32).CoreML.Specification.MILSpec.TensorValueH\x00\x12\x39\n\x05tuple\x18\x02 \x01(\x0b\x32(.CoreML.Specification.MILSpec.TupleValueH\x00\x12\x37\n\x04list\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ListValueH\x00\x12\x43\n\ndictionary\x18\x04 \x01(\x0b\x32-.CoreML.Specification.MILSpec.DictionaryValueH\x00\x42\x07\n\x05value\x1a\x31\n\rBlobFileValue\x12\x10\n\x08\x66ileName\x18\x01 \x01(\t\x12\x0e\n\x06offset\x18\x02 \x01(\x04\x42\x07\n\x05value\"\xac\x06\n\x0bTensorValue\x12J\n\x06\x66loats\x18\x01 \x01(\x0b\x32\x38.CoreML.Specification.MILSpec.TensorValue.RepeatedFloatsH\x00\x12\x46\n\x04ints\x18\x02 \x01(\x0b\x32\x36.CoreML.Specification.MILSpec.TensorValue.RepeatedIntsH\x00\x12H\n\x05\x62ools\x18\x03 \x01(\x0b\x32\x37.CoreML.Specification.MILSpec.TensorValue.RepeatedBoolsH\x00\x12L\n\x07strings\x18\x04 \x01(\x0b\x32\x39.CoreML.Specification.MILSpec.TensorValue.RepeatedStringsH\x00\x12N\n\x08longInts\x18\x05 \x01(\x0b\x32:.CoreML.Specification.MILSpec.TensorValue.RepeatedLongIntsH\x00\x12L\n\x07\x64oubles\x18\x06 \x01(\x0b\x32\x39.CoreML.Specification.MILSpec.TensorValue.RepeatedDoublesH\x00\x12H\n\x05\x62ytes\x18\x07 \x01(\x0b\x32\x37.CoreML.Specification.MILSpec.TensorValue.RepeatedBytesH\x00\x1a$\n\x0eRepeatedFloats\x12\x12\n\x06values\x18\x01 \x03(\x02\x42\x02\x10\x01\x1a%\n\x0fRepeatedDoubles\x12\x12\n\x06values\x18\x01 \x03(\x01\x42\x02\x10\x01\x1a\"\n\x0cRepeatedInts\x12\x12\n\x06values\x18\x01 \x03(\x05\x42\x02\x10\x01\x1a&\n\x10RepeatedLongInts\x12\x12\n\x06values\x18\x01 \x03(\x03\x42\x02\x10\x01\x1a#\n\rRepeatedBools\x12\x12\n\x06values\x18\x01 \x03(\x08\x42\x02\x10\x01\x1a!\n\x0fRepeatedStrings\x12\x0e\n\x06values\x18\x01 \x03(\t\x1a\x1f\n\rRepeatedBytes\x12\x0e\n\x06values\x18\x01 \x01(\x0c\x42\x07\n\x05value\"A\n\nTupleValue\x12\x33\n\x06values\x18\x01 \x03(\x0b\x32#.CoreML.Specification.MILSpec.Value\"@\n\tListValue\x12\x33\n\x06values\x18\x01 \x03(\x0b\x32#.CoreML.Specification.MILSpec.Value\"\xd3\x01\n\x0f\x44ictionaryValue\x12J\n\x06values\x18\x01 \x03(\x0b\x32:.CoreML.Specification.MILSpec.DictionaryValue.KeyValuePair\x1at\n\x0cKeyValuePair\x12\x30\n\x03key\x18\x01 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value*\xb2\x01\n\x08\x44\x61taType\x12\x0f\n\x0bUNUSED_TYPE\x10\x00\x12\x08\n\x04\x42OOL\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x0b\n\x07\x46LOAT16\x10\n\x12\x0b\n\x07\x46LOAT32\x10\x0b\x12\x0b\n\x07\x46LOAT64\x10\x0c\x12\x08\n\x04INT8\x10\x15\x12\t\n\x05INT16\x10\x16\x12\t\n\x05INT32\x10\x17\x12\t\n\x05INT64\x10\x18\x12\t\n\x05UINT8\x10\x1f\x12\n\n\x06UINT16\x10 \x12\n\n\x06UINT32\x10!\x12\n\n\x06UINT64\x10\"B\x02H\x03\x62\x06proto3')
+  serialized_pb=_b('\n\tMIL.proto\x12\x1c\x43oreML.Specification.MILSpec\"\xf3\x02\n\x07Program\x12\x0f\n\x07version\x18\x01 \x01(\x03\x12G\n\tfunctions\x18\x02 \x03(\x0b\x32\x34.CoreML.Specification.MILSpec.Program.FunctionsEntry\x12\x11\n\tdocString\x18\x03 \x01(\t\x12I\n\nattributes\x18\x04 \x03(\x0b\x32\x35.CoreML.Specification.MILSpec.Program.AttributesEntry\x1aX\n\x0e\x46unctionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x35\n\x05value\x18\x02 \x01(\x0b\x32&.CoreML.Specification.MILSpec.Function:\x02\x38\x01\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"\xbe\x03\n\x08\x46unction\x12<\n\x06inputs\x18\x01 \x03(\x0b\x32,.CoreML.Specification.MILSpec.NamedValueType\x12\r\n\x05opset\x18\x02 \x01(\t\x12_\n\x15\x62lock_specializations\x18\x03 \x03(\x0b\x32@.CoreML.Specification.MILSpec.Function.BlockSpecializationsEntry\x12J\n\nattributes\x18\x04 \x03(\x0b\x32\x36.CoreML.Specification.MILSpec.Function.AttributesEntry\x1a`\n\x19\x42lockSpecializationsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Block:\x02\x38\x01\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"\xb4\x02\n\x05\x42lock\x12<\n\x06inputs\x18\x01 \x03(\x0b\x32,.CoreML.Specification.MILSpec.NamedValueType\x12\x0f\n\x07outputs\x18\x02 \x03(\t\x12;\n\noperations\x18\x03 \x03(\x0b\x32\'.CoreML.Specification.MILSpec.Operation\x12G\n\nattributes\x18\x04 \x03(\x0b\x32\x33.CoreML.Specification.MILSpec.Block.AttributesEntry\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"\xa9\x01\n\x08\x41rgument\x12\x41\n\targuments\x18\x01 \x03(\x0b\x32..CoreML.Specification.MILSpec.Argument.Binding\x1aZ\n\x07\x42inding\x12\x0e\n\x04name\x18\x01 \x01(\tH\x00\x12\x34\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.ValueH\x00\x42\t\n\x07\x62inding\"\xce\x03\n\tOperation\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x43\n\x06inputs\x18\x02 \x03(\x0b\x32\x33.CoreML.Specification.MILSpec.Operation.InputsEntry\x12=\n\x07outputs\x18\x03 \x03(\x0b\x32,.CoreML.Specification.MILSpec.NamedValueType\x12\x33\n\x06\x62locks\x18\x04 \x03(\x0b\x32#.CoreML.Specification.MILSpec.Block\x12K\n\nattributes\x18\x05 \x03(\x0b\x32\x37.CoreML.Specification.MILSpec.Operation.AttributesEntry\x1aU\n\x0bInputsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x35\n\x05value\x18\x02 \x01(\x0b\x32&.CoreML.Specification.MILSpec.Argument:\x02\x38\x01\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"U\n\x0eNamedValueType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x35\n\x04type\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\"\x95\x02\n\tValueType\x12>\n\ntensorType\x18\x01 \x01(\x0b\x32(.CoreML.Specification.MILSpec.TensorTypeH\x00\x12:\n\x08listType\x18\x02 \x01(\x0b\x32&.CoreML.Specification.MILSpec.ListTypeH\x00\x12<\n\ttupleType\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.TupleTypeH\x00\x12\x46\n\x0e\x64ictionaryType\x18\x04 \x01(\x0b\x32,.CoreML.Specification.MILSpec.DictionaryTypeH\x00\x42\x06\n\x04type\"\xb7\x02\n\nTensorType\x12\x38\n\x08\x64\x61taType\x18\x01 \x01(\x0e\x32&.CoreML.Specification.MILSpec.DataType\x12\x0c\n\x04rank\x18\x02 \x01(\x03\x12;\n\ndimensions\x18\x03 \x03(\x0b\x32\'.CoreML.Specification.MILSpec.Dimension\x12L\n\nattributes\x18\x04 \x03(\x0b\x32\x38.CoreML.Specification.MILSpec.TensorType.AttributesEntry\x1aV\n\x0f\x41ttributesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value:\x02\x38\x01\"C\n\tTupleType\x12\x36\n\x05types\x18\x01 \x03(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\"z\n\x08ListType\x12\x35\n\x04type\x18\x01 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\x12\x37\n\x06length\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.Dimension\"\x86\x01\n\x0e\x44ictionaryType\x12\x38\n\x07keyType\x18\x01 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\x12:\n\tvalueType\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\"\xfd\x01\n\tDimension\x12M\n\x08\x63onstant\x18\x01 \x01(\x0b\x32\x39.CoreML.Specification.MILSpec.Dimension.ConstantDimensionH\x00\x12K\n\x07unknown\x18\x02 \x01(\x0b\x32\x38.CoreML.Specification.MILSpec.Dimension.UnknownDimensionH\x00\x1a!\n\x11\x43onstantDimension\x12\x0c\n\x04size\x18\x01 \x01(\x04\x1a$\n\x10UnknownDimension\x12\x10\n\x08variadic\x18\x01 \x01(\x08\x42\x0b\n\tdimension\"\xb9\x04\n\x05Value\x12\x11\n\tdocString\x18\x01 \x01(\t\x12\x35\n\x04type\x18\x02 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ValueType\x12L\n\x0eimmediateValue\x18\x03 \x01(\x0b\x32\x32.CoreML.Specification.MILSpec.Value.ImmediateValueH\x00\x12J\n\rblobFileValue\x18\x05 \x01(\x0b\x32\x31.CoreML.Specification.MILSpec.Value.BlobFileValueH\x00\x1a\x8f\x02\n\x0eImmediateValue\x12;\n\x06tensor\x18\x01 \x01(\x0b\x32).CoreML.Specification.MILSpec.TensorValueH\x00\x12\x39\n\x05tuple\x18\x02 \x01(\x0b\x32(.CoreML.Specification.MILSpec.TupleValueH\x00\x12\x37\n\x04list\x18\x03 \x01(\x0b\x32\'.CoreML.Specification.MILSpec.ListValueH\x00\x12\x43\n\ndictionary\x18\x04 \x01(\x0b\x32-.CoreML.Specification.MILSpec.DictionaryValueH\x00\x42\x07\n\x05value\x1a\x31\n\rBlobFileValue\x12\x10\n\x08\x66ileName\x18\x01 \x01(\t\x12\x0e\n\x06offset\x18\x02 \x01(\x04\x42\x07\n\x05value\"\xac\x06\n\x0bTensorValue\x12J\n\x06\x66loats\x18\x01 \x01(\x0b\x32\x38.CoreML.Specification.MILSpec.TensorValue.RepeatedFloatsH\x00\x12\x46\n\x04ints\x18\x02 \x01(\x0b\x32\x36.CoreML.Specification.MILSpec.TensorValue.RepeatedIntsH\x00\x12H\n\x05\x62ools\x18\x03 \x01(\x0b\x32\x37.CoreML.Specification.MILSpec.TensorValue.RepeatedBoolsH\x00\x12L\n\x07strings\x18\x04 \x01(\x0b\x32\x39.CoreML.Specification.MILSpec.TensorValue.RepeatedStringsH\x00\x12N\n\x08longInts\x18\x05 \x01(\x0b\x32:.CoreML.Specification.MILSpec.TensorValue.RepeatedLongIntsH\x00\x12L\n\x07\x64oubles\x18\x06 \x01(\x0b\x32\x39.CoreML.Specification.MILSpec.TensorValue.RepeatedDoublesH\x00\x12H\n\x05\x62ytes\x18\x07 \x01(\x0b\x32\x37.CoreML.Specification.MILSpec.TensorValue.RepeatedBytesH\x00\x1a$\n\x0eRepeatedFloats\x12\x12\n\x06values\x18\x01 \x03(\x02\x42\x02\x10\x01\x1a%\n\x0fRepeatedDoubles\x12\x12\n\x06values\x18\x01 \x03(\x01\x42\x02\x10\x01\x1a\"\n\x0cRepeatedInts\x12\x12\n\x06values\x18\x01 \x03(\x05\x42\x02\x10\x01\x1a&\n\x10RepeatedLongInts\x12\x12\n\x06values\x18\x01 \x03(\x03\x42\x02\x10\x01\x1a#\n\rRepeatedBools\x12\x12\n\x06values\x18\x01 \x03(\x08\x42\x02\x10\x01\x1a!\n\x0fRepeatedStrings\x12\x0e\n\x06values\x18\x01 \x03(\t\x1a\x1f\n\rRepeatedBytes\x12\x0e\n\x06values\x18\x01 \x01(\x0c\x42\x07\n\x05value\"A\n\nTupleValue\x12\x33\n\x06values\x18\x01 \x03(\x0b\x32#.CoreML.Specification.MILSpec.Value\"@\n\tListValue\x12\x33\n\x06values\x18\x01 \x03(\x0b\x32#.CoreML.Specification.MILSpec.Value\"\xd3\x01\n\x0f\x44ictionaryValue\x12J\n\x06values\x18\x01 \x03(\x0b\x32:.CoreML.Specification.MILSpec.DictionaryValue.KeyValuePair\x1at\n\x0cKeyValuePair\x12\x30\n\x03key\x18\x01 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.CoreML.Specification.MILSpec.Value*\xc0\x01\n\x08\x44\x61taType\x12\x0f\n\x0bUNUSED_TYPE\x10\x00\x12\x08\n\x04\x42OOL\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x0b\n\x07\x46LOAT16\x10\n\x12\x0b\n\x07\x46LOAT32\x10\x0b\x12\x0b\n\x07\x46LOAT64\x10\x0c\x12\x0c\n\x08\x42\x46LOAT16\x10\r\x12\x08\n\x04INT8\x10\x15\x12\t\n\x05INT16\x10\x16\x12\t\n\x05INT32\x10\x17\x12\t\n\x05INT64\x10\x18\x12\t\n\x05UINT8\x10\x1f\x12\n\n\x06UINT16\x10 \x12\n\n\x06UINT32\x10!\x12\n\n\x06UINT64\x10\"B\x02H\x03\x62\x06proto3')
 )
 
 _DATATYPE = _descriptor.EnumDescriptor(
@@ -54,42 +54,46 @@
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='INT8', index=6, number=21,
+      name='BFLOAT16', index=6, number=13,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='INT16', index=7, number=22,
+      name='INT8', index=7, number=21,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='INT32', index=8, number=23,
+      name='INT16', index=8, number=22,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='INT64', index=9, number=24,
+      name='INT32', index=9, number=23,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='UINT8', index=10, number=31,
+      name='INT64', index=10, number=24,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='UINT16', index=11, number=32,
+      name='UINT8', index=11, number=31,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='UINT32', index=12, number=33,
+      name='UINT16', index=12, number=32,
       options=None,
       type=None),
     _descriptor.EnumValueDescriptor(
-      name='UINT64', index=13, number=34,
+      name='UINT32', index=13, number=33,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='UINT64', index=14, number=34,
       options=None,
       type=None),
   ],
   containing_type=None,
   options=None,
   serialized_start=4816,
-  serialized_end=4994,
+  serialized_end=5008,
 )
 _sym_db.RegisterEnumDescriptor(_DATATYPE)
 
@@ -100,6 +104,7 @@
 FLOAT16 = 10
 FLOAT32 = 11
 FLOAT64 = 12
+BFLOAT16 = 13
 INT8 = 21
 INT16 = 22
 INT32 = 23
diff --git a/coremltools/test/api/test_api_examples.py b/coremltools/test/api/test_api_examples.py
index 758bd049e..1c54b3853 100644
--- a/coremltools/test/api/test_api_examples.py
+++ b/coremltools/test/api/test_api_examples.py
@@ -6,6 +6,7 @@
 import copy
 import os
 import tempfile
+from collections import Counter
 
 import numpy as np
 import pytest
@@ -13,7 +14,8 @@
 import coremltools as ct
 from coremltools._deps import _HAS_TORCH
 from coremltools.converters.mil import Builder as mb
-from coremltools.converters.mil.mil import Function, Program, get_new_symbol
+from coremltools.converters.mil import mil
+from coremltools.converters.mil.mil import Function, get_new_symbol
 from coremltools.converters.mil.testing_utils import get_op_types_in_program
 
 if _HAS_TORCH:
@@ -57,7 +59,7 @@ def test_unsanitized_input_name_during_prediction(convert_to):
         '''
         input name : "x/0" becomes "x_0" due to name sanitization applied during conversion
         '''
-        prog = Program()
+        prog = mil.Program()
         func_inputs = {"x/0": mb.placeholder(shape=[2, 3]),
                        "y": mb.placeholder(shape=[2, 3])}
         with Function(func_inputs) as ssa_fun:
@@ -79,7 +81,7 @@ def test_unsanitized_input_name_during_prediction(convert_to):
 
     @staticmethod
     def _test_variant_input_type_prediction(to_tensor, convert_to):
-        prog = Program()
+        prog = mil.Program()
         func_inputs = {"x": mb.placeholder(shape=[2, 3]),
                        "y": mb.placeholder(shape=[2, 3])}
         with Function(func_inputs) as ssa_fun:
@@ -167,7 +169,7 @@ def prog(x):
     @staticmethod
     @pytest.mark.skipif(not ct.utils._is_macos(), reason="Platform is not Mac OS")
     def test_deepcopy_error_with_symbols_in_prog():
-        prog = Program()
+        prog = mil.Program()
         func_inputs = {"x": mb.placeholder(shape=[get_new_symbol(), 3]),
                        "y": mb.placeholder(shape=[2, 3])}
         with Function(func_inputs) as ssa_fun:
@@ -390,12 +392,21 @@ def test_skip_passes_in_different_pipelines(self):
             convert_to="mlprogram",
             pass_pipeline=pipeline,
         )
-        assert (
-            get_op_types_in_program(
-                model_converted._get_mil_internal(), skip_const_ops=False
-            ).count("const")
-            == 24
-        )
+
+        op_types = get_op_types_in_program(model_converted._mil_program, skip_const_ops=False)
+        expected_counts = {
+            "const": 26,
+            "cast": 7,
+            "conv": 1,
+            "matmul": 1,
+            "add": 1,
+            "shape": 1,
+            "slice_by_index": 2,
+            "concat": 1,
+            "reshape": 1,
+            "leaky_relu": 1,
+        }
+        assert Counter(op_types) == expected_counts
 
     def test_empty_pipeline(self):
         model = self._get_test_model()
@@ -491,7 +502,7 @@ def test_pass_option_skip_const_by_size(self):
             get_op_types_in_program(
                 model_converted._get_mil_internal(), skip_const_ops=False
             ).count("const")
-            == 23
+            == 25
         )
 
     def test_pass_unsupported_option(self):
diff --git a/coremltools/test/api/test_api_visibilities.py b/coremltools/test/api/test_api_visibilities.py
index 235990359..b783828bc 100644
--- a/coremltools/test/api/test_api_visibilities.py
+++ b/coremltools/test/api/test_api_visibilities.py
@@ -52,7 +52,7 @@ class TestApiVisibilities:
 
     def test_top_level(self):
         if not ct.utils._is_macos():
-             EXPECTED_MODULES.remove("libcoremlpython")
+            EXPECTED_MODULES.remove("libcoremlpython")
         _check_visible_modules(_get_visible_items(ct), EXPECTED_MODULES)
 
     def test_utils(self):
diff --git a/coremltools/test/modelpackage/test_modelpackage.py b/coremltools/test/modelpackage/test_modelpackage.py
index e64a4ca55..2f3f0591a 100644
--- a/coremltools/test/modelpackage/test_modelpackage.py
+++ b/coremltools/test/modelpackage/test_modelpackage.py
@@ -3,6 +3,7 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import json
 import os
 import shutil
 import tempfile
@@ -12,16 +13,19 @@
 
 import coremltools
 from coremltools import ComputeUnit, utils
-from coremltools._deps import _HAS_TORCH
+from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH
 from coremltools.converters.mil import Builder as mb
 from coremltools.libmodelpackage import ModelPackage
-from coremltools.models import MLModel
+from coremltools.models import _METADATA_VERSION, MLModel
 from coremltools.models.utils import _MLPACKAGE_AUTHOR_NAME, _WEIGHTS_DIR_NAME
 from coremltools.proto import Model_pb2
 
 if _HAS_TORCH:
     import torch
 
+if _HAS_EXECUTORCH:
+    import executorch.exir
+
 
 def _remove_path(path):
     if os.path.isdir(path):
@@ -265,6 +269,77 @@ def test_save_in_place(self):
 
         _remove_path(package.name)
 
+    @pytest.mark.skipif(not _HAS_EXECUTORCH, reason="requires ExecuTorch")
+    def test_save_EXIR_debug_handle(self):
+        """
+        If we update EXIR debug handle serialization, we should update this test as well
+        """
+        INPUT_SHAPE = (2, 10)
+        LINEAR_SHAPE = (INPUT_SHAPE[-1], 20)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(*LINEAR_SHAPE)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        def _compare_loaded_debug_handle_mapping_with_original(package):
+            debug_handle_mapping_json_path = os.path.join(
+                package, "executorch_debug_handle_mapping.json"
+            )
+            assert os.path.exists(debug_handle_mapping_json_path)
+            with open(debug_handle_mapping_json_path, "r") as f:
+                loaded_debug_handle_mapping = json.load(f)
+            assert loaded_debug_handle_mapping == debug_handle_mapping
+
+        def _compare_prediction_with_torch(coreml_model, torch_model):
+            x = torch.rand(2, 10)
+            coreml_x = {list(coreml_model.input_description)[0]: x.numpy()}
+
+            coreml_preds = coreml_model.predict(coreml_x)
+            assert coreml_preds is not None
+            coreml_y = list(coreml_preds.values())[0]
+
+            torch_y = torch_model(x).detach().numpy()
+            np.testing.assert_allclose(coreml_y, torch_y, rtol=1e-6, atol=1e-6)
+
+        torch_model = TestModule()
+        torch_model.eval()
+
+        example_input = (torch.rand(*INPUT_SHAPE),)
+        exir_program_aten = torch.export.export(torch_model, example_input)
+        exir_program_edge = executorch.exir.to_edge(exir_program_aten).exported_program()
+
+        coreml_model = coremltools.convert(
+            exir_program_edge, compute_precision=coremltools.precision.FLOAT32
+        )
+        debug_handle_mapping = [
+            {_METADATA_VERSION: coreml_model.user_defined_metadata[_METADATA_VERSION]},
+            {
+                str(k): v
+                for k, v in coreml_model._mil_program.construct_debug_handle_to_ops_mapping().items()
+            },
+        ]
+
+        with tempfile.TemporaryDirectory(suffix=".mlpackage") as package0:
+            coreml_model.save(package0)
+            loaded_model0 = MLModel(package0)
+            if utils._macos_version() >= (12, 0):
+                _compare_prediction_with_torch(loaded_model0, torch_model)
+            _compare_loaded_debug_handle_mapping_with_original(package0)
+
+            with tempfile.TemporaryDirectory(suffix=".mlpackage") as package1:
+                loaded_model0.save(package1)
+                loaded_model1 = MLModel(package1)
+                if utils._macos_version() >= (12, 0):
+                    _compare_prediction_with_torch(loaded_model1, torch_model)
+                # Although debug handle info will be lost in loaded model due to we do not
+                # deserialize executorch_debug_handle_mapping.json, package1 will still have
+                # executorch_debug_handle_mapping.json, which is copied from package0
+                _compare_loaded_debug_handle_mapping_with_original(package1)
+
     @pytest.mark.skipif(not _HAS_TORCH, reason="requires torch")
     def test_mil_as_package(self):
         num_tokens = 3
diff --git a/coremltools/test/neural_network/test_numpy_nn_layers.py b/coremltools/test/neural_network/test_numpy_nn_layers.py
index c939b3cc1..8d7c77fa3 100644
--- a/coremltools/test/neural_network/test_numpy_nn_layers.py
+++ b/coremltools/test/neural_network/test_numpy_nn_layers.py
@@ -5016,6 +5016,9 @@ def test_gather_cpu(self, cpu_only=True):
                 )
 
     def test_gather_gpu(self):
+        # This test can be stochastically failing, so we set the below seed:
+        np.random.seed(0)
+        pytest.xfail("rdar://124260627 ([CI] Two tests are random failing on CI)")
         self.test_gather_cpu(cpu_only=False)
 
     def test_gather_along_axis_cpu(self, cpu_only=True):
diff --git a/coremltools/test/neural_network/test_tf_numeric.py b/coremltools/test/neural_network/test_tf_numeric.py
index b6ff42ff9..e248a98c2 100644
--- a/coremltools/test/neural_network/test_tf_numeric.py
+++ b/coremltools/test/neural_network/test_tf_numeric.py
@@ -396,6 +396,8 @@ def test_resize_bilinear_cpu_only(self):
 
     @unittest.skipUnless(_macos_version() >= (10, 14), "Only supported on MacOS 10.14+")
     def test_crop_resize(self, cpu_only=False):
+        # This test can be stochastically failing, so we set the below seed:
+        np.random.seed(0)
         if _macos_version()[0] == 12:
             pytest.xfail("rdar://110274216")
 
diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py
index b0d8e045a..e799fcec5 100644
--- a/coremltools/test/optimize/coreml/test_post_training_quantization.py
+++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py
@@ -114,7 +114,7 @@ def create_sparse_weight(weight, target_sparsity):
     return np.reshape(weight, shape).astype(np.float32)
 
 
-def verify_model_outputs(model, compressed_model, input_values):
+def verify_model_outputs(model, compressed_model, input_values, rtol=1e-7, atol=0):
     """
     This utility functions does the following checks:
 
@@ -144,7 +144,8 @@ def verify_model_outputs(model, compressed_model, input_values):
     de_output_dict = decompressed_model.predict(input_values)
     for k, v in de_output_dict.items():
         assert k in output_dict
-        np.testing.assert_allclose(v, output_dict[k])
+        np.testing.assert_allclose(v, output_dict[k], rtol=rtol, atol=atol)
+
 
 class TestLinearQuantizeWeights:
     @staticmethod
@@ -720,7 +721,9 @@ def test_weight_decopmression_coreml_optimize():
 
         pipeline = ct.PassPipeline.DEFAULT_PRUNING
 
-        pipeline.insert_pass(1, "compression::palettize_weights")
+        # Add a palettization pass after the pruning pass.
+        prune_pass_idx = pipeline.passes.index("compression::prune_weights")
+        pipeline.insert_pass(prune_pass_idx + 1, "compression::palettize_weights")
         config = cto.coreml.OptimizationConfig(
             global_config=cto.coreml.OpPalettizerConfig(mode="unique"),
         )
@@ -773,7 +776,9 @@ def test_convert_sparse_and_palettized_source_model_custom():
 
         pipeline = ct.PassPipeline.DEFAULT_PRUNING
 
-        pipeline.insert_pass(1, "compression::palettize_weights")
+        # Add a palettization pass after the pruning pass.
+        prune_pass_idx = pipeline.passes.index("compression::prune_weights")
+        pipeline.insert_pass(prune_pass_idx + 1, "compression::palettize_weights")
         config = cto.coreml.OptimizationConfig(
             global_config=cto.coreml.OpPalettizerConfig(mode="unique"),
         )
diff --git a/coremltools/test/pipeline/test_pipeline.py b/coremltools/test/pipeline/test_pipeline.py
index 3874f9387..b0b438c5e 100644
--- a/coremltools/test/pipeline/test_pipeline.py
+++ b/coremltools/test/pipeline/test_pipeline.py
@@ -12,6 +12,7 @@
 
 import coremltools as ct
 from coremltools._deps import _HAS_LIBSVM, _HAS_SKLEARN
+from coremltools.converters.mil import mil
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Function, Program
 from coremltools.models.pipeline import PipelineClassifier, PipelineRegressor
@@ -238,7 +239,7 @@ def _make_model(input_name, input_length,
         weight_tensor = np.arange(input_length * output_length, dtype='float32')
         weight_tensor = weight_tensor.reshape(output_length, input_length)
 
-        prog = Program()
+        prog = mil.Program()
         func_inputs = {input_name: mb.placeholder(shape=(input_length,))}
         with Function(func_inputs) as ssa_fun:
             input = ssa_fun.inputs[input_name]
@@ -319,7 +320,7 @@ def test_compute_unit():
     @staticmethod
     def test_second_model_needs_pipeline_input():
         # First model takes one parameter
-        p1 = Program()
+        p1 = mil.Program()
         func_inputs = {'x1': mb.placeholder(shape=(2,))}
         with Function(func_inputs) as ssa_fun:
             x1 = ssa_fun.inputs['x1']
@@ -330,7 +331,7 @@ def test_second_model_needs_pipeline_input():
 
         # Second model takes two parameters. One will be from previous model in pipeline.
         # The other as pipeline input.
-        p2 = Program()
+        p2 = mil.Program()
         func_inputs = {
             'y1': mb.placeholder(shape=(2,)),
             'x2': mb.placeholder(shape=(2,)),
diff --git a/coremltools/version.py b/coremltools/version.py
index c21924b8c..a5f2975a9 100644
--- a/coremltools/version.py
+++ b/coremltools/version.py
@@ -4,4 +4,4 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 
-__version__ = "7.1"  # VERSION_STRING
+__version__ = "7.1.2"  # VERSION_STRING
diff --git a/docs/source/coremltools.converters.mil.mil.passes.defs.rst b/docs/source/coremltools.converters.mil.mil.passes.defs.rst
index a32d4e3ad..eacc0a398 100644
--- a/docs/source/coremltools.converters.mil.mil.passes.defs.rst
+++ b/docs/source/coremltools.converters.mil.mil.passes.defs.rst
@@ -12,6 +12,7 @@ cleanup
     .. autoclass:: const_elimination
     .. autoclass:: dead_code_elimination
     .. autoclass:: dedup_op_and_var_names
+    .. autoclass:: expand_dynamic_linear
     .. autoclass:: fuse_reduce_mean
     .. autoclass:: loop_invariant_elimination
     .. autoclass:: noop_elimination
@@ -63,6 +64,7 @@ optimize_linear
 
     .. autoclass:: fuse_linear_bias
     .. autoclass:: fuse_matmul_weight_bias
+    .. autoclass:: fuse_transpose_matmul
 
 
 optimize_normalization
@@ -78,6 +80,7 @@ optimize_quantization
 
 .. automodule:: coremltools.converters.mil.mil.passes.defs.optimize_quantization
 
+    .. autoclass:: merge_affine_dequantize_with_consecutive_ops
     .. autoclass:: int_op_canonicalization
     .. autoclass:: nullify_redundant_quantization_zero_point
     .. autoclass:: dequantize_quantize_pair_elimination
diff --git a/mlmodel/build/format/MIL.pb.cc b/mlmodel/build/format/MIL.pb.cc
index d481c3ade..c265cff06 100644
--- a/mlmodel/build/format/MIL.pb.cc
+++ b/mlmodel/build/format/MIL.pb.cc
@@ -316,6 +316,7 @@ bool DataType_IsValid(int value) {
     case 10:
     case 11:
     case 12:
+    case 13:
     case 21:
     case 22:
     case 23:
diff --git a/mlmodel/build/format/MIL.pb.h b/mlmodel/build/format/MIL.pb.h
index 1d20d6cea..6776cba76 100644
--- a/mlmodel/build/format/MIL.pb.h
+++ b/mlmodel/build/format/MIL.pb.h
@@ -178,6 +178,7 @@ enum DataType {
   FLOAT16 = 10,
   FLOAT32 = 11,
   FLOAT64 = 12,
+  BFLOAT16 = 13,
   INT8 = 21,
   INT16 = 22,
   INT32 = 23,
diff --git a/mlmodel/build/format/MIL_enums.h b/mlmodel/build/format/MIL_enums.h
index e3a2a2a6f..911353d4f 100644
--- a/mlmodel/build/format/MIL_enums.h
+++ b/mlmodel/build/format/MIL_enums.h
@@ -7,6 +7,7 @@ enum MLDataType: int {
     MLDataTypeFLOAT16 = 10,
     MLDataTypeFLOAT32 = 11,
     MLDataTypeFLOAT64 = 12,
+    MLDataTypeBFLOAT16 = 13,
     MLDataTypeINT8 = 21,
     MLDataTypeINT16 = 22,
     MLDataTypeINT32 = 23,
diff --git a/reqs/test.pip b/reqs/test.pip
index 784ce6769..86cb7a346 100644
--- a/reqs/test.pip
+++ b/reqs/test.pip
@@ -24,27 +24,15 @@ scipy==1.9.2; python_version == '3.11'
 six
 sympy > 1.6
 gast==0.4.0
-torch==2.1.0
-torchaudio==2.1.0
-torchvision==0.16.0
+torch==2.2.0
+torchaudio==2.2.0
+torchvision==0.17.0
 xgboost==1.4.2; platform_machine != "arm64"
 mock
 wrapt
 tqdm
 pytest-timeout
 
-# TensorFlow (x86) related package
-tensorflow==2.12.0; platform_machine != "arm64"
-tensorflow-estimator==2.12.0; platform_machine != "arm64"
-keras==2.12.0; platform_machine != "arm64"
-
-# TensorFlow (arm64) related package. Currently no Python 3.11 support.
-tensorflow-macos==2.11.0; platform_machine == "arm64" and python_version < "3.11"
-tensorflow-estimator==2.11.0; platform_machine == "arm64" and python_version < "3.11"
-keras==2.11.0; platform_machine == "arm64" and python_version < "3.11"
-
-tensorflow-addons==0.19.0; python_version < "3.11"
-tensorflow-hub==0.12.0
 transformers==4.26.0
 
 # coremltools.optimize.torch
diff --git a/reqs/test_tf2.pip b/reqs/test_tf2.pip
new file mode 100644
index 000000000..b178018ad
--- /dev/null
+++ b/reqs/test_tf2.pip
@@ -0,0 +1,16 @@
+-r ./test.pip
+
+# TODO(rdar://123269464): Support a recent version of TensorFlow
+
+# TensorFlow (x86) related package
+tensorflow==2.12.0; platform_machine != "arm64"
+tensorflow-estimator==2.12.0; platform_machine != "arm64"
+keras==2.12.0; platform_machine != "arm64"
+
+# TensorFlow (arm64) related package. Currently no Python 3.11 support.
+tensorflow-macos==2.11.0; platform_machine == "arm64" and python_version < "3.11"
+tensorflow-estimator==2.11.0; platform_machine == "arm64" and python_version < "3.11"
+keras==2.11.0; platform_machine == "arm64" and python_version < "3.11"
+
+tensorflow-addons==0.19.0; python_version < "3.11"
+tensorflow-hub==0.12.0
diff --git a/scripts/build.sh b/scripts/build.sh
index bc311e987..56998f8b0 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -107,6 +107,9 @@ CMAKE_COMMAND=""
 if [[ $OSTYPE == darwin* ]]; then
   CMAKE_COMMAND="xcrun --sdk ${sdk} "
 fi
+if [ -z "`which cmake`" ] || [ "`which cmake`" = "cmake not found" ]; then
+  conda install cmake -y
+fi
 CMAKE_COMMAND+="cmake $ADDITIONAL_CMAKE_OPTIONS \
   -DCMAKE_BUILD_TYPE=$BUILD_MODE \
   -DPYTHON_EXECUTABLE:FILEPATH=$PYTHON_EXECUTABLE \
diff --git a/scripts/test.sh b/scripts/test.sh
index 3389ed0cb..7265b05c3 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -32,6 +32,7 @@ print_help() {
   echo "  --wheel-path=*          Specify which wheel to test. Otherwise, test the current coremltools dir."
   echo "  --xml-path=*            Path to test xml file."
   echo "  --test-package=*        Test package to run."
+  echo "  --ignores=*             Test packages to ignore"
   echo "  --python=*              Python to use for configuration."
   echo "  --requirements=*        [Optional] Path to the requirements.txt file."
   echo "  --cov=*                 Generate coverage report for these dirs."
@@ -50,6 +51,7 @@ while [ $# -gt 0 ]
     --requirements=*)    REQUIREMENTS=${1##--requirements=} ;;
     --python=*)          PYTHON=${1##--python=} ;;
     --test-package=*)    TEST_PACKAGE=${1##--test-package=} ;;
+    --ignores=*)         IGNORES=${1##--ignores=} ;;
     --wheel-path=*)      WHEEL_PATH=${1##--wheel-path=} ;;
     --xml-path=*)        XML_PATH=${1##--xml-path=} ;;
     --cov=*)             COV=${1##--cov=} ;;
@@ -102,21 +104,29 @@ fi
 # Now run the tests
 echo "Running tests"
 
-TEST_CMD=($PYTEST_EXECUTABLE -v -ra -W "ignore::UserWarning" -W "ignore::FutureWarning" -W "ignore::DeprecationWarning" -W "ignore::ResourceWarning" --durations=100 --pyargs ${TEST_PACKAGE} --junitxml=${XML_PATH} --timeout=${TIME_OUT})
+TEST_CMD=$PYTEST_EXECUTABLE" -v -ra -W \"ignore::UserWarning\" -W \"ignore::FutureWarning\" -W \"ignore::DeprecationWarning\" -W \"ignore::ResourceWarning\" --durations=100"
+TEST_CMD+=" --junitxml="${XML_PATH}
+TEST_CMD+=" --timeout="${TIME_OUT}
+TEST_CMD+=" --pyargs "${TEST_PACKAGE//,/ }
+
+IFS=',' read -A ignore_array <<< "${IGNORES}"
+for ignore in ${ignore_array[@]}; do
+    TEST_CMD+=" --ignore "${CONDA_PREFIX}"/lib/python"${PYTHON}"/site-packages/"${ignore}
+done
 
 if [[ $SLOW != 1 || $FAST != 1 ]]; then
     if [[ $SLOW == 1 ]]; then
-        TEST_CMD+=(-m "slow")
+        TEST_CMD+=" -m \"slow\""
     elif [[ $FAST == 1 ]]; then
-        TEST_CMD+=(-m "not slow")
+        TEST_CMD+=" -m \"not slow\""
     fi
 fi
 
 if [[ $COV != "" ]]; then
-    TEST_CMD+=(--cov $COV)
+    TEST_CMD+=" --cov ${COV}"
 fi
 
-echo $TEST_CMD
-${TEST_CMD[@]}
+echo ${TEST_CMD}
+eval ${TEST_CMD}
 
 pip uninstall -y coremltools