From 6c698cff939cbf1b19980a08bfdd53e17c6cef6b Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@fb.com>
Date: Fri, 3 May 2024 11:36:24 -0700
Subject: [PATCH] introduce _to_dim_order_copy op to runtime (#1970)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/1970

This diff creates a new and special operator, `_to_dim_order_copy`. This new operator introduces two critical attributes to runtime system:
1. Extract memory_format information from tensor based on dim_order, instead of stride.
2. Support both channal_last and contiguous memory_format in runtime. Please note that memory format here is a parallel concept with memory layout, and supporting new format does not violate our contract on only supporting contiguous memory layout tensor. Details can be found in [here](https://discuss.pytorch.org/t/contigious-vs-non-contigious-tensor/30107) and [here](https://pytorch.org/blog/tensor-memory-format-matters/).

Furthermore, dim order is a specifial operator, which does not have a native aten variant but is needed by most model in edge dialect, so it can not directly be put in `kernels/portable/custom_ops.yaml` (need manually registered everytime, not work for an operator needed by many models), or `kernels/portable/functions.yaml` (should have native aten variant). To overcome that, this diff puts   `_to_dim_order_copy`'s aten mode under `kernels/aten`, while lean mode under `kernels/portable/functions.yaml`.

Also update dependencies and utils.

Differential Revision: https://internalfb.com/D53747744
---
 kernels/aten/cpu/TARGETS                      |   8 +
 kernels/aten/cpu/op__to_dim_order_copy.cpp    | 124 ++++
 kernels/aten/cpu/targets.bzl                  |  41 ++
 kernels/aten/cpu/util/TARGETS                 |   8 +
 kernels/aten/cpu/util/copy_ops_util.cpp       |  68 ++
 kernels/aten/cpu/util/copy_ops_util.h         |  23 +
 kernels/aten/cpu/util/targets.bzl             |  28 +
 kernels/aten/edge_dialect_aten_op.yaml        |   8 +
 kernels/aten/targets.bzl                      |  33 +-
 .../portable/cpu/op__to_dim_order_copy.cpp    | 120 ++++
 kernels/portable/cpu/targets.bzl              |   6 +
 kernels/portable/cpu/util/copy_ops_util.cpp   |  40 ++
 kernels/portable/cpu/util/copy_ops_util.h     |   6 +
 kernels/portable/functions.yaml               |   5 +
 kernels/test/op__to_dim_order_copy_test.cpp   | 650 ++++++++++++++++++
 kernels/test/targets.bzl                      |   1 +
 runtime/core/exec_aten/exec_aten.h            |   7 +
 .../kernels/portable/op_registration_util.bzl |  24 +-
 shim/xplat/executorch/kernels/test/util.bzl   |   2 +
 19 files changed, 1190 insertions(+), 12 deletions(-)
 create mode 100644 kernels/aten/cpu/TARGETS
 create mode 100644 kernels/aten/cpu/op__to_dim_order_copy.cpp
 create mode 100644 kernels/aten/cpu/targets.bzl
 create mode 100644 kernels/aten/cpu/util/TARGETS
 create mode 100644 kernels/aten/cpu/util/copy_ops_util.cpp
 create mode 100644 kernels/aten/cpu/util/copy_ops_util.h
 create mode 100644 kernels/aten/cpu/util/targets.bzl
 create mode 100644 kernels/aten/edge_dialect_aten_op.yaml
 create mode 100644 kernels/portable/cpu/op__to_dim_order_copy.cpp
 create mode 100644 kernels/test/op__to_dim_order_copy_test.cpp

diff --git a/kernels/aten/cpu/TARGETS b/kernels/aten/cpu/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/kernels/aten/cpu/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/kernels/aten/cpu/op__to_dim_order_copy.cpp b/kernels/aten/cpu/op__to_dim_order_copy.cpp
new file mode 100644
index 00000000000..63a301531d9
--- /dev/null
+++ b/kernels/aten/cpu/op__to_dim_order_copy.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using SizesArrayRef = exec_aten::ArrayRef<exec_aten::SizesType>;
+using DimOrderArrayRef = exec_aten::ArrayRef<exec_aten::DimOrderType>;
+using MemoryFormat = exec_aten::MemoryFormat;
+
+template <typename T>
+using OptionalArrayRef = exec_aten::OptionalArrayRef<T>;
+
+template <typename T>
+using Optional = exec_aten::optional<T>;
+
+namespace {
+Optional<MemoryFormat> get_memory_format(OptionalArrayRef<int64_t> dim_order) {
+  if (!dim_order.has_value()) {
+    return exec_aten::nullopt;
+  }
+  if (is_contiguous_dim_order(
+          dim_order.value().data(), dim_order.value().size())) {
+    return MemoryFormat::Contiguous;
+  } else if (is_channels_last_dim_order(
+                 dim_order.value().data(), dim_order.value().size())) {
+    return MemoryFormat::ChannelsLast;
+  } else {
+    ET_ASSERT_UNREACHABLE();
+  }
+}
+
+bool check__to_dim_order_copy_args(
+    const Tensor& input,
+    bool non_blocking,
+    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // Right now we only support blocking data transfer
+  ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
+
+  // dim_order is set, the target dim_order will be either contiguous or
+  // channels_last memory format
+  if (dim_order.has_value()) {
+    exec_aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+
+    // dim order size shall equal to input dim
+    ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
+
+    ET_LOG_AND_RETURN_IF_FALSE(
+        is_channels_last_dim_order(
+            dim_order.value().data(), dim_order.value().size()) ||
+        is_contiguous_dim_order(
+            dim_order.value().data(), dim_order.value().size()));
+
+    // Out Aten tensor shall have same memory format stride as dim_order
+    const size_t kMaxNumOfDimensions = 16;
+    ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim());
+    exec_aten::StridesType target_strides[kMaxNumOfDimensions];
+    dim_order_to_stride_nocheck(
+        out.sizes().data(),
+        dim_order_ref.data(),
+        dim_order_ref.size(),
+        target_strides);
+    ET_LOG_AND_RETURN_IF_FALSE(out.dim() == dim_order_ref.size());
+    for (size_t i = 0; i < dim_order_ref.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(target_strides[i] == out.strides()[i]);
+    }
+
+  } else { // dim_order is not set, preserve the dim order of input
+
+    auto out_strides = out.strides();
+    auto input_strides = input.strides();
+    ET_LOG_AND_RETURN_IF_FALSE(input_strides.size() == out_strides.size());
+    for (size_t i = 0; i < input_strides.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(input_strides[i] == out_strides[i]);
+    }
+  }
+  return true;
+}
+} // namespace
+
+// _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
+// dim_order=None, Tensor(a!) out) -> Tensor(a!)
+Tensor& _to_dim_order_copy_out(
+    RuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // TODO(T181345875): enable sanity check in aten mode
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  Optional<MemoryFormat> memory_format = get_memory_format(dim_order);
+  at::_to_copy_outf(self, non_blocking, memory_format, out);
+
+  return out;
+}
+
+Tensor& _to_dim_order_copy_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  exec_aten::RuntimeContext ctx{};
+  return _to_dim_order_copy_out(ctx, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/aten/cpu/targets.bzl b/kernels/aten/cpu/targets.bzl
new file mode 100644
index 00000000000..bdd93bda9ed
--- /dev/null
+++ b/kernels/aten/cpu/targets.bzl
@@ -0,0 +1,41 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "define_op_target", "op_target")
+
+# Operators that are listed in `functions.yaml`, and are thus compatible with
+# the core ATen operators. Every entry here will be backed by a cxx_library
+# target with the given name and deps.
+#
+# Note that a single target (or single .cpp file) can't mix ATen and non-ATen
+# ops, and must be split. They can, however, share common code via a library dep
+# if necessary.
+_EDGE_DIALECT_OPS = (
+    op_target(
+        name = "op__to_dim_order_copy",
+        deps = [
+            "//executorch/kernels/aten/cpu/util:copy_ops_util",
+        ],
+    ),
+)
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Define build targets for all operators registered in the tables above.
+    for op in _EDGE_DIALECT_OPS:
+        define_op_target(is_aten_op = False, is_et_op = False, **op)
+
+    all_op_targets = [":{}".format(op["name"]) for op in _EDGE_DIALECT_OPS]
+
+    runtime.cxx_library(
+        name = "cpu",
+        srcs = [],
+        visibility = [
+            "//executorch/kernels/aten/...",
+            "//executorch/kernels/test/...",
+        ],
+        exported_deps = [t + "_aten" for t in all_op_targets],
+    )
diff --git a/kernels/aten/cpu/util/TARGETS b/kernels/aten/cpu/util/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/kernels/aten/cpu/util/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/kernels/aten/cpu/util/copy_ops_util.cpp b/kernels/aten/cpu/util/copy_ops_util.cpp
new file mode 100644
index 00000000000..b20fe127863
--- /dev/null
+++ b/kernels/aten/cpu/util/copy_ops_util.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstring>
+
+#include <executorch/kernels/aten/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+bool check__to_dim_order_copy_args(
+    const Tensor& input,
+    bool non_blocking,
+    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // Right now we only support blocking data transfer
+  ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
+
+  // dim_order is set, the target dim_order will be either contiguous or
+  // channels_last memory format
+  if (dim_order.has_value()) {
+    exec_aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+
+    // dim order size shall equal to input dim
+    ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
+
+    ET_LOG_AND_RETURN_IF_FALSE(
+        is_channels_last_dim_order(
+            dim_order.value().data(), dim_order.value().size()) ||
+        is_contiguous_dim_order(
+            dim_order.value().data(), dim_order.value().size()));
+
+    // Out Aten tensor shall have same memory format stride as dim_order
+    const size_t kMaxNumOfDimensions = 16;
+    ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim());
+    exec_aten::StridesType target_strides[kMaxNumOfDimensions];
+    dim_order_to_stride_nocheck(
+        out.sizes().data(),
+        dim_order_ref.data(),
+        dim_order_ref.size(),
+        target_strides);
+    ET_LOG_AND_RETURN_IF_FALSE(out.dim() == dim_order_ref.size());
+    for (size_t i = 0; i < dim_order_ref.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(target_strides[i] == out.strides()[i]);
+    }
+
+  } else { // dim_order is not set, preserve the dim order of input
+
+    auto out_strides = out.strides();
+    auto input_strides = input.strides();
+    ET_LOG_AND_RETURN_IF_FALSE(input_strides.size() == out_strides.size());
+    for (size_t i = 0; i < input_strides.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(input_strides[i] == out_strides[i]);
+    }
+  }
+  return true;
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/aten/cpu/util/copy_ops_util.h b/kernels/aten/cpu/util/copy_ops_util.h
new file mode 100644
index 00000000000..6ddc7371006
--- /dev/null
+++ b/kernels/aten/cpu/util/copy_ops_util.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+bool check__to_dim_order_copy_args(
+    const Tensor& input,
+    bool non_blocking,
+    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    Tensor& out);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/aten/cpu/util/targets.bzl b/kernels/aten/cpu/util/targets.bzl
new file mode 100644
index 00000000000..d1ebddd7275
--- /dev/null
+++ b/kernels/aten/cpu/util/targets.bzl
@@ -0,0 +1,28 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Utility functions that can be used by operators that perform reduction
+    runtime.cxx_library(
+        name = "copy_ops_util",
+        srcs = ["copy_ops_util.cpp"],
+        exported_headers = [
+            "copy_ops_util.h",
+        ],
+        compiler_flags = ["-Wno-missing-prototypes"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes_aten",
+            "//executorch/runtime/core/exec_aten/util:tensor_util_aten",
+        ],
+        exported_preprocessor_flags = ["-DUSE_ATEN_LIB"],
+        visibility = [
+            "//executorch/kernels/aten/cpu/...",
+            "//executorch/kernels/portable/cpu/...",
+            "//executorch/kernels/optimized/cpu/...",
+        ],
+    )
diff --git a/kernels/aten/edge_dialect_aten_op.yaml b/kernels/aten/edge_dialect_aten_op.yaml
new file mode 100644
index 00000000000..016f8dbfab5
--- /dev/null
+++ b/kernels/aten/edge_dialect_aten_op.yaml
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This yaml file contains operators that are defined by ExecuTorch and used in ATen mode.
+
+- func: dim_order_ops::_to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_to_dim_order_copy_out
diff --git a/kernels/aten/targets.bzl b/kernels/aten/targets.bzl
index 519dfaf3484..2e007a16d7e 100644
--- a/kernels/aten/targets.bzl
+++ b/kernels/aten/targets.bzl
@@ -16,20 +16,51 @@ def define_common_targets():
         ],
     )
 
+    runtime.export_file(
+        name = "edge_dialect_aten_op.yaml",
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     et_operator_library(
         name = "executorch_aten_ops",
         ops_schema_yaml_target = ":functions.yaml",
         define_static_targets = True,
     )
 
+    runtime.cxx_library(
+        name = "operators_edge_dialect_aten",
+        srcs = [],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            "//executorch/kernels/aten/cpu:cpu",
+        ],
+    )
+
+    et_operator_library(
+        name = "edge_dialect_aten_ops",
+        ops_schema_yaml_target = ":edge_dialect_aten_op.yaml",
+        define_static_targets = True,
+    )
+
     executorch_generated_lib(
         name = "generated_lib",
         aten_mode = True,
         deps = [
             ":executorch_aten_ops",
+            ":edge_dialect_aten_ops",
+        ],
+        kernel_deps = [
+            ":operators_edge_dialect_aten",
         ],
-        functions_yaml_target = None,
+        custom_ops_yaml_target = "//executorch/kernels/aten:edge_dialect_aten_op.yaml",
         define_static_targets = True,
+        custom_ops_requires_aot_registration = False,
         visibility = [
             "//executorch/...",
             "@EXECUTORCH_CLIENTS",
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
new file mode 100644
index 00000000000..8a6a65e7034
--- /dev/null
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using SizesArrayRef = exec_aten::ArrayRef<exec_aten::SizesType>;
+using DimOrderArrayRef = exec_aten::ArrayRef<exec_aten::DimOrderType>;
+using MemoryFormat = exec_aten::MemoryFormat;
+
+template <typename T>
+using OptionalArrayRef = exec_aten::OptionalArrayRef<T>;
+
+template <typename T>
+using Optional = exec_aten::optional<T>;
+
+namespace {
+
+// TODO(T179241236): Update core/exec_aten/util/tensor_util.h to support dim
+// order other than contiguous.
+int64_t coordinateToIndexWithDimOrder(
+    const Tensor& self,
+    const size_t* cur_indices) {
+  int64_t index = 0;
+  exec_aten::StridesType strides[kTensorDimensionLimit];
+  SizesArrayRef sizes = self.sizes();
+  DimOrderArrayRef dim_order = self.dim_order();
+
+  dim_order_to_stride_nocheck(
+      sizes.data(), dim_order.data(), sizes.size(), strides);
+  for (size_t i = 0; i < self.dim(); ++i) {
+    index += cur_indices[i] * strides[i];
+  }
+  return index;
+}
+
+template <typename SELF_CTYPE, typename OUT_CTYPE>
+void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
+  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
+  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
+
+  size_t coordinate[kTensorDimensionLimit] = {0};
+
+  // Copy data from self to out index by index. Same index in self and out
+  // should have same value, no matter the order of dimensions.
+  for (ssize_t i = 0; i < self.numel(); i++) {
+    // Update the current indices.
+    for (ssize_t j = self.dim() - 1; j >= 0; j--) {
+      if (coordinate[j] + 1 < self.size(j)) {
+        coordinate[j]++;
+        break;
+      } else {
+        coordinate[j] = 0;
+      }
+    }
+    // Get the corresponding index of self_data and out_data by stride.
+    int64_t self_data_index = coordinateToIndexWithDimOrder(self, coordinate);
+    int64_t out_data_index = coordinateToIndexWithDimOrder(out, coordinate);
+
+    out_data[out_data_index] =
+        static_cast<OUT_CTYPE>(self_data[self_data_index]);
+  }
+}
+} // namespace
+
+// _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
+// dim_order=None, Tensor(a!) out) -> Tensor(a!)
+Tensor& _to_dim_order_copy_out(
+    RuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  (void)ctx;
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  ET_SWITCH_REALHB_TYPES(
+      self.scalar_type(), ctx, "_to_dim_order_copy_out", CTYPE_IN, [&] {
+        ET_SWITCH_REALHB_TYPES(
+            out.scalar_type(), ctx, "_to_dim_order_copy_out", CTYPE_OUT, [&] {
+              _to_dim_order_copy_impl<CTYPE_IN, CTYPE_OUT>(self, out);
+            });
+      });
+
+  return out;
+}
+
+Tensor& _to_dim_order_copy_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  exec_aten::RuntimeContext context{};
+  return _to_dim_order_copy_out(context, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index c86ac2b9f6d..77796c68526 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -977,6 +977,12 @@ _ATEN_OPS = (
     op_target(
         name = "op_zeros",
     ),
+    op_target(
+        name = "op__to_dim_order_copy",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
+    ),
 )
 
 # Operators that are not listed in `functions.yaml` (i.e., operators listed in
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index 48038f175b9..314e38c2b53 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -9,6 +9,7 @@
 #include <cstring>
 
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace torch {
@@ -735,6 +736,45 @@ bool check_to_copy_args(
   return true;
 }
 
+bool check__to_dim_order_copy_args(
+    const Tensor& input,
+    bool non_blocking,
+    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // Right now we only support blocking data transfer
+  ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
+
+  if (dim_order.has_value()) {
+    exec_aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+
+    // dim order size shall equal to input dim
+    ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
+
+    ET_LOG_AND_RETURN_IF_FALSE(
+        is_channels_last_dim_order(
+            dim_order.value().data(), dim_order.value().size()) ||
+        is_contiguous_dim_order(
+            dim_order.value().data(), dim_order.value().size()));
+
+    // Out tensor shall have same dim order as dim_order
+    auto out_dim_order = out.dim_order();
+    ET_LOG_AND_RETURN_IF_FALSE(out_dim_order.size() == dim_order_ref.size());
+    for (size_t i = 0; i < dim_order_ref.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(out_dim_order[i] == dim_order_ref[i]);
+    }
+  } else { // dim_order is not set, preserve the dim order of input
+
+    // Out tensor shall have same dim order as input dim_order
+    auto out_dim_order = out.dim_order();
+    auto input_dim_order = input.dim_order();
+    ET_LOG_AND_RETURN_IF_FALSE(out_dim_order.size() == input_dim_order.size());
+    for (size_t i = 0; i < input_dim_order.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(out_dim_order[i] == input_dim_order[i]);
+    }
+  }
+  return true;
+}
+
 bool check_unsqueeze_copy_args(
     const Tensor input,
     int64_t dim,
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index 5f341b0c2b9..d5362ae373a 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -198,6 +198,12 @@ bool check_to_copy_args(
     exec_aten::optional<exec_aten::MemoryFormat> memory_format,
     Tensor& out);
 
+bool check__to_dim_order_copy_args(
+    const Tensor& input,
+    bool non_blocking,
+    exec_aten::OptionalArrayRef<int64_t> dim_order,
+    Tensor& out);
+
 bool check_unsqueeze_copy_args(
     const Tensor input,
     int64_t dim,
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index ca3f3c702a1..93256b2a05f 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -881,3 +881,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::zeros_out
+
+- func: dim_order_ops::_to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_to_dim_order_copy_out
diff --git a/kernels/test/op__to_dim_order_copy_test.cpp b/kernels/test/op__to_dim_order_copy_test.cpp
new file mode 100644
index 00000000000..e888e0fc7f6
--- /dev/null
+++ b/kernels/test/op__to_dim_order_copy_test.cpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdint>
+#include <map>
+#include <typeindex>
+#include <variant>
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/kernels/test/supported_features.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ArrayRef;
+using exec_aten::optional;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+// To further emphasize the accuracy of our op_to, we TEST_F the conversion
+// from floating-point types to signed int types directly by the TEST_F cases
+// generated by core Pytorch directly. Such data is random generated in [-5, 5].
+
+// clang-format off
+typedef std::map<
+          std::type_index,
+          std::variant<
+            std::vector<float>,
+            std::vector<double>>>
+        FloatingTypeToDataMap;
+
+typedef std::map<
+          std::type_index,
+          std::variant<
+              std::vector<int64_t>,
+              std::vector<int32_t>,
+              std::vector<int16_t>,
+              std::vector<int8_t>,
+              std::vector<uint8_t>>>
+        IntTypeToDataMap;
+// clang-format on
+
+class OpToDimOrderCopyTest : public OperatorTest {
+ protected:
+  Tensor& op__to_dim_order_copy_out(
+      const Tensor& self,
+      bool non_blocking,
+      exec_aten::optional<ArrayRef<int64_t>> dim_order,
+      Tensor& out) {
+    return torch::executor::dim_order_ops::_to_dim_order_copy_outf(
+        context_, self, non_blocking, dim_order, out);
+  }
+  // Cast float vector to OUTPUT_CTYPE vector
+  template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
+  std::vector<OUTPUT_CTYPE> vector_type_cast(std::vector<INPUT_CTYPE> input) {
+    std::vector<OUTPUT_CTYPE> output(input.size());
+    std::transform(
+        input.begin(), input.end(), output.begin(), [](INPUT_CTYPE x) {
+          return static_cast<OUTPUT_CTYPE>(x);
+        });
+    return output;
+  }
+
+  template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
+  struct ToTestCase {
+    const std::vector<int32_t> sizes;
+    const std::vector<INPUT_CTYPE> data_in;
+    const std::vector<OUTPUT_CTYPE> data_out;
+  };
+
+  // Each TEST_F has different combination of input and output types. Therefore
+  // it is a little bit mess if create template TEST_F case and custom data
+  // types for both input data and output data. We choose another way: for all
+  // TEST_F cases, their data are all in double. And we are gonna cast them into
+  // desired type when delievering them into tf.make function. Based on our
+  // experiments, type cast of core PyTorch is same as static_cast in c++ in the
+  // representable scope, so here we believe using static_cast to generate
+  // ground truth is reasonable.
+  template <
+      typename INPUT_CTYPE,
+      ScalarType INPUT_DTYPE,
+      typename OUTPUT_CTYPE,
+      ScalarType OUTPUT_DTYPE>
+  void test_runner_static_cast(
+      std::vector<ToTestCase<double, double>> test_cases) {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    for (const auto& test_case : test_cases) {
+      auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case.data_in);
+      auto data_out = vector_type_cast<INPUT_CTYPE, OUTPUT_CTYPE>(data_in);
+
+      Tensor input = tf_in.make(test_case.sizes, data_in);
+      Tensor output = tf_out.zeros_like(input);
+
+      std::vector<int64_t> dim_order_vec;
+      for (int64_t i = 0; i < input.dim(); i++) {
+        dim_order_vec.push_back(i);
+      }
+      ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+      Tensor ret = op__to_dim_order_copy_out(
+          /*self=*/input,
+          /*non_blocking=*/false,
+          dim_order,
+          output);
+
+      Tensor expected = tf_out.make(test_case.sizes, data_out);
+
+      // The original tensor a should share same value with the out variable and
+      // return variable of to function
+      EXPECT_TENSOR_EQ(ret, output);
+      EXPECT_TENSOR_EQ(ret, expected);
+    }
+  }
+
+  template <typename INPUT_CTYPE, ScalarType INPUT_DTYPE>
+  void test_runner_to_bool(
+      std::vector<double> test_case,
+      std::vector<uint8_t> data_out) {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<ScalarType::Bool> tf_out;
+
+    auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case);
+
+    Tensor input = tf_in.make({(int)test_case.size()}, data_in);
+    Tensor output = tf_out.zeros_like(input);
+
+    std::vector<int64_t> dim_order_vec;
+    for (int i = 0; i < input.dim(); i++) {
+      dim_order_vec.push_back(i);
+    }
+    ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+    Tensor ret = op__to_dim_order_copy_out(
+        /*self=*/input,
+        /*non_blocking=*/false,
+        dim_order,
+        output);
+
+    Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
+
+    // The return value of op__to_dim_order_copy_out and the values written to
+    // output should be the same.
+    EXPECT_TENSOR_EQ(ret, output);
+    // The return value of op__to_dim_order_copy_out and the values in expected
+    // which are the reference values should be the same.
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+
+  template <typename OUT_CTYPE, ScalarType OUT_DTYPE>
+  void test_runner_from_bool(
+      std::vector<uint8_t> test_case,
+      std::vector<double> out) {
+    TensorFactory<ScalarType::Bool> tf_in;
+    TensorFactory<OUT_DTYPE> tf_out;
+
+    auto data_out = vector_type_cast<double, OUT_CTYPE>(out);
+
+    Tensor input = tf_in.make({(int)test_case.size()}, test_case);
+    Tensor output = tf_out.zeros_like(input);
+
+    std::vector<int64_t> dim_order_vec;
+    for (int64_t i = 0; i < input.dim(); i++) {
+      dim_order_vec.push_back(i);
+    }
+    ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+    Tensor ret = op__to_dim_order_copy_out(
+        /*self=*/input,
+        /*non_blocking=*/false,
+        dim_order,
+        output);
+
+    Tensor expected = tf_out.make({(int)data_out.size()}, data_out);
+
+    // The return value of op__to_dim_order_copy_out and the values written to
+    // output should be the same.
+    EXPECT_TENSOR_EQ(ret, output);
+    // The return value of op__to_dim_order_copy_out and the values in expected
+    // which are the reference values should be the same.
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+
+  /* %python
+  import torch
+  torch.manual_seed(0)
+  x = torch.rand(2, 3)
+  res = x.to(non_blocking = False, memory_format = torch.preserve_format)
+  op = "op__to_dim_order_copy_out"
+  opt_setup_params = """
+    bool non_blocking = false;
+    optional<MemoryFormat> memory_format;
+  """
+  opt_extra_params = "non_blocking, memory_format,"
+  out_args = "out_shape, dynamism"
+  dtype = "ScalarType::Float"
+  check = "EXPECT_TENSOR_EQ" */
+
+  void test_dynamic_shape(
+      const std::vector<int32_t>& out_shape,
+      enum torch::executor::TensorShapeDynamism dynamism) {
+    /* %python
+    %rewrite(unary_op) */
+
+    TensorFactory<ScalarType::Float> tf;
+
+    Tensor x = tf.make(
+        {2, 3},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.08847743272781372,
+         0.13203048706054688,
+         0.30742281675338745,
+         0.6340786814689636});
+    Tensor expected = tf.make(
+        {2, 3},
+        {0.49625658988952637,
+         0.7682217955589294,
+         0.08847743272781372,
+         0.13203048706054688,
+         0.30742281675338745,
+         0.6340786814689636});
+
+    bool non_blocking = false;
+
+    Tensor out = tf.zeros(out_shape, dynamism);
+
+    std::vector<int64_t> dim_order_vec;
+    for (int64_t i = 0; i < x.dim(); i++) {
+      dim_order_vec.push_back(i);
+    }
+    ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+    Tensor ret = op__to_dim_order_copy_out(
+        /*self=*/x, non_blocking, dim_order, out);
+
+    EXPECT_TENSOR_EQ(out, expected);
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+
+  template <
+      typename INPUT_CTYPE,
+      ScalarType INPUT_DTYPE,
+      typename OUTPUT_CTYPE,
+      ScalarType OUTPUT_DTYPE>
+  void test_runner_hardcode_data(
+      FloatingTypeToDataMap floating_point_data,
+      IntTypeToDataMap int_data) {
+    TensorFactory<INPUT_DTYPE> tf_in;
+    TensorFactory<OUTPUT_DTYPE> tf_out;
+
+    if (typeid(OUTPUT_CTYPE) == typeid(uint8_t)) {
+      // Would cause underflow when testing uint8_t.
+      return;
+    }
+
+    ToTestCase<INPUT_CTYPE, OUTPUT_CTYPE> test_case = {
+        /*sizes=*/{3, 5}, /*data_in=*/
+        std::get<std::vector<INPUT_CTYPE>>(
+            floating_point_data[typeid(INPUT_CTYPE)]),
+        /*data_out=*/
+        std::get<std::vector<OUTPUT_CTYPE>>(int_data[typeid(OUTPUT_CTYPE)])};
+
+    Tensor input = tf_in.make(test_case.sizes, test_case.data_in);
+    Tensor output = tf_out.zeros_like(input);
+
+    std::vector<int64_t> dim_order_vec;
+    for (int64_t i = 0; i < input.dim(); i++) {
+      dim_order_vec.push_back(i);
+    }
+    ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+    Tensor ret = op__to_dim_order_copy_out(
+        /*self=*/input,
+        /*non_blocking=*/false,
+        dim_order,
+        output);
+
+    Tensor expected = tf_out.make(test_case.sizes, test_case.data_out);
+
+    // The original tensor a should share same value with the out variable and
+    // return variable of to function
+    EXPECT_TENSOR_EQ(ret, output);
+    EXPECT_TENSOR_EQ(ret, expected);
+  }
+};
+
+/* Here we temporary not try to implement or TEST_F the behavior about casting a
+ * number can not be represented in some type to this type (e.g. inf to
+ * int32_t nan to int64_t or 2147483648 to int32_t), because
+ * - a. The result of such kind of cast is undefined according to c++
+ * standard;
+ * - b. No explicit rules can be found in core pytorch for such transaction
+ * (not same as static_cast or any other casting function in c++);
+ * - c. If user tries to cast a unrepresentable value to certain type, they
+ *      should take the risk;
+ * - d. Even though we can always use if/switch to cover these boundry cases,
+ *      the code will be lengthy and jumbled. I believe using these disordered
+ *      code to meet some undefine behavior is meaningless, and we can not
+ *      cover all such cases.
+ */
+
+// Regular TEST_F for to_copy.out
+// TEST_F if to_copy.out works well under all kinds of data pairs
+TEST_F(OpToDimOrderCopyTest, AllDtypesSupported) {
+  std::vector<ToTestCase<double, double>> test_cases = {
+      {
+          /*sizes=*/{2, 4}, /*data_in=*/
+          {2.11, 3.2, 2.3, 4.0, 1.1, 5.2, 1.1, 6.3}, /*data_out=*/
+          {}, // data_out shouldn't be used in test_runner_static_cast
+      },
+      {
+          /*sizes=*/{3, 4, 0, 5},
+          /*data_in=*/{},
+          /*data_out=*/{},
+      },
+      {
+          /*sizes=*/{},
+          /*data_in=*/{10.0},
+          /*data_out=*/{}, // data_out shouldn't be used in
+                           // test_runner_static_cast
+      },
+  };
+
+#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
+  test_runner_static_cast<                                                \
+      INPUT_CTYPE,                                                        \
+      ScalarType::INPUT_DTYPE,                                            \
+      OUTPUT_CTYPE,                                                       \
+      ScalarType::OUTPUT_DTYPE>(test_cases);
+
+#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
+  ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+
+  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+
+#undef TEST_ENTRY
+#undef TEST_KERNEL
+}
+
+TEST_F(OpToDimOrderCopyTest, BoolTests) {
+  std::vector<double> test_case_to_bool = {1.1, 2.2, 0};
+  std::vector<uint8_t> result_to_bool = {true, true, false};
+#define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE)               \
+  test_runner_to_bool<INPUT_CTYPE, ScalarType::INPUT_DTYPE>( \
+      test_case_to_bool, result_to_bool);
+  ET_FORALL_REAL_TYPES(TEST_TO_BOOL);
+
+  std::vector<uint8_t> test_case_from_bool = {true, true, false};
+  std::vector<double> result_from_bool = {1.0, 1.0, 0};
+#define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE)               \
+  test_runner_from_bool<OUTPUT_CTYPE, ScalarType::OUTPUT_DTYPE>( \
+      test_case_from_bool, result_from_bool);
+  ET_FORALL_REAL_TYPES(TEST_FROM_BOOL);
+}
+
+TEST_F(OpToDimOrderCopyTest, NanInfSupported) {
+  constexpr auto floatInfinity = std::numeric_limits<float>::infinity();
+  std::vector<ToTestCase<double, double>> test_cases = {{
+      /*sizes=*/{2, 4},
+      /*data_in=*/{2, 3, NAN, 4, floatInfinity, 5, -floatInfinity, 6},
+      /*data_out=*/{2, 3, NAN, 4, floatInfinity, 5, -floatInfinity, 6},
+  }};
+
+#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
+  test_runner_static_cast<                                                \
+      INPUT_CTYPE,                                                        \
+      ScalarType::INPUT_DTYPE,                                            \
+      OUTPUT_CTYPE,                                                       \
+      ScalarType::OUTPUT_DTYPE>(test_cases);
+
+#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
+  ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+
+  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+
+#undef TEST_ENTRY
+#undef TEST_KERNEL
+}
+
+TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) {
+  // Hardcode input and output generated from core PyTorch
+  // clang-format off
+  std::vector<float> float_data = {
+      -1.47900056838989257812, -4.59277725219726562500,
+       2.15365791320800781250, -2.55494546890258789062,
+       3.06999135017395019531,  3.27460670471191406250,
+      -3.98865103721618652344, -4.81065988540649414062,
+       3.67902207374572753906,  3.72226405143737792969,
+       0.80567771196365356445,  2.23788332939147949219,
+      -0.52035576105117797852, -1.58493483066558837891,
+      -0.30919688940048217773};
+
+  std::vector<double> double_data = {
+      -1.47900053955270172068, -4.59277735274143061872,
+       2.15365796963871947156, -2.55494554556038755422,
+       3.06999137834642255029,  3.27460679459944969949,
+      -3.98865109243288795682, -4.81065977167646074975,
+       3.67902198302105531980,  3.72226414774102742911,
+       0.80567768667100203572,  2.23788335717029518435,
+      -0.52035578832931150828, -1.58493480710766210251,
+      -0.30919688936285893988};
+  // clang-format on
+
+  std::vector<int64_t> int64_data = {
+      -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
+  std::vector<int32_t> int32_data = {
+      -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
+  std::vector<int16_t> int16_data = {
+      -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
+  std::vector<int8_t> int8_data = {
+      -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
+
+  // Gathering all floating point data together for better traversial
+  FloatingTypeToDataMap floating_point_data;
+  floating_point_data[typeid(float)] = float_data;
+  floating_point_data[typeid(double)] = double_data;
+
+  // Gathering all int data together for better traversial
+  IntTypeToDataMap int_data;
+  int_data[typeid(int64_t)] = int64_data;
+  int_data[typeid(int32_t)] = int32_data;
+  int_data[typeid(int16_t)] = int16_data;
+  int_data[typeid(int8_t)] = int8_data;
+
+#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
+  test_runner_hardcode_data<                                              \
+      INPUT_CTYPE,                                                        \
+      ScalarType::INPUT_DTYPE,                                            \
+      OUTPUT_CTYPE,                                                       \
+      ScalarType::OUTPUT_DTYPE>(floating_point_data, int_data);
+
+#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
+  ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+
+  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+}
+
+TEST_F(OpToDimOrderCopyTest, MismatchedSizesDie) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
+  }
+  TensorFactory<ScalarType::Int> tf;
+  Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
+  Tensor out = tf.zeros({3, 2, 1, 1});
+  std::vector<int64_t> dim_order_vec;
+  for (int64_t i = 0; i < input.dim(); i++) {
+    dim_order_vec.push_back(i);
+  }
+  ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op__to_dim_order_copy_out(
+          /*self=*/input,
+          /*non_blocking=*/false,
+          dim_order,
+          out));
+}
+
+// Only contiguous memory is supported, the memory type MemoryFormat::Contiguous
+// should not be allowed. The function is expected death if using the illegal
+// memory format.
+TEST_F(OpToDimOrderCopyTest, MismatchedMemoryFormatDies) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle non contiguous memory formats";
+  }
+  TensorFactory<ScalarType::Float> tf_in;
+  TensorFactory<ScalarType::Float> tf_out;
+  Tensor input =
+      tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
+  Tensor out = tf_out.zeros({3, 1, 1, 2});
+
+  std::vector<int64_t> dim_order_vec;
+  for (int64_t i = 0; i < input.dim(); i++) {
+    dim_order_vec.push_back(i);
+  }
+
+  // mutate dim_order_vec to create a illegal one.
+  dim_order_vec[1] = 3;
+  dim_order_vec[3] = 1;
+  ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op__to_dim_order_copy_out(
+          /*self=*/input,
+          /*non_blocking=*/false,
+          dim_order,
+          out));
+}
+
+// Only blocking data transfer supported
+TEST_F(OpToDimOrderCopyTest, MismatchedBlockingDie) {
+  if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
+    GTEST_SKIP() << "ATen kernel can handle non blocking data transfer";
+  }
+  TensorFactory<ScalarType::Int> tf;
+  Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
+  Tensor out = tf.zeros(/*sizes=*/{3, 1, 1, 2});
+
+  std::vector<int64_t> dim_order_vec;
+  for (int64_t i = 0; i < input.dim(); i++) {
+    dim_order_vec.push_back(i);
+  }
+  ArrayRef<int64_t> dim_order(dim_order_vec.data(), dim_order_vec.size());
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op__to_dim_order_copy_out(
+          /*self=*/input,
+          /*non_blocking=*/true,
+          dim_order,
+          out));
+}
+
+TEST_F(OpToDimOrderCopyTest, DynamicShapeUpperBoundSameAsExpected) {
+  test_dynamic_shape(
+      {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+}
+
+TEST_F(OpToDimOrderCopyTest, DynamicShapeUpperBoundLargerThanExpected) {
+  test_dynamic_shape(
+      {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+}
+
+TEST_F(OpToDimOrderCopyTest, DynamicShapeUnbound) {
+  if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
+    GTEST_SKIP() << "Dynamic shape unbound not supported";
+  }
+  test_dynamic_shape(
+      {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
+}
+
+TEST_F(OpToDimOrderCopyTest, ContiguousToChannelsLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make_with_dimorder(
+      {3, 5, 2, 2},
+      {0.2432, 0.5248, 0.5361, 0.8513, 0.8184, 0.8206, 0.7357, 0.9655, 0.6138,
+       0.1112, 0.2799, 0.1079, 0.9680, 0.2548, 0.0393, 0.6002, 0.2257, 0.8766,
+       0.2715, 0.1595, 0.2029, 0.7026, 0.6982, 0.8529, 0.4405, 0.6560, 0.9217,
+       0.6372, 0.2446, 0.6590, 0.3866, 0.7185, 0.4439, 0.5346, 0.3179, 0.4492,
+       0.3491, 0.6970, 0.8456, 0.2516, 0.2345, 0.2924, 0.7695, 0.0911, 0.8530,
+       0.8560, 0.6909, 0.7719, 0.8923, 0.5546, 0.6978, 0.8151, 0.3007, 0.3961,
+       0.8416, 0.4296, 0.7203, 0.8963, 0.3597, 0.5552});
+
+  Tensor out = tf.full_channels_last({3, 5, 2, 2}, 0.0);
+  Tensor expected = tf.make_with_dimorder(
+      {3, 5, 2, 2},
+      {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548,
+       0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079,
+       0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560,
+       0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529,
+       0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203,
+       0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416,
+       0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552},
+      /*dim_order=*/{0, 2, 3, 1});
+
+  std::vector<int64_t> dim_order_vec = {0, 2, 3, 1};
+  exec_aten::ArrayRef<int64_t> dim_order(
+      dim_order_vec.data(), dim_order_vec.size());
+  Tensor ret = op__to_dim_order_copy_out(
+      /*self*/ x, /*non_blocking*/ false, /*dim_order*/ dim_order, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+  EXPECT_TENSOR_EQ(ret, expected);
+}
+
+TEST_F(OpToDimOrderCopyTest, ChannelsLastToContiguous) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor out = tf.full({3, 5, 2, 2}, 0.0);
+  Tensor x = tf.make_with_dimorder(
+      {3, 5, 2, 2},
+      {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548,
+       0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079,
+       0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560,
+       0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529,
+       0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203,
+       0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416,
+       0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552},
+      /*dim_order=*/{0, 2, 3, 1});
+
+  Tensor expected = tf.make_with_dimorder(
+      {3, 5, 2, 2},
+      {0.2432, 0.5248, 0.5361, 0.8513, 0.8184, 0.8206, 0.7357, 0.9655, 0.6138,
+       0.1112, 0.2799, 0.1079, 0.9680, 0.2548, 0.0393, 0.6002, 0.2257, 0.8766,
+       0.2715, 0.1595, 0.2029, 0.7026, 0.6982, 0.8529, 0.4405, 0.6560, 0.9217,
+       0.6372, 0.2446, 0.6590, 0.3866, 0.7185, 0.4439, 0.5346, 0.3179, 0.4492,
+       0.3491, 0.6970, 0.8456, 0.2516, 0.2345, 0.2924, 0.7695, 0.0911, 0.8530,
+       0.8560, 0.6909, 0.7719, 0.8923, 0.5546, 0.6978, 0.8151, 0.3007, 0.3961,
+       0.8416, 0.4296, 0.7203, 0.8963, 0.3597, 0.5552});
+
+  std::vector<int64_t> dim_order_vec = {0, 1, 2, 3};
+  exec_aten::ArrayRef<int64_t> dim_order(
+      dim_order_vec.data(), dim_order_vec.size());
+  Tensor ret = op__to_dim_order_copy_out(
+      /*self*/ x, /*non_blocking*/ false, /*dim_order*/ dim_order, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+  EXPECT_TENSOR_EQ(ret, expected);
+}
+
+TEST_F(OpToDimOrderCopyTest, PreserveChanneslLast) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor out = tf.full_channels_last({3, 5, 2, 2}, 0.0);
+  Tensor x = tf.make_with_dimorder(
+      {3, 5, 2, 2},
+      {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548,
+       0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079,
+       0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560,
+       0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529,
+       0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203,
+       0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416,
+       0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552},
+      /*dim_order=*/{0, 2, 3, 1});
+
+  Tensor expected = tf.make_with_dimorder(
+      {3, 5, 2, 2},
+      {0.2432, 0.8184, 0.6138, 0.9680, 0.2257, 0.5248, 0.8206, 0.1112, 0.2548,
+       0.8766, 0.5361, 0.7357, 0.2799, 0.0393, 0.2715, 0.8513, 0.9655, 0.1079,
+       0.6002, 0.1595, 0.2029, 0.4405, 0.2446, 0.4439, 0.3491, 0.7026, 0.6560,
+       0.6590, 0.5346, 0.6970, 0.6982, 0.9217, 0.3866, 0.3179, 0.8456, 0.8529,
+       0.6372, 0.7185, 0.4492, 0.2516, 0.2345, 0.8530, 0.8923, 0.3007, 0.7203,
+       0.2924, 0.8560, 0.5546, 0.3961, 0.8963, 0.7695, 0.6909, 0.6978, 0.8416,
+       0.3597, 0.0911, 0.7719, 0.8151, 0.4296, 0.5552},
+      /*dim_order=*/{0, 2, 3, 1});
+
+  Tensor ret = op__to_dim_order_copy_out(
+      /*self*/ x,
+      /*non_blocking*/ false,
+      /*dim_order*/ exec_aten::nullopt,
+      out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+  EXPECT_TENSOR_EQ(ret, expected);
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 1daa66c58fa..1205ca3d87d 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -158,6 +158,7 @@ def define_common_targets(is_fbcode = False):
     codegen_function_header_wrapper("executorch/kernels/quantized", "quantized")
     codegen_function_header_wrapper("executorch/kernels/test/custom_kernel_example", "custom_kernel_example")
 
+    _common_op_test("op__to_dim_order_copy_test", ["aten", "portable"])
     _common_op_test("op_abs_test", ["aten", "portable"])
     _common_op_test("op_acos_test", ["aten", "portable"])
     _common_op_test("op_acosh_test", ["aten", "portable"])
diff --git a/runtime/core/exec_aten/exec_aten.h b/runtime/core/exec_aten/exec_aten.h
index 9eb1f12cd18..919b5420b3a 100644
--- a/runtime/core/exec_aten/exec_aten.h
+++ b/runtime/core/exec_aten/exec_aten.h
@@ -82,6 +82,9 @@ using quint4x2 = c10::quint4x2;
 using quint2x4 = c10::quint2x4;
 using IntArrayRef = at::IntArrayRef;
 
+template <typename T>
+using OptionalArrayRef = c10::OptionalArrayRef<T>;
+
 #else // Use executor types
 
 using Tensor = torch::executor::Tensor;
@@ -118,6 +121,10 @@ using quint2x4 = torch::executor::quint2x4;
 
 using IntArrayRef = torch::executor::IntArrayRef;
 
+template <typename T>
+using OptionalArrayRef =
+    torch::executor::optional<torch::executor::ArrayRef<T>>;
+
 #endif // Use executor types
 
 } // namespace exec_aten
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
index 32a6da5260f..b8817b61885 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -147,7 +147,7 @@ def define_op_library(name, deps, android_deps, aten_target, _allow_third_party_
         link_whole = True,
     )
 
-def define_op_target(name, deps, android_deps, is_aten_op, _allow_third_party_deps = False, _aten_mode_deps = []):
+def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _allow_third_party_deps = False, _aten_mode_deps = []):
     """Possibly defines cxx_library targets for the named operator group.
 
     Args:
@@ -155,6 +155,7 @@ def define_op_target(name, deps, android_deps, is_aten_op, _allow_third_party_de
         deps: List of deps for the targets.
         android_deps: List of fbandroid_platform_deps for the target.
         is_aten_op: True if the operator overload group is ATen-compatible.
+        is_et_op: True if the operator overload group is ET-compatible.
         _allow_third_party_deps: If True, the op is allowed to depend on
             third-party deps outside of //executorch. Should only be used by
             targets under //executorch/kernels/optimized.
@@ -171,13 +172,14 @@ def define_op_target(name, deps, android_deps, is_aten_op, _allow_third_party_de
             _allow_third_party_deps = _allow_third_party_deps,
         )
 
-    # When building in ATen mode, ATen-compatible (non-custom) operators will
-    # use the implementations provided by ATen, so we should not build the
-    # versions defined here.
-    define_op_library(
-        name = name,
-        deps = deps,
-        android_deps = android_deps,
-        aten_target = False,
-        _allow_third_party_deps = _allow_third_party_deps,
-    )
+    if is_et_op:
+        # When building in ATen mode, ATen-compatible (non-custom) operators will
+        # use the implementations provided by ATen, so we should not build the
+        # versions defined here.
+        define_op_library(
+            name = name,
+            deps = deps,
+            android_deps = android_deps,
+            aten_target = False,
+            _allow_third_party_deps = _allow_third_party_deps,
+        )
diff --git a/shim/xplat/executorch/kernels/test/util.bzl b/shim/xplat/executorch/kernels/test/util.bzl
index a7e1c458fa5..cefb4fae6f0 100644
--- a/shim/xplat/executorch/kernels/test/util.bzl
+++ b/shim/xplat/executorch/kernels/test/util.bzl
@@ -21,6 +21,8 @@ def op_test(name, deps = [], kernel_name = "portable", use_kernel_prefix = False
     if kernel_name == "aten":
         generated_lib_and_op_deps = [
             "//executorch/kernels/aten:generated_lib",
+            #TODO(T187390274): consolidate all aten ops into one target
+            "//executorch/kernels/aten/cpu:op__to_dim_order_copy_aten",
             "//executorch/kernels/aten:generated_lib_headers",
             "//executorch/kernels/test:supported_features_aten",
         ]