Revert "Optimize transpose copy on CPU using fbgemm transpose (pytorc…

…h#83327)" This reverts commit f56720e. Reverted pytorch#83327 on behalf of https://github.com/janeyx99 due to Sorry, reverting as this breaks mac functorch tests on trunk https://hud.pytorch.org/pytorch/pytorch/commit/f56720ea7c7ad0bcb4c5af669e28bf7de8122cb6
harborn · Aug 22, 2022 · 53cda90 · 53cda90
1 parent d1be36c
commit 53cda90
Show file tree

Hide file tree

Showing 2 changed files with 0 additions and 67 deletions.
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
@@ -16,7 +16,6 @@
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
-#include <ATen/native/cpu/utils.h>
 
 #ifdef USE_FBGEMM
 #include <fbgemm/Fbgemm.h>
@@ -27,53 +26,6 @@ namespace {
 
 using namespace at;
 
-bool fbgemm_copy_transpose_valid(const Tensor& self, const Tensor& src) {
-  const int MIN_SZ = 16 * 32;
-  if ((self.device().is_cpu() && src.device().is_cpu()) &&
-      (self.layout() == c10::kStrided) && (src.layout() == c10::kStrided) &&
-      !self.is_sparse() && !src.is_sparse() && self.is_contiguous() &&
-      (self.is_conj() == src.is_conj()) && (self.is_neg() == src.is_neg()) &&
-      !self.is_complex() && !src.is_complex() &&
-      self.sizes().equals(src.sizes()) && self.dim() >= 2 &&
-      src.size(src.dim() - 1) * src.size(src.dim() - 2) >= MIN_SZ &&
-      src.stride(src.dim() - 2) == 1 && src.stride(src.dim() - 1) == src.size(src.dim() - 2) &&
-      !(src.size(src.dim() - 2) == 1 && src.size(src.dim() - 1) == 1)) {
-      // Check src is in contiguous block
-      for (long i = 0; i < src.dim() - 2; i++) {
-        if (!(src.stride(i) == ((i + 1) == (src.dim() - 2)) ?
-                src.stride(src.dim() - 1) * src.size(src.dim() - 1) :  src.stride(i + 1) * src.size(i + 1))){
-              return false;
-            }
-      }
-  } else {
-    return false;
-  }
-  return true;
-}
-
-void fbgemm_copy_transpose_same_type(Tensor& self, const Tensor& src) {
-  auto block_size = src.size(src.dim() - 1) * src.size(src.dim() - 2);
-  auto ntrans = src.numel() / block_size;
-  AT_DISPATCH_ALL_TYPES_AND(kBFloat16, src.scalar_type(),
-    "fbgemm_transpose_copy_same_type", [&] {
-    at::parallel_for(
-    0,
-    ntrans,
-    at::internal::GRAIN_SIZE / block_size,
-    [&](int64_t begin, int64_t end) {
-      for (int64_t i = begin; i < end; i++) {
-        native::utils::transpose(
-        src.size(src.dim() - 1),
-        src.size(src.dim() - 2),
-        src.data_ptr<scalar_t>() + i * block_size,
-        src.stride(src.dim() - 1),
-        self.data_ptr<scalar_t>() + i * block_size,
-        self.stride(self.dim() - 2));
-      }
-    });
-  });
-}
-
 bool copy_transpose_valid(const Tensor& self, const Tensor& src) {
   const int MIN_SZ = 60 * 60;
   return self.is_contiguous() && src.numel() != 0 && src.dim() == 2 &&
@@ -206,12 +158,6 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
       }
       return self;
     }
-
-    if (fbgemm_copy_transpose_valid(self, src) && src.dtype() == self.dtype() &&
-      (src.dtype() == at::kFloat || src.dtype() == at::kBFloat16)) {
-      fbgemm_copy_transpose_same_type(self, src);
-      return self;
-    }
   #endif
 
   if (self.is_same(src)) {

diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
@@ -93,19 +93,6 @@ inline void transpose<float>(int64_t M, int64_t N, const float* src, int64_t ld_
   TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
   fbgemm::transpose_simd<float>(M, N, src, ld_src, dst, ld_dst);
 }
-
-template <>
-inline void transpose<BFloat16>(int64_t M, int64_t N, const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst) {
-  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
-  fbgemm::transpose_simd<uint16_t>(M, N, reinterpret_cast<const uint16_t*>(src), ld_src, reinterpret_cast<uint16_t*>(dst), ld_dst);
-}
-
-template <>
-inline void transpose<uint8_t>(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) {
-  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
-  fbgemm::transpose_simd<uint8_t>(M, N, src, ld_src, dst, ld_dst);
-}
-
 #endif
 
 } // namespace utils