diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 659a3e143..4b7760973 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -320,42 +320,10 @@ struct PvcBenchmarkRunner : BenchmarkRunner { using ProblemShapeType = typename Base::ProblemShapeType; - cutlass::DeviceAllocation block_B_vnni; - - template - void vnni_matrix( - T* dst, const T* src, - int batch, int numRows, int numCols, int factor) - { - for (int b = 0; b < batch; b++) { - for (int r = 0; r < numRows / factor; r++) { - for (int c = 0; c < numCols; c++) { - for (int k = 0; k < factor; k++) { - dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = - src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; - } - } - } - } - } - void initialize(const ProblemShapeType& problem_size) override { Base::initialize(problem_size); - - auto problem_shape_MNKL = cute::append<4>(problem_size, 1); - auto [M, N, K, L] = problem_shape_MNKL; - - block_B_vnni.reset(Base::block_B.size()); - - std::vector b(K * N * L); - std::vector b_vnni(b.size()); - - Base::block_B.copy_to_host(b.data()); - vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); - - block_B_vnni.copy_from_host(b_vnni.data()); } - + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; @@ -364,7 +332,7 @@ struct PvcBenchmarkRunner : BenchmarkRunner { typename Gemm::GemmKernel::Arguments arguments{ cutlass::gemm::GemmUniversalMode::kGemm, problem_size, - {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, + {Base::block_A.get(), Base::stride_A, Base::block_B.get(), Base::stride_B}, { {options.alpha, options.beta}, Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp index ea90d89b7..3203e7f36 100644 --- a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp +++ b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp @@ -85,15 +85,15 @@ int main(int argc, const char** argv) using LayoutD = cutlass::layout::RowMajor; // Workgroup-level tile - using TileShape = Shape<_32, _256, _32>; + using TileShape = Shape<_256, _256, _32>; using TiledMma = TiledMMA< - MMA_Atom, + MMA_Atom, Layout>, Tile<_32,_64,_32>>; // Subgroup level-tile using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; - using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V; using GEMMDispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated; using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue; diff --git a/examples/sycl/pvc/pvc_gemm.cpp b/examples/sycl/pvc/pvc_gemm.cpp index 3bf9cb53b..9cec462ca 100644 --- a/examples/sycl/pvc/pvc_gemm.cpp +++ b/examples/sycl/pvc/pvc_gemm.cpp @@ -89,7 +89,7 @@ struct Options { Options(): help(false), error(false), - m(4096), n(4096), k(4096), l(1), iterations(100), + m(4096), n(4096), k(4096), l(1), iterations(20), alpha(1.f), beta(0.f) { } @@ -108,7 +108,7 @@ struct Options { cmd.get_cmd_line_argument("l", l, 1); cmd.get_cmd_line_argument("alpha", alpha, 1.f); cmd.get_cmd_line_argument("beta", beta, 0.f); - cmd.get_cmd_line_argument("iterations", iterations, 100); + cmd.get_cmd_line_argument("iterations", iterations, 20); } /// Prints the usage statement. diff --git a/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp b/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp index 207537958..1352ef621 100644 --- a/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp +++ b/examples/sycl/pvc/pvc_gemm_with_epilogue_relu.cpp @@ -90,7 +90,7 @@ struct Options { Options(): help(false), error(false), - m(4096), n(4096), k(4096), l(1), iterations(100), + m(4096), n(4096), k(4096), l(1), iterations(10), alpha(1.f), beta(0.f) { } @@ -109,7 +109,7 @@ struct Options { cmd.get_cmd_line_argument("l", l, 1); cmd.get_cmd_line_argument("alpha", alpha, 1.f); cmd.get_cmd_line_argument("beta", beta, 0.f); - cmd.get_cmd_line_argument("iterations", iterations, 100); + cmd.get_cmd_line_argument("iterations", iterations, 10); } /// Prints the usage statement. @@ -171,7 +171,6 @@ struct ExampleRunner { cutlass::DeviceAllocation block_A; cutlass::DeviceAllocation block_B; - cutlass::DeviceAllocation block_B_vnni; cutlass::DeviceAllocation block_C; cutlass::DeviceAllocation block_D; cutlass::DeviceAllocation block_ref_D; @@ -238,7 +237,6 @@ struct ExampleRunner { block_A.reset(M * K * L); block_B.reset(K * N * L); - block_B_vnni.reset(K * N * L); block_C.reset(M * N * L); block_D.reset(M * N * L); block_ref_D.reset(M * N * L); @@ -247,18 +245,15 @@ struct ExampleRunner { // available through SYCL. std::vector a(K * M * L); std::vector b(K * N * L); - std::vector b_vnni(b.size()); std::vector c(M * N * L); std::vector d(M * N * L, ElementC{0}); fill_matrix(a); fill_matrix(b); fill_matrix(c); - vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); syclcompat::memcpy(block_A.get(), a.data(), a.size() * sizeof(ElementA)); syclcompat::memcpy(block_B.get(), b.data(), b.size() * sizeof(ElementB)); - syclcompat::memcpy(block_B_vnni.get(), b_vnni.data(), b.size() * sizeof(ElementB)); syclcompat::memcpy(block_C.get(), c.data(), c.size() * sizeof(ElementC)); syclcompat::memcpy(block_D.get(), d.data(), d.size() * sizeof(ElementC)); } @@ -271,7 +266,7 @@ struct ExampleRunner { typename Gemm::GemmKernel::Arguments arguments{ cutlass::gemm::GemmUniversalMode::kGemm, problem_size, - {block_A.get(), stride_A, block_B_vnni.get(), stride_B}, + {block_A.get(), stride_A, block_B.get(), stride_B}, {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, hw_info }; @@ -302,7 +297,7 @@ struct ExampleRunner { } syclcompat::wait(); - float cute_time = timer.seconds() / options.iterations; + float cute_time = timer.seconds(); double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); @@ -361,12 +356,12 @@ int main(int argc, const char** argv) using LayoutD = cutlass::layout::RowMajor; using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; - using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16x2x2_V; // Workgroup-level tile - using TileShape = Shape<_32, _256, _32>; + using TileShape = Shape<_256, _256, _32>; - using TiledMma = TiledMMA, + using TiledMma = TiledMMA, Layout>, Tile<_32,_64,_32>>; // Subgroup level-tile