Merge branch 'develop' into bf16_verify_tests

ROCm · Dec 16, 2024 · c978082 · c978082
2 parents ffa2e46 + f9e276b
commit c978082
Show file tree

Hide file tree

Showing 31 changed files with 397 additions and 62 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,83 @@
 Full documentation for MIGraphX is available at
 [https://rocmdocs.amd.com/projects/AMDMIGraphX/en/latest/](https://rocmdocs.amd.com/projects/AMDMIGraphX/en/latest/).
 
+## MIGraphX 2.11 for ROCm 6.3.0
+
+### Added
+
+* Initial code to run on Windows
+* Support for gfx120x GPU
+* Support for FP8, and INT4
+* Support for the Log2 internal operator
+* Support for the GCC 14 compiler
+* The BitwiseAnd, Scan, SoftmaxCrossEntropyLoss, GridSample, and NegativeLogLikelihoodLoss ONNX operators
+* The MatMulNBits, QuantizeLinear/DequantizeLinear, GroupQueryAttention, SkipSimplifiedLayerNormalization, and SimpliedLayerNormalization Microsoft Contrib operators
+* Dymamic batch parameter support to OneHot operator
+* Split-K as an optional performance improvement
+* Scripts to validate ONNX models from the ONNX Model Zoo
+* GPU Pooling Kernel
+* --mlir flag to the migraphx-driver program to offload entire module to mlir
+* Fusing split-reduce with MLIR
+* Multiple outputs for the MLIR + Pointwise fusions
+* Pointwise fusions with MLIR across reshape operations
+* MIGRAPHX_MLIR_DUMP environment variable to dump MLIR modules to MXRs
+* The 3 option to MIGRAPHX_TRACE_BENCHMARKING to print the MLIR program for improved debug output
+* MIGRAPHX_ENABLE_HIPBLASLT_GEMM environment variable to call hipBlasLt libaries
+* MIGRAPHX_VERIFY_DUMP_DIFF to improve the debugging of accuracy issues
+* reduce_any and reduce_all options to the Reduce operation via Torch MIGraphX
+* Examples for RNNT, and ControlNet
+
+
+### Changed
+
+* Switched to MLIR's 3D Convolution operator.
+* MLIR is now used for Attention operations by default on gfx942 and newer ASICs.
+* Names and locations for VRM specific libraries have changed.
+* Use random mode for benchmarking GEMMs and convolutions.
+* Python version is now printed with an actual version number.
+
+
+### Removed
+
+* Disabled requirements for MIOpen and rocBlas when running on Windows.
+* Removed inaccuracte warning messages when using exhaustive-tune.
+* Remove the hard coded path in MIGRAPHX_CXX_COMPILER allowing the compiler to be installed in different locations.
+
+
+### Optimized
+
+* Improved:
+    * Infrastructure code to enable better Kernel fusions with all supported data types
+    * Subsequent model compile time by creating a cache for already performant kernels
+    * Use of Attention fusion with models
+    * Performance of the Softmax JIT kernel and of the Pooling opterator
+    * Tuning operations through a new 50ms delay before running the next kernel
+    * Performance of several convolution based models through an optimized NHWC layout
+    * Performance for the FP8 datatype
+    * GPU utilization
+    * Verification tools
+    * Debug prints
+    * Documentation, including gpu-driver utility documentation
+    * Summary section of the migrahx-driver perf command
+* Reduced model compilation time
+* Reordered some compiler passes to allow for more fusions
+* Preloaded tiles into LDS to improve performance of pointwise transposes
+* Exposed the external_data_path property in onnx_options to set the path from onnxruntime
+
+
+### Resolved Issues
+
+* Fixed a bug with gfx1030 that overwrote dpp_reduce.
+* Fixed a bug in 1arg dynamic reshape that created a failure.
+* Fixed a bug with dot_broadcast and inner_broadcast that caused compile failures.
+* Fixed a bug where some configs were failing when using exhaustive-tune.
+* Fixed the ROCM Install Guide URL.
+* Fixed an issue while building a whl package due to an apostrophe.
+* Fixed the BERT Squad example requirements file to support different versions of Python.
+* Fixed a bug that stopped the Vicuna model from compiling.
+* Fixed failures with the verify option of migraphx-driver that would cause the application to exit early.
+
+
 ## MIGraphX 2.10 for ROCm 6.2.0
 
 ### Additions

diff --git a/Dockerfile b/Dockerfile
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y software-properties-common gnupg2 --no-
     curl -sL http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
 
 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/6.2/ jammy main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/6.3/ jammy main > /etc/apt/sources.list.d/rocm.list'
 
 # From docs.amd.com for installing rocm. Needed to install properly
 RUN sh -c "echo 'Package: *\nPin: release o=repo.radeon.com\nPin-priority: 600' > /etc/apt/preferences.d/rocm-pin-600"

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -42,7 +42,7 @@ def rocmtestnode(Map conf) {
             rm -rf build
             mkdir build
             cd build
-            cmake -DCTEST_TIMEOUT=3600 -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DBUILD_DEV=On -DCMAKE_EXECUTE_PROCESS_COMMAND_ECHO=STDOUT -DMIGRAPHX_DISABLE_VIRTUAL_ENV=ON ${flags} ..
+            cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DBUILD_DEV=On -DCMAKE_EXECUTE_PROCESS_COMMAND_ECHO=STDOUT -DMIGRAPHX_DISABLE_VIRTUAL_ENV=ON ${flags} ..
             git diff
             git diff-index --quiet HEAD || (echo "Git repo is not clean after running cmake." && exit 1)
             make -j\$(nproc) generate VERBOSE=1

diff --git a/docs/dev/env_vars.rst b/docs/dev/env_vars.rst
@@ -116,9 +116,9 @@ Disables the ``schedule`` pass.
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disables the ``fuse_reduce`` pass.
 
-.. envvar:: MIGRAPHX_ENABLE_SPLIT_REDUCE
-Set to "1", "enable", "enabled", "yes", or "true" to use.
-Enable split_reduce.
+.. envvar:: MIGRAPHX_SPLIT_REDUCE_SIZE
+Set to the minimum size of a reduction to do a split reduce. Overrides what
+is set in the backend. Set to -1 to disable split reduce completely.
 
 .. envvar:: MIGRAPHX_ENABLE_NHWC
 

diff --git a/hip-clang.docker b/hip-clang.docker
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386
 
 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/6.2/ focal main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/6.3/ jammy main > /etc/apt/sources.list.d/rocm.list'
 
 # From docs.amd.com for installing rocm. Needed to install properly
 RUN sh -c "echo 'Package: *\nPin: release o=repo.radeon.com\nPin-priority: 600' > /etc/apt/preferences.d/rocm-pin-600"

diff --git a/requirements.txt b/requirements.txt
@@ -28,4 +28,4 @@ pybind/pybind11@3e9dfa2866941655c56877882565e7577de6fc7b --build
 msgpack/[email protected] -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCm/composable_kernel@b7775add2d28251674d81e220cd4a857b90b997a -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCm/rocMLIR@e61b0f0e516f09144445b3c8eb372f39eb82d53b -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCm/rocMLIR@13065c4b3a216e1b13dfb8f746b8a0d421f124e8 -DBUILD_FAT_LIBROCKCOMPILER=On
diff --git a/src/driver/main.cpp b/src/driver/main.cpp
@@ -482,6 +482,7 @@ struct compiler
     compiler_target ct;
     compile_options co;
     bool to_fp16 = false;
+    bool to_bf16 = false;
     bool to_fp8  = false;
     bool to_int8 = false;
     bool to_int4 = false;
@@ -506,6 +507,7 @@ struct compiler
            ap.help("Exhastively search for best tuning parameters for kernels"),
            ap.set_value(true));
         ap(to_fp16, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(true));
+        ap(to_bf16, {"--bf16"}, ap.help("Quantize for bf16"), ap.set_value(true));
         ap(to_int8, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(true));
         ap(to_fp8, {"--fp8"}, ap.help("Quantize for fp8"), ap.set_value(true));
         ap(to_int4, {"--int4-weights"}, ap.help("Quantize weights for int4"), ap.set_value(true));
@@ -555,6 +557,10 @@ struct compiler
         {
             quantize_fp16(p);
         }
+        if(to_bf16)
+        {
+            quantize_bf16(p);
+        }
         if(to_int8)
         {
             quantize_int8(p, t, {host_params(p)});
@@ -639,6 +645,10 @@ struct verify : command<verify>
         {
             vo.quantize = precision::fp16;
         }
+        if(c.to_bf16)
+        {
+            vo.quantize = precision::bf16;
+        }
         if(c.to_int8)
         {
             vo.quantize = precision::int8;

diff --git a/src/driver/precision.hpp b/src/driver/precision.hpp
@@ -32,6 +32,7 @@ enum class precision
 {
     fp32,
     fp16,
+    bf16,
     int8
 };
 

diff --git a/src/driver/verify.cpp b/src/driver/verify.cpp
@@ -50,11 +50,14 @@ verify::tolerance get_tolerances(const program& p,
                                  std::optional<double> atol,
                                  std::optional<double> rtol)
 {
-    bool has_fp16 = any_of(p.get_modules(), [](auto&& m) {
-        return any_of(*m, [](auto&& ins) { return (ins.get_shape().type() == shape::half_type); });
+    bool has_16bit = any_of(p.get_modules(), [](auto&& m) {
+        return any_of(*m, [](auto&& ins) {
+            return (ins.get_shape().type() == shape::half_type or
+                    ins.get_shape().type() == shape::bf16_type);
+        });
     });
     migraphx::verify::tolerance result{};
-    if(has_fp16 or vo.quantize == precision::fp16)
+    if(has_16bit or vo.quantize == precision::fp16 or vo.quantize == precision::bf16)
     {
         result.rms_tol = 8e-2;
         result.atol    = 4e-2;
@@ -100,6 +103,10 @@ std::vector<argument> run_target(program p,
     {
         quantize_fp16(p);
     }
+    if(vo.quantize == precision::bf16)
+    {
+        quantize_bf16(p);
+    }
     p.compile(t, options);
 
     parameter_map m;

diff --git a/src/fuse_pointwise_reduce.cpp b/src/fuse_pointwise_reduce.cpp
@@ -26,16 +26,29 @@
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/fuse_reduce.hpp>
+#include <migraphx/split_reduce.hpp>
+#include <migraphx/env.hpp>
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_SPLIT_REDUCE_SIZE);
+
+static std::size_t get_split_size(std::size_t default_split)
+{
+    std::string value = string_value_of(MIGRAPHX_SPLIT_REDUCE_SIZE{});
+    if(value.empty())
+        return default_split;
+    return std::stoul(value);
+}
 
 void fuse_pointwise_reduce::apply(module_pass_manager& mpm) const
 {
     mpm.run_pass(fuse_pointwise{.enable_rewrite_reshapes = false});
     mpm.run_pass(fuse_reduce{.enable_rewrite_reshapes = false});
     mpm.run_pass(fuse_pointwise{.enable_rewrite_reshapes = true});
     mpm.run_pass(fuse_reduce{.enable_rewrite_reshapes = true});
+    mpm.run_pass(split_reduce{.split_size = get_split_size(split_size)});
+    mpm.run_pass(fuse_pointwise{.enable_rewrite_broadcasts = true});
 }
 
 } // namespace MIGRAPHX_INLINE_NS

diff --git a/src/include/migraphx/fuse_pointwise_reduce.hpp b/src/include/migraphx/fuse_pointwise_reduce.hpp
@@ -35,6 +35,7 @@ struct module_pass_manager;
 
 struct MIGRAPHX_EXPORT fuse_pointwise_reduce
 {
+    std::size_t split_size = 32768;
     std::string name() const { return "fuse_pointwise_reduce"; }
     void apply(module_pass_manager& mpm) const;
 };

diff --git a/src/include/migraphx/quantization.hpp b/src/include/migraphx/quantization.hpp
@@ -51,6 +51,9 @@ quantize_fp8(program& prog, const target& t, const std::vector<parameter_map>& c
 
 MIGRAPHX_EXPORT void quantize_int4_weights(program& prog);
 
+MIGRAPHX_EXPORT void quantize_bf16(program& prog,
+                                   const std::vector<std::string>& ins_names = {"all"});
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 

diff --git a/src/program.cpp b/src/program.cpp
@@ -44,6 +44,7 @@
 #include <iostream>
 #include <queue>
 #include <sstream>
+#include <fstream>
 #include <algorithm>
 #include <set>
 #include <unordered_map>
@@ -845,6 +846,31 @@ double common_average(const std::vector<double>& v)
     return total / std::distance(v.begin() + n, v.end() - n);
 }
 
+double mean(const std::vector<double>& v)
+{
+    double total = std::accumulate(v.begin(), v.end(), 0.0);
+    return total / v.size();
+}
+
+double median(const std::vector<double>& v)
+{
+    size_t mid = v.size() / 2;
+    if(v.size() % 2 == 0)
+    {
+        return (v[mid - 1] + v[mid]) / 2.0;
+    }
+    else
+    {
+        return v[mid];
+    }
+}
+
+double percentile(const std::vector<double>& v, double percentile)
+{
+    size_t index = (percentile * (v.size() - 1));
+    return v[index];
+}
+
 std::string perf_group(instruction_ref ins, bool detailed)
 {
     std::string result;
@@ -925,8 +951,14 @@ void program::perf_report(
     {
         overhead_vec.push_back(time<milliseconds>([&] { dry_run(params); }));
     }
-
     double total_time             = common_average(total_vec);
+    double min_time               = total_vec.front();
+    double max_time               = total_vec.back();
+    double mean_time              = mean(total_vec);
+    double median_time            = median(total_vec);
+    double percentile_90_time     = percentile(total_vec, 0.90);
+    double percentile_95_time     = percentile(total_vec, 0.95);
+    double percentile_99_time     = percentile(total_vec, 0.99);
     double rate                   = 1000.0 / total_time;
     double overhead_time          = common_average(overhead_vec);
     double overhead_percent       = overhead_time * 100.0 / total_time;
@@ -978,7 +1010,14 @@ void program::perf_report(
 
     os << "Batch size: " << batch << std::endl;
     os << "Rate: " << rate * batch << " inferences/sec" << std::endl;
-    os << "Total time: " << total_time << "ms" << std::endl;
+    os << "Total time: " << total_time << "ms ";
+    os << "(Min: " << min_time << "ms, ";
+    os << "Max: " << max_time << "ms, ";
+    os << "Mean: " << mean_time << "ms, ";
+    os << "Median: " << median_time << "ms)" << std::endl;
+    os << "Percentiles (90%, 95%, 99%): (";
+    os << percentile_90_time << "ms, " << percentile_95_time << "ms, " << percentile_99_time
+       << "ms)" << std::endl;
     os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
     os << "Overhead time: " << overhead_time << "ms"
        << ", " << calculate_overhead_time << "ms" << std::endl;

diff --git a/src/py/migraphx_py.cpp b/src/py/migraphx_py.cpp
@@ -263,6 +263,13 @@ migraphx::shape to_shape(const py::buffer_info& info)
 {
     migraphx::shape::type_t t;
     std::size_t n = 0;
+    // Unsupported pybuffer types lead to undefined behaviour when comparing with migraphx type enum
+    if(info.format == "z")
+    {
+        MIGRAPHX_THROW(
+            "MIGRAPHX PYTHON: Unsupported data type. For fp8 and bf16 literals try using "
+            "migraphx.generate_argument with migraphx.add_literal");
+    }
     visit_types([&](auto as) {
         if(info.format == py::format_descriptor<decltype(as())>::format() or
            (info.format == "l" and py::format_descriptor<decltype(as())>::format() == "q") or
@@ -388,6 +395,12 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
             py::arg("op"),
             py::arg("args"),
             py::arg("mod_args") = std::vector<migraphx::module*>{})
+        .def(
+            "add_literal",
+            [](migraphx::module& mm, migraphx::argument a) {
+                return mm.add_literal(a.get_shape(), a.data());
+            },
+            py::arg("data"))
         .def(
             "add_literal",
             [](migraphx::module& mm, py::buffer data) {
@@ -651,6 +664,10 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
         },
         "Auto-convert FP8 parameters and return values to Float for MIGraphX Program",
         py::arg("prog"));
+    m.def("quantize_bf16",
+          &migraphx::quantize_bf16,
+          py::arg("prog"),
+          py::arg("ins_names") = std::vector<std::string>{"all"});
 
 #ifdef HAVE_GPU
     m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);

diff --git a/src/quantization.cpp b/src/quantization.cpp
@@ -74,6 +74,16 @@ void quantize_fp16(program& prog, const std::vector<std::string>& ins_names)
                quant_tracer());
 }
 
+void quantize_bf16(program& prog, const std::vector<std::string>& ins_names)
+{
+    run_passes(prog,
+               {normalize_ops{},
+                optimize_module{{"quantizelinear", "dequantizelinear"}},
+                truncate_float_pass{ins_names, shape::bf16_type},
+                optimize_module{{"quantizelinear", "dequantizelinear"}}},
+               quant_tracer());
+}
+
 void quantize_8bits(program& prog,
                     const target& t,
                     shape::type_t precision,

diff --git a/src/split_reduce.cpp b/src/split_reduce.cpp
@@ -237,8 +237,6 @@ void split_reduce::apply(module_pass_manager& mpm) const
         assert(replaced.size() == 1);
         mpm.get_module().replace_instruction(ins, replaced.front());
     }
-
-    mpm.run_pass(fuse_pointwise{.enable_rewrite_broadcasts = true});
 }
 
 } // namespace MIGRAPHX_INLINE_NS
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,7 @@ enum class precision @@
     {
         fp32,
         fp16,
+        bf16,
         int8
     };
@@ Expand Down @@