From 2d28d7cb29d129ebe28b905c451fe3872e6fd549 Mon Sep 17 00:00:00 2001
From: "y01000.you" <y01000.you@samsung.com>
Date: Wed, 18 Dec 2024 17:45:31 +0900
Subject: [PATCH 1/2] [luci/pass] Add basic quantization support for weights in
 QuantizeDequantizeWeightsWithGPTQPass

This commit implements basic quantization of weights in `QuantizeDequantizeWeightsWithGPTQPass`, supporting both 4-bit and 8-bit quantization. Only channel-wise quantization is supported.

ONE-DCO-1.0-Signed-off-by: y01000.you <y01000.you@samsung.com>
---
 .../QuantizeDequantizeWeightsWithGPTQPass.cpp | 253 +++++++++++++++++-
 1 file changed, 251 insertions(+), 2 deletions(-)
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp
index 9a3f702f4bf..d831983110e 100644
--- a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp
@@ -15,11 +15,17 @@
  */
 
 #include "luci/Pass/QuantizeDequantizeWeightsWithGPTQPass.h"
+#include "QuantizationUtils.h"
 #include "helpers/LayerInfoMap.h"
 
+#include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
 #include <luci/Service/Nodes/CircleConst.h>
 #include <luci/Log.h>
+#include <loco/IR/TensorShape.h>
+
+#include <cmath>
+#include <functional>
 
 namespace luci
 {
@@ -27,6 +33,218 @@ namespace luci
 namespace
 {
 
+using IterFunc = std::function<void(uint32_t *, loco::TensorShape &, int32_t)>;
+
+void iterate_per_channel(CircleConst *node, IterFunc func)
+{
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+    0,
+  };
+  int32_t channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          func(indices, dimension, channel_dim_index);
+        }
+      }
+    }
+  }
+}
+
+size_t calculate_qauntized_value(CircleConst *node, uint32_t *indices, loco::TensorShape &dimension,
+                                 int index_channel_dim, std::vector<float> &scaling_factor,
+                                 std::vector<float> &max, std::vector<float> &min)
+{
+  assert(node != nullptr);
+
+  int idx_channel = indices[index_channel_dim];
+
+  assert(scaling_factor[idx_channel] > 0);
+  const float scaling_factor_inv = 1.0 / scaling_factor[idx_channel];
+  auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+  auto data_clipped = data < min[idx_channel] ? min[idx_channel] : data;
+  data_clipped = data_clipped > max[idx_channel] ? max[idx_channel] : data_clipped;
+
+  return static_cast<int32_t>(std::round((data_clipped - min[idx_channel]) * scaling_factor_inv));
+}
+
+void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max)
+{
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  int32_t index_channel_dim{0};
+
+  if (!get_channel_dim_index(node, dimension, index_channel_dim))
+  {
+    throw std::runtime_error("GPTQPass: Failed to get channel dim index.");
+  }
+  auto size = dimension.dim(index_channel_dim).value();
+
+  std::vector<bool> has_min_max_value(size, false);
+  min.resize(size);
+  max.resize(size);
+
+  auto cal_minmax = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) {
+    int idx_channel = indices[index_channel_dim];
+    auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+    if (has_min_max_value[idx_channel])
+    {
+      min[idx_channel] = data < min[idx_channel] ? data : min[idx_channel];
+      max[idx_channel] = data > max[idx_channel] ? data : max[idx_channel];
+    }
+    else
+    {
+      min[idx_channel] = data;
+      max[idx_channel] = data;
+      has_min_max_value[idx_channel] = true;
+    }
+  };
+
+  iterate_per_channel(node, cal_minmax);
+}
+
+/**
+ * @brief Compute the scale and zero point for the given range of values
+ */
+void compute_asym_scale_zp(float min, float max, loco::DataType data_type, float &scaling_factor,
+                           int64_t &zp, float &nudged_min, float &nudged_max)
+{
+  LOGGER(l);
+
+  assert(min <= max);
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = data_type == loco::DataType::U4 ? 15 : 255;
+
+  const double qmin_double = kMinScale;
+  const double qmax_double = kMaxScale;
+  const double rmin = std::fmin(0, min);
+  const double rmax = std::fmax(0, max);
+  const double qrange = qmax_double - qmin_double;
+  assert(qrange > 0);
+
+  double scale = (rmax - rmin) / qrange;
+  double zero_point_double = 0;
+  uint8_t nudged_zero_point = 0;
+
+  if (scale == 0)
+  {
+    WARN(l) << "GPTQPass: The minimum and maximum values are the same." << std::endl;
+    if (min >= 0 && max >= 0)
+      zero_point_double = kMinScale;
+    else
+      zero_point_double = kMaxScale;
+  }
+  else
+    zero_point_double = qmin_double - rmin / scale;
+  if (min >= 0)
+  {
+    assert(min >= 0 && max >= 0);
+    nudged_zero_point = kMinScale;
+    scale = max / qrange;
+    if (min > 0 && max > 0)
+      WARN(l) << "GPTQPass: The minimum and maximum values are all positive." << std::endl;
+  }
+  else if (max < 0)
+  {
+    assert(min < 0 && max < 0);
+    nudged_zero_point = kMaxScale;
+    scale = -min / qrange;
+    WARN(l) << "GPTQPass: The minimum and maximum values are all negative." << std::endl;
+  }
+  else
+  {
+    assert(min < 0 && max >= 0);
+    nudged_zero_point = fp32_to_uint8_cast(std::round(zero_point_double));
+  }
+
+  // protect scale from being very low due to overflow
+  if (scale < 1e-5)
+  {
+    scale = 1e-5;
+    nudged_zero_point = fp32_to_uint8_cast(std::round(qmin_double - rmin / scale));
+  }
+
+  nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
+  nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale);
+
+  scaling_factor = scale;
+  zp = nudged_zero_point;
+}
+
+void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+                                   std::vector<float> &max, std::vector<float> &scaling_factor,
+                                   std::vector<int64_t> &zp, std::vector<float> &nudged_min,
+                                   std::vector<float> &nudged_max, loco::DataType output_type)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+  assert(output_type == loco::DataType::U8 || output_type == loco::DataType::U4);
+
+  IterFunc quantize;
+
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = output_type == loco::DataType::U4 ? 15 : 255;
+
+  uint32_t input_size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(input_size);
+
+  for (size_t i = 0; i < min.size(); ++i)
+  {
+    compute_asym_scale_zp(min[i], max[i], output_type, scaling_factor[i], zp[i], nudged_min[i],
+                          nudged_max[i]);
+  }
+
+  quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) {
+    quantized_values[cal_offset(dimension, indices)] = calculate_qauntized_value(
+      node, indices, dimension, index_channel_dim, scaling_factor, nudged_max, nudged_min);
+  };
+  iterate_per_channel(node, quantize);
+
+  node->dtype(loco::DataType::U8);            // Change the type of tensor
+  node->size<loco::DataType::U8>(input_size); // Resize tensor
+  for (uint32_t i = 0; i < input_size; ++i)
+  {
+    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+void asymmetric_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
+                                     std::vector<float> &nudged_min)
+{
+  assert(node->dtype() == loco::DataType::U8);
+  uint32_t size = node->size<loco::DataType::U8>();
+  std::vector<float> dequantized_values(size);
+
+  auto dequantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) {
+    int idx_channel = indices[index_channel_dim];
+    auto data = node->at<loco::DataType::U8>(cal_offset(dimension, indices));
+    dequantized_values[cal_offset(dimension, indices)] =
+      static_cast<float>(data) * scaling_factor[idx_channel] + nudged_min[idx_channel];
+  };
+
+  iterate_per_channel(node, dequantize);
+
+  node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
+  node->size<loco::DataType::FLOAT32>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
+  }
+}
+
 /**
  * @brief QuantizeWeightsWithGPTQ quantizes and dequantizes tensors for weights uisng GPTQ algorithm
  * @details Compensate for the quantization error and update weights using Hessian matrix
@@ -50,9 +268,40 @@ class QuantizeDequantizeWeightsWithGPTQ final : public luci::CircleNodeMutableVi
 
   void fake_quantize(luci::CircleConst *weights)
   {
-    // To be implemented
-    (void)weights;
+    if (_granularity != luci::QuantizationGranularity::ChannelWise)
+    {
+      throw std::invalid_argument("GPTQPass: Unsupported granularity");
+    }
+
+    if (_output_type != loco::DataType::U4 && _output_type != loco::DataType::U8)
+    {
+      throw std::runtime_error("GPTQPass: GPTQ quantization supports uint4/uint8");
+    }
+
+    // Find min/max per channel
+    std::vector<float> min;
+    std::vector<float> max;
+
+    cal_minmax_per_channel(weights, min, max);
+
+    std::vector<float> nudged_min(min.size());
+    std::vector<float> nudged_max(min.size());
+    std::vector<float> scaling_factor(min.size());
+    std::vector<int64_t> zp(min.size());
+
+    asymmetric_wquant_per_channel(weights, min, max, scaling_factor, zp, nudged_min, nudged_max,
+                                  _output_type);
+    asymmetric_wdequant_per_channel(weights, scaling_factor, nudged_min);
+
+    auto quantparam = std::make_unique<CircleQuantParam>();
+    quantparam->min = nudged_min;
+    quantparam->max = nudged_max;
+    quantparam->scale = scaling_factor;
+    quantparam->zerop = zp;
+
+    weights->quantparam(std::move(quantparam));
   }
+
   void fake_quantize_with_gptq(luci::CircleConst *weights, std::vector<float> &hessian)
   {
     // To be implemented

From dc9ecfec0e6fb704b155aca2659eeed1f9fffb56 Mon Sep 17 00:00:00 2001
From: "y01000.you" <y01000.you@samsung.com>
Date: Thu, 19 Dec 2024 16:53:17 +0900
Subject: [PATCH 2/2] [luci/pass] Refactor
 QuantizeDequantizeWeightsWithGPTQPass.cpp

This commit refactors the QuantizeDequantizeWeightsWithGPTQPass.cpp file to improve its readability and maintainability.

ONE-DCO-1.0-Signed-off-by: y01000.you <y01000.you@samsung.com>
---
 .../src/QuantizeDequantizeWeightsWithGPTQPass.cpp   | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp
index d831983110e..4fcd52c06d6 100644
--- a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp
@@ -74,10 +74,10 @@ size_t calculate_qauntized_value(CircleConst *node, uint32_t *indices, loco::Ten
   int idx_channel = indices[index_channel_dim];
 
   assert(scaling_factor[idx_channel] > 0);
+
   const float scaling_factor_inv = 1.0 / scaling_factor[idx_channel];
   auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
-  auto data_clipped = data < min[idx_channel] ? min[idx_channel] : data;
-  data_clipped = data_clipped > max[idx_channel] ? max[idx_channel] : data_clipped;
+  auto data_clipped = std::min(std::max(data, max[idx_channel]), min[idx_channel]);
 
   return static_cast<int32_t>(std::round((data_clipped - min[idx_channel]) * scaling_factor_inv));
 }
@@ -103,8 +103,8 @@ void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vec
     auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
     if (has_min_max_value[idx_channel])
     {
-      min[idx_channel] = data < min[idx_channel] ? data : min[idx_channel];
-      max[idx_channel] = data > max[idx_channel] ? data : max[idx_channel];
+      min[idx_channel] = std::min(data, min[idx_channel]);
+      max[idx_channel] = std::max(data, max[idx_channel]);
     }
     else
     {
@@ -126,6 +126,7 @@ void compute_asym_scale_zp(float min, float max, loco::DataType data_type, float
   LOGGER(l);
 
   assert(min <= max);
+
   const int32_t kMinScale = 0;
   const int32_t kMaxScale = data_type == loco::DataType::U4 ? 15 : 255;
 
@@ -193,8 +194,6 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
   assert(node->dtype() == loco::DataType::FLOAT32);
   assert(output_type == loco::DataType::U8 || output_type == loco::DataType::U4);
 
-  IterFunc quantize;
-
   const int32_t kMinScale = 0;
   const int32_t kMaxScale = output_type == loco::DataType::U4 ? 15 : 255;
 
@@ -207,7 +206,7 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
                           nudged_max[i]);
   }
 
-  quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) {
+  auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) {
     quantized_values[cal_offset(dimension, indices)] = calculate_qauntized_value(
       node, indices, dimension, index_channel_dim, scaling_factor, nudged_max, nudged_min);
   };