From 2d28d7cb29d129ebe28b905c451fe3872e6fd549 Mon Sep 17 00:00:00 2001 From: "y01000.you" Date: Wed, 18 Dec 2024 17:45:31 +0900 Subject: [PATCH 1/2] [luci/pass] Add basic quantization support for weights in QuantizeDequantizeWeightsWithGPTQPass This commit implements basic quantization of weights in `QuantizeDequantizeWeightsWithGPTQPass`, supporting both 4-bit and 8-bit quantization. Only channel-wise quantization is supported. ONE-DCO-1.0-Signed-off-by: y01000.you --- .../QuantizeDequantizeWeightsWithGPTQPass.cpp | 253 +++++++++++++++++- 1 file changed, 251 insertions(+), 2 deletions(-) diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp index 9a3f702f4bf..d831983110e 100644 --- a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp +++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp @@ -15,11 +15,17 @@ */ #include "luci/Pass/QuantizeDequantizeWeightsWithGPTQPass.h" +#include "QuantizationUtils.h" #include "helpers/LayerInfoMap.h" +#include #include #include #include +#include + +#include +#include namespace luci { @@ -27,6 +33,218 @@ namespace luci namespace { +using IterFunc = std::function; + +void iterate_per_channel(CircleConst *node, IterFunc func) +{ + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int32_t channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + func(indices, dimension, channel_dim_index); + } + } + } + } +} + +size_t calculate_qauntized_value(CircleConst *node, uint32_t *indices, loco::TensorShape &dimension, + int index_channel_dim, std::vector &scaling_factor, + std::vector &max, std::vector &min) +{ + assert(node != nullptr); + + int idx_channel = indices[index_channel_dim]; + + assert(scaling_factor[idx_channel] > 0); + const float scaling_factor_inv = 1.0 / scaling_factor[idx_channel]; + auto data = node->at(cal_offset(dimension, indices)); + auto data_clipped = data < min[idx_channel] ? min[idx_channel] : data; + data_clipped = data_clipped > max[idx_channel] ? max[idx_channel] : data_clipped; + + return static_cast(std::round((data_clipped - min[idx_channel]) * scaling_factor_inv)); +} + +void cal_minmax_per_channel(CircleConst *node, std::vector &min, std::vector &max) +{ + loco::TensorShape dimension; + dimension.rank(4); + int32_t index_channel_dim{0}; + + if (!get_channel_dim_index(node, dimension, index_channel_dim)) + { + throw std::runtime_error("GPTQPass: Failed to get channel dim index."); + } + auto size = dimension.dim(index_channel_dim).value(); + + std::vector has_min_max_value(size, false); + min.resize(size); + max.resize(size); + + auto cal_minmax = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { + int idx_channel = indices[index_channel_dim]; + auto data = node->at(cal_offset(dimension, indices)); + if (has_min_max_value[idx_channel]) + { + min[idx_channel] = data < min[idx_channel] ? data : min[idx_channel]; + max[idx_channel] = data > max[idx_channel] ? data : max[idx_channel]; + } + else + { + min[idx_channel] = data; + max[idx_channel] = data; + has_min_max_value[idx_channel] = true; + } + }; + + iterate_per_channel(node, cal_minmax); +} + +/** + * @brief Compute the scale and zero point for the given range of values + */ +void compute_asym_scale_zp(float min, float max, loco::DataType data_type, float &scaling_factor, + int64_t &zp, float &nudged_min, float &nudged_max) +{ + LOGGER(l); + + assert(min <= max); + const int32_t kMinScale = 0; + const int32_t kMaxScale = data_type == loco::DataType::U4 ? 15 : 255; + + const double qmin_double = kMinScale; + const double qmax_double = kMaxScale; + const double rmin = std::fmin(0, min); + const double rmax = std::fmax(0, max); + const double qrange = qmax_double - qmin_double; + assert(qrange > 0); + + double scale = (rmax - rmin) / qrange; + double zero_point_double = 0; + uint8_t nudged_zero_point = 0; + + if (scale == 0) + { + WARN(l) << "GPTQPass: The minimum and maximum values are the same." << std::endl; + if (min >= 0 && max >= 0) + zero_point_double = kMinScale; + else + zero_point_double = kMaxScale; + } + else + zero_point_double = qmin_double - rmin / scale; + if (min >= 0) + { + assert(min >= 0 && max >= 0); + nudged_zero_point = kMinScale; + scale = max / qrange; + if (min > 0 && max > 0) + WARN(l) << "GPTQPass: The minimum and maximum values are all positive." << std::endl; + } + else if (max < 0) + { + assert(min < 0 && max < 0); + nudged_zero_point = kMaxScale; + scale = -min / qrange; + WARN(l) << "GPTQPass: The minimum and maximum values are all negative." << std::endl; + } + else + { + assert(min < 0 && max >= 0); + nudged_zero_point = fp32_to_uint8_cast(std::round(zero_point_double)); + } + + // protect scale from being very low due to overflow + if (scale < 1e-5) + { + scale = 1e-5; + nudged_zero_point = fp32_to_uint8_cast(std::round(qmin_double - rmin / scale)); + } + + nudged_min = static_cast((qmin_double - nudged_zero_point) * scale); + nudged_max = static_cast((qmax_double - nudged_zero_point) * scale); + + scaling_factor = scale; + zp = nudged_zero_point; +} + +void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, + std::vector &max, std::vector &scaling_factor, + std::vector &zp, std::vector &nudged_min, + std::vector &nudged_max, loco::DataType output_type) +{ + assert(node->dtype() == loco::DataType::FLOAT32); + assert(output_type == loco::DataType::U8 || output_type == loco::DataType::U4); + + IterFunc quantize; + + const int32_t kMinScale = 0; + const int32_t kMaxScale = output_type == loco::DataType::U4 ? 15 : 255; + + uint32_t input_size = node->size(); + std::vector quantized_values(input_size); + + for (size_t i = 0; i < min.size(); ++i) + { + compute_asym_scale_zp(min[i], max[i], output_type, scaling_factor[i], zp[i], nudged_min[i], + nudged_max[i]); + } + + quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { + quantized_values[cal_offset(dimension, indices)] = calculate_qauntized_value( + node, indices, dimension, index_channel_dim, scaling_factor, nudged_max, nudged_min); + }; + iterate_per_channel(node, quantize); + + node->dtype(loco::DataType::U8); // Change the type of tensor + node->size(input_size); // Resize tensor + for (uint32_t i = 0; i < input_size; ++i) + { + node->at(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +void asymmetric_wdequant_per_channel(CircleConst *node, std::vector &scaling_factor, + std::vector &nudged_min) +{ + assert(node->dtype() == loco::DataType::U8); + uint32_t size = node->size(); + std::vector dequantized_values(size); + + auto dequantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { + int idx_channel = indices[index_channel_dim]; + auto data = node->at(cal_offset(dimension, indices)); + dequantized_values[cal_offset(dimension, indices)] = + static_cast(data) * scaling_factor[idx_channel] + nudged_min[idx_channel]; + }; + + iterate_per_channel(node, dequantize); + + node->dtype(loco::DataType::FLOAT32); // change the type of tensor + node->size(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at(i) = dequantized_values[i]; + } +} + /** * @brief QuantizeWeightsWithGPTQ quantizes and dequantizes tensors for weights uisng GPTQ algorithm * @details Compensate for the quantization error and update weights using Hessian matrix @@ -50,9 +268,40 @@ class QuantizeDequantizeWeightsWithGPTQ final : public luci::CircleNodeMutableVi void fake_quantize(luci::CircleConst *weights) { - // To be implemented - (void)weights; + if (_granularity != luci::QuantizationGranularity::ChannelWise) + { + throw std::invalid_argument("GPTQPass: Unsupported granularity"); + } + + if (_output_type != loco::DataType::U4 && _output_type != loco::DataType::U8) + { + throw std::runtime_error("GPTQPass: GPTQ quantization supports uint4/uint8"); + } + + // Find min/max per channel + std::vector min; + std::vector max; + + cal_minmax_per_channel(weights, min, max); + + std::vector nudged_min(min.size()); + std::vector nudged_max(min.size()); + std::vector scaling_factor(min.size()); + std::vector zp(min.size()); + + asymmetric_wquant_per_channel(weights, min, max, scaling_factor, zp, nudged_min, nudged_max, + _output_type); + asymmetric_wdequant_per_channel(weights, scaling_factor, nudged_min); + + auto quantparam = std::make_unique(); + quantparam->min = nudged_min; + quantparam->max = nudged_max; + quantparam->scale = scaling_factor; + quantparam->zerop = zp; + + weights->quantparam(std::move(quantparam)); } + void fake_quantize_with_gptq(luci::CircleConst *weights, std::vector &hessian) { // To be implemented From dc9ecfec0e6fb704b155aca2659eeed1f9fffb56 Mon Sep 17 00:00:00 2001 From: "y01000.you" Date: Thu, 19 Dec 2024 16:53:17 +0900 Subject: [PATCH 2/2] [luci/pass] Refactor QuantizeDequantizeWeightsWithGPTQPass.cpp This commit refactors the QuantizeDequantizeWeightsWithGPTQPass.cpp file to improve its readability and maintainability. ONE-DCO-1.0-Signed-off-by: y01000.you --- .../src/QuantizeDequantizeWeightsWithGPTQPass.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp index d831983110e..4fcd52c06d6 100644 --- a/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp +++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsWithGPTQPass.cpp @@ -74,10 +74,10 @@ size_t calculate_qauntized_value(CircleConst *node, uint32_t *indices, loco::Ten int idx_channel = indices[index_channel_dim]; assert(scaling_factor[idx_channel] > 0); + const float scaling_factor_inv = 1.0 / scaling_factor[idx_channel]; auto data = node->at(cal_offset(dimension, indices)); - auto data_clipped = data < min[idx_channel] ? min[idx_channel] : data; - data_clipped = data_clipped > max[idx_channel] ? max[idx_channel] : data_clipped; + auto data_clipped = std::min(std::max(data, max[idx_channel]), min[idx_channel]); return static_cast(std::round((data_clipped - min[idx_channel]) * scaling_factor_inv)); } @@ -103,8 +103,8 @@ void cal_minmax_per_channel(CircleConst *node, std::vector &min, std::vec auto data = node->at(cal_offset(dimension, indices)); if (has_min_max_value[idx_channel]) { - min[idx_channel] = data < min[idx_channel] ? data : min[idx_channel]; - max[idx_channel] = data > max[idx_channel] ? data : max[idx_channel]; + min[idx_channel] = std::min(data, min[idx_channel]); + max[idx_channel] = std::max(data, max[idx_channel]); } else { @@ -126,6 +126,7 @@ void compute_asym_scale_zp(float min, float max, loco::DataType data_type, float LOGGER(l); assert(min <= max); + const int32_t kMinScale = 0; const int32_t kMaxScale = data_type == loco::DataType::U4 ? 15 : 255; @@ -193,8 +194,6 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, assert(node->dtype() == loco::DataType::FLOAT32); assert(output_type == loco::DataType::U8 || output_type == loco::DataType::U4); - IterFunc quantize; - const int32_t kMinScale = 0; const int32_t kMaxScale = output_type == loco::DataType::U4 ? 15 : 255; @@ -207,7 +206,7 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector &min, nudged_max[i]); } - quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { + auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int index_channel_dim) { quantized_values[cal_offset(dimension, indices)] = calculate_qauntized_value( node, indices, dimension, index_channel_dim, scaling_factor, nudged_max, nudged_min); };