Skip to content

Commit

Permalink
naive impl
Browse files Browse the repository at this point in the history
  • Loading branch information
tarekziade committed Dec 22, 2024
1 parent 28b488f commit 63d7cf6
Showing 1 changed file with 156 additions and 1 deletion.
157 changes: 156 additions & 1 deletion onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,28 @@
#include "core/providers/cpu/quantization/matmul_integer_base.h"
#include "core/util/math_cpuonly.h"
#include "core/util/qmath.h"
#include <cassert>

#include <algorithm>

namespace onnxruntime {
namespace contrib {

namespace {


using Index = uint32_t;
extern "C" void
__attribute__((import_module("wasm_gemm"), import_name("int8_multiply")))

Check warning on line 24 in onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc

View workflow job for this annotation

GitHub Actions / Vcpkg

unknown attribute 'import_module' ignored [-Wunknown-attributes]

Check warning on line 24 in onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc

View workflow job for this annotation

GitHub Actions / Vcpkg

unknown attribute 'import_name' ignored [-Wunknown-attributes]
int8Multiply(const uint8_t* input_A,
float zero_point_A,
const int8_t* input_B,
//const uint8_t* zero_point_B,
Index rows_A,
Index width,
Index cols_B,
float* output);

void ScaleOutput(const Tensor& scale, Tensor& output) {
ProcessBroadcastSpanFuncs funcs{
[](BroadcastHelper& per_iter_bh) {
Expand Down Expand Up @@ -51,12 +66,65 @@ class MatMulIntegerToFloatBase : public MatMulIntegerBase {
float a_scale,
uint8_t a_zp,
bool a_is_signed,
const Tensor* b_tensor,
const Tensor* b_tensor,
const Tensor* b_scale,
const Tensor* b_zp,
const Tensor* bias_tensor) const;
};

void MatMulFull(const uint8_t* inputMatrixA,
const int8_t* inputMatrixB,
float* output,
size_t rowsA,
size_t width,
size_t colsB,
uint8_t zeroPointA,
const uint8_t* zeroPointB,
const float* b_scale_data,
bool is_b_scale_per_column) {

float matrixScale = is_b_scale_per_column ? 0.0f : b_scale_data[0];
int32_t matrixZeroPointB = is_b_scale_per_column ? 0 : static_cast<int32_t>(zeroPointB[0]);

for (size_t rowIndex = 0; rowIndex < rowsA; ++rowIndex) {
const uint8_t* aRow = inputMatrixA + rowIndex * width; // Start of row in A
for (size_t colIndex = 0; colIndex < colsB; ++colIndex) {
int32_t tempResult = 0;

for (size_t k = 0; k < width; ++k) {
// Row-major access
uint8_t aValue = aRow[k];

// Column-major access for B
int8_t bValue = inputMatrixB[k * colsB + colIndex];

// Adjust for zero-point offsets
int32_t adjustedA = static_cast<int32_t>(aValue) - static_cast<int32_t>(zeroPointA);
int32_t adjustedB = static_cast<int32_t>(bValue);

if (is_b_scale_per_column) {
adjustedB -= static_cast<int32_t>(zeroPointB[colIndex]);
} else {
adjustedB -= matrixZeroPointB;
}
// Accumulate product
tempResult += adjustedA * adjustedB;
}

float scaledResult = tempResult;
if (is_b_scale_per_column) {
scaledResult *= b_scale_data[colIndex];
}
else {
scaledResult *= matrixScale;
}

// Store the scaled result in y_data
output[rowIndex * colsB + colIndex] = scaledResult;
}
}
}

Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
const uint8_t* a_data,
const TensorShape& a_shape,
Expand Down Expand Up @@ -150,11 +218,95 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
params.ldc = gemm_shape.N;
}

// rowsA = M
// width = K
// colsB = N
size_t rowsA = static_cast<size_t>(helper.M());
size_t width = static_cast<size_t>(helper.K());
size_t colsB = static_cast<size_t>(helper.N());
const int8_t* b_data = static_cast<const int8_t*>(b_tensor->DataRaw());

#if 0
size_t total_elements = rowsA * colsB;
size_t display_limit = std::min(total_elements, static_cast<size_t>(100));
std::vector<float> y_data_2(rowsA * colsB, 0.0f);
std::cout << "Calling MatMulFull with the following parameters:\n";
std::cout << "rowsA: " << rowsA << ", width: " << width << ", colsB: " << colsB << "\n";
std::cout << "a_zp: " << static_cast<int>(a_zp) << "\n";
std::cout << "is_b_scale_per_column: " << is_b_scale_per_column << "\n";
std::cout << "multiplier_per_tensor: " << multiplier_per_tensor << "\n";
std::cout << "b_scale_data sample: [";
for (size_t i = 0; i < 25; ++i) {
if (i > 0) std::cout << ", ";
std::cout << b_scale_data[i];
}
std::cout << "]\n";
std::cout << "b_zero point sample: [";
for (size_t i = 0; i < 25; ++i) {
if (i > 0) std::cout << ", ";
std::cout << static_cast<int>(b_zp_ptr[i]) << ", ";
}
std::cout << "]\n";

if (bias_data != nullptr) {
size_t bias_size = static_cast<size_t>(bias_tensor->Shape().Size()); // Get the total size of bias_data
size_t display_limit = std::min(bias_size, static_cast<size_t>(100));
std::cout << "First " << display_limit << " elements of bias_data: [";
for (size_t i = 0; i < display_limit; ++i) {
if (i > 0) std::cout << ", ";
std::cout << bias_data[i];
}
std::cout << "]" << std::endl;
}
std::cout << "multiplier_per_tensor: " << multiplier_per_tensor << std::endl;
std::cout << "b_scale_data[0]: " << b_scale_data[0] << std::endl;
#endif

MatMulFull(a_data, b_data, y_data_2.data(), rowsA, width, colsB, a_zp, b_zp_ptr, b_scale_data, is_b_scale_per_column);

Check failure on line 265 in onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc

View workflow job for this annotation

GitHub Actions / Vcpkg

use of undeclared identifier 'y_data_2'

#if 0
MlasGemmBatch(gemm_shape, gemm_data_vec.data(), num_gemms, ctx->GetOperatorThreadPool());

// Compare y_data and y_data_2
bool mismatch_found = false;
for (size_t i = 0; i < total_elements; ++i) {
if (std::fabs(y_data[i] - y_data_2[i]) > 1e-6) { // Tolerance for floating-point comparison
std::cerr << "Mismatch at index " << i << ": y_data=" << y_data[i] << ", y_data_2=" << y_data_2[i] << std::endl;
mismatch_found = true;
break;
}
}

if (mismatch_found) {
std::cerr << "Displaying the first 100 elements of y_data and y_data_2:" << std::endl;
std::cerr << "[";
for (size_t i = 0; i < display_limit; ++i) {
std::cerr << "(Index " << i << ": y_data=" << y_data[i] << ", y_data_2=" << y_data_2[i] << ")";
if (i != display_limit - 1) {
std::cerr << ", ";
}
}
std::cerr << "]" << std::endl;
std::cerr << "Mismatch found between y_data and y_data_2!" << std::endl;
assert(false && "Validation failed: y_data and y_data_2 are not equal.");
}
#endif
return Status::OK();
}

/*
int8Multiply(
reinterpret_cast<const uint8_t*>(a_data),
a_zp,
b_data,
//reinterpret_cast<const uint8_t*>(b_zero_point->DataRaw()),
rowsA,
width,
colsB,
reinterpret_cast<float*>(y_data)
);
*/

class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
public:
DynamicQuantizeMatMul(const OpKernelInfo& info) : MatMulIntegerToFloatBase(info) {}
Expand Down Expand Up @@ -234,6 +386,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
ctx->Input<Tensor>(IN_BIAS)));

if (!is_b_scale_supported) {
std::cout << "dynamic quantize matmul: b scale is not supported\n";
ScaleOutput(*b_scale_tensor, *ctx->Output<Tensor>(0));
}

Expand Down Expand Up @@ -289,9 +442,11 @@ Status MatMulIntegerToFloat::Compute(OpKernelContext* ctx) const {
ctx->Input<Tensor>(IN_BIAS)));

if (!is_a_scale_scalar) {
std::cout << "dynamic quantize matmul: a scale is not scalar\n";
ScaleOutput(*a_scale_tensor, *ctx->Output<Tensor>(0));
}
if (!is_b_scale_supported) {
std::cout << "dynamic quantize matmul: b scale is not supported\n";
ScaleOutput(*b_scale_tensor, *ctx->Output<Tensor>(0));
}

Expand Down

0 comments on commit 63d7cf6

Please sign in to comment.