Skip to content

Commit

Permalink
now using the new API
Browse files Browse the repository at this point in the history
  • Loading branch information
tarekziade committed Dec 12, 2024
1 parent 0735d86 commit b661e8b
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 273 deletions.
24 changes: 11 additions & 13 deletions onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,24 +114,22 @@ Status FirefoxMatMulInteger8::Compute(OpKernelContext* ctx) const {
std::vector<float> float_output(helper.M() * helper.N(), 0.0f);

// Call the function
// matix A (M x K) * matrix B (K x N)
// matrix A (M x K) * matrix B (K x N)
// matrix C (M x N)
size_t rows_a = static_cast<size_t>(helper.M());
size_t cols_b = static_cast<size_t>(helper.N());
size_t width = static_cast<size_t>(helper.K());

int8MultiplyAndAddBias(reinterpret_cast<const int8_t*>(a_data),
1.0f, // scale factor for A
a_offset,
reinterpret_cast<const int8_t*>(b_data),
1.0f, // scale factor for B
0, // b_zero_point
0, // we don't have any bias
1.0f, // quantization multiplier
rows_a, // rows A
width, // width
cols_b, // col B
reinterpret_cast<float*>(y_data));
// gemmology is only doing A unsigned x B signed
int8Multiply(reinterpret_cast<const int8_t*>(a_data),
a_offset,
reinterpret_cast<const int8_t*>(b_data),
0, // b_zero_point
rows_a, // rows A
width, // width
cols_b, // col B
reinterpret_cast<float*>(y_data));


// Print the output
#if 0
Expand Down
269 changes: 9 additions & 260 deletions onnxruntime/contrib_ops/cpu/quantization/firefox_matmul_integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,271 +36,20 @@ class FirefoxMatMulInteger8 final : public MatMulIntegerBase {
#include <emscripten/emscripten.h>


/** Main interface for integer matrix multiplication followed by addition of bias for wasm.
*
* C = A * B + Bias
*
* Input matrix A:
* - is a 2-D matrix that typically represents activations as floating point values
* - no. of rows should be a multiple of 1 (i.e. no restriction)
* - no. of columns should be a multiple of 64
* - is represented as array (contiguous memory locations) in row-major format
*
* Input matrix B:
* - is a 2-D matrix that typically represents fixed model parameters as floating point values
* - no. of rows should be:
* -- equal to no. of columns of Input matrix A
* -- a multiple of 64
* - no. of columns should be a multiple of 8
* - is represented as array (contiguous memory locations) in row-major format
*
* Please note that it is also possible to pass Input matrix B in 2 more forms:
* - One that is already a quantized and transposed version of Input matrix B
* - Other that is already a transposed version of Input matrix B
*
* Input Bias:
* - is an array (contiguous memory locations) that represents bias
* - size of the array should be equal to the no. of columns of Input matrix B
*
* Output matrix C:
* - is a 2-D matrix that represents the result (= A * B + Bias)
* - no. of rows will be equal to no. of rows of Input matrix A
* - no. of columns will be equal to no. of columns of Input matrix B (in untransposed form)
* - is represented as array (contiguous memory locations) in row-major format
*
* Please note that most of the functions in this interface might have architecture specific
* implementations.
*
* Conventions followed throughout this file:
* - Unless explicitly mentioned, Input matrix B always means an unquantized (i.e. float values)
* and non-transposed version
* - no. of rows of Input matrix A = `rows_A`
* - no. of columns of Input matrix A = no. of rows of Input matrix B = `width`
* - no. of columns of Input matrix B = `cols_B`
*/

#include <cstdint>

using Index = uint32_t;

/**
* Prepare B for the Matrix Multiply function from Input matrix B.
*
* Quantization is performed on the input.
* The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
* function (`int8MultiplyAndAddBias`).
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_B An array representing the Input matrix B in row-major format.
* Size of the array = `width` * `cols_B`.
* Shape of the matrix: (`width`, `cols_B`)
* @param[in] scale The scaling factor (for quantization)
* @param[in] zero_point The zero point (for quantization)
* @param[in] width No. of rows of Input matrix B. It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. It should be a multiple of 8.
* @param[out] output An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b")))
int8PrepareB(const float* input_B,
float scale,
float zero_point,
Index width,
Index cols_B,
int8_t* output);

/**
* Prepare B for the Matrix Multiply function from transposed version of Input matrix B.
*
* Quantization is performed on floating values of input.
* The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
* function (`int8MultiplyAndAddBias`).
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_B_transposed An array representing transposed version of Input matrix B.
* It is in column-major format.
* Size of the array = `width` * `cols_B`.
* Shape of the matrix: (`cols_B`, `width`)
* @param[in] scale The scaling factor (for quantization)
* @param[in] zero_point The zero point (for quantization)
* @param[in] width No. of rows of Input matrix B. It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. Should be a multiple of 8.
* @param[out] output An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
*/
extern "C" void
__attribute__((import_module("wasm_gemm"), import_name("int8_prepare_b_from_transposed")))
int8PrepareBFromTransposed(const float* input_B_transposed,
float scale,
float zero_point,
Index width,
Index cols_B,
int8_t* output);

/**
* Prepare B for the Matrix Multiply function from a quantized and transposed version of Input
* matrix B which is also in a CPU-independent format.
*
* The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
* function (`int8MultiplyAndAddBias`).
*
* This function is useful while using the quantized models that are stored in a CPU-independent
* format on the disk.
*
* @param[in] input_B_quant_transposed An array representing the quantized and transposed
* version of Input matrix B. It is in column-major format.
* Size of the array = `width` * `cols_B`.
* Shape of the matrix: (`cols_B`, `width`)
* @param[in] width No. of rows of Input matrix B. Should be multiple of 64
* @param[in] cols_B No. of columns of Input matrix B. Should be multiple of 8
* @param[out] output An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"),
import_name("int8_prepare_b_from_quantized_transposed")))
int8PrepareBFromQuantizedTransposed(const int8_t* input_B_quant_transposed,
Index width,
Index cols_B,
int8_t* output);

/**
* Prepare A for the Matrix Multiply function from Input matrix A.
*
* It performs quantization on floating values of input.
* The final prepared A might be architecture dependent. e.g. On some architectures like x86, it
* might be unsigned (achieved by adding 127 to quantized values) while on others like Arm, it might
* be signed.
* The final prepared A can be used as an input to matrix multiply function
* (`int8MultiplyAndAddBias`).
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_A An array representing the Input matrix A in row-major format.
* Size of the array = `rows_A` * `width`.
* Shape of the matrix: (`rows_A`, `width`)
* @param[in] scale The scaling factor (for quantization)
* @param[in] zero_point The zero point (for quantization)
* @param[in] rows_A No. of rows of Input matrix A. No restriction on its size.
* @param[in] width No. of columns of Input matrix A. It should be a multiple of 64.
* @param[out] output An array representing the prepared A matrix.
* Size of the array = `rows_A` * `width`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_a")))
int8PrepareA(const float* input_A,
float scale,
float zero_point,
Index rows_A,
Index width,
int8_t* output);

/**
* Prepares bias for the Matrix Multiply function.
*
* It uses the prepared B (which must be obtained by using any of the int8PrepareB* functions) and
* a bias input to prepare the final bias.
*
* The final bias can be used as an input to matrix multiply function (`int8MultiplyAndAddBias`).
*
* @param[in] input_B_prepared An array representing the prepared B matrix.
* Size of the array = `width` * `cols_B`.
* @param[in] scale_A The scaling factor (for quantization) of A
* @param[in] zero_point_A The zero point (for quantization) of A
* @param[in] scale_B The scaling factor (for quantization) of B
* @param[in] zero_point_B The zero point (for quantization) of B
* factor that is prepared from `scale_A` and `scale_B`.
* @param[in] width No. of rows of Input matrix B (unquantized & non-transposed).
* It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B (unquantized & non-transposed)
* It should be a multiple of 8.
* @param[in] input_bias An array representing the input bias. Size of array = `cols_B`
* @param[out] output An array representing the final prepared bias.
* Size of the array = `cols_B`
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_prepare_bias")))
int8PrepareBias(const int8_t* input_B_prepared,
float scale_A,
float zero_point_A,
float scale_B,
float zero_point_B,
Index width,
Index cols_B,
const float* input_bias,
float* output);

/**
* Perform multiplication of 2 matrices followed by adding a bias.
*
* i.e Output = A_prepared * B_prepared + Bias_prepared
*
* The inputs A_prepared, B_prepared and Bias_prepared of this function must be
* obtained by using `int8PrepareA`, one of the `int8PrepareB*` and `int8PrepareBias`
* functions respectively.
*
* Please note that this interface might have architecture specific implementation.
*
* @param[in] input_A_prepared An array representing the prepared A matrix.
* This must be obtained by using `int8PrepareA` function.
* Size of the array = `rows_A` * `width`.
* @param[in] scale_A The scaling factor (for quantization) of A
* @param[in] zero_point_A The zero point (for quantization) of A
* @param[in] input_B_prepared An array representing the prepared B matrix.
* This must be obtained by using one of `int8PrepareB*`
* functions. Size of the array = `width` * `cols_B`.
* @param[in] scale_B The scaling factor (for quantization) of B
* @param[in] zero_point_B The zero point (for quantization) of B
* @param[in] input_bias_prepared An array representing the prepared bias.
* This must be obtained by using `int8PrepareBias` function.
* Size of the array = `cols_B`
* @param[in] unquant_multiplier A value that will be multiplied to the final unquantization
* factor that is prepared from `scale_A` and `scale_B`.
* @param[in] rows_A No. of rows of Input matrix A. No restriction on its size.
* @param[in] width No. of columns of Input matrix A (same as no. of columns of
* Input matrix B). It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. Should be a multiple of 8.
* @param[out] output An array representing the result matrix in row-major format.
* Size of the array = `rows_A` * `cols_B`.
*/
extern "C" void
__attribute__((import_module("wasm_gemm"), import_name("int8_multiply_and_add_bias")))
int8MultiplyAndAddBias(const int8_t* input_A_prepared,
float scale_A,
float zero_point_A,
const int8_t* input_B_prepared,
float scale_B,
float zero_point_B,
const float* input_bias_prepared,
float unquant_multiplier,
Index rows_A,
Index width,
Index cols_B,
float* output);

/**
* Select a subset of columns of prepared B.
*
* Indices of the columns to be selected are specified by an array.
*
* @param[in] input_B_prepared An array representing the prepared B matrix.
* This must be obtained by using one of the `int8PrepareB*`
* functions Size of the array = `width` * `cols_B`.
* @param[in] width No. of rows of Input matrix B. It should be a multiple of 64.
* @param[in] cols_B No. of columns of Input matrix B. It should be a multiple of 8.
* @param[in] cols An array of column indices to be selected from prepared B.
* All indices of the array should be valid. i.e.
* 0 <= cols[N] < cols_B where N = 0, 1, 2 .... (`num_cols`-1)
* @param[in] num_cols Size of the `cols` array. It should be a multiple of 8.
* @param[out] output An array representing the selected columns of prepared B.
* Size of the array = `width` * `num_cols`.
*/
extern "C" void __attribute__((import_module("wasm_gemm"), import_name("int8_select_columns_of_b")))
int8SelectColumnsOfB(const int8_t* input_B_prepared,
Index width,
Index cols_B,
const Index* cols,
const Index num_cols,
int8_t* output);
__attribute__((import_module("wasm_gemm"), import_name("int8_multiply")))
int8Multiply(const int8_t* input_A,
float zero_point_A,
const int8_t* input_B,
float zero_point_B,
Index rows_A,
Index width,
Index cols_B,
float* output);


#endif
Expand Down

0 comments on commit b661e8b

Please sign in to comment.