IntegerGemmIntrinsic.h

namespace intgemm {

/* Interface for integer matrix multiplication followed by addition of bias.

 * C = A * B + Bias

 * Input matrix A:

 *  - A 2-D matrix that typically represents activations as floating point

 * values

 *  - no. of rows should be a positive integer

 *  - no. of columns should be a positive integeral multiple of 64

 *  - is represented as array (contiguous memory locations) in row-major format

 * Input matrix B:

 *  - A 2-D matrix that typically represents fixed model parameters as

 * floating point values

 *  - no. of rows should be:

 *    -- equal to no. of columns of Input matrix A

 *    -- a positive integeral multiple of 64

 *  - no. of columns should be a positive integeral multiple of 8

 *  - is represented as array (contiguous memory locations) in row-major format

 *  Please note that it is also possible to pass Input matrix B in 2 more forms:

 *   - One that is already a quantized and transposed version of Input matrix B

 *   - Other that is already a transposed version of Input matrix B

 * Input Bias:

 *  - is an array (contiguous memory locations) that represents bias

 *  - size of the array should be equal to the no. of columns of Input matrix B

 * Output matrix C:

 *  - is a 2-D matrix that represents the result (= A * B + Bias)

 *  - no. of rows = no. of rows of Input matrix A

 *  - no. of columns = no. of columns of Input matrix B (in

 * untransposed form)

 *  - is represented as array (contiguous memory locations) in row-major format

 * Please note that most of the functions in this interface might have

 * architecture specific implementations.

 * Conventions followed for the interface:

 *  - Unless explicitly mentioned, Input matrix B refers to an unquantized

 * (i.e. float values) and non-transposed version

 *  - no. of rows of Input matrix A = `rowsA`

 *  - no. of columns of Input matrix A (`colsA`) = no. of rows of Input matrix B

 * (`rowsB`) = `width`

 *  - no. of columns of Input matrix B = `colsB`

*/

/* Prepare B for the Matrix Multiply function from Input matrix B.

 * Quantization is performed on the input.

 * The final prepared B is in CPU-dependent format and can be used as an input

 * to matrix multiply function (`int8_multiply_and_add_bias`).

 * Please note that this interface might have architecture specific

 * implementation.

 * @param[in]   inputMatrixB        An array representing the Input matrix B in

 *                                  row-major format.

 *                                  Size of the array = `rowsB` * `colsB`.

 *                                  Shape of the matrix: (`rowsB`, `colsB`)

 * @param[in]   scale               The scaling factor (for quantization)

 * @param[in]   zeroPoint           The zero point (for quantization)

 * @param[in]   rowsB               No. of rows of Input matrix B. It should be

 *                                  a positive integer and a multiple of 64.

 * @param[in]   colsB               No. of columns of Input matrix B. It should

 *                                  be a positive integer and a multiple of 8.

 * @param[out]  outputMatrixB       An array representing the prepared B matrix.

 *                                  Size of the array = `rowsB` * `colsB`.

 * This function implements the intrinsic:

 *   int8_prepare_b(inputMatrixB: i32, scale: f32, zeroPoint: f32, rowsB: i32,

 * colsB: i32, outputMatrixB: i32) which implements the function:

 *   int8_prepare_b(const float* inputMatrixB, float scale, float zeroPoint,

 * uint32_t rowsB, uint32_t colsB, int8_t* outputMatrixB)

*/

int32_t IntrI8PrepareB(wasm::Instance* instance, uint32_t inputMatrixB,

                       float scale, float zeroPoint, uint32_t rowsB,

                       uint32_t colsB, uint32_t outputMatrixB,

                       uint8_t* memBase);

/* Prepare B for the Matrix Multiply function from transposed version of Input

 * matrix B.

 * Quantization is performed on floating values of input.

 * The final prepared B is in CPU-dependent format and can be used as an input

 * to matrix multiply function (`int8_multiply_and_add_bias`).

 * Please note that this interface might have architecture specific

 * implementation.

 * @param[in]   inputMatrixBTransposed An array representing transposed version

 *                                     of Input matrix B.

 *                                     It is in column-major format.

 *                                     Size of the array = `rowsB` * `colsB`.

 *                                     Shape of the matrix: (`colsB`, `rowsB`)

 * @param[in]   scale                  The scaling factor (for quantization)

 * @param[in]   zeroPoint              The zero point (for quantization)

 * @param[in]   rowsB                  No. of rows of Input matrix B. It should

 *                                     be a positive integer and a multiple of

 *                                     64.

 * @param[in]   colsB                  No. of columns of Input matrix B. It

 *                                     should be a positive integer and a

 *                                     multiple of 8.

 * @param[out]  outputMatrixB          An array representing the prepared B

 *                                     matrix. Size of array = `rowsB`*`colsB`

 * This function implements the intrinsic:

 *   int8_prepare_b_from_transposed(inputMatrixBTransposed: i32, scale: f32,

 * zeroPoint: f32, rowsB: i32, colsB: i32, outputMatrixB: i32) which implements

 * the function: int8_prepare_b_from_transposed(const float*

 * inputMatrixBTransposed, float scale, float zeroPoint, uint32_t rowsB,

 * uint32_t colsB, int8_t* outputMatrixB)

*/

int32_t IntrI8PrepareBFromTransposed(wasm::Instance* instance,

                                     uint32_t inputMatrixBTransposed,

                                     float scale, float zeroPoint,

                                     uint32_t rowsB, uint32_t colsB,

                                     uint32_t outputMatrixB, uint8_t* memBase);

/* Prepare B for the Matrix Multiply function from a quantized and transposed

 * version of Input matrix B which is also in a CPU-independent format.

 * The final prepared B is in CPU-dependent format and can be used as an input

 * to matrix multiply function (`int8_multiply_and_add_bias`).

 * This function is useful while using the quantized models that are stored in a

 * CPU-independent format on the disk.

 * @param[in]   inputMatrixBQuantizedTransposed  An array representing the

 *                                               quantized and transposed

 *                                               version of Input matrix B.

 *                                               It is in column-major format.

 *                                               Size of array =

 *                                                 `rowsB`*`colsB`

 *                                               Shape of the matrix:

 *                                                 (`colsB`,`rowsB`)

 * @param[in]   rowsB                            No. of rows of Input matrix B.

 *                                               Should be a positive integer

 *                                               and a multiple of 64.

 * @param[in]   colsB                            No. of columns of Input matrix

 *                                               B. Should be a positive

 *                                               integer and a multiple of 8

 * @param[out]  outputMatrixB                    An array representing the

 *                                               prepared B matrix.

 *                                               Size: `rowsB` * `colsB`.

 * This function implements the intrinsic:

 *   int8_prepare_b_from_quantized_transposed(inputMatrixBQuantizedTransposed:

 * i32, rowsB: i32, colsB: i32, outputMatrixB: i32) which implements the

 * function: int8_prepare_b_from_quantized_transposed(const int8_t*

 * inputMatrixBQuantizedTransposed, uint32_t rowsB, uint32_t colsB, int8_t*

 * outputMatrixB)

*/

int32_t IntrI8PrepareBFromQuantizedTransposed(

    wasm::Instance* instance, uint32_t inputMatrixBQuantizedTransposed,

    uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB, uint8_t* memBase);

/* Prepare A for the Matrix Multiply function from Input matrix A.

 * It performs quantization on floating values of input.

 * The final prepared A might be architecture dependent. e.g. On some

 * architectures like x86, it might be unsigned (achieved by adding 127 to

 * quantized values) while on others like Arm, it might be signed. The final

 * prepared A can be used as an input to matrix multiply function

 * (`int8_multiply_and_add_bias`).

 * Please note that this interface might have architecture specific

 * implementation.

 * @param[in]   inputMatrixA   An array representing the Input matrix A in

 *                             row-major format.

 *                             Size of the array = `rowsA` * `colsA`.

 *                             Shape of the matrix: (`rowsA`, `colsA`)

 * @param[in]   scale          The scaling factor (for quantization)

 * @param[in]   zeroPoint      The zero point (for quantization)

 * @param[in]   rowsA          No. of rows of Input matrix A. It should be a

 *                             positive integer.

 * @param[in]   colsA          No. of columns of Input matrix A. It should be a

 *                             positive integer and a multiple of 64.

 * @param[out]  outputMatrixA  An array representing the prepared A matrix.

 *                             Size of the array = `rowsA` * `colsA`.

 * This function implements the intrinsic:

 *   int8_prepare_a(inputMatrixA: i32, scale: f32, zeroPoint: f32, rowsA: i32,

 * colsA: i32, outputMatrixA: i32) which implements the function:

 *   int8_prepare_a(const float* inputMatrixA, float scale, float zeroPoint,

 * uint32_t rowsA, uint32_t colsA, int8_t* outputMatrixA)

*/

int32_t IntrI8PrepareA(wasm::Instance* instance, uint32_t inputMatrixA,

                       float scale, float zeroPoint, uint32_t rowsA,

                       uint32_t colsA, uint32_t outputMatrixA,

                       uint8_t* memBase);

/* Prepares bias for the Matrix Multiply function.

 * It uses the prepared B (which must be obtained by using any of the

 * int8_prepare_b* functions) and a bias input to prepare the final bias.

 * The final bias can be used as an input to matrix multiply function

 * (`int8_multiply_and_add_bias`).

 * @param[in]   inputMatrixBPrepared An array representing the prepared B

 *                                   matrix. Size of array = `rowsB`*`colsB`.

 * @param[in]   scaleA               The scaling factor (for quantization) of A

 * @param[in]   zeroPointA           The zero point (for quantization) of A

 * @param[in]   scaleB               The scaling factor (for quantization) of B

 * @param[in]   zeroPointB           The zero point (for quantization) of B

 * @param[in]   rowsB                No. of rows of Input matrix B (unquantized

 *                                   & non-transposed). It should be a positive

 *                                   integer and a multiple of 64.

 * @param[in]   colsB                No. of columns of Input matrix B

 *                                   (unquantized & non-transposed). It should

 *                                   be a positive integer and a multiple of 8.

 * @param[in]   inputBias            An array representing the input bias. Size

 *                                   of array = `colsB`

 * @param[out]  output               An array representing the final prepared

 *                                   bias. Size of the array = `colsB`

 * This function implements the intrinsic:

 *   int8_prepare_bias(inputMatrixBPrepared: i32, scaleA: f32, zeroPointA: f32,

 * scaleB: f32, zeroPointB: f32, rowsB: i32, colsB: i32, inputBias: i32, output:

 * i32) which implements the function: int8_prepare_bias(const int8_t*

 * inputMatrixBPrepared, float scaleA, float zeroPointA, float scaleB, float

 * zeroPointB, uint32_t rowsB, uint32_t colsB, const float* inputBias, float*

 * output)

*/

int32_t IntrI8PrepareBias(wasm::Instance* instance,

                          uint32_t inputMatrixBPrepared, float scaleA,

                          float zeroPointA, float scaleB, float zeroPointB,

                          uint32_t rowsB, uint32_t colsB, uint32_t inputBias,

                          uint32_t output, uint8_t* memBase);

/* Perform multiplication of 2 matrices followed by adding a bias.

 * i.e Output = inputMatrixAPrepared * inputMatrixBPrepared + inputBiasPrepared

 * The inputs inputMatrixAPrepared, inputMatrixBPrepared and inputBiasPrepared

 * of this function must be obtained by using `int8_prepare_A`, one of the

 * `int8_prepare_b*` and `int8_prepare_bias` functions respectively.

 * Please note that this interface might have architecture specific

 * implementation.

 * @param[in]   inputMatrixAPrepared   An array representing the prepared A

 *                                     matrix. This must be obtained by using

 *                                     `int8_prepare_A` function. Size of the

 *                                     array = `rowsA` * `width`.

 * @param[in]   scaleA                 The scaling factor (quantization) of A

 * @param[in]   zeroPointA             The zero point (for quantization) of A

 * @param[in]   inputMatrixBPrepared   An array representing the prepared B

 *                                     matrix. This must be obtained by using

 *                                     one of `int8_prepare_b*` functions.

 *                                     Size of the array = `width` * `colsB`.

 * @param[in]   scaleB                 The scaling factor (quantization) of B

 * @param[in]   zeroPointB             The zero point (for quantization) of B

 * @param[in]   inputBiasPrepared      An array representing the prepared bias.

 *                                     This must be obtained by using

 *                                     `int8_prepare_bias` function.

 *                                     Size of the array = `colsB`

 * @param[in]   unquantMultiplier      A value that will be multiplied to the

 *                                     final unquantization factor that is

 *                                     prepared from `scaleA` and `scaleB`.

 * @param[in]   rowsA                  No. of rows of Input matrix A. It should

 *                                     be a positive integer.

 * @param[in]   width                  No. of columns of Input matrix A (same as

 *                                     no. of columns of Input matrix B). It

 *                                     should be a positive integer and a

 *                                     multiple of 64.

 * @param[in]   colsB                  No. of columns of Input matrix B. Should

 *                                     be a multiple of 8.

 * @param[out]  output                 An array representing the result matrix

 *                                     in row-major format.

 *                                     Size of the array = `rowsA` * `colsB`.

 * This function implements the intrinsic:

 *   int8_multiply_and_add_bias(inputMatrixAPrepared: i32, scaleA: f32,

 * zeroPointA: f32, inputMatrixBPrepared: i32, scaleB: f32, zeroPointB: f32,

 *                     inputBiasPrepared: i32, unquantMultiplier: f32,

 *                     rowsA: i32, width: i32, colsB: i32, output: i32)

 * which implements the function:

 *   int8_multiply_and_add_bias(const int8_t* inputMatrixAPrepared, float

 * scaleA, float zeroPointA, const int8_t* inputMatrixBPrepared, float scaleB,

 * float zeroPointB, const float* inputBiasPrepared, float unquantMultiplier,

 *                     uint32_t rowsA, uint32_t width, uint32_t colsB, float*

 * output)

*/

int32_t IntrI8MultiplyAndAddBias(wasm::Instance* instance,

                                 uint32_t inputMatrixAPrepared, float scaleA,

                                 float zeroPointA,

                                 uint32_t inputMatrixBPrepared, float scaleB,

                                 float zeroPointB, uint32_t inputBiasPrepared,

                                 float unquantMultiplier, uint32_t rowsA,

                                 uint32_t width, uint32_t colsB,

                                 uint32_t output, uint8_t* memBase);

/* Select a subset of columns of prepared B.

 * Indices of the columns to be selected are specified by an array.

 * @param[in]   inputMatrixBPrepared  An array representing the prepared B

 *                                    matrix. This must be obtained by using

 *                                    one of the `int8_prepare_b*` functions.

 *                                    Size of the array = `rowsB` * `colsB`.

 * @param[in]   rowsB                 No. of rows of Input matrix B. It should

 *                                    be a positive integer and a multiple

 *                                    of 64.

 * @param[in]   colsB                 No. of columns of Input matrix B. It

 *                                    should be a positive integer and a

 *                                    multiple of 8.

 * @param[in]   colIndexList          An array of column indices to be selected

 *                                    from prepared B. All indices of the array

 *                                    should be valid

 *                                    i.e. 0 <= colIndexList[N] < colsB

 *                                    where N = 0, 1 ....(`sizeColIndexList`-1)

 * @param[in]   sizeColIndexList      Size of the `colIndexList` array. It

 *                                    should be a positive integer and a

 *                                    multiple of 8.

 * @param[out]  output                An array representing the selected columns

 *                                    of prepared B.

 *                                    Size = `rowsB` * `sizeColIndexList`.

 * This function implements the intrinsic:

 *   int8_select_columns_of_b(inputMatrixBPrepared: i32, rowsB: i32, colsB: i32,

 * colIndexList: i32, sizeColIndexList: i32, output: i32) which implements the

 * function: int8_select_columns_of_b(const int8_t* inputMatrixBPrepared,

 * uint32_t rowsB, uint32_t colsB, const uint32_t* colIndexList, const uint32_t

 * sizeColIndexList, int8_t* output)

*/

int32_t IntrI8SelectColumnsOfB(wasm::Instance* instance,

                               uint32_t inputMatrixBPrepared, uint32_t rowsB,

                               uint32_t colsB, uint32_t colIndexList,

                               uint32_t sizeColIndexList, uint32_t output,

                               uint8_t* memBase);

Source code

Revision control

Copy as Markdown

Other Tools