CMSIS-DSP: Improvements to pull request #1363

To avoid having to change the API in an incompatible way,
a arm_mat_mult_opt_q31 was introduced and is providing a faster implementation
to use with Helium (but requiring more storage for intermediate results).

Some improvements to tests for matrix functions added.
pull/19/head
Christophe Favergeon 4 years ago
parent cfc30c12b8
commit e45dc7c22e

@ -444,6 +444,21 @@ arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
/**
* @brief Q31 matrix multiplication
* @param[in] pSrcA points to the first input matrix structure
* @param[in] pSrcB points to the second input matrix structure
* @param[out] pDst points to output matrix structure
* @param[in] pState points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState);
/**
* @brief Q31 matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure

@ -44,6 +44,7 @@
#include "arm_mat_mult_q7.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
#include "arm_mat_mult_opt_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"

@ -0,0 +1,784 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_opt_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 3 Nov 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
@remark
This function is a faster implementation of arm_mat_mult_q31 for MVE but it is requiring
additional storage for intermediate results.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q31x4_t vecB, vecA0, vecA1;
/* enable predication to disable half of vector elements */
mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM2;
pInB = pSrcB->pData;
/* load 1st B column (partial load) */
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
/* load A rows */
vecA0 = vldrwq_s32(pInA0);
vecA1 = vldrwq_s32(pInA1);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM3;
q31_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q31x4_t vecB, vecA;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM4;
q31_t *pInA2 = pInA1 + MATRIX_DIM4;
q31_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
vecColBOffs = vidupq_u32((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA2;
q31_t *pInB2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q31_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
arm_matrix_instance_q31 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
} else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pDst->pData[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_opt_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_opt_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_opt_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q31(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* main loop
* compute 2 x 2 output blocks
* with dot products (Matrix A rows * Transposed MAtrix B rows)
*/
while (row > 0u) {
/*
* For every row wise process, the column loop counter is to be initiated
* Compute 2 columns and 2 rows in parrallel
*/
col = numColsB >> 1;
j = 0;
/*
* column pair loop
*/
while (col > 0u) {
q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q31x4_t vecA, vecA2, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointers
* - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q31_t const *) pInA;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcBVec = (q31_t const *) pInB;
pSrcB2Vec = (q31_t const *) pInB2;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
/* load scheduling */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
}
/* Convert to 1.31 */
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
/* Store the results (2 x 2 block) in the destination buffer */
*px++ = (q31_t) acc0;
*px++ = (q31_t) acc2;
*px2++ = (q31_t) acc1;
*px2++ = (q31_t) acc3;
j += numRowsB * 2;
/*
* Decrement the column pair loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row pair loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u) {
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
/* single dot-product */
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px = (q31_t) acc0;
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u) {
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px++ = (q31_t) acc0;
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/*
* Return to application
*/
return (status);
}
#else
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

@ -42,7 +42,7 @@
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results (Unused)
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@ -617,7 +617,7 @@ arm_status arm_mat_mult_q15(
return (status);
}
#else
#else
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
@ -639,8 +639,8 @@ arm_status arm_mat_mult_q15(
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
arm_matrix_instance_q15 BT;
#ifdef ARM_MATH_MATRIX_CHECK
@ -655,89 +655,13 @@ arm_status arm_mat_mult_q15(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read two elements from row */
in = read_q15x2_ia ((q15_t **) &pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pInB++;
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U);
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q15(pSrcB,&BT);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;

@ -3,8 +3,8 @@
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 3 Nov 2021
* $Revision: V1.10.0
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@ -332,45 +332,44 @@ __STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA2;
q31_t *pInB2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q31_t State[numRowsB * numColsB * 1];
q31_t *pSrcBT = State; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
arm_matrix_instance_q31 BT;
#ifdef ARM_MATH_MATRIX_CHECK
q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint32x4_t vecOffs, vecColBOffs;
uint32_t blkCnt, rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
} else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* small squared matrix specialized routines */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pDst->pData[0] = (q31_t)(sum >> 31);
pOut[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
@ -381,263 +380,246 @@ arm_status arm_mat_mult_q31(
return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * (uint32_t) (numColsB);
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0U)
{
/*
* Matrix transpose
* Output pointer is set to starting address of the row being processed
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q31(pSrcB, &BT);
px = pOut + i;
i = i + 4 * numColsB;
/*
* Reset the variables for the usage in the following multiplication process
* For every row wise process, the column loop counter is to be initiated
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* main loop
* compute 2 x 2 output blocks
* with dot products (Matrix A rows * Transposed MAtrix B rows)
* column loop
*/
while (row > 0u) {
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* For every row wise process, the column loop counter is to be initiated
* Compute 2 columns and 2 rows in parrallel
* Matrix A columns number of MAC operations are to be performed
*/
col = numColsB >> 1;
j = 0;
q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + numColsA;
q31_t const *pInA2 = pInA1 + numColsA;
q31_t const *pInA3 = pInA2 + numColsA;
q63_t acc0, acc1, acc2, acc3;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
pSrcA1Vec = (q31_t const *) pInA1;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcA3Vec = (q31_t const *) pInA3;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
blkCnt--;
}
/*
* column pair loop
* tail
* (will be merged thru tail predication)
*/
while (col > 0u) {
q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q31x4_t vecA, vecA2, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointers
* - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q31_t const *) pInA;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcBVec = (q31_t const *) pInB;
pSrcB2Vec = (q31_t const *) pInB2;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
/* load scheduling */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
}
/* Convert to 1.31 */
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
/* Store the results (2 x 2 block) in the destination buffer */
*px++ = (q31_t) acc0;
*px++ = (q31_t) acc2;
*px2++ = (q31_t) acc1;
*px2++ = (q31_t) acc3;
j += numRowsB * 2;
/*
* Decrement the column pair loop counter
*/
col--;
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
px[0] = (q31_t) acc0;
px[1 * numColsB] = (q31_t) acc1;
px[2 * numColsB] = (q31_t) acc2;
px[3 * numColsB] = (q31_t) acc3;
px++;
/*
* Decrement the row pair loop counter
* Decrement the column loop counter
*/
row--;
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/*
* Compute remaining row and/or column below
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4);
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0U)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
if (numColsB & 1u) {
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
while (col > 0U)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
q31_t const *pSrcA0Vec;
q31_t const *pInA0 = pInA;
q63_t acc0;
acc0 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* row loop
* tail
* (will be merged thru tail predication)
*/
while (row > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
/* single dot-product */
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px = (q31_t) acc0;
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
}
}
if (numRowsA & 1u) {
col = numColsB;
i = 0u;
acc0 = asrl(acc0, 23);
px[0] = (q31_t) acc0;
px++;
/*
* point to last row in output matrix
* Decrement the column loop counter
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
col--;
/*
* col loop
* Update the pointer pInB to point to the starting address of the next column
*/
while (col > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px++ = (q31_t) acc0;
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA;
/*
* Decrement the row loop counter
*/
rowCnt--;
}
/*
* Return to application
* set status as ARM_MATH_SUCCESS
*/
return (status);
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else

@ -14,6 +14,7 @@ class BinaryQ31:public Client::Suite
Client::Pattern<q31_t> input1;
Client::Pattern<q31_t> input2;
Client::LocalPattern<q31_t> output;
Client::LocalPattern<q31_t> tmp;
int nbr;
int nbi;
@ -22,5 +23,6 @@ class BinaryQ31:public Client::Suite
arm_matrix_instance_q31 in1;
arm_matrix_instance_q31 in2;
arm_matrix_instance_q31 out;
q31_t *tmpPtr;
};

@ -16,6 +16,8 @@ class BinaryTestsQ31:public Client::Suite
Client::Pattern<q31_t> ref;
Client::Pattern<int16_t> dims;
Client::LocalPattern<q31_t> output;
Client::LocalPattern<q31_t> tmp;
/* Local copies of inputs since matrix instance in CMSIS-DSP are not using
pointers to const.

@ -17,6 +17,11 @@
arm_mat_mult_fast_q31(&this->in1,&this->in2,&this->out);
}
void BinaryQ31::test_mat_mult_opt_q31()
{
arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,this->tmpPtr);
}
void BinaryQ31::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
{
@ -35,6 +40,14 @@
output.create(2*this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr);
break;
case BinaryQ31::TEST_MAT_MULT_OPT_Q31_4:
input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi);
input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc);
output.create(this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr);
tmp.create(this->nbi*this->nbc,BinaryQ31::TMP_Q31_ID,mgr);
this->tmpPtr=tmp.ptr();
break;
default:
input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi);
input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc);

@ -16,6 +16,13 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
static void checkInnerTail(float32_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
}
#define LOADDATA2() \
const float32_t *inp1=input1.ptr(); \
@ -68,6 +75,7 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
@ -99,6 +107,7 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (2*rows * columns);
checkInnerTail(outp);
}

@ -23,6 +23,19 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
static void checkInnerTail(q15_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
ASSERT_TRUE(b[4] == 0);
ASSERT_TRUE(b[5] == 0);
ASSERT_TRUE(b[6] == 0);
ASSERT_TRUE(b[7] == 0);
}
#define LOADDATA2() \
const q15_t *inp1=input1.ptr(); \
@ -39,7 +52,7 @@ a double precision computation.
int i;
#define PREPAREDATA2() \
#define PREPAREDATA2C() \
in1.numRows=rows; \
in1.numCols=internal; \
memcpy((void*)ap,(const void*)inp1,2*sizeof(q15_t)*rows*internal);\
@ -54,29 +67,45 @@ a double precision computation.
out.numCols=columns; \
out.pData = outp;
#define PREPAREDATA2R() \
in1.numRows=rows; \
in1.numCols=internal; \
memcpy((void*)ap,(const void*)inp1,sizeof(q15_t)*rows*internal);\
in1.pData = ap; \
\
in2.numRows=internal; \
in2.numCols=columns; \
memcpy((void*)bp,(const void*)inp2,sizeof(q15_t)*internal*columns);\
in2.pData = bp; \
\
out.numRows=rows; \
out.numCols=columns; \
out.pData = outp;
void BinaryTestsQ15::test_mat_mult_q15()
{
LOADDATA2();
arm_status status;
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
internal = *dimsp++;
columns = *dimsp++;
PREPAREDATA2();
PREPAREDATA2R();
memset(tmpPtr,0,sizeof(q15_t)*internal*columns + 16);
status=arm_mat_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
checkInnerTail(tmpPtr + internal * columns);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_LOW_THRESHOLD);
@ -99,17 +128,16 @@ a double precision computation.
columns = *dimsp++;
PREPAREDATA2();
PREPAREDATA2C();
status=arm_mat_cmplx_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (2*rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)MULT_SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15);

@ -18,6 +18,14 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
static void checkInnerTail(q31_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
}
#define LOADDATA2() \
const q31_t *inp1=input1.ptr(); \
@ -68,11 +76,10 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31);
@ -98,10 +105,38 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (2*rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31);
}
void BinaryTestsQ31::test_mat_mult_opt_q31()
{
LOADDATA2();
q31_t *tmpPtr=tmp.ptr();
arm_status status;
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
internal = *dimsp++;
columns = *dimsp++;
PREPAREDATA2();
memset(tmpPtr,0,sizeof(q31_t)*internal*columns + 16);
status=arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
checkInnerTail(tmpPtr + internal*columns);
}
ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
@ -141,6 +176,21 @@ a double precision computation.
b.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr);
break;
case TEST_MAT_MULT_OPT_Q31_3:
input1.reload(BinaryTestsQ31::INPUTS1_Q31_ID,mgr);
input2.reload(BinaryTestsQ31::INPUTS2_Q31_ID,mgr);
dims.reload(BinaryTestsQ31::DIMSBINARY1_S16_ID,mgr);
ref.reload(BinaryTestsQ31::REFMUL1_Q31_ID,mgr);
output.create(ref.nbSamples(),BinaryTestsQ31::OUT_Q31_ID,mgr);
a.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPA_Q31_ID,mgr);
b.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr);
tmp.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPC_Q31_ID,mgr);
break;

@ -19,6 +19,26 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 47
static void checkInnerTail(q7_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
ASSERT_TRUE(b[4] == 0);
ASSERT_TRUE(b[5] == 0);
ASSERT_TRUE(b[6] == 0);
ASSERT_TRUE(b[7] == 0);
ASSERT_TRUE(b[8] == 0);
ASSERT_TRUE(b[9] == 0);
ASSERT_TRUE(b[10] == 0);
ASSERT_TRUE(b[11] == 0);
ASSERT_TRUE(b[12] == 0);
ASSERT_TRUE(b[13] == 0);
ASSERT_TRUE(b[14] == 0);
ASSERT_TRUE(b[15] == 0);
}
#define LOADDATA2() \
const q7_t *inp1=input1.ptr(); \
@ -65,12 +85,15 @@ a double precision computation.
columns = *dimsp++;
PREPAREDATA2();
memset(tmpPtr,0,sizeof(q7_t)*internal*columns + 16);
checkInnerTail(tmpPtr + internal*columns);
status=arm_mat_mult_q7(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
checkInnerTail(tmpPtr + internal*columns);
}
ASSERT_EMPTY_TAIL(output);

@ -46,6 +46,14 @@ Comparison for Cholesky
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
static void checkInnerTailOverflow(float32_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
}
#define LOADDATA2() \
const float32_t *inp1=input1.ptr(); \
const float32_t *inp2=input2.ptr(); \
@ -192,6 +200,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32()
arm_mat_vec_mult_f32(&this->in1, bp, outp);
outp += rows ;
checkInnerTailOverflow(outp);
}
@ -219,6 +228,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -246,6 +256,7 @@ void UnaryTestsF32::test_mat_sub_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -273,6 +284,7 @@ void UnaryTestsF32::test_mat_scale_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -300,6 +312,7 @@ void UnaryTestsF32::test_mat_trans_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -327,6 +340,7 @@ void UnaryTestsF32::test_mat_cmplx_trans_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += 2*(rows * columns);
checkInnerTailOverflow(outp);
}
@ -421,6 +435,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
outp += (rows * columns);
inp1 += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -461,6 +476,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
outp += (rows * columns);
inp1 += (rows * rows);
inp2 += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -501,6 +517,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
outp += (rows * columns);
inp1 += (rows * rows);
inp2 += (rows * columns);
checkInnerTailOverflow(outp);
}
@ -668,6 +685,9 @@ void UnaryTestsF32::test_mat_inverse_f32()
inp1 += (rows * columns);
checkInnerTailOverflow(outllp);
checkInnerTailOverflow(outdp);
}

@ -18,6 +18,31 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
static void refInnerTail(q15_t *b)
{
b[0] = 1;
b[1] = -1;
b[2] = 2;
b[3] = -2;
b[4] = 3;
b[5] = -3;
b[6] = 4;
b[7] = -4;
}
static void checkInnerTail(q15_t *b)
{
ASSERT_TRUE(b[0] == 1);
ASSERT_TRUE(b[1] == -1);
ASSERT_TRUE(b[2] == 2);
ASSERT_TRUE(b[3] == -2);
ASSERT_TRUE(b[4] == 3);
ASSERT_TRUE(b[5] == -3);
ASSERT_TRUE(b[6] == 4);
ASSERT_TRUE(b[7] == -4);
}
#define LOADDATA2() \
const q15_t *inp1=input1.ptr(); \
const q15_t *inp2=input2.ptr(); \
@ -127,14 +152,14 @@ a double precision computation.
internal = *dimsp++;
PREPAREVECDATA2();
refInnerTail(outp + rows);
arm_mat_vec_mult_q15(&this->in1, bp, outp);
outp += rows ;
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@ -153,15 +178,15 @@ a double precision computation.
columns = *dimsp++;
PREPAREDATA2();
refInnerTail(outp + rows * columns);
status=arm_mat_add_q15(&this->in1,&this->in2,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@ -180,15 +205,15 @@ void UnaryTestsQ15::test_mat_sub_q15()
columns = *dimsp++;
PREPAREDATA2();
refInnerTail(outp + rows * columns);
status=arm_mat_sub_q15(&this->in1,&this->in2,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@ -207,15 +232,15 @@ void UnaryTestsQ15::test_mat_scale_q15()
columns = *dimsp++;
PREPAREDATA1(false);
refInnerTail(outp + rows * columns);
status=arm_mat_scale_q15(&this->in1,ONEHALF,0,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@ -234,16 +259,15 @@ void UnaryTestsQ15::test_mat_trans_q15()
columns = *dimsp++;
PREPAREDATA1(true);
refInnerTail(outp + rows * columns);
status=arm_mat_trans_q15(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15);
@ -261,15 +285,15 @@ void UnaryTestsQ15::test_mat_cmplx_trans_q15()
columns = *dimsp++;
PREPAREDATA1C(true);
refInnerTail(outp + 2*rows * columns);
status=arm_mat_cmplx_trans_q15(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += 2*(rows * columns);
checkInnerTail(outp);
}
ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

@ -18,6 +18,15 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
static void checkInnerTail(q31_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
}
#define LOADDATA2() \
const q31_t *inp1=input1.ptr(); \
const q31_t *inp2=input2.ptr(); \
@ -129,6 +138,7 @@ a double precision computation.
arm_mat_vec_mult_q31(&this->in1, bp, outp);
outp += rows ;
checkInnerTail(outp);
}
@ -156,6 +166,7 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
@ -183,6 +194,7 @@ void UnaryTestsQ31::test_mat_sub_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
@ -210,6 +222,7 @@ void UnaryTestsQ31::test_mat_scale_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
@ -237,6 +250,7 @@ void UnaryTestsQ31::test_mat_trans_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}
@ -264,6 +278,7 @@ void UnaryTestsQ31::test_mat_cmplx_trans_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += 2*(rows * columns);
checkInnerTail(outp);
}

@ -19,6 +19,27 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 47
static void checkInnerTail(q7_t *b)
{
ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0);
ASSERT_TRUE(b[2] == 0);
ASSERT_TRUE(b[3] == 0);
ASSERT_TRUE(b[4] == 0);
ASSERT_TRUE(b[5] == 0);
ASSERT_TRUE(b[6] == 0);
ASSERT_TRUE(b[7] == 0);
ASSERT_TRUE(b[8] == 0);
ASSERT_TRUE(b[9] == 0);
ASSERT_TRUE(b[10] == 0);
ASSERT_TRUE(b[11] == 0);
ASSERT_TRUE(b[12] == 0);
ASSERT_TRUE(b[13] == 0);
ASSERT_TRUE(b[14] == 0);
ASSERT_TRUE(b[15] == 0);
}
#define LOADDATA2() \
const q7_t *inp1=input1.ptr(); \
const q7_t *inp2=input2.ptr(); \
@ -112,6 +133,7 @@ a double precision computation.
arm_mat_vec_mult_q7(&this->in1, bp, outp);
outp += rows ;
checkInnerTail(outp);
}
@ -132,13 +154,13 @@ void UnaryTestsQ7::test_mat_trans_q7()
{
rows = *dimsp++;
columns = *dimsp++;
PREPAREDATA1(true);
status=arm_mat_trans_q7(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
checkInnerTail(outp);
}

@ -1583,6 +1583,7 @@ group Root {
Pattern INPUTAC_Q31_ID : InputAC1_q31.txt
Pattern INPUTBC_Q31_ID : InputBC1_q31.txt
Output OUT_Q31_ID : Output
Output TMP_Q31_ID : Temp
Params PARAM1_ID = {
NBR = [5,10,40]
@ -1595,6 +1596,7 @@ group Root {
Matrix Multiplication:test_mat_mult_q31
Complex Matrix Multiplication:test_mat_cmplx_mult_q31
Fast Matrix Multiplication:test_mat_mult_fast_q31
Opt Matrix Multiplication:test_mat_mult_opt_q31
} -> PARAM1_ID
}
@ -1614,6 +1616,7 @@ group Root {
Pattern INPUTAC_Q15_ID : InputAC1_q15.txt
Pattern INPUTBC_Q15_ID : InputBC1_q15.txt
Output OUT_Q15_ID : Output
Output TMP_Q15_ID : Temp
Params PARAM1_ID = {
NBR = [5,10,40]

@ -3459,10 +3459,12 @@ group Root {
Output OUT_Q31_ID : Output
Output TMPA_Q31_ID : TmpA
Output TMPB_Q31_ID : TmpB
Output TMPC_Q31_ID : TmpC
Functions {
test mult:test_mat_mult_q31
test complex mult:test_mat_cmplx_mult_q31
test mult opt:test_mat_mult_opt_q31
}
}

Loading…
Cancel
Save