diff --git a/Include/dsp/matrix_functions.h b/Include/dsp/matrix_functions.h
index 3d444883..f3801a98 100755
--- a/Include/dsp/matrix_functions.h
+++ b/Include/dsp/matrix_functions.h
@@ -444,6 +444,21 @@ arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst);
+ /**
+ * @brief Q31 matrix multiplication
+ * @param[in] pSrcA points to the first input matrix structure
+ * @param[in] pSrcB points to the second input matrix structure
+ * @param[out] pDst points to output matrix structure
+ * @param[in] pState points to the array for storing intermediate results
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+arm_status arm_mat_mult_opt_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst,
+ q31_t *pState);
+
/**
* @brief Q31 matrix and vector multiplication
* @param[in] pSrcMat points to the input matrix structure
diff --git a/Source/MatrixFunctions/MatrixFunctions.c b/Source/MatrixFunctions/MatrixFunctions.c
index cad08519..d4fa42c1 100644
--- a/Source/MatrixFunctions/MatrixFunctions.c
+++ b/Source/MatrixFunctions/MatrixFunctions.c
@@ -44,6 +44,7 @@
#include "arm_mat_mult_q7.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
+#include "arm_mat_mult_opt_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"
diff --git a/Source/MatrixFunctions/arm_mat_mult_opt_q31.c b/Source/MatrixFunctions/arm_mat_mult_opt_q31.c
new file mode 100755
index 00000000..91b1bcd5
--- /dev/null
+++ b/Source/MatrixFunctions/arm_mat_mult_opt_q31.c
@@ -0,0 +1,784 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_opt_q31.c
+ * Description: Q31 matrix multiplication
+ *
+ * $Date: 3 Nov 2021
+ * $Revision: V1.10.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixMult
+ @{
+ */
+
+/**
+ @brief Q31 matrix multiplication.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @param[in] pState points to the array for storing intermediate results
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit. There is no saturation
+ on intermediate additions. Thus, if the accumulator overflows it wraps around and
+ distorts the result. The input signals should be scaled down to avoid intermediate
+ overflows. The input is thus scaled down by log2(numColsA) bits
+ to avoid overflows, as a total of numColsA additions are performed internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ @remark
+ Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
+ @remark
+ This function is a faster implementation of arm_mat_mult_q31 for MVE but it is requiring
+ additional storage for intermediate results.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_mult_opt_q31_2x2_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs;
+ q31_t *pInA0 = pInA;
+ q31_t *pInA1 = pInA0 + MATRIX_DIM2;
+ q63_t acc0, acc1;
+ q31x4_t vecB, vecA0, vecA1;
+ /* enable predication to disable half of vector elements */
+ mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * MATRIX_DIM2;
+
+ pInB = pSrcB->pData;
+
+ /* load 1st B column (partial load) */
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ /* load A rows */
+ vecA0 = vldrwq_s32(pInA0);
+ vecA1 = vldrwq_s32(pInA1);
+
+ acc0 = vrmlaldavhq(vecA0, vecB);
+ acc1 = vrmlaldavhq(vecA1, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+
+ pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ acc0 = vrmlaldavhq(vecA0, vecB);
+ acc1 = vrmlaldavhq(vecA1, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+
+ pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_INLINE arm_status arm_mat_mult_opt_q31_3x3_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs;
+ q31_t *pInA0 = pInA;
+ q31_t *pInA1 = pInA0 + MATRIX_DIM3;
+ q31_t *pInA2 = pInA1 + MATRIX_DIM3;
+ q63_t acc0, acc1, acc2;
+ q31x4_t vecB, vecA;
+ /* enable predication to disable last (4th) vector element */
+ mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * MATRIX_DIM3;
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_mult_opt_q31_4x4_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs;
+ q31_t *pInA0 = pInA;
+ q31_t *pInA1 = pInA0 + MATRIX_DIM4;
+ q31_t *pInA2 = pInA1 + MATRIX_DIM4;
+ q31_t *pInA3 = pInA2 + MATRIX_DIM4;
+ q63_t acc0, acc1, acc2, acc3;
+ q31x4_t vecB, vecA;
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 4);
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+arm_status arm_mat_mult_opt_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst,
+ q31_t *pState)
+{
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA2;
+ q31_t *pInB2;
+ q31_t *px; /* Temporary output data matrix pointer */
+ q31_t *px2; /* Temporary output data matrix pointer */
+ uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
+ q31_t *pSrcBT = pState; /* input data matrix pointer for transpose */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* Status of matrix multiplication */
+ arm_matrix_instance_q31 BT;
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ } else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+
+ /* small squared matrix specialized routines */
+ if(numRowsA == numColsB && numColsB == numColsA) {
+ if (numRowsA == 1)
+ {
+ q63_t sum = (q63_t) *pInA * *pInB;
+ pDst->pData[0] = (q31_t)(sum >> 31);
+ return (ARM_MATH_SUCCESS);
+ }
+ else if(numRowsA == 2)
+ return arm_mat_mult_opt_q31_2x2_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 3)
+ return arm_mat_mult_opt_q31_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_mult_opt_q31_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+
+ /*
+ * Matrix transpose
+ */
+ BT.numRows = numColsB;
+ BT.numCols = numRowsB;
+ BT.pData = pSrcBT;
+
+ arm_mat_trans_q31(pSrcB, &BT);
+
+
+ /*
+ * Reset the variables for the usage in the following multiplication process
+ */
+ i = 0;
+ row = numRowsA >> 1;
+ px = pDst->pData;
+ px2 = px + numColsB;
+
+ /*
+ * main loop
+ * compute 2 x 2 output blocks
+ * with dot products (Matrix A rows * Transposed MAtrix B rows)
+ */
+ while (row > 0u) {
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ * Compute 2 columns and 2 rows in parrallel
+ */
+ col = numColsB >> 1;
+ j = 0;
+
+ /*
+ * column pair loop
+ */
+ while (col > 0u) {
+ q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
+ q31x4_t vecA, vecA2, vecB, vecB2;
+ q63_t acc0, acc1, acc2, acc3;
+
+ /*
+ * Initiate the pointers
+ * - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
+ * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
+ */
+ pInA = pSrcA->pData + i;
+ pInA2 = pInA + numColsA;
+ pInB = pSrcBT + j;
+ pInB2 = pInB + numRowsB;
+
+
+ pSrcAVec = (q31_t const *) pInA;
+ pSrcA2Vec = (q31_t const *) pInA2;
+ pSrcBVec = (q31_t const *) pInB;
+ pSrcB2Vec = (q31_t const *) pInB2;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ /* load scheduling */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+
+ blkCnt = (numColsA / 4);
+ while (blkCnt > 0U) {
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+ vecA2 = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
+ vecB2 = vld1q(pSrcB2Vec);
+ pSrcB2Vec += 4;
+ acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numColsA & 3);
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecB = vld1q(pSrcBVec);
+ acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
+ vecA2 = vld1q(pSrcA2Vec);
+ acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
+ vecB2 = vld1q(pSrcB2Vec);
+ acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
+ vecA = vld1q(pSrcAVec);
+ acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
+ }
+
+ /* Convert to 1.31 */
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ /* Store the results (2 x 2 block) in the destination buffer */
+ *px++ = (q31_t) acc0;
+ *px++ = (q31_t) acc2;
+ *px2++ = (q31_t) acc1;
+ *px2++ = (q31_t) acc3;
+
+ j += numRowsB * 2;
+ /*
+ * Decrement the column pair loop counter
+ */
+ col--;
+
+ }
+
+ i = i + numColsA * 2;
+ px = px2 + (numColsB & 1u);
+ px2 = px + numColsB;
+ /*
+ * Decrement the row pair loop counter
+ */
+ row--;
+ }
+
+ /*
+ * Compute remaining row and/or column below
+ */
+ if (numColsB & 1u) {
+ row = numRowsA & (~0x1); //avoid redundant computation
+ px = pDst->pData + numColsB - 1;
+ i = 0;
+
+ /*
+ * row loop
+ */
+ while (row > 0) {
+ q31_t const *pSrcAVec, *pSrcBVec;
+ q31x4_t vecA, vecB;
+ q63_t acc0;
+
+ /*
+ * point to last column in matrix B
+ */
+ pInB = pSrcBT + numRowsB * (numColsB - 1);
+ pInA = pSrcA->pData + i;
+
+ pSrcAVec = (q31_t const *) pInA;
+ pSrcBVec = (q31_t const *) pInB;
+
+ /* single dot-product */
+ acc0 = 0LL;
+ blkCnt = (numColsA / 4);
+ while (blkCnt > 0U) {
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numColsA & 3);
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
+ }
+
+ acc0 = asrl(acc0, 23);
+ *px = (q31_t) acc0;
+
+ px += numColsB;
+
+ i += numColsA;
+ /*
+ * Decrement the row loop counter
+ */
+ row--;
+ }
+ }
+
+ if (numRowsA & 1u) {
+ col = numColsB;
+ i = 0u;
+ /*
+ * point to last row in output matrix
+ */
+ px = pDst->pData + (numColsB) * (numRowsA - 1);
+ /*
+ * col loop
+ */
+ while (col > 0) {
+ q31_t const *pSrcAVec, *pSrcBVec;
+ q31x4_t vecA, vecB;
+ q63_t acc0;
+
+ /*
+ * point to last row in matrix A
+ */
+ pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
+ pInB = pSrcBT + i;
+
+ /*
+ * Set the variable sum, that acts as accumulator, to zero
+ */
+ pSrcAVec = (q31_t const *) pInA;
+ pSrcBVec = (q31_t const *) pInB;
+ acc0 = 0LL;
+
+ blkCnt = (numColsA / 4);
+ while (blkCnt > 0U) {
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numColsA & 3);
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
+ }
+
+ acc0 = asrl(acc0, 23);
+ *px++ = (q31_t) acc0;
+
+ i += numColsA;
+ /*
+ * Decrement the col loop counter
+ */
+ col--;
+ }
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /*
+ * Return to application
+ */
+ return (status);
+}
+
+#else
+arm_status arm_mat_mult_opt_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst,
+ q31_t *pState)
+{
+ q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ q63_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+ (void)pState;
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining MACs */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize cntCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Convert result from 2.62 to 1.31 format and store in destination buffer */
+ *px++ = (q31_t) (sum >> 31);
+
+ /* Decrement column loop counter */
+ col--;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/Source/MatrixFunctions/arm_mat_mult_q15.c b/Source/MatrixFunctions/arm_mat_mult_q15.c
index 9219ed02..3e1172c5 100644
--- a/Source/MatrixFunctions/arm_mat_mult_q15.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q15.c
@@ -42,7 +42,7 @@
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
- @param[in] pState points to the array for storing intermediate results (Unused)
+ @param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@@ -617,7 +617,7 @@ arm_status arm_mat_mult_q15(
return (status);
}
-#else
+#else
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
@@ -639,8 +639,8 @@ arm_status arm_mat_mult_q15(
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
- q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
+ arm_matrix_instance_q15 BT;
#ifdef ARM_MATH_MATRIX_CHECK
@@ -655,89 +655,13 @@ arm_status arm_mat_mult_q15(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
{
- /* Matrix transpose */
- do
- {
- /* The pointer px is set to starting address of column being processed */
- px = pSrcBT + i;
-
- /* Apply loop unrolling and exchange columns with row elements */
- col = numColsB >> 2U;
-
- /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (col > 0U)
- {
- /* Read two elements from row */
- in = read_q15x2_ia ((q15_t **) &pInB);
-
- /* Unpack and store one element in destination */
-#ifndef ARM_MATH_BIG_ENDIAN
- *px = (q15_t) in;
-#else
- *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update pointer px to point to next row of transposed matrix */
- px += numRowsB;
-
- /* Unpack and store second element in destination */
-#ifndef ARM_MATH_BIG_ENDIAN
- *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#else
- *px = (q15_t) in;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
- /* Update pointer px to point to next row of transposed matrix */
- px += numRowsB;
-
- /* Read two elements from row */
- in = read_q15x2_ia ((q15_t **) &pInB);
-
- /* Unpack and store one element in destination */
-#ifndef ARM_MATH_BIG_ENDIAN
- *px = (q15_t) in;
-#else
- *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- px += numRowsB;
-
-#ifndef ARM_MATH_BIG_ENDIAN
- *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#else
- *px = (q15_t) in;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
- px += numRowsB;
- /* Decrement column loop counter */
- col--;
- }
-
- /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
- ** No loop unrolling is used. */
- col = numColsB % 0x4U;
-
- while (col > 0U)
- {
- /* Read and store input element in destination */
- *px = *pInB++;
-
- /* Update pointer px to point to next row of transposed matrix */
- px += numRowsB;
-
- /* Decrement column loop counter */
- col--;
- }
-
- i++;
-
- /* Decrement row loop counter */
- row--;
-
- } while (row > 0U);
+ BT.numRows = numColsB;
+ BT.numCols = numRowsB;
+ BT.pData = pSrcBT;
+ arm_mat_trans_q15(pSrcB,&BT);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
diff --git a/Source/MatrixFunctions/arm_mat_mult_q31.c b/Source/MatrixFunctions/arm_mat_mult_q31.c
index 08001cc2..18738279 100644
--- a/Source/MatrixFunctions/arm_mat_mult_q31.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q31.c
@@ -3,8 +3,8 @@
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
- * $Date: 3 Nov 2021
- * $Revision: V1.10.0
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -332,45 +332,44 @@ __STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
return (ARM_MATH_SUCCESS);
}
-
arm_status arm_mat_mult_q31(
- const arm_matrix_instance_q31 * pSrcA,
- const arm_matrix_instance_q31 * pSrcB,
- arm_matrix_instance_q31 * pDst)
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
{
- q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
- q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
- q31_t *pInA2;
- q31_t *pInB2;
- q31_t *px; /* Temporary output data matrix pointer */
- q31_t *px2; /* Temporary output data matrix pointer */
- uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
- uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
- uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
- uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
- uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
- q31_t State[numRowsB * numColsB * 1];
- q31_t *pSrcBT = State; /* input data matrix pointer for transpose */
- uint32_t blkCnt; /* loop counters */
- arm_status status; /* Status of matrix multiplication */
- arm_matrix_instance_q31 BT;
-#ifdef ARM_MATH_MATRIX_CHECK
+ q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */
+ q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ uint32x4_t vecOffs, vecColBOffs;
+ uint32_t blkCnt, rowCnt; /* loop counters */
+
+ #ifdef ARM_MATH_MATRIX_CHECK
- /* Check for matrix mismatch condition */
- if ((pSrcA->numCols != pSrcB->numRows) ||
- (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- } else
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
- {
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
- /* small squared matrix specialized routines */
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
- pDst->pData[0] = (q31_t)(sum >> 31);
+ pOut[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
@@ -381,263 +380,246 @@ arm_status arm_mat_mult_q31(
return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
+ vecColBOffs = vidupq_u32((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * (uint32_t) (numColsB);
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ rowCnt = row >> 2;
+ while (rowCnt > 0U)
+ {
/*
- * Matrix transpose
+ * Output pointer is set to starting address of the row being processed
*/
- BT.numRows = numColsB;
- BT.numCols = numRowsB;
- BT.pData = pSrcBT;
-
- arm_mat_trans_q31(pSrcB, &BT);
-
-
+ px = pOut + i;
+ i = i + 4 * numColsB;
/*
- * Reset the variables for the usage in the following multiplication process
+ * For every row wise process, the column loop counter is to be initiated
*/
- i = 0;
- row = numRowsA >> 1;
- px = pDst->pData;
- px2 = px + numColsB;
-
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (q31_t const *)pSrcB->pData;
/*
- * main loop
- * compute 2 x 2 output blocks
- * with dot products (Matrix A rows * Transposed MAtrix B rows)
+ * column loop
*/
- while (row > 0u) {
+ while (col > 0U)
+ {
+ /*
+ * generate 4 columns elements
+ */
/*
- * For every row wise process, the column loop counter is to be initiated
- * Compute 2 columns and 2 rows in parrallel
+ * Matrix A columns number of MAC operations are to be performed
*/
- col = numColsB >> 1;
- j = 0;
+
+ q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+ q31_t const *pInA0 = pInA;
+ q31_t const *pInA1 = pInA0 + numColsA;
+ q31_t const *pInA2 = pInA1 + numColsA;
+ q31_t const *pInA3 = pInA2 + numColsA;
+ q63_t acc0, acc1, acc2, acc3;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ pSrcA0Vec = (q31_t const *) pInA0;
+ pSrcA1Vec = (q31_t const *) pInA1;
+ pSrcA2Vec = (q31_t const *) pInA2;
+ pSrcA3Vec = (q31_t const *) pInA3;
+
+ vecOffs = vecColBOffs;
+
+ /* process 1 x 4 block output */
+ blkCnt = numColsA >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /* move Matrix B read offsets, 4 rows down */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
+ acc1 = vrmlaldavhaq(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
+ acc2 = vrmlaldavhaq(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
+ acc3 = vrmlaldavhaq(acc3, vecA, vecB);
+ blkCnt--;
+ }
/*
- * column pair loop
+ * tail
+ * (will be merged thru tail predication)
*/
- while (col > 0u) {
- q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
- q31x4_t vecA, vecA2, vecB, vecB2;
- q63_t acc0, acc1, acc2, acc3;
-
- /*
- * Initiate the pointers
- * - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
- * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
- */
- pInA = pSrcA->pData + i;
- pInA2 = pInA + numColsA;
- pInB = pSrcBT + j;
- pInB2 = pInB + numRowsB;
-
-
- pSrcAVec = (q31_t const *) pInA;
- pSrcA2Vec = (q31_t const *) pInA2;
- pSrcBVec = (q31_t const *) pInB;
- pSrcB2Vec = (q31_t const *) pInB2;
-
- acc0 = 0LL;
- acc1 = 0LL;
- acc2 = 0LL;
- acc3 = 0LL;
-
- /* load scheduling */
- vecA = vld1q(pSrcAVec);
- pSrcAVec += 4;
-
- blkCnt = (numColsA / 4);
- while (blkCnt > 0U) {
- vecB = vld1q(pSrcBVec);
- pSrcBVec += 4;
- acc0 = vrmlaldavhaq(acc0, vecA, vecB);
- vecA2 = vld1q(pSrcA2Vec);
- pSrcA2Vec += 4;
- acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
- vecB2 = vld1q(pSrcB2Vec);
- pSrcB2Vec += 4;
- acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
- vecA = vld1q(pSrcAVec);
- pSrcAVec += 4;
- acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
-
- blkCnt--;
- }
- /*
- * tail
- * (will be merged thru tail predication)
- */
- blkCnt = (numColsA & 3);
- if (blkCnt > 0U) {
- mve_pred16_t p0 = vctp32q(blkCnt);
- vecB = vld1q(pSrcBVec);
- acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
- vecA2 = vld1q(pSrcA2Vec);
- acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
- vecB2 = vld1q(pSrcB2Vec);
- acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
- vecA = vld1q(pSrcAVec);
- acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
- }
-
- /* Convert to 1.31 */
- acc0 = asrl(acc0, 23);
- acc1 = asrl(acc1, 23);
- acc2 = asrl(acc2, 23);
- acc3 = asrl(acc3, 23);
-
- /* Store the results (2 x 2 block) in the destination buffer */
- *px++ = (q31_t) acc0;
- *px++ = (q31_t) acc2;
- *px2++ = (q31_t) acc1;
- *px2++ = (q31_t) acc3;
-
- j += numRowsB * 2;
- /*
- * Decrement the column pair loop counter
- */
- col--;
-
+ blkCnt = numColsA & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+ //vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
+ acc1 = vrmlaldavhaq(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
+ acc2 = vrmlaldavhaq(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
+ acc3 = vrmlaldavhaq(acc3, vecA, vecB);
}
- i = i + numColsA * 2;
- px = px2 + (numColsB & 1u);
- px2 = px + numColsB;
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ px[0] = (q31_t) acc0;
+ px[1 * numColsB] = (q31_t) acc1;
+ px[2 * numColsB] = (q31_t) acc2;
+ px[3 * numColsB] = (q31_t) acc3;
+ px++;
/*
- * Decrement the row pair loop counter
+ * Decrement the column loop counter
*/
- row--;
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/*
- * Compute remaining row and/or column below
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA * 4);
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt --;
+
+ }
+ rowCnt = row & 3;
+ while (rowCnt > 0U)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i;
+ i = i + numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (q31_t const *)pSrcB->pData;
+ /*
+ * column loop
*/
- if (numColsB & 1u) {
- row = numRowsA & (~0x1); //avoid redundant computation
- px = pDst->pData + numColsB - 1;
- i = 0;
+ while (col > 0U)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q31_t const *pSrcA0Vec;
+ q31_t const *pInA0 = pInA;
+ q63_t acc0;
+
+ acc0 = 0LL;
+
+
+ pSrcA0Vec = (q31_t const *) pInA0;
+
+ vecOffs = vecColBOffs;
+
+ /* process 1 x 4 block output */
+ blkCnt = numColsA >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /* move Matrix B read offsets, 4 rows down */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+ blkCnt--;
+ }
/*
- * row loop
+ * tail
+ * (will be merged thru tail predication)
*/
- while (row > 0) {
- q31_t const *pSrcAVec, *pSrcBVec;
- q31x4_t vecA, vecB;
- q63_t acc0;
-
- /*
- * point to last column in matrix B
- */
- pInB = pSrcBT + numRowsB * (numColsB - 1);
- pInA = pSrcA->pData + i;
-
- pSrcAVec = (q31_t const *) pInA;
- pSrcBVec = (q31_t const *) pInB;
-
- /* single dot-product */
- acc0 = 0LL;
- blkCnt = (numColsA / 4);
- while (blkCnt > 0U) {
- vecA = vld1q(pSrcAVec);
- pSrcAVec += 4;
- vecB = vld1q(pSrcBVec);
- pSrcBVec += 4;
- acc0 = vrmlaldavhaq(acc0, vecA, vecB);
-
- blkCnt--;
- }
- /*
- * tail
- * (will be merged thru tail predication)
- */
- blkCnt = (numColsA & 3);
- if (blkCnt > 0U) {
- mve_pred16_t p0 = vctp32q(blkCnt);
- vecA = vld1q(pSrcAVec);
- vecB = vld1q(pSrcBVec);
- acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
- }
-
- acc0 = asrl(acc0, 23);
- *px = (q31_t) acc0;
-
- px += numColsB;
-
- i += numColsA;
- /*
- * Decrement the row loop counter
- */
- row--;
+ blkCnt = numColsA & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+ //vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
}
- }
- if (numRowsA & 1u) {
- col = numColsB;
- i = 0u;
+ acc0 = asrl(acc0, 23);
+
+
+ px[0] = (q31_t) acc0;
+ px++;
/*
- * point to last row in output matrix
+ * Decrement the column loop counter
*/
- px = pDst->pData + (numColsB) * (numRowsA - 1);
+ col--;
/*
- * col loop
+ * Update the pointer pInB to point to the starting address of the next column
*/
- while (col > 0) {
- q31_t const *pSrcAVec, *pSrcBVec;
- q31x4_t vecA, vecB;
- q63_t acc0;
-
- /*
- * point to last row in matrix A
- */
- pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
- pInB = pSrcBT + i;
-
- /*
- * Set the variable sum, that acts as accumulator, to zero
- */
- pSrcAVec = (q31_t const *) pInA;
- pSrcBVec = (q31_t const *) pInB;
- acc0 = 0LL;
-
- blkCnt = (numColsA / 4);
- while (blkCnt > 0U) {
- vecA = vld1q(pSrcAVec);
- pSrcAVec += 4;
- vecB = vld1q(pSrcBVec);
- pSrcBVec += 4;
- acc0 = vrmlaldavhaq(acc0, vecA, vecB);
-
- blkCnt--;
- }
- /*
- * tail
- * (will be merged thru tail predication)
- */
- blkCnt = (numColsA & 3);
- if (blkCnt > 0U) {
- mve_pred16_t p0 = vctp32q(blkCnt);
- vecA = vld1q(pSrcAVec);
- vecB = vld1q(pSrcBVec);
- acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
- }
-
- acc0 = asrl(acc0, 23);
- *px++ = (q31_t) acc0;
-
- i += numColsA;
- /*
- * Decrement the col loop counter
- */
- col--;
- }
+ pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
- /* Set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += numColsA;
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt--;
}
+
/*
- * Return to application
+ * set status as ARM_MATH_SUCCESS
*/
- return (status);
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
}
#else
diff --git a/Testing/Include/Benchmarks/BinaryQ31.h b/Testing/Include/Benchmarks/BinaryQ31.h
index 64502c2f..21f51508 100755
--- a/Testing/Include/Benchmarks/BinaryQ31.h
+++ b/Testing/Include/Benchmarks/BinaryQ31.h
@@ -14,6 +14,7 @@ class BinaryQ31:public Client::Suite
Client::Pattern input1;
Client::Pattern input2;
Client::LocalPattern output;
+ Client::LocalPattern tmp;
int nbr;
int nbi;
@@ -22,5 +23,6 @@ class BinaryQ31:public Client::Suite
arm_matrix_instance_q31 in1;
arm_matrix_instance_q31 in2;
arm_matrix_instance_q31 out;
+ q31_t *tmpPtr;
};
diff --git a/Testing/Include/Tests/BinaryTestsQ31.h b/Testing/Include/Tests/BinaryTestsQ31.h
index 41459e40..b5d6d9a4 100755
--- a/Testing/Include/Tests/BinaryTestsQ31.h
+++ b/Testing/Include/Tests/BinaryTestsQ31.h
@@ -16,6 +16,8 @@ class BinaryTestsQ31:public Client::Suite
Client::Pattern ref;
Client::Pattern dims;
Client::LocalPattern output;
+ Client::LocalPattern tmp;
+
/* Local copies of inputs since matrix instance in CMSIS-DSP are not using
pointers to const.
diff --git a/Testing/Source/Benchmarks/BinaryQ31.cpp b/Testing/Source/Benchmarks/BinaryQ31.cpp
index f89abd90..7cbc836e 100755
--- a/Testing/Source/Benchmarks/BinaryQ31.cpp
+++ b/Testing/Source/Benchmarks/BinaryQ31.cpp
@@ -17,6 +17,11 @@
arm_mat_mult_fast_q31(&this->in1,&this->in2,&this->out);
}
+ void BinaryQ31::test_mat_mult_opt_q31()
+ {
+ arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,this->tmpPtr);
+ }
+
void BinaryQ31::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr)
{
@@ -35,6 +40,14 @@
output.create(2*this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr);
break;
+ case BinaryQ31::TEST_MAT_MULT_OPT_Q31_4:
+ input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi);
+ input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc);
+ output.create(this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr);
+ tmp.create(this->nbi*this->nbc,BinaryQ31::TMP_Q31_ID,mgr);
+ this->tmpPtr=tmp.ptr();
+ break;
+
default:
input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi);
input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc);
diff --git a/Testing/Source/Tests/BinaryTestsF32.cpp b/Testing/Source/Tests/BinaryTestsF32.cpp
index d552d87a..16e31263 100755
--- a/Testing/Source/Tests/BinaryTestsF32.cpp
+++ b/Testing/Source/Tests/BinaryTestsF32.cpp
@@ -16,6 +16,13 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
+static void checkInnerTail(float32_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+}
#define LOADDATA2() \
const float32_t *inp1=input1.ptr(); \
@@ -68,6 +75,7 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
@@ -99,6 +107,7 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (2*rows * columns);
+ checkInnerTail(outp);
}
diff --git a/Testing/Source/Tests/BinaryTestsQ15.cpp b/Testing/Source/Tests/BinaryTestsQ15.cpp
index c6ff67a9..390a52a3 100755
--- a/Testing/Source/Tests/BinaryTestsQ15.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ15.cpp
@@ -23,6 +23,19 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
+static void checkInnerTail(q15_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+ ASSERT_TRUE(b[4] == 0);
+ ASSERT_TRUE(b[5] == 0);
+ ASSERT_TRUE(b[6] == 0);
+ ASSERT_TRUE(b[7] == 0);
+}
+
+
#define LOADDATA2() \
const q15_t *inp1=input1.ptr(); \
@@ -39,7 +52,7 @@ a double precision computation.
int i;
-#define PREPAREDATA2() \
+#define PREPAREDATA2C() \
in1.numRows=rows; \
in1.numCols=internal; \
memcpy((void*)ap,(const void*)inp1,2*sizeof(q15_t)*rows*internal);\
@@ -54,29 +67,45 @@ a double precision computation.
out.numCols=columns; \
out.pData = outp;
-
+#define PREPAREDATA2R() \
+ in1.numRows=rows; \
+ in1.numCols=internal; \
+ memcpy((void*)ap,(const void*)inp1,sizeof(q15_t)*rows*internal);\
+ in1.pData = ap; \
+ \
+ in2.numRows=internal; \
+ in2.numCols=columns; \
+ memcpy((void*)bp,(const void*)inp2,sizeof(q15_t)*internal*columns);\
+ in2.pData = bp; \
+ \
+ out.numRows=rows; \
+ out.numCols=columns; \
+ out.pData = outp;
void BinaryTestsQ15::test_mat_mult_q15()
{
LOADDATA2();
arm_status status;
+
for(i=0;i < nbMatrixes ; i ++)
{
rows = *dimsp++;
internal = *dimsp++;
columns = *dimsp++;
- PREPAREDATA2();
+ PREPAREDATA2R();
+ memset(tmpPtr,0,sizeof(q15_t)*internal*columns + 16);
status=arm_mat_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
+ checkInnerTail(tmpPtr + internal * columns);
}
- ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_LOW_THRESHOLD);
@@ -99,17 +128,16 @@ a double precision computation.
columns = *dimsp++;
- PREPAREDATA2();
+ PREPAREDATA2C();
status=arm_mat_cmplx_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (2*rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
-
ASSERT_SNR(output,ref,(q15_t)MULT_SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15);
diff --git a/Testing/Source/Tests/BinaryTestsQ31.cpp b/Testing/Source/Tests/BinaryTestsQ31.cpp
index 7dee08ea..b894e530 100755
--- a/Testing/Source/Tests/BinaryTestsQ31.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ31.cpp
@@ -18,6 +18,14 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
+static void checkInnerTail(q31_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+}
+
#define LOADDATA2() \
const q31_t *inp1=input1.ptr(); \
@@ -68,11 +76,10 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
-
ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31);
@@ -98,10 +105,38 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (2*rows * columns);
-
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
+ ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
+
+ ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31);
+
+ }
+
+ void BinaryTestsQ31::test_mat_mult_opt_q31()
+ {
+ LOADDATA2();
+ q31_t *tmpPtr=tmp.ptr();
+
+ arm_status status;
+
+ for(i=0;i < nbMatrixes ; i ++)
+ {
+ rows = *dimsp++;
+ internal = *dimsp++;
+ columns = *dimsp++;
+
+ PREPAREDATA2();
+ memset(tmpPtr,0,sizeof(q31_t)*internal*columns + 16);
+ status=arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,tmpPtr);
+ ASSERT_TRUE(status==ARM_MATH_SUCCESS);
+
+ outp += (rows * columns);
+ checkInnerTail(outp);
+ checkInnerTail(tmpPtr + internal*columns);
+
+ }
ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
@@ -141,6 +176,21 @@ a double precision computation.
b.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr);
break;
+ case TEST_MAT_MULT_OPT_Q31_3:
+ input1.reload(BinaryTestsQ31::INPUTS1_Q31_ID,mgr);
+ input2.reload(BinaryTestsQ31::INPUTS2_Q31_ID,mgr);
+ dims.reload(BinaryTestsQ31::DIMSBINARY1_S16_ID,mgr);
+
+ ref.reload(BinaryTestsQ31::REFMUL1_Q31_ID,mgr);
+
+ output.create(ref.nbSamples(),BinaryTestsQ31::OUT_Q31_ID,mgr);
+ a.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPA_Q31_ID,mgr);
+ b.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr);
+
+ tmp.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPC_Q31_ID,mgr);
+
+ break;
+
diff --git a/Testing/Source/Tests/BinaryTestsQ7.cpp b/Testing/Source/Tests/BinaryTestsQ7.cpp
index accd3f4b..4fa0346d 100755
--- a/Testing/Source/Tests/BinaryTestsQ7.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ7.cpp
@@ -19,6 +19,26 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 47
+static void checkInnerTail(q7_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+ ASSERT_TRUE(b[4] == 0);
+ ASSERT_TRUE(b[5] == 0);
+ ASSERT_TRUE(b[6] == 0);
+ ASSERT_TRUE(b[7] == 0);
+ ASSERT_TRUE(b[8] == 0);
+ ASSERT_TRUE(b[9] == 0);
+ ASSERT_TRUE(b[10] == 0);
+ ASSERT_TRUE(b[11] == 0);
+ ASSERT_TRUE(b[12] == 0);
+ ASSERT_TRUE(b[13] == 0);
+ ASSERT_TRUE(b[14] == 0);
+ ASSERT_TRUE(b[15] == 0);
+
+}
#define LOADDATA2() \
const q7_t *inp1=input1.ptr(); \
@@ -65,12 +85,15 @@ a double precision computation.
columns = *dimsp++;
PREPAREDATA2();
+ memset(tmpPtr,0,sizeof(q7_t)*internal*columns + 16);
+ checkInnerTail(tmpPtr + internal*columns);
status=arm_mat_mult_q7(&this->in1,&this->in2,&this->out,tmpPtr);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
-
+ checkInnerTail(outp);
+ checkInnerTail(tmpPtr + internal*columns);
}
ASSERT_EMPTY_TAIL(output);
diff --git a/Testing/Source/Tests/UnaryTestsF32.cpp b/Testing/Source/Tests/UnaryTestsF32.cpp
index c91be8fc..8aee65c1 100755
--- a/Testing/Source/Tests/UnaryTestsF32.cpp
+++ b/Testing/Source/Tests/UnaryTestsF32.cpp
@@ -46,6 +46,14 @@ Comparison for Cholesky
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
+static void checkInnerTailOverflow(float32_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+}
+
#define LOADDATA2() \
const float32_t *inp1=input1.ptr(); \
const float32_t *inp2=input2.ptr(); \
@@ -192,6 +200,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32()
arm_mat_vec_mult_f32(&this->in1, bp, outp);
outp += rows ;
+ checkInnerTailOverflow(outp);
}
@@ -219,6 +228,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -246,6 +256,7 @@ void UnaryTestsF32::test_mat_sub_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -273,6 +284,7 @@ void UnaryTestsF32::test_mat_scale_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -300,6 +312,7 @@ void UnaryTestsF32::test_mat_trans_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -327,6 +340,7 @@ void UnaryTestsF32::test_mat_cmplx_trans_f32()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += 2*(rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -421,6 +435,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
outp += (rows * columns);
inp1 += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -461,6 +476,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
outp += (rows * columns);
inp1 += (rows * rows);
inp2 += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -501,6 +517,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
outp += (rows * columns);
inp1 += (rows * rows);
inp2 += (rows * columns);
+ checkInnerTailOverflow(outp);
}
@@ -668,6 +685,9 @@ void UnaryTestsF32::test_mat_inverse_f32()
inp1 += (rows * columns);
+ checkInnerTailOverflow(outllp);
+ checkInnerTailOverflow(outdp);
+
}
diff --git a/Testing/Source/Tests/UnaryTestsQ15.cpp b/Testing/Source/Tests/UnaryTestsQ15.cpp
index eaf43648..3de4bce8 100755
--- a/Testing/Source/Tests/UnaryTestsQ15.cpp
+++ b/Testing/Source/Tests/UnaryTestsQ15.cpp
@@ -18,6 +18,31 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
+static void refInnerTail(q15_t *b)
+{
+ b[0] = 1;
+ b[1] = -1;
+ b[2] = 2;
+ b[3] = -2;
+ b[4] = 3;
+ b[5] = -3;
+ b[6] = 4;
+ b[7] = -4;
+}
+
+static void checkInnerTail(q15_t *b)
+{
+ ASSERT_TRUE(b[0] == 1);
+ ASSERT_TRUE(b[1] == -1);
+ ASSERT_TRUE(b[2] == 2);
+ ASSERT_TRUE(b[3] == -2);
+ ASSERT_TRUE(b[4] == 3);
+ ASSERT_TRUE(b[5] == -3);
+ ASSERT_TRUE(b[6] == 4);
+ ASSERT_TRUE(b[7] == -4);
+}
+
+
#define LOADDATA2() \
const q15_t *inp1=input1.ptr(); \
const q15_t *inp2=input2.ptr(); \
@@ -127,14 +152,14 @@ a double precision computation.
internal = *dimsp++;
PREPAREVECDATA2();
-
+ refInnerTail(outp + rows);
arm_mat_vec_mult_q15(&this->in1, bp, outp);
outp += rows ;
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@@ -153,15 +178,15 @@ a double precision computation.
columns = *dimsp++;
PREPAREDATA2();
-
+ refInnerTail(outp + rows * columns);
status=arm_mat_add_q15(&this->in1,&this->in2,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@@ -180,15 +205,15 @@ void UnaryTestsQ15::test_mat_sub_q15()
columns = *dimsp++;
PREPAREDATA2();
-
+ refInnerTail(outp + rows * columns);
status=arm_mat_sub_q15(&this->in1,&this->in2,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@@ -207,15 +232,15 @@ void UnaryTestsQ15::test_mat_scale_q15()
columns = *dimsp++;
PREPAREDATA1(false);
-
+ refInnerTail(outp + rows * columns);
status=arm_mat_scale_q15(&this->in1,ONEHALF,0,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
@@ -234,16 +259,15 @@ void UnaryTestsQ15::test_mat_trans_q15()
columns = *dimsp++;
PREPAREDATA1(true);
-
+ refInnerTail(outp + rows * columns);
status=arm_mat_trans_q15(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
-
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15);
@@ -261,15 +285,15 @@ void UnaryTestsQ15::test_mat_cmplx_trans_q15()
columns = *dimsp++;
PREPAREDATA1C(true);
-
+ refInnerTail(outp + 2*rows * columns);
status=arm_mat_cmplx_trans_q15(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += 2*(rows * columns);
+ checkInnerTail(outp);
}
- ASSERT_EMPTY_TAIL(output);
ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);
diff --git a/Testing/Source/Tests/UnaryTestsQ31.cpp b/Testing/Source/Tests/UnaryTestsQ31.cpp
index f17d8d1a..9c5aac84 100755
--- a/Testing/Source/Tests/UnaryTestsQ31.cpp
+++ b/Testing/Source/Tests/UnaryTestsQ31.cpp
@@ -18,6 +18,15 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 40
+static void checkInnerTail(q31_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+}
+
+
#define LOADDATA2() \
const q31_t *inp1=input1.ptr(); \
const q31_t *inp2=input2.ptr(); \
@@ -129,6 +138,7 @@ a double precision computation.
arm_mat_vec_mult_q31(&this->in1, bp, outp);
outp += rows ;
+ checkInnerTail(outp);
}
@@ -156,6 +166,7 @@ a double precision computation.
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
@@ -183,6 +194,7 @@ void UnaryTestsQ31::test_mat_sub_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
@@ -210,6 +222,7 @@ void UnaryTestsQ31::test_mat_scale_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
@@ -237,6 +250,7 @@ void UnaryTestsQ31::test_mat_trans_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
@@ -264,6 +278,7 @@ void UnaryTestsQ31::test_mat_cmplx_trans_q31()
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += 2*(rows * columns);
+ checkInnerTail(outp);
}
diff --git a/Testing/Source/Tests/UnaryTestsQ7.cpp b/Testing/Source/Tests/UnaryTestsQ7.cpp
index 6e4200b9..bb8b2ab1 100755
--- a/Testing/Source/Tests/UnaryTestsQ7.cpp
+++ b/Testing/Source/Tests/UnaryTestsQ7.cpp
@@ -19,6 +19,27 @@ a double precision computation.
/* Upper bound of maximum matrix dimension used by Python */
#define MAXMATRIXDIM 47
+static void checkInnerTail(q7_t *b)
+{
+ ASSERT_TRUE(b[0] == 0);
+ ASSERT_TRUE(b[1] == 0);
+ ASSERT_TRUE(b[2] == 0);
+ ASSERT_TRUE(b[3] == 0);
+ ASSERT_TRUE(b[4] == 0);
+ ASSERT_TRUE(b[5] == 0);
+ ASSERT_TRUE(b[6] == 0);
+ ASSERT_TRUE(b[7] == 0);
+
+ ASSERT_TRUE(b[8] == 0);
+ ASSERT_TRUE(b[9] == 0);
+ ASSERT_TRUE(b[10] == 0);
+ ASSERT_TRUE(b[11] == 0);
+ ASSERT_TRUE(b[12] == 0);
+ ASSERT_TRUE(b[13] == 0);
+ ASSERT_TRUE(b[14] == 0);
+ ASSERT_TRUE(b[15] == 0);
+}
+
#define LOADDATA2() \
const q7_t *inp1=input1.ptr(); \
const q7_t *inp2=input2.ptr(); \
@@ -112,6 +133,7 @@ a double precision computation.
arm_mat_vec_mult_q7(&this->in1, bp, outp);
outp += rows ;
+ checkInnerTail(outp);
}
@@ -132,13 +154,13 @@ void UnaryTestsQ7::test_mat_trans_q7()
{
rows = *dimsp++;
columns = *dimsp++;
-
PREPAREDATA1(true);
status=arm_mat_trans_q7(&this->in1,&this->out);
ASSERT_TRUE(status==ARM_MATH_SUCCESS);
outp += (rows * columns);
+ checkInnerTail(outp);
}
diff --git a/Testing/bench.txt b/Testing/bench.txt
index ea5dcbf5..ec95e513 100755
--- a/Testing/bench.txt
+++ b/Testing/bench.txt
@@ -1583,6 +1583,7 @@ group Root {
Pattern INPUTAC_Q31_ID : InputAC1_q31.txt
Pattern INPUTBC_Q31_ID : InputBC1_q31.txt
Output OUT_Q31_ID : Output
+ Output TMP_Q31_ID : Temp
Params PARAM1_ID = {
NBR = [5,10,40]
@@ -1595,6 +1596,7 @@ group Root {
Matrix Multiplication:test_mat_mult_q31
Complex Matrix Multiplication:test_mat_cmplx_mult_q31
Fast Matrix Multiplication:test_mat_mult_fast_q31
+ Opt Matrix Multiplication:test_mat_mult_opt_q31
} -> PARAM1_ID
}
@@ -1614,6 +1616,7 @@ group Root {
Pattern INPUTAC_Q15_ID : InputAC1_q15.txt
Pattern INPUTBC_Q15_ID : InputBC1_q15.txt
Output OUT_Q15_ID : Output
+ Output TMP_Q15_ID : Temp
Params PARAM1_ID = {
NBR = [5,10,40]
diff --git a/Testing/desc.txt b/Testing/desc.txt
index 1233d2d7..8728218c 100644
--- a/Testing/desc.txt
+++ b/Testing/desc.txt
@@ -3459,10 +3459,12 @@ group Root {
Output OUT_Q31_ID : Output
Output TMPA_Q31_ID : TmpA
Output TMPB_Q31_ID : TmpB
+ Output TMPC_Q31_ID : TmpC
Functions {
test mult:test_mat_mult_q31
test complex mult:test_mat_cmplx_mult_q31
+ test mult opt:test_mat_mult_opt_q31
}
}