CMSIS-DSP: Improvements to pull request #1363

To avoid having to change the API in an incompatible way, a arm_mat_mult_opt_q31 was introduced and is providing a faster implementation to use with Helium (but requiring more storage for intermediate results). Some improvements to tests for matrix functions added.
5 years ago · e45dc7c22e
parent cfc30c12b8
commit e45dc7c22e
18 changed files with 1285 additions and 366 deletions
--- a/Include/dsp/matrix_functions.h
+++ b/Include/dsp/matrix_functions.h
@ -444,6 +444,21 @@ arm_status arm_mat_mult_q31(
  const arm_matrix_instance_q31 * pSrcB,
        arm_matrix_instance_q31 * pDst);

+  /**
+   * @brief Q31 matrix multiplication
+   * @param[in]  pSrcA  points to the first input matrix structure
+   * @param[in]  pSrcB  points to the second input matrix structure
+   * @param[out] pDst   points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_opt_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst,
+        q31_t *pState);
+
  /**
   * @brief Q31 matrix and vector multiplication
   * @param[in]  pSrcMat  points to the input matrix structure
--- a/Source/MatrixFunctions/MatrixFunctions.c
+++ b/Source/MatrixFunctions/MatrixFunctions.c
@ -44,6 +44,7 @@
 #include "arm_mat_mult_q7.c"
 #include "arm_mat_mult_q15.c"
 #include "arm_mat_mult_q31.c"
+#include "arm_mat_mult_opt_q31.c"
 #include "arm_mat_scale_f32.c"
 #include "arm_mat_scale_q15.c"
 #include "arm_mat_scale_q31.c"
--- a/Source/MatrixFunctions/arm_mat_mult_opt_q31.c
+++ b/Source/MatrixFunctions/arm_mat_mult_opt_q31.c
@ -0,0 +1,784 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_opt_q31.c
+ * Description:  Q31 matrix multiplication
+ *
+ * $Date:        3 Nov 2021
+ * $Revision:    V1.10.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixMult
+  @{
+ */
+
+/**
+  @brief         Q31 matrix multiplication.
+  @param[in]     pSrcA      points to the first input matrix structure
+  @param[in]     pSrcB      points to the second input matrix structure
+  @param[out]    pDst       points to output matrix structure
+  @param[in]  pState  points to the array for storing intermediate results
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+  @par           Scaling and Overflow Behavior
+                   The function is implemented using an internal 64-bit accumulator.
+                   The accumulator has a 2.62 format and maintains full precision of the intermediate
+                   multiplication results but provides only a single guard bit. There is no saturation
+                   on intermediate additions. Thus, if the accumulator overflows it wraps around and
+                   distorts the result. The input signals should be scaled down to avoid intermediate
+                   overflows. The input is thus scaled down by log2(numColsA) bits
+                   to avoid overflows, as a total of numColsA additions are performed internally.
+                   The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+  @remark
+                   Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
+  @remark
+                   This function is a faster implementation of arm_mat_mult_q31 for MVE but it is requiring 
+                   additional storage for intermediate results.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_mult_opt_q31_2x2_mve(
+    const arm_matrix_instance_q31 * pSrcA,
+    const arm_matrix_instance_q31 * pSrcB,
+    arm_matrix_instance_q31 * pDst)
+{
+    q31_t       *pInB = pSrcB->pData;  /* input data matrix pointer B */
+    q31_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q31_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint32x4_t   vecColBOffs;
+    q31_t       *pInA0 = pInA;
+    q31_t       *pInA1 = pInA0 + MATRIX_DIM2;
+    q63_t        acc0, acc1;
+    q31x4_t      vecB, vecA0, vecA1;
+    /* enable predication to disable half of vector elements */
+    mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
+
+    vecColBOffs = vidupq_u32((uint32_t)0, 1);
+    vecColBOffs = vecColBOffs * MATRIX_DIM2;
+
+    pInB = pSrcB->pData;
+
+    /* load 1st B column (partial load) */
+    vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+    /* load A rows */
+    vecA0 = vldrwq_s32(pInA0);
+    vecA1 = vldrwq_s32(pInA1);
+
+    acc0 = vrmlaldavhq(vecA0, vecB);
+    acc1 = vrmlaldavhq(vecA1, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+
+    pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+    acc0 = vrmlaldavhq(vecA0, vecB);
+    acc1 = vrmlaldavhq(vecA1, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+
+    pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_INLINE arm_status arm_mat_mult_opt_q31_3x3_mve(
+    const arm_matrix_instance_q31 * pSrcA,
+    const arm_matrix_instance_q31 * pSrcB,
+    arm_matrix_instance_q31 * pDst)
+{
+    q31_t       *pInB = pSrcB->pData;  /* input data matrix pointer B */
+    q31_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q31_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint32x4_t   vecColBOffs;
+    q31_t       *pInA0 = pInA;
+    q31_t       *pInA1 = pInA0 + MATRIX_DIM3;
+    q31_t       *pInA2 = pInA1 + MATRIX_DIM3;
+    q63_t        acc0, acc1, acc2;
+    q31x4_t      vecB, vecA;
+    /* enable predication to disable last (4th) vector element */
+    mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
+
+    vecColBOffs = vidupq_u32((uint32_t)0, 1);
+    vecColBOffs = vecColBOffs * MATRIX_DIM3;
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+
+    pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+
+    pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+
+    pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_mult_opt_q31_4x4_mve(
+    const arm_matrix_instance_q31 * pSrcA,
+    const arm_matrix_instance_q31 * pSrcB,
+    arm_matrix_instance_q31 * pDst)
+{
+    q31_t       *pInB = pSrcB->pData;  /* input data matrix pointer B */
+    q31_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q31_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint32x4_t   vecColBOffs;
+    q31_t       *pInA0 = pInA;
+    q31_t       *pInA1 = pInA0 + MATRIX_DIM4;
+    q31_t       *pInA2 = pInA1 + MATRIX_DIM4;
+    q31_t       *pInA3 = pInA2 + MATRIX_DIM4;
+    q63_t        acc0, acc1, acc2, acc3;
+    q31x4_t      vecB, vecA;
+
+    vecColBOffs = vidupq_u32((uint32_t)0, 4);
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA3);
+    acc3 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+    acc3 = asrl(acc3, 23);
+
+    pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+    pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA3);
+    acc3 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+    acc3 = asrl(acc3, 23);
+
+    pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+    pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA3);
+    acc3 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+    acc3 = asrl(acc3, 23);
+
+    pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+    pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+    vecA = vldrwq_s32(pInA0);
+    acc0 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA1);
+    acc1 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA2);
+    acc2 = vrmlaldavhq(vecA, vecB);
+    vecA = vldrwq_s32(pInA3);
+    acc3 = vrmlaldavhq(vecA, vecB);
+
+    acc0 = asrl(acc0, 23);
+    acc1 = asrl(acc1, 23);
+    acc2 = asrl(acc2, 23);
+    acc3 = asrl(acc3, 23);
+
+    pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+    pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+    pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+    pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+arm_status arm_mat_mult_opt_q31(
+    const arm_matrix_instance_q31 * pSrcA,
+    const arm_matrix_instance_q31 * pSrcB,
+    arm_matrix_instance_q31 * pDst,
+    q31_t *pState)
+{
+    q31_t          *pInA = pSrcA->pData;        /* input data matrix pointer A */
+    q31_t          *pInB = pSrcB->pData;        /* input data matrix pointer B */
+    q31_t          *pInA2;
+    q31_t          *pInB2;
+    q31_t          *px;         /* Temporary output data matrix pointer */
+    q31_t          *px2;        /* Temporary output data matrix pointer */
+    uint32_t        numRowsA = pSrcA->numRows;  /* number of rows of input matrix A    */
+    uint32_t        numColsB = pSrcB->numCols;  /* number of columns of input matrix B */
+    uint32_t        numColsA = pSrcA->numCols;  /* number of columns of input matrix A */
+    uint32_t        numRowsB = pSrcB->numRows;  /* number of rows of input matrix A    */
+    uint32_t        col, i = 0u, j, row = numRowsB;     /* loop counters */
+    q31_t          *pSrcBT = pState;     /* input data matrix pointer for transpose */
+    uint32_t        blkCnt;     /* loop counters */
+    arm_status      status;                            /* Status of matrix multiplication */
+    arm_matrix_instance_q31 BT;
+#ifdef ARM_MATH_MATRIX_CHECK
+
+    /* Check for matrix mismatch condition */
+    if ((pSrcA->numCols != pSrcB->numRows) ||
+        (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
+        /* Set status as ARM_MATH_SIZE_MISMATCH */
+        status = ARM_MATH_SIZE_MISMATCH;
+    } else
+#endif                          /* #ifdef ARM_MATH_MATRIX_CHECK */
+    {
+
+         /* small squared matrix specialized routines */
+    if(numRowsA == numColsB && numColsB == numColsA) {
+        if (numRowsA == 1)
+        {
+          q63_t sum =  (q63_t) *pInA * *pInB;
+          pDst->pData[0] = (q31_t)(sum >> 31);
+          return (ARM_MATH_SUCCESS);
+        }
+        else if(numRowsA == 2)
+            return arm_mat_mult_opt_q31_2x2_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 3)
+            return arm_mat_mult_opt_q31_3x3_mve(pSrcA, pSrcB, pDst);
+        else if (numRowsA == 4)
+            return arm_mat_mult_opt_q31_4x4_mve(pSrcA, pSrcB, pDst);
+    }
+
+
+        /*
+         * Matrix transpose
+         */
+        BT.numRows = numColsB;
+        BT.numCols = numRowsB;
+        BT.pData = pSrcBT;
+
+        arm_mat_trans_q31(pSrcB, &BT);
+
+
+        /*
+         * Reset the variables for the usage in the following multiplication process
+         */
+        i = 0;
+        row = numRowsA >> 1;
+        px = pDst->pData;
+        px2 = px + numColsB;
+
+        /*
+         * main loop
+         * compute 2 x 2 output blocks
+         * with dot products (Matrix A rows * Transposed MAtrix B rows)
+         */
+        while (row > 0u) {
+            /*
+             * For every row wise process, the column loop counter is to be initiated
+             * Compute 2 columns and 2 rows in parrallel
+             */
+            col = numColsB >> 1;
+            j = 0;
+
+            /*
+             * column pair loop
+             */
+            while (col > 0u) {
+                q31_t const    *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
+                q31x4_t         vecA, vecA2, vecB, vecB2;
+                q63_t           acc0, acc1, acc2, acc3;
+
+                /*
+                 * Initiate the pointers
+                 * - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
+                 * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
+                 */
+                pInA = pSrcA->pData + i;
+                pInA2 = pInA + numColsA;
+                pInB = pSrcBT + j;
+                pInB2 = pInB + numRowsB;
+
+
+                pSrcAVec = (q31_t const *) pInA;
+                pSrcA2Vec = (q31_t const *) pInA2;
+                pSrcBVec = (q31_t const *) pInB;
+                pSrcB2Vec = (q31_t const *) pInB2;
+
+                acc0 = 0LL;
+                acc1 = 0LL;
+                acc2 = 0LL;
+                acc3 = 0LL;
+
+                /* load scheduling */
+                vecA = vld1q(pSrcAVec);
+                pSrcAVec += 4;
+
+                blkCnt = (numColsA / 4);
+                while (blkCnt > 0U) {
+                    vecB = vld1q(pSrcBVec);
+                    pSrcBVec += 4;
+                    acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+                    vecA2 = vld1q(pSrcA2Vec);
+                    pSrcA2Vec += 4;
+                    acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
+                    vecB2 = vld1q(pSrcB2Vec);
+                    pSrcB2Vec += 4;
+                    acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
+                    vecA = vld1q(pSrcAVec);
+                    pSrcAVec += 4;
+                    acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
+
+                    blkCnt--;
+                }
+                /*
+                 * tail
+                 * (will be merged thru tail predication)
+                 */
+                blkCnt = (numColsA & 3);
+                if (blkCnt > 0U) {
+                    mve_pred16_t    p0 = vctp32q(blkCnt);
+                    vecB = vld1q(pSrcBVec);
+                    acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
+                    vecA2 = vld1q(pSrcA2Vec);
+                    acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
+                    vecB2 = vld1q(pSrcB2Vec);
+                    acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
+                    vecA = vld1q(pSrcAVec);
+                    acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
+                }
+
+                /* Convert to 1.31 */
+                acc0 = asrl(acc0, 23);
+                acc1 = asrl(acc1, 23);
+                acc2 = asrl(acc2, 23);
+                acc3 = asrl(acc3, 23);
+
+                /* Store the results (2 x 2 block) in the destination buffer */
+                *px++ = (q31_t) acc0;
+                *px++ = (q31_t) acc2;
+                *px2++ = (q31_t) acc1;
+                *px2++ = (q31_t) acc3;
+
+                j += numRowsB * 2;
+                /*
+                 * Decrement the column pair loop counter
+                 */
+                col--;
+
+            }
+
+            i = i + numColsA * 2;
+            px = px2 + (numColsB & 1u);
+            px2 = px + numColsB;
+            /*
+             * Decrement the row pair loop counter
+             */
+            row--;
+        }
+
+        /*
+         * Compute remaining row and/or column below
+         */
+        if (numColsB & 1u) {
+            row = numRowsA & (~0x1);    //avoid redundant computation
+            px = pDst->pData + numColsB - 1;
+            i = 0;
+
+            /*
+             * row loop
+             */
+            while (row > 0) {
+                q31_t const    *pSrcAVec, *pSrcBVec;
+                q31x4_t         vecA, vecB;
+                q63_t           acc0;
+
+                /*
+                 * point to last column in matrix B
+                 */
+                pInB = pSrcBT + numRowsB * (numColsB - 1);
+                pInA = pSrcA->pData + i;
+
+                pSrcAVec = (q31_t const *) pInA;
+                pSrcBVec = (q31_t const *) pInB;
+
+                /* single dot-product */
+                acc0 = 0LL;
+                blkCnt = (numColsA / 4);
+                while (blkCnt > 0U) {
+                    vecA = vld1q(pSrcAVec);
+                    pSrcAVec += 4;
+                    vecB = vld1q(pSrcBVec);
+                    pSrcBVec += 4;
+                    acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+                    blkCnt--;
+                }
+                /*
+                 * tail
+                 * (will be merged thru tail predication)
+                 */
+                blkCnt = (numColsA & 3);
+                if (blkCnt > 0U) {
+                    mve_pred16_t    p0 = vctp32q(blkCnt);
+                    vecA = vld1q(pSrcAVec);
+                    vecB = vld1q(pSrcBVec);
+                    acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
+                }
+
+                acc0 = asrl(acc0, 23);
+                *px = (q31_t) acc0;
+
+                px += numColsB;
+
+                i += numColsA;
+                /*
+                 * Decrement the row loop counter
+                 */
+                row--;
+            }
+        }
+
+        if (numRowsA & 1u) {
+            col = numColsB;
+            i = 0u;
+            /*
+             * point to last row in output matrix
+             */
+            px = pDst->pData + (numColsB) * (numRowsA - 1);
+            /*
+             * col loop
+             */
+            while (col > 0) {
+                q31_t const    *pSrcAVec, *pSrcBVec;
+                q31x4_t         vecA, vecB;
+                q63_t           acc0;
+
+                /*
+                 * point to last row in matrix A
+                 */
+                pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
+                pInB = pSrcBT + i;
+
+                /*
+                 * Set the variable sum, that acts as accumulator, to zero
+                 */
+                pSrcAVec = (q31_t const *) pInA;
+                pSrcBVec = (q31_t const *) pInB;
+                acc0 = 0LL;
+
+                blkCnt = (numColsA / 4);
+                while (blkCnt > 0U) {
+                    vecA = vld1q(pSrcAVec);
+                    pSrcAVec += 4;
+                    vecB = vld1q(pSrcBVec);
+                    pSrcBVec += 4;
+                    acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+                    blkCnt--;
+                }
+                /*
+                 * tail
+                 * (will be merged thru tail predication)
+                 */
+                blkCnt = (numColsA & 3);
+                if (blkCnt > 0U) {
+                    mve_pred16_t    p0 = vctp32q(blkCnt);
+                    vecA = vld1q(pSrcAVec);
+                    vecB = vld1q(pSrcBVec);
+                    acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
+                }
+
+                acc0 = asrl(acc0, 23);
+                *px++ = (q31_t) acc0;
+
+                i += numColsA;
+                /*
+                 * Decrement the col loop counter
+                 */
+                col--;
+            }
+        }
+        /* Set status as ARM_MATH_SUCCESS */
+        status = ARM_MATH_SUCCESS;
+    }
+    /*
+     * Return to application
+     */
+    return (status);
+}
+
+#else
+arm_status arm_mat_mult_opt_q31(
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst,
+        q31_t *pState)
+{
+  q31_t *pIn1 = pSrcA->pData;                    /* Input data matrix pointer A */
+  q31_t *pIn2 = pSrcB->pData;                    /* Input data matrix pointer B */
+  q31_t *pInA = pSrcA->pData;                    /* Input data matrix pointer A */
+  q31_t *pInB = pSrcB->pData;                    /* Input data matrix pointer B */
+  q31_t *pOut = pDst->pData;                     /* Output data matrix pointer */
+  q31_t *px;                                     /* Temporary output data matrix pointer */
+  q63_t sum;                                     /* Accumulator */
+  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
+  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
+  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
+  uint32_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
+  arm_status status;                             /* Status of matrix multiplication */
+  (void)pState;
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* row loop */
+    do
+    {
+      /* Output pointer is set to starting address of row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      /* column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0;
+
+        /* Initialize pointer pIn1 to point to starting address of column being processed */
+        pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        /* Loop unrolling: Compute 4 MACs at a time. */
+        colCnt = numColsA >> 2U;
+
+        /* matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += (q63_t) *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += (q63_t) *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += (q63_t) *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          sum += (q63_t) *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Loop unrolling: Compute remaining MACs */
+        colCnt = numColsA % 0x4U;
+
+#else
+
+        /* Initialize cntCnt with number of columns */
+        colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+          /* Perform the multiply-accumulates */
+          sum += (q63_t) *pIn1++ * *pIn2;
+          pIn2 += numColsB;
+
+          /* Decrement loop counter */
+          colCnt--;
+        }
+
+        /* Convert result from 2.62 to 1.31 format and store in destination buffer */
+        *px++ = (q31_t) (sum >> 31);
+
+        /* Decrement column loop counter */
+        col--;
+
+        /* Update pointer pIn2 to point to starting address of next column */
+        pIn2 = pInB + (numColsB - col);
+
+      } while (col > 0U);
+
+      /* Update pointer pInA to point to starting address of next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement row loop counter */
+      row--;
+
+    } while (row > 0U);
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of MatrixMult group
+ */
--- a/Source/MatrixFunctions/arm_mat_mult_q15.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q15.c
@ -42,7 +42,7 @@
  @param[in]     pSrcA      points to the first input matrix structure
  @param[in]     pSrcB      points to the second input matrix structure
  @param[out]    pDst       points to output matrix structure
-  @param[in]     pState     points to the array for storing intermediate results (Unused)
+  @param[in]     pState     points to the array for storing intermediate results
  @return        execution status
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@ -617,7 +617,7 @@ arm_status arm_mat_mult_q15(
    return (status);
 }

-#else
+#else 
 arm_status arm_mat_mult_q15(
  const arm_matrix_instance_q15 * pSrcA,
  const arm_matrix_instance_q15 * pSrcB,
@ -639,8 +639,8 @@ arm_status arm_mat_mult_q15(
        uint32_t col, i = 0U, row = numRowsB, colCnt;  /* Loop counters */
        arm_status status;                             /* Status of matrix multiplication */

-        q31_t in;                                      /* Temporary variable to hold the input value */
        q31_t inA1, inB1, inA2, inB2;
+        arm_matrix_instance_q15 BT;

 #ifdef ARM_MATH_MATRIX_CHECK

@ -655,89 +655,13 @@ arm_status arm_mat_mult_q15(
  else

 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
  {
-    /* Matrix transpose */
-    do
-    {
-      /* The pointer px is set to starting address of column being processed */
-      px = pSrcBT + i;
-
-      /* Apply loop unrolling and exchange columns with row elements */
-      col = numColsB >> 2U;
-
-      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-       ** a second loop below computes the remaining 1 to 3 samples. */
-      while (col > 0U)
-      {
-        /* Read two elements from row */
-        in = read_q15x2_ia ((q15_t **) &pInB);
-
-        /* Unpack and store one element in destination */
-#ifndef ARM_MATH_BIG_ENDIAN
-        *px = (q15_t) in;
-#else
-        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* Update pointer px to point to next row of transposed matrix */
-        px += numRowsB;
-
-        /* Unpack and store second element in destination */
-#ifndef ARM_MATH_BIG_ENDIAN
-        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#else
-        *px = (q15_t) in;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-
-        /* Update pointer px to point to next row of transposed matrix */
-        px += numRowsB;
-
-        /* Read two elements from row */
-        in = read_q15x2_ia ((q15_t **) &pInB);
-
-        /* Unpack and store one element in destination */
-#ifndef ARM_MATH_BIG_ENDIAN
-        *px = (q15_t) in;
-#else
-        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-        px += numRowsB;
-
-#ifndef ARM_MATH_BIG_ENDIAN
-        *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
-#else
-        *px = (q15_t) in;
-#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
-        px += numRowsB;

-        /* Decrement column loop counter */
-        col--;
-      }
-
-      /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
-       ** No loop unrolling is used. */
-      col = numColsB % 0x4U;
-
-      while (col > 0U)
-      {
-        /* Read and store input element in destination */
-        *px = *pInB++;
-
-        /* Update pointer px to point to next row of transposed matrix */
-        px += numRowsB;
-
-        /* Decrement column loop counter */
-        col--;
-      }
-
-      i++;
-
-      /* Decrement row loop counter */
-      row--;
-
-    } while (row > 0U);
+    BT.numRows = numColsB;
+    BT.numCols = numRowsB;
+    BT.pData = pSrcBT;

+    arm_mat_trans_q15(pSrcB,&BT);
    /* Reset variables for usage in following multiplication process */
    row = numRowsA;
    i = 0U;
--- a/Source/MatrixFunctions/arm_mat_mult_q31.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q31.c
@ -3,8 +3,8 @@
 * Title:        arm_mat_mult_q31.c
 * Description:  Q31 matrix multiplication
 *
- * $Date:        3 Nov 2021
- * $Revision:    V1.10.0
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -332,45 +332,44 @@ __STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
    return (ARM_MATH_SUCCESS);
 }

-
 arm_status arm_mat_mult_q31(
-    const arm_matrix_instance_q31 * pSrcA,
-    const arm_matrix_instance_q31 * pSrcB,
-    arm_matrix_instance_q31 * pDst)
+  const arm_matrix_instance_q31 * pSrcA,
+  const arm_matrix_instance_q31 * pSrcB,
+        arm_matrix_instance_q31 * pDst)
 {
-    q31_t          *pInA = pSrcA->pData;        /* input data matrix pointer A */
-    q31_t          *pInB = pSrcB->pData;        /* input data matrix pointer B */
-    q31_t          *pInA2;
-    q31_t          *pInB2;
-    q31_t          *px;         /* Temporary output data matrix pointer */
-    q31_t          *px2;        /* Temporary output data matrix pointer */
-    uint32_t        numRowsA = pSrcA->numRows;  /* number of rows of input matrix A    */
-    uint32_t        numColsB = pSrcB->numCols;  /* number of columns of input matrix B */
-    uint32_t        numColsA = pSrcA->numCols;  /* number of columns of input matrix A */
-    uint32_t        numRowsB = pSrcB->numRows;  /* number of rows of input matrix A    */
-    uint32_t        col, i = 0u, j, row = numRowsB;     /* loop counters */
-    q31_t           State[numRowsB * numColsB * 1];
-    q31_t          *pSrcBT = State;     /* input data matrix pointer for transpose */
-    uint32_t        blkCnt;     /* loop counters */
-    arm_status      status;                            /* Status of matrix multiplication */
-    arm_matrix_instance_q31 BT;
-#ifdef ARM_MATH_MATRIX_CHECK
+    q31_t const *pInB = (q31_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q31_t const *pInA = (q31_t const *)pSrcA->pData;  /* input data matrix pointer A */
+    q31_t      *pOut = pDst->pData;   /* output data matrix pointer */
+    q31_t      *px;               /* Temporary output data matrix pointer */
+    uint16_t    numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
+    uint16_t    numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
+    uint16_t    numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
+    uint16_t    col, i = 0U, row = numRowsA;  /* loop counters */
+    arm_status  status;          /* status of matrix multiplication */
+    uint32x4_t  vecOffs, vecColBOffs;
+    uint32_t    blkCnt, rowCnt;           /* loop counters */
+
+  #ifdef ARM_MATH_MATRIX_CHECK

-    /* Check for matrix mismatch condition */
-    if ((pSrcA->numCols != pSrcB->numRows) ||
-        (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
-        /* Set status as ARM_MATH_SIZE_MISMATCH */
-        status = ARM_MATH_SIZE_MISMATCH;
-    } else
-#endif                          /* #ifdef ARM_MATH_MATRIX_CHECK */
-    {
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else

-         /* small squared matrix specialized routines */
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+     /* small squared matrix specialized routines */
    if(numRowsA == numColsB && numColsB == numColsA) {
        if (numRowsA == 1)
        {
          q63_t sum =  (q63_t) *pInA * *pInB;
-          pDst->pData[0] = (q31_t)(sum >> 31);
+          pOut[0] = (q31_t)(sum >> 31);
          return (ARM_MATH_SUCCESS);
        }
        else if(numRowsA == 2)
@ -381,263 +380,246 @@ arm_status arm_mat_mult_q31(
            return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
    }

+    vecColBOffs = vidupq_u32((uint32_t)0, 1);
+    vecColBOffs = vecColBOffs * (uint32_t) (numColsB);

+    /*
+     * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+     */
+
+    /*
+     * row loop
+     */
+    rowCnt = row >> 2;
+    while (rowCnt > 0U)
+    {
        /*
-         * Matrix transpose
+         * Output pointer is set to starting address of the row being processed
         */
-        BT.numRows = numColsB;
-        BT.numCols = numRowsB;
-        BT.pData = pSrcBT;
-
-        arm_mat_trans_q31(pSrcB, &BT);
-
-
+        px = pOut + i;
+        i = i + 4 * numColsB;
        /*
-         * Reset the variables for the usage in the following multiplication process
+         * For every row wise process, the column loop counter is to be initiated
         */
-        i = 0;
-        row = numRowsA >> 1;
-        px = pDst->pData;
-        px2 = px + numColsB;
-
+        col = numColsB;
+        /*
+         * For every row wise process, the pInB pointer is set
+         * to the starting address of the pSrcB data
+         */
+        pInB = (q31_t const *)pSrcB->pData;
        /*
-         * main loop
-         * compute 2 x 2 output blocks
-         * with dot products (Matrix A rows * Transposed MAtrix B rows)
+         * column loop
         */
-        while (row > 0u) {
+        while (col > 0U)
+        {
+                    /*
+             * generate 4 columns elements
+             */
            /*
-             * For every row wise process, the column loop counter is to be initiated
-             * Compute 2 columns and 2 rows in parrallel
+             * Matrix A columns number of MAC operations are to be performed
             */
-            col = numColsB >> 1;
-            j = 0;
+
+            q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+            q31_t const   *pInA0 = pInA;
+            q31_t const   *pInA1 = pInA0 + numColsA;
+            q31_t const   *pInA2 = pInA1 + numColsA;
+            q31_t const   *pInA3 = pInA2 + numColsA;
+            q63_t          acc0, acc1, acc2, acc3;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+            acc2 = 0LL;
+            acc3 = 0LL;
+
+            pSrcA0Vec = (q31_t const *) pInA0;
+            pSrcA1Vec = (q31_t const *) pInA1;
+            pSrcA2Vec = (q31_t const *) pInA2;
+            pSrcA3Vec = (q31_t const *) pInA3;
+
+            vecOffs = vecColBOffs;
+
+            /* process 1 x 4 block output */
+            blkCnt = numColsA >> 2;
+            while (blkCnt > 0U)
+            {
+                q31x4_t vecB, vecA;
+
+                vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+                /* move Matrix B read offsets, 4 rows down */
+                vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+                vecA = vld1q(pSrcA0Vec);  pSrcA0Vec += 4;
+                acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+                vecA = vld1q(pSrcA1Vec);  pSrcA1Vec += 4;
+                acc1 = vrmlaldavhaq(acc1, vecA, vecB);
+                vecA = vld1q(pSrcA2Vec);  pSrcA2Vec += 4;
+                acc2 = vrmlaldavhaq(acc2, vecA, vecB);
+                vecA = vld1q(pSrcA3Vec);  pSrcA3Vec += 4;
+                acc3 = vrmlaldavhaq(acc3, vecA, vecB);
+                blkCnt--;
+            }

            /*
-             * column pair loop
+             * tail
+             * (will be merged thru tail predication)
             */
-            while (col > 0u) {
-                q31_t const    *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
-                q31x4_t         vecA, vecA2, vecB, vecB2;
-                q63_t           acc0, acc1, acc2, acc3;
-
-                /*
-                 * Initiate the pointers
-                 * - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
-                 * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
-                 */
-                pInA = pSrcA->pData + i;
-                pInA2 = pInA + numColsA;
-                pInB = pSrcBT + j;
-                pInB2 = pInB + numRowsB;
-
-
-                pSrcAVec = (q31_t const *) pInA;
-                pSrcA2Vec = (q31_t const *) pInA2;
-                pSrcBVec = (q31_t const *) pInB;
-                pSrcB2Vec = (q31_t const *) pInB2;
-
-                acc0 = 0LL;
-                acc1 = 0LL;
-                acc2 = 0LL;
-                acc3 = 0LL;
-
-                /* load scheduling */
-                vecA = vld1q(pSrcAVec);
-                pSrcAVec += 4;
-
-                blkCnt = (numColsA / 4);
-                while (blkCnt > 0U) {
-                    vecB = vld1q(pSrcBVec);
-                    pSrcBVec += 4;
-                    acc0 = vrmlaldavhaq(acc0, vecA, vecB);
-                    vecA2 = vld1q(pSrcA2Vec);
-                    pSrcA2Vec += 4;
-                    acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
-                    vecB2 = vld1q(pSrcB2Vec);
-                    pSrcB2Vec += 4;
-                    acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
-                    vecA = vld1q(pSrcAVec);
-                    pSrcAVec += 4;
-                    acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
-
-                    blkCnt--;
-                }
-                /*
-                 * tail
-                 * (will be merged thru tail predication)
-                 */
-                blkCnt = (numColsA & 3);
-                if (blkCnt > 0U) {
-                    mve_pred16_t    p0 = vctp32q(blkCnt);
-                    vecB = vld1q(pSrcBVec);
-                    acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
-                    vecA2 = vld1q(pSrcA2Vec);
-                    acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
-                    vecB2 = vld1q(pSrcB2Vec);
-                    acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
-                    vecA = vld1q(pSrcAVec);
-                    acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
-                }
-
-                /* Convert to 1.31 */
-                acc0 = asrl(acc0, 23);
-                acc1 = asrl(acc1, 23);
-                acc2 = asrl(acc2, 23);
-                acc3 = asrl(acc3, 23);
-
-                /* Store the results (2 x 2 block) in the destination buffer */
-                *px++ = (q31_t) acc0;
-                *px++ = (q31_t) acc2;
-                *px2++ = (q31_t) acc1;
-                *px2++ = (q31_t) acc3;
-
-                j += numRowsB * 2;
-                /*
-                 * Decrement the column pair loop counter
-                 */
-                col--;
-
+            blkCnt = numColsA & 3;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp32q(blkCnt);
+                q31x4_t   vecB, vecA;
+
+                vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+                //vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+                vecA = vld1q(pSrcA0Vec);  pSrcA0Vec += 4;
+                acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+                vecA = vld1q(pSrcA1Vec);  pSrcA1Vec += 4;
+                acc1 = vrmlaldavhaq(acc1, vecA, vecB);
+                vecA = vld1q(pSrcA2Vec);  pSrcA2Vec += 4;
+                acc2 = vrmlaldavhaq(acc2, vecA, vecB);
+                vecA = vld1q(pSrcA3Vec);  pSrcA3Vec += 4;
+                acc3 = vrmlaldavhaq(acc3, vecA, vecB);
            }

-            i = i + numColsA * 2;
-            px = px2 + (numColsB & 1u);
-            px2 = px + numColsB;
+            acc0 = asrl(acc0, 23);
+            acc1 = asrl(acc1, 23);
+            acc2 = asrl(acc2, 23);
+            acc3 = asrl(acc3, 23);
+
+            px[0] = (q31_t) acc0;
+            px[1 * numColsB] = (q31_t) acc1;
+            px[2 * numColsB] = (q31_t) acc2;
+            px[3 * numColsB] = (q31_t) acc3;
+            px++;
            /*
-             * Decrement the row pair loop counter
+             * Decrement the column loop counter
             */
-            row--;
+            col--;
+            /*
+             * Update the pointer pInB to point to the  starting address of the next column
+             */
+            pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
        }

        /*
-         * Compute remaining row and/or column below
+         * Update the pointer pInA to point to the  starting address of the next row
+         */
+        pInA += (numColsA * 4);
+        /*
+         * Decrement the row loop counter
+         */
+        rowCnt --;
+
+    }
+    rowCnt = row & 3;
+    while (rowCnt > 0U)
+    {
+             /*
+         * Output pointer is set to starting address of the row being processed
+         */
+        px = pOut + i;
+        i = i + numColsB;
+        /*
+         * For every row wise process, the column loop counter is to be initiated
+         */
+        col = numColsB;
+        /*
+         * For every row wise process, the pInB pointer is set
+         * to the starting address of the pSrcB data
+         */
+        pInB = (q31_t const *)pSrcB->pData;
+        /*
+         * column loop
         */
-        if (numColsB & 1u) {
-            row = numRowsA & (~0x1);    //avoid redundant computation
-            px = pDst->pData + numColsB - 1;
-            i = 0;
+        while (col > 0U)
+        {
+            /*
+             * generate 4 columns elements
+             */
+            /*
+             * Matrix A columns number of MAC operations are to be performed
+             */
+
+            q31_t const *pSrcA0Vec;
+            q31_t const   *pInA0 = pInA;
+            q63_t          acc0;
+
+            acc0 = 0LL;
+           
+
+            pSrcA0Vec = (q31_t const *) pInA0;
+           
+            vecOffs = vecColBOffs;
+
+            /* process 1 x 4 block output */
+            blkCnt = numColsA >> 2;
+            while (blkCnt > 0U)
+            {
+                q31x4_t vecB, vecA;
+
+                vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+                /* move Matrix B read offsets, 4 rows down */
+                vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+                vecA = vld1q(pSrcA0Vec);  pSrcA0Vec += 4;
+                acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+              
+                blkCnt--;
+            }

            /*
-             * row loop
+             * tail
+             * (will be merged thru tail predication)
             */
-            while (row > 0) {
-                q31_t const    *pSrcAVec, *pSrcBVec;
-                q31x4_t         vecA, vecB;
-                q63_t           acc0;
-
-                /*
-                 * point to last column in matrix B
-                 */
-                pInB = pSrcBT + numRowsB * (numColsB - 1);
-                pInA = pSrcA->pData + i;
-
-                pSrcAVec = (q31_t const *) pInA;
-                pSrcBVec = (q31_t const *) pInB;
-
-                /* single dot-product */
-                acc0 = 0LL;
-                blkCnt = (numColsA / 4);
-                while (blkCnt > 0U) {
-                    vecA = vld1q(pSrcAVec);
-                    pSrcAVec += 4;
-                    vecB = vld1q(pSrcBVec);
-                    pSrcBVec += 4;
-                    acc0 = vrmlaldavhaq(acc0, vecA, vecB);
-
-                    blkCnt--;
-                }
-                /*
-                 * tail
-                 * (will be merged thru tail predication)
-                 */
-                blkCnt = (numColsA & 3);
-                if (blkCnt > 0U) {
-                    mve_pred16_t    p0 = vctp32q(blkCnt);
-                    vecA = vld1q(pSrcAVec);
-                    vecB = vld1q(pSrcBVec);
-                    acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
-                }
-
-                acc0 = asrl(acc0, 23);
-                *px = (q31_t) acc0;
-
-                px += numColsB;
-
-                i += numColsA;
-                /*
-                 * Decrement the row loop counter
-                 */
-                row--;
+            blkCnt = numColsA & 3;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp32q(blkCnt);
+                q31x4_t   vecB, vecA;
+
+                vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+                //vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+                vecA = vld1q(pSrcA0Vec);  
+                pSrcA0Vec += 4;
+                acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+                
            }
-        }

-        if (numRowsA & 1u) {
-            col = numColsB;
-            i = 0u;
+            acc0 = asrl(acc0, 23);
+           
+
+            px[0] = (q31_t) acc0;
+            px++;
            /*
-             * point to last row in output matrix
+             * Decrement the column loop counter
             */
-            px = pDst->pData + (numColsB) * (numRowsA - 1);
+            col--;
            /*
-             * col loop
+             * Update the pointer pInB to point to the  starting address of the next column
             */
-            while (col > 0) {
-                q31_t const    *pSrcAVec, *pSrcBVec;
-                q31x4_t         vecA, vecB;
-                q63_t           acc0;
-
-                /*
-                 * point to last row in matrix A
-                 */
-                pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
-                pInB = pSrcBT + i;
-
-                /*
-                 * Set the variable sum, that acts as accumulator, to zero
-                 */
-                pSrcAVec = (q31_t const *) pInA;
-                pSrcBVec = (q31_t const *) pInB;
-                acc0 = 0LL;
-
-                blkCnt = (numColsA / 4);
-                while (blkCnt > 0U) {
-                    vecA = vld1q(pSrcAVec);
-                    pSrcAVec += 4;
-                    vecB = vld1q(pSrcBVec);
-                    pSrcBVec += 4;
-                    acc0 = vrmlaldavhaq(acc0, vecA, vecB);
-
-                    blkCnt--;
-                }
-                /*
-                 * tail
-                 * (will be merged thru tail predication)
-                 */
-                blkCnt = (numColsA & 3);
-                if (blkCnt > 0U) {
-                    mve_pred16_t    p0 = vctp32q(blkCnt);
-                    vecA = vld1q(pSrcAVec);
-                    vecB = vld1q(pSrcBVec);
-                    acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
-                }
-
-                acc0 = asrl(acc0, 23);
-                *px++ = (q31_t) acc0;
-
-                i += numColsA;
-                /*
-                 * Decrement the col loop counter
-                 */
-                col--;
-            }
+            pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
        }
-        /* Set status as ARM_MATH_SUCCESS */
-        status = ARM_MATH_SUCCESS;
+
+        /*
+         * Update the pointer pInA to point to the  starting address of the next row
+         */
+        pInA += numColsA;
+        /*
+         * Decrement the row loop counter
+         */
+        rowCnt--;
    }
+
    /*
-     * Return to application
+     * set status as ARM_MATH_SUCCESS
     */
-    return (status);
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
 }

 #else
--- a/Testing/Include/Benchmarks/BinaryQ31.h
+++ b/Testing/Include/Benchmarks/BinaryQ31.h
@ -14,6 +14,7 @@ class BinaryQ31:public Client::Suite
            Client::Pattern<q31_t> input1;
            Client::Pattern<q31_t> input2;
            Client::LocalPattern<q31_t> output;
+            Client::LocalPattern<q31_t> tmp;

            int nbr;
            int nbi;
@ -22,5 +23,6 @@ class BinaryQ31:public Client::Suite
            arm_matrix_instance_q31 in1;
            arm_matrix_instance_q31 in2;
            arm_matrix_instance_q31 out;
+            q31_t *tmpPtr;
            
    };
--- a/Testing/Include/Tests/BinaryTestsQ31.h
+++ b/Testing/Include/Tests/BinaryTestsQ31.h
@ -16,6 +16,8 @@ class BinaryTestsQ31:public Client::Suite
            Client::Pattern<q31_t> ref;
            Client::Pattern<int16_t> dims;
            Client::LocalPattern<q31_t> output;
+            Client::LocalPattern<q31_t> tmp;
+

            /* Local copies of inputs since matrix instance in CMSIS-DSP are not using
               pointers to const.
--- a/Testing/Source/Benchmarks/BinaryQ31.cpp
+++ b/Testing/Source/Benchmarks/BinaryQ31.cpp
@ -17,6 +17,11 @@
      arm_mat_mult_fast_q31(&this->in1,&this->in2,&this->out);
    }

+    void BinaryQ31::test_mat_mult_opt_q31()
+    {     
+      arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,this->tmpPtr);
+    }
+
    
    void BinaryQ31::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
    {
@ -35,6 +40,14 @@
            output.create(2*this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr);
          break;

+          case BinaryQ31::TEST_MAT_MULT_OPT_Q31_4:
+            input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi);
+            input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc);
+            output.create(this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr);
+            tmp.create(this->nbi*this->nbc,BinaryQ31::TMP_Q31_ID,mgr);
+            this->tmpPtr=tmp.ptr();
+          break;
+
          default:
            input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi);
            input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc);
--- a/Testing/Source/Tests/BinaryTestsF32.cpp
+++ b/Testing/Source/Tests/BinaryTestsF32.cpp
@ -16,6 +16,13 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 40

+static void checkInnerTail(float32_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+}

 #define LOADDATA2()                          \
      const float32_t *inp1=input1.ptr();    \
@ -68,6 +75,7 @@ a double precision computation.
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

@ -99,6 +107,7 @@ a double precision computation.
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (2*rows * columns);
+          checkInnerTail(outp);

      }

--- a/Testing/Source/Tests/BinaryTestsQ15.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ15.cpp
@ -23,6 +23,19 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 40

+static void checkInnerTail(q15_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+    ASSERT_TRUE(b[4] == 0);
+    ASSERT_TRUE(b[5] == 0);
+    ASSERT_TRUE(b[6] == 0);
+    ASSERT_TRUE(b[7] == 0);
+}
+
+

 #define LOADDATA2()                         \
      const q15_t *inp1=input1.ptr();       \
@ -39,7 +52,7 @@ a double precision computation.
      int i;


-#define PREPAREDATA2()                                                   \
+#define PREPAREDATA2C()                                                   \
      in1.numRows=rows;                                                  \
      in1.numCols=internal;                                               \
      memcpy((void*)ap,(const void*)inp1,2*sizeof(q15_t)*rows*internal);\
@ -54,29 +67,45 @@ a double precision computation.
      out.numCols=columns;                                               \
      out.pData = outp;

-
+#define PREPAREDATA2R()                                                   \
+      in1.numRows=rows;                                                  \
+      in1.numCols=internal;                                               \
+      memcpy((void*)ap,(const void*)inp1,sizeof(q15_t)*rows*internal);\
+      in1.pData = ap;                                                    \
+                                                                         \
+      in2.numRows=internal;                                                  \
+      in2.numCols=columns;                                               \
+      memcpy((void*)bp,(const void*)inp2,sizeof(q15_t)*internal*columns);\
+      in2.pData = bp;                                                    \
+                                                                         \
+      out.numRows=rows;                                                  \
+      out.numCols=columns;                                               \
+      out.pData = outp;
      
    void BinaryTestsQ15::test_mat_mult_q15()
    {     
      LOADDATA2();
      arm_status status;

+
      for(i=0;i < nbMatrixes ; i ++)
      {
          rows = *dimsp++;
          internal = *dimsp++;
          columns = *dimsp++;

-          PREPAREDATA2();

+          PREPAREDATA2R();
+          memset(tmpPtr,0,sizeof(q15_t)*internal*columns + 16);
          status=arm_mat_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);
+          checkInnerTail(tmpPtr + internal * columns);

      }

-      ASSERT_EMPTY_TAIL(output);

      ASSERT_SNR(output,ref,(q15_t)SNR_LOW_THRESHOLD);

@ -99,17 +128,16 @@ a double precision computation.
          columns = *dimsp++;


-          PREPAREDATA2();
+          PREPAREDATA2C();

          status=arm_mat_cmplx_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (2*rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);
-
      ASSERT_SNR(output,ref,(q15_t)MULT_SNR_THRESHOLD);

      ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15);
--- a/Testing/Source/Tests/BinaryTestsQ31.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ31.cpp
@ -18,6 +18,14 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 40

+static void checkInnerTail(q31_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+}
+

 #define LOADDATA2()                          \
      const q31_t *inp1=input1.ptr();    \
@ -68,11 +76,10 @@ a double precision computation.
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);
-
      ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);

      ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31);
@ -98,10 +105,38 @@ a double precision computation.
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);
        
          outp += (2*rows * columns);
-
+          checkInnerTail(outp);
      }

-      ASSERT_EMPTY_TAIL(output);
+      ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);
+
+      ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31);
+
+    } 
+
+    void BinaryTestsQ31::test_mat_mult_opt_q31()
+    {     
+      LOADDATA2();
+      q31_t *tmpPtr=tmp.ptr();      
+
+      arm_status status;
+
+      for(i=0;i < nbMatrixes ; i ++)
+      {
+          rows = *dimsp++;
+          internal = *dimsp++;
+          columns = *dimsp++;
+
+          PREPAREDATA2();
+          memset(tmpPtr,0,sizeof(q31_t)*internal*columns + 16);
+          status=arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,tmpPtr);
+          ASSERT_TRUE(status==ARM_MATH_SUCCESS);
+
+          outp += (rows * columns);
+          checkInnerTail(outp);
+          checkInnerTail(tmpPtr + internal*columns);
+
+      }

      ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD);

@ -141,6 +176,21 @@ a double precision computation.
            b.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr);
         break;

+         case TEST_MAT_MULT_OPT_Q31_3:
+            input1.reload(BinaryTestsQ31::INPUTS1_Q31_ID,mgr);
+            input2.reload(BinaryTestsQ31::INPUTS2_Q31_ID,mgr);
+            dims.reload(BinaryTestsQ31::DIMSBINARY1_S16_ID,mgr);
+
+            ref.reload(BinaryTestsQ31::REFMUL1_Q31_ID,mgr);
+
+            output.create(ref.nbSamples(),BinaryTestsQ31::OUT_Q31_ID,mgr);
+            a.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPA_Q31_ID,mgr);
+            b.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr);
+
+            tmp.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPC_Q31_ID,mgr);
+
+         break;
+


    
--- a/Testing/Source/Tests/BinaryTestsQ7.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ7.cpp
@ -19,6 +19,26 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 47

+static void checkInnerTail(q7_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+    ASSERT_TRUE(b[4] == 0);
+    ASSERT_TRUE(b[5] == 0);
+    ASSERT_TRUE(b[6] == 0);
+    ASSERT_TRUE(b[7] == 0);
+    ASSERT_TRUE(b[8] == 0);
+    ASSERT_TRUE(b[9] == 0);
+    ASSERT_TRUE(b[10] == 0);
+    ASSERT_TRUE(b[11] == 0);
+    ASSERT_TRUE(b[12] == 0);
+    ASSERT_TRUE(b[13] == 0);
+    ASSERT_TRUE(b[14] == 0);
+    ASSERT_TRUE(b[15] == 0);
+
+}

 #define LOADDATA2()                         \
      const q7_t *inp1=input1.ptr();       \
@ -65,12 +85,15 @@ a double precision computation.
          columns = *dimsp++;

          PREPAREDATA2();
+          memset(tmpPtr,0,sizeof(q7_t)*internal*columns + 16);
+          checkInnerTail(tmpPtr + internal*columns);
          
          status=arm_mat_mult_q7(&this->in1,&this->in2,&this->out,tmpPtr);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
-
+          checkInnerTail(outp);
+          checkInnerTail(tmpPtr + internal*columns);
      }

      ASSERT_EMPTY_TAIL(output);
--- a/Testing/Source/Tests/UnaryTestsF32.cpp
+++ b/Testing/Source/Tests/UnaryTestsF32.cpp
@ -46,6 +46,14 @@ Comparison for Cholesky
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 40

+static void checkInnerTailOverflow(float32_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+}
+
 #define LOADDATA2()                          \
      const float32_t *inp1=input1.ptr();    \
      const float32_t *inp2=input2.ptr();    \
@ -192,6 +200,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32()
          arm_mat_vec_mult_f32(&this->in1, bp, outp);

          outp += rows ;
+          checkInnerTailOverflow(outp);

      }

@ -219,6 +228,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -246,6 +256,7 @@ void UnaryTestsF32::test_mat_sub_f32()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -273,6 +284,7 @@ void UnaryTestsF32::test_mat_scale_f32()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -300,6 +312,7 @@ void UnaryTestsF32::test_mat_trans_f32()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -327,6 +340,7 @@ void UnaryTestsF32::test_mat_cmplx_trans_f32()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += 2*(rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -421,6 +435,7 @@ void UnaryTestsF32::test_mat_inverse_f32()

          outp += (rows * columns);
          inp1 += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -461,6 +476,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
          outp += (rows * columns);
          inp1 += (rows * rows);
          inp2 += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -501,6 +517,7 @@ void UnaryTestsF32::test_mat_inverse_f32()
          outp += (rows * columns);
          inp1 += (rows * rows);
          inp2 += (rows * columns);
+          checkInnerTailOverflow(outp);

      }

@ -668,6 +685,9 @@ void UnaryTestsF32::test_mat_inverse_f32()

          inp1 += (rows * columns);

+          checkInnerTailOverflow(outllp);
+          checkInnerTailOverflow(outdp);
+

      }

--- a/Testing/Source/Tests/UnaryTestsQ15.cpp
+++ b/Testing/Source/Tests/UnaryTestsQ15.cpp
@ -18,6 +18,31 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 40

+static void refInnerTail(q15_t *b)
+{
+    b[0] = 1;
+    b[1] = -1;
+    b[2] = 2;
+    b[3] = -2;
+    b[4] = 3;
+    b[5] = -3;
+    b[6] = 4;
+    b[7] = -4;
+}
+
+static void checkInnerTail(q15_t *b)
+{
+    ASSERT_TRUE(b[0] == 1);
+    ASSERT_TRUE(b[1] == -1);
+    ASSERT_TRUE(b[2] == 2);
+    ASSERT_TRUE(b[3] == -2);
+    ASSERT_TRUE(b[4] == 3);
+    ASSERT_TRUE(b[5] == -3);
+    ASSERT_TRUE(b[6] == 4);
+    ASSERT_TRUE(b[7] == -4);
+}
+
+
 #define LOADDATA2()                          \
      const q15_t *inp1=input1.ptr();    \
      const q15_t *inp2=input2.ptr();    \
@ -127,14 +152,14 @@ a double precision computation.
          internal = *dimsp++;

          PREPAREVECDATA2();
-
+          refInnerTail(outp + rows);
          arm_mat_vec_mult_q15(&this->in1, bp, outp);

          outp += rows ;
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);

      ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

@ -153,15 +178,15 @@ a double precision computation.
          columns = *dimsp++;

          PREPAREDATA2();
-
+          refInnerTail(outp + rows * columns);
          status=arm_mat_add_q15(&this->in1,&this->in2,&this->out);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);

      ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

@ -180,15 +205,15 @@ void UnaryTestsQ15::test_mat_sub_q15()
          columns = *dimsp++;

          PREPAREDATA2();
-
+          refInnerTail(outp + rows * columns);
          status=arm_mat_sub_q15(&this->in1,&this->in2,&this->out);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);

      ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

@ -207,15 +232,15 @@ void UnaryTestsQ15::test_mat_scale_q15()
          columns = *dimsp++;

          PREPAREDATA1(false);
-
+          refInnerTail(outp + rows * columns);
          status=arm_mat_scale_q15(&this->in1,ONEHALF,0,&this->out);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);

      ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

@ -234,16 +259,15 @@ void UnaryTestsQ15::test_mat_trans_q15()
          columns = *dimsp++;

          PREPAREDATA1(true);
-
+          refInnerTail(outp + rows * columns);
          status=arm_mat_trans_q15(&this->in1,&this->out);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);
-
      ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

      ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15);
@ -261,15 +285,15 @@ void UnaryTestsQ15::test_mat_cmplx_trans_q15()
          columns = *dimsp++;

          PREPAREDATA1C(true);
-
+          refInnerTail(outp + 2*rows * columns);
          status=arm_mat_cmplx_trans_q15(&this->in1,&this->out);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += 2*(rows * columns);
+          checkInnerTail(outp);

      }

-      ASSERT_EMPTY_TAIL(output);

      ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD);

--- a/Testing/Source/Tests/UnaryTestsQ31.cpp
+++ b/Testing/Source/Tests/UnaryTestsQ31.cpp
@ -18,6 +18,15 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 40

+static void checkInnerTail(q31_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+}
+
+
 #define LOADDATA2()                          \
      const q31_t *inp1=input1.ptr();    \
      const q31_t *inp2=input2.ptr();    \
@ -129,6 +138,7 @@ a double precision computation.
          arm_mat_vec_mult_q31(&this->in1, bp, outp);

          outp += rows ;
+          checkInnerTail(outp);

      }

@ -156,6 +166,7 @@ a double precision computation.
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

@ -183,6 +194,7 @@ void UnaryTestsQ31::test_mat_sub_q31()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

@ -210,6 +222,7 @@ void UnaryTestsQ31::test_mat_scale_q31()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

@ -237,6 +250,7 @@ void UnaryTestsQ31::test_mat_trans_q31()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

@ -264,6 +278,7 @@ void UnaryTestsQ31::test_mat_cmplx_trans_q31()
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += 2*(rows * columns);
+          checkInnerTail(outp);

      }

--- a/Testing/Source/Tests/UnaryTestsQ7.cpp
+++ b/Testing/Source/Tests/UnaryTestsQ7.cpp
@ -19,6 +19,27 @@ a double precision computation.
 /* Upper bound of maximum matrix dimension used by Python */
 #define MAXMATRIXDIM 47

+static void checkInnerTail(q7_t *b)
+{
+    ASSERT_TRUE(b[0] == 0);
+    ASSERT_TRUE(b[1] == 0);
+    ASSERT_TRUE(b[2] == 0);
+    ASSERT_TRUE(b[3] == 0);
+    ASSERT_TRUE(b[4] == 0);
+    ASSERT_TRUE(b[5] == 0);
+    ASSERT_TRUE(b[6] == 0);
+    ASSERT_TRUE(b[7] == 0);
+
+    ASSERT_TRUE(b[8] == 0);
+    ASSERT_TRUE(b[9] == 0);
+    ASSERT_TRUE(b[10] == 0);
+    ASSERT_TRUE(b[11] == 0);
+    ASSERT_TRUE(b[12] == 0);
+    ASSERT_TRUE(b[13] == 0);
+    ASSERT_TRUE(b[14] == 0);
+    ASSERT_TRUE(b[15] == 0);
+}
+
 #define LOADDATA2()                          \
      const q7_t *inp1=input1.ptr();    \
      const q7_t *inp2=input2.ptr();    \
@ -112,6 +133,7 @@ a double precision computation.
          arm_mat_vec_mult_q7(&this->in1, bp, outp);

          outp += rows ;
+          checkInnerTail(outp);

      }

@ -132,13 +154,13 @@ void UnaryTestsQ7::test_mat_trans_q7()
      {
          rows = *dimsp++;
          columns = *dimsp++;
-
          PREPAREDATA1(true);

          status=arm_mat_trans_q7(&this->in1,&this->out);
          ASSERT_TRUE(status==ARM_MATH_SUCCESS);

          outp += (rows * columns);
+          checkInnerTail(outp);

      }

--- a/Testing/bench.txt
+++ b/Testing/bench.txt
@ -1583,6 +1583,7 @@ group Root {
                Pattern INPUTAC_Q31_ID : InputAC1_q31.txt 
                Pattern INPUTBC_Q31_ID : InputBC1_q31.txt 
                Output  OUT_Q31_ID : Output
+                Output  TMP_Q31_ID : Temp

                Params PARAM1_ID = {
                  NBR = [5,10,40]
@ -1595,6 +1596,7 @@ group Root {
                   Matrix Multiplication:test_mat_mult_q31
                   Complex Matrix Multiplication:test_mat_cmplx_mult_q31
                   Fast Matrix Multiplication:test_mat_mult_fast_q31
+                   Opt Matrix Multiplication:test_mat_mult_opt_q31
                } -> PARAM1_ID
             }

@ -1614,6 +1616,7 @@ group Root {
                Pattern INPUTAC_Q15_ID : InputAC1_q15.txt 
                Pattern INPUTBC_Q15_ID : InputBC1_q15.txt 
                Output  OUT_Q15_ID : Output
+                Output  TMP_Q15_ID : Temp

                Params PARAM1_ID = {
                  NBR = [5,10,40]
--- a/Testing/desc.txt
+++ b/Testing/desc.txt
@ -3459,10 +3459,12 @@ group Root {
                Output  OUT_Q31_ID : Output
                Output  TMPA_Q31_ID : TmpA
                Output  TMPB_Q31_ID : TmpB
+                Output  TMPC_Q31_ID : TmpC

                Functions {
                  test mult:test_mat_mult_q31
                  test complex mult:test_mat_cmplx_mult_q31
+                  test mult opt:test_mat_mult_opt_q31
                }

             }