CMSIS-DSP: Added new matrix functions and MVE version

arm_mat_mult_q7.c arm_mat_trans_q7.c arm_mat_vec_mult_f32.c arm_mat_vec_mult_q15.c arm_mat_vec_mult_q31.c arm_mat_vec_mult_q7.c
6 years ago · 8268b079d5
parent 952a743985
commit 8268b079d5
124 changed files with 976785 additions and 351755 deletions
--- a/Include/arm_helium_utils.h
+++ b/Include/arm_helium_utils.h
@ -104,6 +104,10 @@ Definitions available for MVEI only

 #include "arm_common_tables.h"

+#define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+#define MVE_ASRL_SAT32(acc, shift)          ((sqrshrl(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+
 /* Following functions are used to transpose matrix in f32 and q31 cases */
 __STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
    uint32_t * pDataSrc,
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@ -2382,6 +2382,16 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
    float64_t *pData;     /**< points to the data of the matrix. */
  } arm_matrix_instance_f64;

+ /**
+   * @brief Instance structure for the Q7 matrix structure.
+   */
+  typedef struct
+  {
+    uint16_t numRows;     /**< number of rows of the matrix.     */
+    uint16_t numCols;     /**< number of columns of the matrix.  */
+    q7_t *pData;         /**< points to the data of the matrix. */
+  } arm_matrix_instance_q7;
+
  /**
   * @brief Instance structure for the Q15 matrix structure.
   */
@ -2503,6 +2513,17 @@ arm_status arm_mat_trans_q15(
  const arm_matrix_instance_q15 * pSrc,
        arm_matrix_instance_q15 * pDst);

+  /**
+   * @brief Q7 matrix transpose.
+   * @param[in]  pSrc  points to the input matrix
+   * @param[out] pDst  points to the output matrix
+   * @return    The function returns either  <code>ARM_MATH_SIZE_MISMATCH</code>
+   * or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_trans_q7(
+  const arm_matrix_instance_q7 * pSrc,
+        arm_matrix_instance_q7 * pDst);
+
  /**
   * @brief Q31 matrix transpose.
   * @param[in]  pSrc  points to the input matrix
@ -2527,6 +2548,43 @@ arm_status arm_mat_mult_f32(
  const arm_matrix_instance_f32 * pSrcB,
        arm_matrix_instance_f32 * pDst);

+  /**
+   * @brief Floating-point matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_f32(
+  const arm_matrix_instance_f32 *pSrcMat, 
+  const float32_t *pVec, 
+  float32_t *pDst);
+
+  /**
+   * @brief Q7 matrix multiplication
+   * @param[in]  pSrcA   points to the first input matrix structure
+   * @param[in]  pSrcB   points to the second input matrix structure
+   * @param[out] pDst    points to output matrix structure
+   * @param[in]  pState  points to the array for storing intermediate results
+   * @return     The function returns either
+   * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+   */
+arm_status arm_mat_mult_q7(
+  const arm_matrix_instance_q7 * pSrcA,
+  const arm_matrix_instance_q7 * pSrcB,
+        arm_matrix_instance_q7 * pDst,
+        q7_t * pState);
+
+  /**
+   * @brief Q7 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q7(
+  const arm_matrix_instance_q7 *pSrcMat, 
+  const q7_t *pVec, 
+  q7_t *pDst);
+
  /**
   * @brief Q15 matrix multiplication
   * @param[in]  pSrcA   points to the first input matrix structure
@ -2542,6 +2600,17 @@ arm_status arm_mat_mult_q15(
        arm_matrix_instance_q15 * pDst,
        q15_t * pState);

+  /**
+   * @brief Q15 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q15(
+  const arm_matrix_instance_q15 *pSrcMat, 
+  const q15_t *pVec, 
+  q15_t *pDst);
+
  /**
   * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
   * @param[in]  pSrcA   points to the first input matrix structure
@ -2570,6 +2639,17 @@ arm_status arm_mat_mult_q31(
  const arm_matrix_instance_q31 * pSrcB,
        arm_matrix_instance_q31 * pDst);

+  /**
+   * @brief Q31 matrix and vector multiplication
+   * @param[in]  pSrcMat  points to the input matrix structure
+   * @param[in]  pVec     points to vector
+   * @param[out] pDst     points to output vector
+   */
+void arm_mat_vec_mult_q31(
+  const arm_matrix_instance_q31 *pSrcMat, 
+  const q31_t *pVec, 
+  q31_t *pDst);
+
  /**
   * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
   * @param[in]  pSrcA  points to the first input matrix structure
--- a/Source/MatrixFunctions/MatrixFunctions.c
+++ b/Source/MatrixFunctions/MatrixFunctions.c
@ -40,6 +40,7 @@
 #include "arm_mat_mult_f32.c"
 #include "arm_mat_mult_fast_q15.c"
 #include "arm_mat_mult_fast_q31.c"
+#include "arm_mat_mult_q7.c"
 #include "arm_mat_mult_q15.c"
 #include "arm_mat_mult_q31.c"
 #include "arm_mat_scale_f32.c"
@ -49,5 +50,10 @@
 #include "arm_mat_sub_q15.c"
 #include "arm_mat_sub_q31.c"
 #include "arm_mat_trans_f32.c"
+#include "arm_mat_trans_q7.c"
 #include "arm_mat_trans_q15.c"
 #include "arm_mat_trans_q31.c"
+#include "arm_mat_vec_mult_f32.c"
+#include "arm_mat_vec_mult_q31.c"
+#include "arm_mat_vec_mult_q15.c"
+#include "arm_mat_vec_mult_q7.c"
--- a/Source/MatrixFunctions/arm_mat_mult_q7.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q7.c
@ -0,0 +1,676 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_mult_q7.c
+ * Description:  Q15 matrix multiplication
+ *
+ * $Date:        06. July 2020
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixMult
+  @{
+ */
+
+/**
+ * @brief Q7 matrix multiplication
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @param[in]       *pState points to the array for storing intermediate results (Unused)
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
+ *
+ *
+ */
+#if defined(ARM_MATH_MVEI)
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst)
+{
+    const uint32_t MATRIX_DIM = 2;
+    q7_t const *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q7_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q7_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint8x16_t vecColBOffs;
+    q7_t       *pInA0 = pInA;
+    q7_t       *pInA1 = pInA0 + MATRIX_DIM;
+    q31_t       acc0, acc1;
+    q7x16_t    vecB, vecA0, vecA1;
+    mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+    vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    vecA0 = vldrbq_s8(pInA0);
+    vecA1 = vldrbq_s8(pInA1);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst)
+{
+    const uint32_t  MATRIX_DIM = 3;
+    q7_t const     *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q7_t           *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q7_t           *pOut = pDst->pData;   /* output data matrix pointer */
+    uint8x16_t     vecColBOffs;
+    q7_t           *pInA0 = pInA;
+    q7_t           *pInA1 = pInA0 + MATRIX_DIM;
+    q7_t           *pInA2 = pInA1 + MATRIX_DIM;
+    q31_t           acc0, acc1, acc2;
+    q7x16_t        vecB, vecA0, vecA1, vecA2;
+    mve_pred16_t    p0 = vctp8q(MATRIX_DIM);
+
+    vecColBOffs = vidupq_u8((uint32_t)0, 1);
+    vecColBOffs = vecColBOffs * MATRIX_DIM;
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    vecA0 = vldrbq_s8(pInA0);
+    vecA1 = vldrbq_s8(pInA1);
+    vecA2 = vldrbq_s8(pInA2);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst)
+{
+    const uint32_t MATRIX_DIM = 4;
+    q7_t const *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
+    q7_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
+    q7_t       *pOut = pDst->pData;   /* output data matrix pointer */
+    uint8x16_t vecColBOffs;
+    q7_t       *pInA0 = pInA;
+    q7_t       *pInA1 = pInA0 + MATRIX_DIM;
+    q7_t       *pInA2 = pInA1 + MATRIX_DIM;
+    q7_t       *pInA3 = pInA2 + MATRIX_DIM;
+    q31_t       acc0, acc1, acc2, acc3;
+    q7x16_t    vecB, vecA0, vecA1, vecA2, vecA3;
+    mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+    vecColBOffs = vidupq_u8((uint32_t)0, 4);
+
+    pInB = pSrcB->pData;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    vecA0 = vldrbq_s8(pInA0);
+    vecA1 = vldrbq_s8(pInA1);
+    vecA2 = vldrbq_s8(pInA2);
+    vecA3 = vldrbq_s8(pInA3);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+    acc3 = vmladavq(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+    acc3 = vmladavq(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+    acc3 = vmladavq(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    pOut++;
+
+    /* move to next B column */
+    pInB = pInB + 1;
+
+    vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+    acc0 = vmladavq(vecA0, vecB);
+    acc1 = vmladavq(vecA1, vecB);
+    acc2 = vmladavq(vecA2, vecB);
+    acc3 = vmladavq(vecA3, vecB);
+
+    pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+    pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+    pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+    pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+    /*
+     * Return to application
+     */
+    return (ARM_MATH_SUCCESS);
+}
+
+arm_status arm_mat_mult_q7(
+    const arm_matrix_instance_q7 * pSrcA,
+    const arm_matrix_instance_q7 * pSrcB,
+    arm_matrix_instance_q7 * pDst,
+    q7_t * pState)
+{
+    q7_t    *pInA = pSrcA->pData;  /* input data matrix pointer A of Q7 type */
+    q7_t    *pInB = pSrcB->pData;  /* input data matrix pointer B of Q7 type */
+    q7_t    *pInA2;
+    q7_t    *pInB2;
+    q7_t    *px;               /* Temporary output data matrix pointer */
+    q7_t    *px2;              /* Temporary output data matrix pointer */
+    uint32_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
+    uint32_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
+    uint32_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
+    uint32_t  numRowsB = pSrcB->numRows;    /* number of rows of input matrix A    */
+    uint32_t  col, i = 0u, j, row = numRowsB;   /* loop counters */
+    q7_t    *pSrcBT = pState;   /* input data matrix pointer for transpose */
+    uint32_t  blkCnt;           /* loop counters */
+    arm_status status;                            /* status of matrix multiplication */
+    arm_matrix_instance_q7 BT;
+
+
+   #ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+  {
+    /* small squared matrix specialized routines */
+    if(numRowsA == numColsB && numColsB == numColsA) {
+        if(numRowsA == 2)
+            return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
+        else if(numRowsA == 3)
+            return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
+        else if (numRowsA == 4)
+            return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
+    }
+    /*
+     * Matrix transpose
+     */
+
+    BT.numRows = numColsB;
+    BT.numCols = numRowsB;
+    BT.pData = pSrcBT;
+
+    arm_mat_trans_q7(pSrcB, &BT);
+
+    /*
+     * Reset the variables for the usage in the following multiplication process
+     */
+    i = 0;
+    row = numRowsA >> 1;
+    px = pDst->pData;
+    px2 = px + numColsB;
+
+    /*
+     * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+     */
+
+    /*
+     * row loop
+     */
+    while (row > 0u)
+    {
+        /*
+         * For every row wise process, the column loop counter is to be initiated
+         */
+        col = numColsB >> 1;
+        /*
+         * For every row wise process, the pIn2 pointer is set
+         * to the starting address of the transposed pSrcB data
+         */
+        pInB = pSrcBT;
+        pInB2 = pInB + numRowsB;
+        j = 0;
+
+        /*
+         * column loop
+         */
+        while (col > 0u)
+        {
+            q7_t const     *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
+            q7x16_t        vecA, vecA2, vecB, vecB2;
+            q31_t           acc0, acc1, acc2, acc3;
+
+            /*
+             * Initiate the pointer pIn1 to point to the starting address of the column being processed
+             */
+            pInA = pSrcA->pData + i;
+            pInA2 = pInA + numColsA;
+            pInB = pSrcBT + j;
+            pInB2 = pInB + numRowsB;
+
+            pSrcAVec = (q7_t const *) pInA;
+            pSrcA2Vec = (q7_t const *)pInA2;
+            pSrcBVec = (q7_t const *) pInB;
+            pSrcB2Vec = (q7_t const *)pInB2;
+
+            acc0 = 0L;
+            acc1 = 0L;
+            acc2 = 0L;
+            acc3 = 0L;
+
+            vecA = vld1q(pSrcAVec);  
+            pSrcAVec += 16;
+
+            blkCnt = numColsA >> 4;
+            while (blkCnt > 0U)
+            {
+                vecB = vld1q(pSrcBVec);  
+                pSrcBVec += 16;
+                acc0 = vmladavaq(acc0, vecA, vecB);
+                vecA2 = vld1q(pSrcA2Vec);  
+                pSrcA2Vec += 16;
+                acc1 = vmladavaq(acc1, vecA2, vecB);
+                vecB2 = vld1q(pSrcB2Vec);  
+                pSrcB2Vec += 16;
+                acc2 = vmladavaq(acc2, vecA, vecB2);
+                vecA = vld1q(pSrcAVec);  
+                pSrcAVec += 16;
+                acc3 = vmladavaq(acc3, vecA2, vecB2);
+
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numColsA & 0xF;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp8q(blkCnt);
+                vecB = vld1q(pSrcBVec);
+                acc0 = vmladavaq_p(acc0, vecA, vecB, p0);
+                vecA2 = vld1q(pSrcA2Vec);
+                acc1 = vmladavaq_p(acc1, vecA2, vecB, p0);
+                vecB2 = vld1q(pSrcB2Vec);
+                acc2 = vmladavaq_p(acc2, vecA, vecB2, p0);
+                vecA = vld1q(pSrcAVec);
+                acc3 = vmladavaq_p(acc3, vecA2, vecB2, p0);
+            }
+
+            *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
+            *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
+            *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
+            *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
+            j += numRowsB * 2;
+            /*
+             * Decrement the column loop counter
+             */
+            col--;
+
+        }
+
+        i = i + numColsA * 2;
+        px = px2 + (numColsB & 1u);
+        px2 = px + numColsB;
+        /*
+         * Decrement the row loop counter
+         */
+        row--;
+    }
+
+    /*
+     * Compute remaining row and/or column below
+     */
+
+    if (numColsB & 1u)
+    {
+        row = numRowsA & (~0x1);    //avoid redundant computation
+        px = pDst->pData + numColsB - 1;
+        i = 0;
+
+        /*
+         * row loop
+         */
+        while (row > 0)
+        {
+            q7_t const   *pSrcAVec, *pSrcBVec;
+            q7x16_t       vecA, vecB;
+            q63_t           acc0;
+
+            /*
+             * point to last column in matrix B
+             */
+            pInB = pSrcBT + numRowsB * (numColsB - 1);
+            pInA = pSrcA->pData + i;
+
+            pSrcAVec = (q7_t const *) pInA;
+            pSrcBVec = (q7_t const *) pInB;
+
+            acc0 = 0LL;
+            blkCnt = (numColsA) >> 4;
+            while (blkCnt > 0U)
+            {
+                vecA = vld1q(pSrcAVec);  
+                pSrcAVec += 16;
+                vecB = vld1q(pSrcBVec);  
+                pSrcBVec += 16;
+                acc0 = vmladavaq(acc0, vecA, vecB);
+
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numColsA & 0xF;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp8q(blkCnt);
+                vecA = vld1q(pSrcAVec);
+                vecB = vld1q(pSrcBVec);
+                acc0 = vmladavaq_p(acc0, vecA, vecB, p0);
+            }
+
+            *px = (q7_t) __SSAT(acc0 >> 7, 8);
+
+            px += numColsB;
+
+            i += numColsA;
+            /*
+             * Decrement the row loop counter
+             */
+            row--;
+        }
+    }
+
+    if (numRowsA & 1u)
+    {
+        col = numColsB;
+        i = 0u;
+        /*
+         * point to last row in output matrix
+         */
+        px = pDst->pData + (numColsB) * (numRowsA - 1);
+        /*
+         * col loop
+         */
+        while (col > 0)
+        {
+            q7_t const    *pSrcAVec, *pSrcBVec;
+            q7x16_t       vecA, vecB;
+            q63_t           acc0;
+
+            /*
+             * point to last row in matrix A
+             */
+            pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
+            pInB = pSrcBT + i;
+
+            /*
+             * Set the variable sum, that acts as accumulator, to zero
+             */
+            pSrcAVec = (q7_t const *) pInA;
+            pSrcBVec = (q7_t const *) pInB;
+            acc0 = 0LL;
+
+            blkCnt = (numColsA) >> 4;
+            while (blkCnt > 0U)
+            {
+                vecA = vld1q(pSrcAVec); 
+                pSrcAVec += 16;
+                vecB = vld1q(pSrcBVec); 
+                pSrcBVec += 16;
+                acc0 = vmladavaq(acc0, vecA, vecB);
+
+                blkCnt--;
+            }
+            /*
+             * tail
+             * (will be merged thru tail predication)
+             */
+            blkCnt = numColsA & 0xF;
+            if (blkCnt > 0U)
+            {
+                mve_pred16_t p0 = vctp8q(blkCnt);
+                vecA = vld1q(pSrcAVec);
+                vecB = vld1q(pSrcBVec);
+                acc0 = vmladavaq_p(acc0, vecA, vecB, p0);
+            }
+
+            *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
+
+            i += numColsA;
+
+            /*
+             * Decrement the col loop counter
+             */
+            col--;
+        }
+    }
+    /*
+     * Return to application
+     */
+     status = ARM_MATH_SUCCESS;
+    }
+    return(status);
+}
+#else
+arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
+{
+    q31_t sum; /* accumulator */
+    q7_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
+    q7_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
+    q7_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q7 type */
+    q7_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q7 type */
+    q7_t *pOut = pDst->pData;                     /* output data matrix pointer */
+    q7_t *px;                                     /* Temporary output data matrix pointer */
+    uint16_t numColsB = pSrcB->numCols;           /* number of columns of input matrix B */
+    uint16_t numColsA = pSrcA->numCols;           /* number of columns of input matrix A */
+    uint16_t numRowsA = pSrcA->numRows;           /* number of rows of input matrix A    */
+    uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
+    arm_status status;                            /* status of matrix multiplication */
+
+    (void)pState;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+      (pSrcA->numRows != pDst->numRows)  ||
+      (pSrcB->numCols != pDst->numCols)    )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+    {
+        /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+        /* row loop */
+        do {
+            /* Output pointer is set to starting address of the row being processed */
+            px = pOut + i;
+
+            /* For every row wise process, the column loop counter is to be initiated */
+            col = numColsB;
+
+            /* For every row wise process, the pIn2 pointer is set
+             ** to the starting address of the pSrcB data */
+            pIn2 = pSrcB->pData;
+
+            /* column loop */
+            do {
+                /* Set the variable sum, that acts as accumulator, to zero */
+                sum = 0;
+
+                /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
+                pIn1 = pInA;
+
+                /* Matrix A columns number of MAC operations are to be performed */
+                colCnt = numColsA;
+
+                /* matrix multiplication */
+                while (colCnt > 0U) {
+                    /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+                    /* Perform the multiply-accumulates */
+                    sum += (q31_t)*pIn1++ * *pIn2;
+                    pIn2 += numColsB;
+
+                    /* Decrement the loop counter */
+                    colCnt--;
+                }
+
+                /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
+                /* Saturate and store the result in the destination buffer */
+                *px++ = (q7_t)__SSAT((sum >> 7), 8);
+
+                /* Decrement the column loop counter */
+                col--;
+
+                /* Update the pointer pIn2 to point to the  starting address of the next column */
+                pIn2 = pInB + (numColsB - col);
+
+            } while (col > 0U);
+
+            /* Update the pointer pSrcA to point to the  starting address of the next row */
+            i = i + numColsB;
+            pInA = pInA + numColsA;
+
+            /* Decrement the row loop counter */
+            row--;
+
+        } while (row > 0U);
+
+        /* set status as ARM_MATH_SUCCESS */
+        status = ARM_MATH_SUCCESS;
+    }
+
+    /* Return to application */
+    return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of MatrixMult group
+ */
--- a/Source/MatrixFunctions/arm_mat_trans_q7.c
+++ b/Source/MatrixFunctions/arm_mat_trans_q7.c
@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_trans_q7.c
+ * Description:  Q7 matrix transpose
+ *
+ * $Date:        06. July 2020
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupMatrix
+ */
+
+/**
+  @addtogroup MatrixTrans
+  @{
+ */
+
+/**
+  @brief         Q7 matrix transpose.
+  @param[in]     pSrc      points to input matrix
+  @param[out]    pDst      points to output matrix
+  @return        execution status
+                   - \ref ARM_MATH_SUCCESS       : Operation successful
+                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI)
+arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
+{
+
+    uint16x8_t    vecOffs;
+    uint32_t        i;
+    uint32_t        blkCnt;
+    uint8_t const  *pDataC;
+    uint8_t        *pDataDestR;
+    uint16x8_t    vecIn;
+
+    const uint8_t   * pDataSrc=(const uint8_t  *)pSrc->pData;
+    uint8_t   * pDataDst=(uint8_t  *)pDst->pData;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+    /* Check for matrix mismatch condition */
+    if ((pSrc->numRows != pDst->dstCols) || (pSrc->srcCols != pDst->numCols))
+    {
+        /* Set status as ARM_MATH_SIZE_MISMATCH */
+        return = ARM_MATH_SIZE_MISMATCH;
+    }
+#endif
+
+    vecOffs = vidupq_u16((uint32_t)0, 1);
+    vecOffs = vecOffs * pSrc->numCols;
+
+    i = pSrc->numCols;
+    do
+    {
+        pDataC = (uint8_t const *) pDataSrc;
+        pDataDestR = (uint8_t*)pDataDst;
+
+        blkCnt = pSrc->numRows >> 3;
+        while (blkCnt > 0U)
+        {
+            /* widened loads */
+            vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
+            vstrbq_u16(pDataDestR, vecIn);  
+            pDataDestR += 8;
+            pDataC = pDataC + pSrc->numCols * 8;
+            /*
+             * Decrement the blockSize loop counter
+             */
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = pSrc->numRows & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+            vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
+            vstrbq_p_u16(pDataDestR, vecIn, p0);
+        }
+        pDataSrc += 1;
+        pDataDst += pSrc->numRows;
+    }
+    while (--i);
+
+    return (ARM_MATH_SUCCESS);
+}
+#else
+arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
+{
+    q7_t *pSrcA = pSrc->pData;         /* input data matrix pointer */
+    q7_t *pOut = pDst->pData;          /* output data matrix pointer */
+    uint16_t nRows = pSrc->numRows;    /* number of nRows */
+    uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+    uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
+    arm_status status;                 /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+    /* Check for matrix mismatch condition */
+    if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) {
+        /* Set status as ARM_MATH_SIZE_MISMATCH */
+        status = ARM_MATH_SIZE_MISMATCH;
+    } else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+
+    {
+        /* Matrix transpose by exchanging the rows with columns */
+        /* row loop     */
+        do {
+            /* The pointer pOut is set to starting address of the column being processed */
+            pOut = pDst->pData + i;
+
+            /* Initialize column loop counter */
+            col = nColumns;
+
+
+            while (col > 0U) {
+                /* Read and store the input element in the destination */
+                *pOut = *pSrcA++;
+
+                /* Update the pointer pOut to point to the next row of the transposed matrix */
+                pOut += nRows;
+
+                /* Decrement the column loop counter */
+                col--;
+            }
+
+            i++;
+
+            /* Decrement the row loop counter */
+            row--;
+
+        } while (row > 0U);
+
+        /* set status as ARM_MATH_SUCCESS */
+        status = ARM_MATH_SUCCESS;
+    }
+    /* Return to application */
+    return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+/**
+  @} end of MatrixTrans group
+ */
--- a/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
@ -0,0 +1,389 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_f32.c
+ * Description:  Floating-point matrix and vector multiplication
+ *
+ * $Date:        07. July 202
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix and vector multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_f32(
+    const arm_matrix_instance_f32   *pSrcMat,
+    const float32_t                 *pSrcVec,
+    float32_t                       *pDstVec)
+{
+    uint32_t         numRows = pSrcMat->numRows;
+    uint32_t         numCols = pSrcMat->numCols;
+    const float32_t *pSrcA = pSrcMat->pData;
+    const float32_t *pInA0;
+    const float32_t *pInA1;
+    float32_t       *px;
+    int32_t          row;
+    uint32_t         blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 4 rows in parallel
+     */
+    while (row >= 4)
+    {
+        const float32_t     *pInA2, *pInA3;
+        float32_t const    *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f32x4_t            vecIn, acc0, acc1, acc2, acc3;
+        float32_t const     *pSrcVecPtr = pSrcVec;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec =  pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        acc2 = vdupq_n_f32(0.0f);
+        acc3 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            f32x4_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);  
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);  
+            pSrcA2Vec += 4;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);  
+            pSrcA3Vec += 4;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            f32x4_t vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF32Mve(acc0);
+        *px++ = vecAddAcrossF32Mve(acc1);
+        *px++ = vecAddAcrossF32Mve(acc2);
+        *px++ = vecAddAcrossF32Mve(acc3);
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2)
+    {
+        float32_t const    *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f32x4_t            vecIn, acc0, acc1;
+        float32_t const     *pSrcVecPtr = pSrcVec;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            f32x4_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);    
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            f32x4_t vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF32Mve(acc0);
+        *px++ = vecAddAcrossF32Mve(acc1);
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        f32x4_t             vecIn, acc0;
+        float32_t const     *pSrcA0Vec, *pInVec;
+        float32_t const      *pSrcVecPtr = pSrcVec;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            f32x4_t vecA;
+
+            vecIn = vld1q(pInVec);      
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);    
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+            f32x4_t vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vld1q(pSrcA0Vec);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        *px++ = vecAddAcrossF32Mve(acc0);
+    }
+}
+#else
+void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_t *pVec, float32_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const float32_t *pSrcA = pSrcMat->pData;
+    const float32_t *pInA1;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInA2;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInA3;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInA4;      /* input data matrix pointer A of Q31 type */
+    const float32_t *pInVec;     /* input data matrix pointer B of Q31 type */
+    float32_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    float32_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        float32_t sum1 = 0.0f;
+        float32_t sum2 = 0.0f;
+        float32_t sum3 = 0.0f;
+        float32_t sum4 = 0.0f;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = *(pInVec)++;
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData = *(pInA1)++;
+            sum1 += matData * vecData;
+            matData = *(pInA2)++;
+            sum2 += matData * vecData;
+            matData = *(pInA3)++;
+            sum3 += matData * vecData;
+            matData = *(pInA4)++;
+            sum4 += matData * vecData;
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = sum1;
+        *px++ = sum2;
+        *px++ = sum3;
+        *px++ = sum4;
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        float32_t sum = 0.0f;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        colCnt = numCols >> 1;
+
+        while (colCnt > 0) {
+            vecData = *(pInVec)++;
+            vecData2 = *(pInVec)++;
+            matData = *(pInA1)++;
+            matData2 = *(pInA1)++;
+            sum += matData * vecData;
+            sum += matData2 * vecData2;
+            colCnt--;
+        }
+        // process remainder of row
+        colCnt = numCols & 1u;
+        while (colCnt > 0) {
+            sum += *pInA1++ * *pInVec++;
+            colCnt--;
+        }
+
+        *px++ = sum;
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
--- a/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
@ -0,0 +1,386 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_q15.c
+ * Description:  Q15 matrix and vector multiplication
+ *
+ * $Date:        07. July 202
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Q15 matrix and vector multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_q15(
+    const arm_matrix_instance_q15 * pSrcMat,
+    const q15_t     *pSrcVec,
+    q15_t           *pDstVec)
+{
+    const q15_t *pMatSrc = pSrcMat->pData;
+    const q15_t *pMat0, *pMat1;
+    uint32_t     numRows = pSrcMat->numRows;
+    uint32_t     numCols = pSrcMat->numCols;
+    q15_t       *px;
+    int32_t      row;
+    uint16_t     blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 3x64-bit accumulators per loop
+     */
+    while (row >= 3)
+    {
+        q15_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
+        const q15_t  *pMat2;
+        q15_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1, acc2;
+        q15x8_t     vecMatA0, vecMatA1, vecMatA2, vecIn;
+
+
+        pVec = pSrcVec;
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+        pMat2 = pMat1 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pMat2Vec = pMat2;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 8;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 8;
+            vecMatA2 = vld1q(pMat2Vec); 
+            pMat2Vec += 8;
+            vecIn = vld1q(pVec);        
+            pVec += 8;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecMatA2 = vld1q(pMat2Vec);
+            vecIn = vldrhq_z_s16(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+        }
+
+        *px++ = MVE_ASRL_SAT16(acc0, 15);
+        *px++ = MVE_ASRL_SAT16(acc1, 15);
+        *px++ = MVE_ASRL_SAT16(acc2, 15);
+
+        pMatSrc += numCols * 3;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 3;
+    }
+
+    /*
+     * process any remaining rows pair
+     */
+    if (row >= 2)
+    {
+        q15_t const *pMat0Vec, *pMat1Vec, *pVec;
+        q15_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1;
+        q15x8_t     vecMatA0, vecMatA1, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 8;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 8;
+            vecIn = vld1q(pVec);        
+            pVec += 8;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecIn = vldrhq_z_s16(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+        }
+
+        *px++ = MVE_ASRL_SAT16(acc0, 15);
+        *px++ = MVE_ASRL_SAT16(acc1, 15);
+
+        pMatSrc += numCols * 2;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        q15_t const *pMat0Vec, *pVec;
+        q15_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0;
+        q15x8_t     vecMatA0, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+
+        acc0 = 0LL;
+
+        pMat0Vec = pMat0;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 8;
+            vecIn = vld1q(pVec);        
+            pVec += 8;
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp16q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecIn = vldrhq_z_s16(pVec, p0);
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+        }
+        *px++ = MVE_ASRL_SAT16(acc0, 15);
+    }
+}
+
+#else
+void arm_mat_vec_mult_q15(const arm_matrix_instance_q15 *pSrcMat, const q15_t *pVec, q15_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const q15_t *pSrcA = pSrcMat->pData;
+    const q15_t *pInA1;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInA2;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInA3;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInA4;      /* input data matrix pointer A of Q15 type */
+    const q15_t *pInVec;     /* input data matrix pointer B of Q15 type */
+    q15_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    q31_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        q63_t sum1 = 0;
+        q63_t sum2 = 0;
+        q63_t sum3 = 0;
+        q63_t sum4 = 0;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols >> 1;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = read_q15x2_ia ((q15_t **) &pInVec);
+
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData =  read_q15x2_ia ((q15_t **) &pInA1);
+            sum1 = __SMLALD(matData, vecData, sum1);
+            matData = read_q15x2_ia ((q15_t **) &pInA2);
+            sum2 = __SMLALD(matData, vecData, sum2);
+            matData = read_q15x2_ia ((q15_t **) &pInA3);
+            sum3 = __SMLALD(matData, vecData, sum3);
+            matData = read_q15x2_ia ((q15_t **) &pInA4);
+            sum4 = __SMLALD(matData, vecData, sum4);
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* process any remaining columns */
+        colCnt = numCols & 1u;
+        if (numCols & 1u) {
+            vecData = *pInVec++;
+            sum1 += (q63_t)*pInA1++ * vecData;
+            sum2 += (q63_t)*pInA2++ * vecData;
+            sum3 += (q63_t)*pInA3++ * vecData;
+            sum4 += (q63_t)*pInA4++ * vecData;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = (q15_t)(__SSAT((sum1 >> 15), 16));
+        *px++ = (q15_t)(__SSAT((sum2 >> 15), 16));
+        *px++ = (q15_t)(__SSAT((sum3 >> 15), 16));
+        *px++ = (q15_t)(__SSAT((sum4 >> 15), 16));
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        q63_t sum = 0;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        // loop unrolling - process 4 elements at a time
+        colCnt = numCols >> 2;
+
+        while (colCnt > 0) {
+            vecData = read_q15x2_ia ((q15_t **) &pInVec);
+            vecData2 = read_q15x2_ia ((q15_t **) &pInVec);
+            matData = read_q15x2_ia ((q15_t **) &pInA1);
+            matData2 = read_q15x2_ia ((q15_t **) &pInA1);
+            sum = __SMLAD(matData, vecData, sum);
+            sum = __SMLAD(matData2, vecData2, sum);
+            colCnt--;
+        }
+
+        // process remainder of row
+        colCnt = numCols & 3u;
+        while (colCnt > 0) {
+            sum += (q63_t)*pInA1++ * *pInVec++;
+            colCnt--;
+        }
+        *px++ = (q15_t)(__SSAT((sum >> 15), 16));
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
--- a/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
@ -0,0 +1,374 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_q31.c
+ * Description:  Q31 matrix and vector multiplication
+ *
+ * $Date:        07. July 202
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix and vector multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ */
+#if defined(ARM_MATH_MVEI)
+void arm_mat_vec_mult_q31(
+    const arm_matrix_instance_q31 * pSrcMat,
+    const q31_t     *pSrcVec,
+    q31_t           *pDstVec)
+{
+    const q31_t *pMatSrc = pSrcMat->pData;
+    const q31_t *pMat0, *pMat1;
+    uint32_t     numRows = pSrcMat->numRows;
+    uint32_t     numCols = pSrcMat->numCols;
+    q31_t       *px;
+    int32_t      row;
+    uint16_t     blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 3x64-bit accumulators per loop
+     */
+    while (row >= 3)
+    {
+        q31_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
+        const q31_t  *pMat2;
+        q31_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1, acc2;
+        q31x4_t     vecMatA0, vecMatA1, vecMatA2, vecIn;
+
+
+        pVec = pSrcVec;
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+        pMat2 = pMat1 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pMat2Vec = pMat2;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 4;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 4;
+            vecMatA2 = vld1q(pMat2Vec); 
+            pMat2Vec += 4;
+            vecIn = vld1q(pVec);        
+            pVec += 4;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecMatA2 = vld1q(pMat2Vec);
+            vecIn = vldrwq_z_s32(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+        }
+
+        *px++ = asrl(acc0, 31);
+        *px++ = asrl(acc1, 31);
+        *px++ = asrl(acc2, 31);
+
+        pMatSrc += numCols * 3;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 3;
+    }
+
+    /*
+     * process any remaining rows pair
+     */
+    if (row >= 2)
+    {
+        q31_t const *pMat0Vec, *pMat1Vec, *pVec;
+        q31_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0, acc1;
+        q31x4_t     vecMatA0, vecMatA1, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+
+        acc0 = 0LL;
+        acc1 = 0LL;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 4;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 4;
+            vecIn = vld1q(pVec);        
+            pVec += 4;
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecIn = vldrwq_z_s32(pVec, p0);
+
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+        }
+
+        *px++ = asrl(acc0, 31);
+        *px++ = asrl(acc1, 31);
+
+        pMatSrc += numCols * 2;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        q31_t const *pMat0Vec, *pVec;
+        q31_t const  *pSrcVecPtr = pSrcVec;
+        q63_t         acc0;
+        q31x4_t     vecMatA0, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+
+        acc0 = 0LL;
+
+        pMat0Vec = pMat0;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 4;
+            vecIn = vld1q(pVec);        
+            pVec += 4;
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp32q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecIn = vldrwq_z_s32(pVec, p0);
+            acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+        }
+
+        *px++ = asrl(acc0, 31);
+    }
+}
+#else
+void arm_mat_vec_mult_q31(const arm_matrix_instance_q31 *pSrcMat, const q31_t *pVec, q31_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const q31_t *pSrcA = pSrcMat->pData;
+    const q31_t *pInA1;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInA2;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInA3;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInA4;      /* input data matrix pointer A of Q31 type */
+    const q31_t *pInVec;     /* input data matrix pointer B of Q31 type */
+    q31_t *px;               /* Temporary output data matrix pointer */
+    uint16_t i, row, colCnt; /* loop counters */
+    q31_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    /* row loop */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        q63_t sum1 = 0;
+        q63_t sum2 = 0;
+        q63_t sum3 = 0;
+        q63_t sum4 = 0;
+
+        /* Loop unrolling: process 2 columns per iteration */
+        colCnt = numCols;
+
+        /* Initialize pointers to the starting address of the column being processed */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Main loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 2 values from vector
+            vecData = *(pInVec)++;
+
+            // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+            matData = *(pInA1)++;
+            sum1 += (q63_t)matData * vecData;
+            matData = *(pInA2)++;
+            sum2 += (q63_t)matData * vecData;
+            matData = *(pInA3)++;
+            sum3 += (q63_t)matData * vecData;
+            matData = *(pInA4)++;
+            sum4 += (q63_t)matData * vecData;
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = (q31_t)(sum1 >> 31);
+        *px++ = (q31_t)(sum2 >> 31);
+        *px++ = (q31_t)(sum3 >> 31);
+        *px++ = (q31_t)(sum4 >> 31);
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        q63_t sum = 0;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        colCnt = numCols >> 1;
+
+        while (colCnt > 0) {
+            vecData = *(pInVec)++;
+            vecData2 = *(pInVec)++;
+            matData = *(pInA1)++;
+            matData2 = *(pInA1)++;
+            sum += (q63_t)matData * vecData;
+            sum += (q63_t)matData2 * vecData2;
+            colCnt--;
+        }
+
+        // process remainder of row
+        colCnt = numCols & 1u;
+        while (colCnt > 0) {
+            sum += (q63_t)*pInA1++ * *pInVec++;
+            colCnt--;
+        }
+
+        *px++ = (q31_t)(sum >> 31);
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
--- a/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
@ -0,0 +1,413 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mat_vec_mult_q7.c
+ * Description:  Q7 matrix and vector multiplication
+ *
+ * $Date:        07. July 202
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Q7 matrix and vector multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ */
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_q7(
+    const arm_matrix_instance_q7 * pSrcMat,
+    const q7_t     *pSrcVec,
+    q7_t           *pDstVec)
+{
+    const q7_t *pMatSrc = pSrcMat->pData;
+    const q7_t *pMat0, *pMat1;
+    uint32_t     numRows = pSrcMat->numRows;
+    uint32_t     numCols = pSrcMat->numCols;
+    q7_t       *px;
+    int32_t      row;
+    uint16_t     blkCnt;           /* loop counters */
+
+    row = numRows;
+    px = pDstVec;
+
+    /*
+     * compute 4x64-bit accumulators per loop
+     */
+    while (row >= 4)
+    {
+        q7_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pMat3Vec, *pVec;
+        const q7_t  *pMat2, *pMat3;
+        q7_t const  *pSrcVecPtr = pSrcVec;
+        q31_t        acc0, acc1, acc2, acc3;
+        q7x16_t      vecMatA0, vecMatA1, vecMatA2, vecMatA3, vecIn;
+
+        pVec = pSrcVec;
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+        pMat2 = pMat1 + numCols;
+        pMat3 = pMat2 + numCols;
+
+        acc0 = 0L;
+        acc1 = 0L;
+        acc2 = 0L;
+        acc3 = 0L;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pMat2Vec = pMat2;
+        pMat3Vec = pMat3;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 4;
+        while (blkCnt > 0U)
+        {
+
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 16;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 16;
+            vecMatA2 = vld1q(pMat2Vec); 
+            pMat2Vec += 16;
+            vecMatA3 = vld1q(pMat3Vec); 
+            pMat3Vec += 16;
+            vecIn = vld1q(pVec);        
+            pVec += 16;
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmladavaq(acc2, vecIn, vecMatA2);
+            acc3 = vmladavaq(acc3, vecIn, vecMatA3);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 0xF;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp8q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecMatA2 = vld1q(pMat2Vec);
+            vecMatA3 = vld1q(pMat3Vec);
+            vecIn = vldrbq_z_s8(pVec, p0);
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+            acc2 = vmladavaq(acc2, vecIn, vecMatA2);
+            acc3 = vmladavaq(acc3, vecIn, vecMatA3);
+        }
+
+        *px++ = __SSAT(acc0 >> 7, 8);
+        *px++ = __SSAT(acc1 >> 7, 8);
+        *px++ = __SSAT(acc2 >> 7, 8);
+        *px++ = __SSAT(acc3 >> 7, 8);
+
+        pMatSrc += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * process any remaining rows pair
+     */
+    if (row >= 2)
+    {
+        q7_t const  *pMat0Vec, *pMat1Vec, *pVec;
+        q7_t const  *pSrcVecPtr = pSrcVec;
+        q31_t         acc0, acc1;
+        q7x16_t     vecMatA0, vecMatA1, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+        pMat1 = pMat0 + numCols;
+
+        acc0 = 0;
+        acc1 = 0;
+
+        pMat0Vec = pMat0;
+        pMat1Vec = pMat1;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 4;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 16;
+            vecMatA1 = vld1q(pMat1Vec); 
+            pMat1Vec += 16;
+            vecIn = vld1q(pVec);        
+            pVec += 16;
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+
+            blkCnt--;
+        }
+
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 0xF;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp8q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecMatA1 = vld1q(pMat1Vec);
+            vecIn = vldrbq_z_s8(pVec, p0);
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+        }
+
+        *px++ = __SSAT(acc0 >> 7, 8);
+        *px++ = __SSAT(acc1 >> 7, 8);
+
+        pMatSrc += numCols * 2;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 2;
+    }
+
+    if (row >= 1)
+    {
+        q7_t const  *pMat0Vec, *pVec;
+        q7_t const  *pSrcVecPtr = pSrcVec;
+        q31_t         acc0;
+        q7x16_t     vecMatA0, vecIn;
+
+        /*
+         * For every row wise process, the pInVec pointer is set
+         * to the starting address of the vector
+         */
+        pVec = pSrcVec;
+
+        /*
+         * Initialize the pointer pIn1 to point to the starting address of the column being processed
+         */
+        pMat0 = pMatSrc;
+
+        acc0 = 0LL;
+
+        pMat0Vec = pMat0;
+        pVec = pSrcVecPtr;
+
+        blkCnt = numCols >> 4;
+        while (blkCnt > 0U)
+        {
+            vecMatA0 = vld1q(pMat0Vec); 
+            pMat0Vec += 16;
+            vecIn = vld1q(pVec);        
+            pVec += 16;
+
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 0xF;
+        if (blkCnt > 0U)
+        {
+            mve_pred16_t p0 = vctp8q(blkCnt);
+
+            vecMatA0 = vld1q(pMat0Vec);
+            vecIn = vldrbq_z_s8(pVec, p0);
+            acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+        }
+        *px++ = __SSAT(acc0 >> 7, 8);
+    }
+}
+
+#else
+void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec, q7_t *pDst)
+{
+    uint32_t numRows = pSrcMat->numRows;
+    uint32_t numCols = pSrcMat->numCols;
+    const q7_t *pSrcA = pSrcMat->pData;
+    const q7_t *pInA1;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInA2;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInA3;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInA4;       /* input data matrix pointer of Q7 type */
+    const q7_t *pInVec;      /* input data vector pointer of Q7 type */
+    q7_t *px;                /* output data pointer */
+    uint32_t i, row, colCnt; /* loop counters */
+
+    q31_t matData, matData2, vecData, vecData2;
+
+
+    /* Process 4 rows at a time */
+    row = numRows >> 2;
+    i = 0u;
+    px = pDst;
+
+    /* The following loop performs the dot-product of each row in pSrcA with the vector */
+    while (row > 0) {
+        /* For every row wise process, the pInVec pointer is set
+         ** to the starting address of the vector */
+        pInVec = pVec;
+
+        /* Initialize accumulators */
+        q31_t sum1 = 0;
+        q31_t sum2 = 0;
+        q31_t sum3 = 0;
+        q31_t sum4 = 0;
+
+        /* Loop unrolling: process 4 columns per iteration */
+        colCnt = numCols >> 2;
+
+        /* Initialize row pointers so we can track 4 rows at once */
+        pInA1 = pSrcA + i;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        pInA4 = pInA3 + numCols;
+
+
+        // Inner loop: matrix-vector multiplication
+        while (colCnt > 0u) {
+            // Read 4 values from vector
+            vecData = read_q7x4_ia ((q7_t **) &pInVec);
+            vecData2 = __SXTB16(__ROR(vecData, 8));
+            vecData = __SXTB16(vecData);
+            // Read 16 values from the matrix - 4 values from each of 4 rows, and do multiply accumulate
+            matData = read_q7x4_ia ((q7_t **) &pInA1);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum1 = __SMLAD(matData, vecData, sum1);
+            sum1 = __SMLAD(matData2, vecData2, sum1);
+            matData = read_q7x4_ia ((q7_t **) &pInA2);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum2 = __SMLAD(matData, vecData, sum2);
+            sum2 = __SMLAD(matData2, vecData2, sum2);
+            matData = read_q7x4_ia ((q7_t **) &pInA3);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum3 = __SMLAD(matData, vecData, sum3);
+            sum3 = __SMLAD(matData2, vecData2, sum3);
+            matData = read_q7x4_ia ((q7_t **) &pInA4);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum4 = __SMLAD(matData, vecData, sum4);
+            sum4 = __SMLAD(matData2, vecData2, sum4);
+
+            // Decrement the loop counter
+            colCnt--;
+        }
+
+        /* process any remaining columns */
+        colCnt = numCols & 3u;
+        while (colCnt > 0) {
+            vecData = *pInVec++;
+            sum1 += *pInA1++ * vecData;
+            sum2 += *pInA2++ * vecData;
+            sum3 += *pInA3++ * vecData;
+            sum4 += *pInA4++ * vecData;
+            colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = (q7_t)(__SSAT((sum1 >> 7), 8));
+        *px++ = (q7_t)(__SSAT((sum2 >> 7), 8));
+        *px++ = (q7_t)(__SSAT((sum3 >> 7), 8));
+        *px++ = (q7_t)(__SSAT((sum4 >> 7), 8));
+
+        i = i + numCols * 4;
+
+        /* Decrement the row loop counter */
+        row--;
+    }
+
+    /* process any remaining rows */
+    row = numRows & 3u;
+    while (row > 0) {
+
+        q31_t sum = 0;
+        pInVec = pVec;
+        pInA1 = pSrcA + i;
+
+        // loop unrolling - process 4 elements at a time
+        colCnt = numCols >> 2;
+
+        while (colCnt > 0) {
+            vecData = read_q7x4_ia ((q7_t **) &pInVec);
+            vecData2 = __SXTB16(__ROR(vecData, 8));
+            vecData = __SXTB16(vecData);
+            matData = read_q7x4_ia ((q7_t **) &pInA1);
+            matData2 = __SXTB16(__ROR(matData, 8));
+            matData = __SXTB16(matData);
+            sum = __SMLAD(matData, vecData, sum);
+            sum = __SMLAD(matData2, vecData2, sum);
+            colCnt--;
+        }
+
+        // process remainder of row
+        colCnt = numCols & 3u;
+        while (colCnt > 0) {
+            sum += *pInA1++ * *pInVec++;
+            colCnt--;
+        }
+        *px++ = (q7_t)(__SSAT((sum >> 7), 8));
+        i = i + numCols;
+        row--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
--- a/Testing/CMakeLists.txt
+++ b/Testing/CMakeLists.txt
@ -2,6 +2,7 @@ cmake_minimum_required (VERSION 3.14)
 cmake_policy(SET CMP0077 NEW)
 include(CMakePrintHelpers)

+
 # The tests are assuming that MATRIX_CHECK is enabled when building
 # CMSIS-DSP.

@ -170,9 +171,11 @@ set (NNSRC
   Source/Benchmarks/UnaryF64.cpp
   Source/Benchmarks/UnaryQ31.cpp
   Source/Benchmarks/UnaryQ15.cpp
+   Source/Benchmarks/UnaryQ7.cpp
   Source/Benchmarks/BinaryF32.cpp
   Source/Benchmarks/BinaryQ31.cpp
   Source/Benchmarks/BinaryQ15.cpp
+   Source/Benchmarks/BinaryQ7.cpp
   Source/Benchmarks/TransformF32.cpp
   Source/Benchmarks/TransformQ31.cpp
   Source/Benchmarks/TransformQ15.cpp
@ -197,47 +200,32 @@ if (EXTERNAL)
 endif()

 if (STANDARDTEST)
-set(TESTSRC 
-  Source/Tests/BasicTestsF32.cpp
+
+if (BASICMATH)
+  set(BASICMATHSRC Source/Tests/BasicTestsF32.cpp
  Source/Tests/BasicTestsQ31.cpp
  Source/Tests/BasicTestsQ15.cpp
-  Source/Tests/BasicTestsQ7.cpp
-  Source/Tests/ComplexTestsF32.cpp
+  Source/Tests/BasicTestsQ7.cpp)
+endif()
+
+if (COMPLEXMATH)
+  set(COMPLEXMATHSRC Source/Tests/ComplexTestsF32.cpp
  Source/Tests/ComplexTestsQ31.cpp
-  Source/Tests/ComplexTestsQ15.cpp
-  Source/Tests/SVMF32.cpp
-  Source/Tests/BayesF32.cpp
-  Source/Tests/TransformCF64.cpp
-  Source/Tests/TransformCF32.cpp
-  Source/Tests/TransformRF64.cpp
-  Source/Tests/TransformRF32.cpp
-  Source/Tests/TransformCQ31.cpp
-  Source/Tests/TransformRQ31.cpp
-  Source/Tests/TransformCQ15.cpp
-  Source/Tests/TransformRQ15.cpp
-  Source/Tests/StatsTestsF32.cpp
-  Source/Tests/StatsTestsF64.cpp
-  Source/Tests/StatsTestsQ31.cpp
-  Source/Tests/StatsTestsQ15.cpp
-  Source/Tests/StatsTestsQ7.cpp
-  Source/Tests/FastMathF32.cpp
+  Source/Tests/ComplexTestsQ15.cpp)
+endif()
+
+if (CONTROLLER)
+  set(CONTROLLERSRC )
+endif()
+
+if (FASTMATH)
+  set(FASTMATHSRC Source/Tests/FastMathF32.cpp
  Source/Tests/FastMathQ31.cpp
-  Source/Tests/FastMathQ15.cpp
-  Source/Tests/SupportTestsF32.cpp
-  Source/Tests/SupportTestsQ31.cpp
-  Source/Tests/SupportTestsQ15.cpp
-  Source/Tests/SupportTestsQ7.cpp
-  Source/Tests/SupportBarTestsF32.cpp
-  Source/Tests/DistanceTestsF32.cpp
-  Source/Tests/DistanceTestsU32.cpp
-  Source/Tests/UnaryTestsQ31.cpp
-  Source/Tests/UnaryTestsQ15.cpp
-  Source/Tests/UnaryTestsF32.cpp
-  Source/Tests/UnaryTestsF64.cpp
-  Source/Tests/BinaryTestsF32.cpp
-  Source/Tests/BinaryTestsQ31.cpp
-  Source/Tests/BinaryTestsQ15.cpp
-  Source/Tests/DECIMF32.cpp
+  Source/Tests/FastMathQ15.cpp)
+endif()
+
+if (FILTERING)
+  set(FILTERINGSRC Source/Tests/DECIMF32.cpp
  Source/Tests/DECIMQ31.cpp
  Source/Tests/DECIMQ15.cpp
  Source/Tests/MISCF32.cpp
@ -251,15 +239,82 @@ set(TESTSRC
  Source/Tests/BIQUADF64.cpp
  Source/Tests/BIQUADF32.cpp
  Source/Tests/BIQUADQ31.cpp
-  Source/Tests/BIQUADQ15.cpp
-  Source/Tests/InterpolationTestsF32.cpp
-  Source/Tests/InterpolationTestsQ31.cpp
-  Source/Tests/InterpolationTestsQ15.cpp
-  Source/Tests/InterpolationTestsQ7.cpp
-  Source/Tests/ExampleCategoryF32.cpp
-  Source/Tests/ExampleCategoryQ31.cpp
-  Source/Tests/ExampleCategoryQ15.cpp
-  Source/Tests/ExampleCategoryQ7.cpp
+  Source/Tests/BIQUADQ15.cpp)
+endif()
+
+if (MATRIX)
+  set(MATRIXSRC  Source/Tests/UnaryTestsQ31.cpp
+  Source/Tests/UnaryTestsQ15.cpp
+  Source/Tests/UnaryTestsQ7.cpp
+  Source/Tests/UnaryTestsF32.cpp
+  Source/Tests/UnaryTestsF64.cpp
+  Source/Tests/BinaryTestsF32.cpp
+  Source/Tests/BinaryTestsQ31.cpp
+  Source/Tests/BinaryTestsQ15.cpp
+  Source/Tests/BinaryTestsQ7.cpp)
+endif()
+
+if (STATISTICS)
+  set(STATISTICSSRC Source/Tests/StatsTestsF32.cpp
+  Source/Tests/StatsTestsF64.cpp
+  Source/Tests/StatsTestsQ31.cpp
+  Source/Tests/StatsTestsQ15.cpp
+  Source/Tests/StatsTestsQ7.cpp)
+endif()
+
+if (SUPPORT)
+  set(SUPPORTSRC Source/Tests/SupportTestsF32.cpp
+  Source/Tests/SupportTestsQ31.cpp
+  Source/Tests/SupportTestsQ15.cpp
+  Source/Tests/SupportTestsQ7.cpp
+  Source/Tests/SupportBarTestsF32.cpp)
+endif()
+
+if (TRANSFORM)
+  set(TRANSFORMSRC Source/Tests/TransformCF64.cpp
+  Source/Tests/TransformCF32.cpp
+  Source/Tests/TransformRF64.cpp
+  Source/Tests/TransformRF32.cpp
+  Source/Tests/TransformCQ31.cpp
+  Source/Tests/TransformRQ31.cpp
+  Source/Tests/TransformCQ15.cpp
+  Source/Tests/TransformRQ15.cpp)
+endif()
+
+if (SVM)
+  set(SVMSRC Source/Tests/SVMF32.cpp)
+endif()
+
+if (BAYES)
+  set(BAYESSRC Source/Tests/BayesF32.cpp)
+endif()
+
+if (DISTANCE)
+  set(DISTANCESRC Source/Tests/DistanceTestsF32.cpp
+  Source/Tests/DistanceTestsU32.cpp)
+endif()
+
+set(TESTSRC 
+     ${BASICMATHSRC}
+     ${COMPLEXMATHSRC}
+     ${CONTROLLERSRC}
+     ${FASTMATHSRC}
+     ${FILTERINGSRC}
+     ${MATRIXSRC}
+     ${STATISTICSSRC}
+     ${SUPPORTSRC}
+     ${TRANSFORMSRC}
+     ${SVMSRC}
+     ${BAYESSRC}
+     ${DISTANCESRC}
+     Source/Tests/InterpolationTestsF32.cpp
+     Source/Tests/InterpolationTestsQ31.cpp
+     Source/Tests/InterpolationTestsQ15.cpp
+     Source/Tests/InterpolationTestsQ7.cpp
+     Source/Tests/ExampleCategoryF32.cpp
+     Source/Tests/ExampleCategoryQ31.cpp
+     Source/Tests/ExampleCategoryQ15.cpp
+     Source/Tests/ExampleCategoryQ7.cpp
  )


--- a/Testing/Include/Benchmarks/BinaryQ7.h
+++ b/Testing/Include/Benchmarks/BinaryQ7.h
@ -0,0 +1,26 @@
+#include "Test.h"
+#include "Pattern.h"
+class BinaryQ7:public Client::Suite
+    {
+        public:
+            BinaryQ7(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "BinaryQ7_decl.h"
+            Client::Pattern<q7_t> input1;
+            Client::Pattern<q7_t> input2;
+            Client::LocalPattern<q7_t> output;
+            Client::LocalPattern<q7_t> state;
+
+            int nbr;
+            int nbi;
+            int nbc;
+
+            arm_matrix_instance_q7 in1;
+            arm_matrix_instance_q7 in2;
+            arm_matrix_instance_q7 out;
+
+            q7_t *pState;
+            
+    };
--- a/Testing/Include/Benchmarks/UnaryF32.h
+++ b/Testing/Include/Benchmarks/UnaryF32.h
@ -9,11 +9,15 @@ class UnaryF32:public Client::Suite
        private:
            #include "UnaryF32_decl.h"
            Client::Pattern<float32_t> input1;
+            Client::Pattern<float32_t> vec;
+
            Client::LocalPattern<float32_t> output;

            int nbr;
            int nbc;

+            float32_t *vecp;
+            float32_t *outp;
            arm_matrix_instance_f32 in1;
            arm_matrix_instance_f32 out;
    };
--- a/Testing/Include/Benchmarks/UnaryQ15.h
+++ b/Testing/Include/Benchmarks/UnaryQ15.h
@ -9,11 +9,14 @@ class UnaryQ15:public Client::Suite
        private:
            #include "UnaryQ15_decl.h"
            Client::Pattern<q15_t> input1;
+            Client::Pattern<q15_t> vec;
            Client::LocalPattern<q15_t> output;

            int nbr;
            int nbc;

+            q15_t *vecp;
+            q15_t *outp;
            arm_matrix_instance_q15 in1;
            arm_matrix_instance_q15 out;
    };
--- a/Testing/Include/Benchmarks/UnaryQ31.h
+++ b/Testing/Include/Benchmarks/UnaryQ31.h
@ -9,11 +9,15 @@ class UnaryQ31:public Client::Suite
        private:
            #include "UnaryQ31_decl.h"
            Client::Pattern<q31_t> input1;
+            Client::Pattern<q31_t> vec;
+
            Client::LocalPattern<q31_t> output;

            int nbr;
            int nbc;

+            q31_t *vecp;
+            q31_t *outp;
            arm_matrix_instance_q31 in1;
            arm_matrix_instance_q31 out;
    };
--- a/Testing/Include/Benchmarks/UnaryQ7.h
+++ b/Testing/Include/Benchmarks/UnaryQ7.h
@ -0,0 +1,22 @@
+#include "Test.h"
+#include "Pattern.h"
+class UnaryQ7:public Client::Suite
+    {
+        public:
+            UnaryQ7(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "UnaryQ7_decl.h"
+            Client::Pattern<q7_t> input1;
+            Client::Pattern<q7_t> vec;
+            Client::LocalPattern<q7_t> output;
+
+            int nbr;
+            int nbc;
+
+            q7_t *vecp;
+            q7_t *outp;
+            arm_matrix_instance_q7 in1;
+            arm_matrix_instance_q7 out;
+    };
--- a/Testing/Include/Tests/BinaryTestsQ7.h
+++ b/Testing/Include/Tests/BinaryTestsQ7.h
@ -0,0 +1,30 @@
+#include "Test.h"
+#include "Pattern.h"
+class BinaryTestsQ7:public Client::Suite
+    {
+        public:
+            BinaryTestsQ7(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "BinaryTestsQ7_decl.h"
+            Client::Pattern<q7_t> input1;
+            Client::Pattern<q7_t> input2;
+            Client::Pattern<q7_t> ref;
+            Client::Pattern<int16_t> dims;
+            Client::LocalPattern<q7_t> output;
+            Client::LocalPattern<q7_t> tmp;
+
+            /* Local copies of inputs since matrix instance in CMSIS-DSP are not using
+               pointers to const.
+            */
+            Client::LocalPattern<q7_t> a;
+            Client::LocalPattern<q7_t> b;
+
+            int nbr;
+            int nbc;
+
+            arm_matrix_instance_q7 in1;
+            arm_matrix_instance_q7 in2;
+            arm_matrix_instance_q7 out;
+    };
--- a/Testing/Include/Tests/UnaryTestsQ7.h
+++ b/Testing/Include/Tests/UnaryTestsQ7.h
@ -0,0 +1,29 @@
+#include "Test.h"
+#include "Pattern.h"
+class UnaryTestsQ7:public Client::Suite
+    {
+        public:
+            UnaryTestsQ7(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "UnaryTestsQ7_decl.h"
+            Client::Pattern<q7_t> input1;
+            Client::Pattern<q7_t> input2;
+            Client::Pattern<q7_t> ref;
+            Client::Pattern<int16_t> dims;
+            Client::LocalPattern<q7_t> output;
+
+            /* Local copies of inputs since matrix instance in CMSIS-DSP are not using
+               pointers to const.
+            */
+            Client::LocalPattern<q7_t> a;
+            Client::LocalPattern<q7_t> b;
+
+            int nbr;
+            int nbc;
+
+            arm_matrix_instance_q7 in1;
+            arm_matrix_instance_q7 in2;
+            arm_matrix_instance_q7 out;
+    };
--- a/Testing/PatternGeneration/Matrix.py
+++ b/Testing/PatternGeneration/Matrix.py
@ -14,9 +14,9 @@ def cartesian(*somelists):
 # Those patterns are used for tests and benchmarks.
 # For tests, there is the need to add tests for saturation

-NBA = 40
-NBI = 40
-NBB = 40
+NBA = 47
+NBI = 47
+NBB = 47

 def randComplex(nb):
    data = np.random.randn(2*nb)
@ -45,6 +45,7 @@ def writeBinaryTests(config,format):
    config.writeInput(1, data1,"InputA")
    config.writeInput(1, data2,"InputB")

+
    config.writeInput(1, asReal(data1C),"InputAC")
    config.writeInput(1, asReal(data2C),"InputBC")

@ -72,6 +73,8 @@ def writeBinaryTests(config,format):
       vals = vals + r
    config.writeReference(1, vals,"RefMul")

+   
+
    vals=[] 
    for (a,b,c) in binarySizes:
       ma = np.copy(data1C[0:a*b]).reshape(a,b)
@ -592,15 +595,25 @@ def getInvertibleMatrix(d):
 def writeUnaryTests(config,format):
    # For benchmarks
    NBSAMPLES=NBA*NBB
+    NBVECSAMPLES = NBB

    data1=np.random.randn(NBSAMPLES)
    data1 = Tools.normalize(data1)
+    if format == Tools.Q7:
+       data1 = data1 / 4.0

    data2=np.random.randn(NBSAMPLES)
-    data2 = Tools.normalize(data2)
+    data2 = Tools.normalize(data2) 
+
+    vecdata=np.random.randn(NBVECSAMPLES)
+    vecdata = Tools.normalize(vecdata)
+    if format == Tools.Q7:
+       vecdata = vecdata / 4.0
+

    config.writeInput(1, data1,"InputA")
    config.writeInput(1, data2,"InputB")
+    config.writeInput(1, vecdata,"InputVec")

    # For tests
    NA=[1,2,3,4,Tools.loopnb(format,Tools.TAILONLY),
@ -625,6 +638,15 @@ def writeUnaryTests(config,format):
       vals = vals + r
    config.writeReference(1, vals,"RefAdd")

+    vals=[] 
+    for (a,b) in unarySizes:
+       ma = np.copy(data1[0:a*b]).reshape(a,b)
+       v = np.copy(vecdata[0:b])
+       r = ma.dot(v)
+       r = list(r.reshape(a))
+       vals = vals + r
+    config.writeReference(1, vals,"RefVecMul")
+
    vals = []
    for (a,b) in unarySizes:
       ma = np.copy(data1[0:a*b]).reshape(a,b)
@ -681,12 +703,14 @@ def generatePatterns():
    configBinaryf32=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"f32")
    configBinaryq31=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"q31")
    configBinaryq15=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"q15")
+    configBinaryq7=Tools.Config(PATTERNBINDIR,PARAMBINDIR,"q7")
+
    
    
-    
-    writeBinaryTests(configBinaryf32,0)
-    writeBinaryTests(configBinaryq31,31)
-    writeBinaryTests(configBinaryq15,15)
+    writeBinaryTests(configBinaryf32,Tools.F32)
+    writeBinaryTests(configBinaryq31,Tools.Q31)
+    writeBinaryTests(configBinaryq15,Tools.Q15)
+    writeBinaryTests(configBinaryq7,Tools.Q7)
    
    PATTERNUNDIR = os.path.join("Patterns","DSP","Matrix","Unary","Unary")
    PARAMUNDIR = os.path.join("Parameters","DSP","Matrix","Unary","Unary")
@ -695,12 +719,14 @@ def generatePatterns():
    configUnaryf32=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"f32")
    configUnaryq31=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"q31")
    configUnaryq15=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"q15")
+    configUnaryq7=Tools.Config(PATTERNUNDIR,PARAMUNDIR,"q7")
    
    
-    writeUnaryTests(configUnaryf64,0)
-    writeUnaryTests(configUnaryf32,0)
-    writeUnaryTests(configUnaryq31,31)
-    writeUnaryTests(configUnaryq15,15)
+    writeUnaryTests(configUnaryf64,Tools.F64)
+    writeUnaryTests(configUnaryf32,Tools.F32)
+    writeUnaryTests(configUnaryq31,Tools.Q31)
+    writeUnaryTests(configUnaryq15,Tools.Q15)
+    writeUnaryTests(configUnaryq7,Tools.Q7)

 if __name__ == '__main__':
  generatePatterns()
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/DimsBinary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/DimsBinary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputA1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputA1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputAC1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputAC1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputB1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputB1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputBC1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/InputBC1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/RefCmplxMul1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/RefCmplxMul1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/RefMul1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryF32/RefMul1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/DimsBinary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/DimsBinary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputA1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputA1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputAC1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputAC1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputB1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputB1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputBC1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/InputBC1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/RefCmplxMul1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/RefCmplxMul1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/RefMul1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ15/RefMul1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/DimsBinary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/DimsBinary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputA1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputA1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputAC1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputAC1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputB1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputB1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputBC1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/InputBC1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/RefCmplxMul1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/RefCmplxMul1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/RefMul1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ31/RefMul1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/DimsBinary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/DimsBinary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputA1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputA1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputAC1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputAC1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputB1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputB1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputBC1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/InputBC1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/RefCmplxMul1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/RefCmplxMul1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/RefMul1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Binary/BinaryQ7/RefMul1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/DimsInvert1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/DimsInvert1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/DimsUnary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/DimsUnary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputA1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputA1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputB1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputB1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputInvert1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputInvert1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputVec1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/InputVec1_f32.txt
@ -0,0 +1,96 @@
+W
+47
+// -0.107875
+0xbddced80
+// 0.110208
+0x3de1b4a6
+// -0.186117
+0xbe3e956f
+// -0.179879
+0xbe383222
+// 0.457523
+0x3eea4071
+// -0.389640
+0xbec77ef0
+// -0.087725
+0xbdb3a93f
+// 0.009642
+0x3c1df7e8
+// 0.121087
+0x3df7fc88
+// 0.360831
+0x3eb8bee3
+// 0.145014
+0x3e147ea9
+// 0.208232
+0x3e553ac6
+// -0.001926
+0xbafc741d
+// -0.337274
+0xbeacaf2b
+// -0.105656
+0xbdd861f2
+// -0.263624
+0xbe86f9b1
+// 0.421856
+0x3ed7fd79
+// -0.413779
+0xbed3dae3
+// -0.247986
+0xbe7defea
+// 0.118261
+0x3df232c4
+// -0.031828
+0xbd025e56
+// -0.161765
+0xbe25a5b9
+// -0.204676
+0xbe51967a
+// -0.329348
+0xbea8a05a
+// 0.046422
+0x3d3e24b5
+// 0.247452
+0x3e7d6425
+// 0.009540
+0x3c1c4e80
+// -0.439287
+0xbee0ea32
+// 1.000000
+0x3f800000
+// -0.240757
+0xbe7688fb
+// 0.016640
+0x3c884fe6
+// 0.403801
+0x3ecebf0a
+// -0.275932
+0xbe8d4700
+// 0.065854
+0x3d86dea4
+// -0.568089
+0xbf116e4d
+// 0.235354
+0x3e71009e
+// -0.677409
+0xbf2d6aaa
+// 0.575935
+0x3f137079
+// 0.579105
+0x3f14403c
+// -0.444585
+0xbee3a0b3
+// -0.440784
+0xbee1ae66
+// 0.411888
+0x3ed2e305
+// 0.182672
+0x3e3b0e58
+// 0.288879
+0x3e93e7ff
+// -0.138646
+0xbe0df928
+// 0.937707
+0x3f700d98
+// -0.342960
+0xbeaf9884
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefAdd1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefAdd1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefInvert1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefInvert1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefScale1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefScale1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefSub1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefSub1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefTranspose1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefTranspose1_f32.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefVecMul1_f32.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF32/RefVecMul1_f32.txt
@ -0,0 +1,450 @@
+W
+224
+// 0.057904
+0x3d6d2d31
+// 0.091881
+0x3dbc2bec
+// 0.035213
+0x3d103bef
+// 0.042564
+0x3d2e581c
+// 0.035213
+0x3d103bef
+// 0.016754
+0x3c893fc2
+// -0.061413
+0xbd7b8c04
+// 0.057904
+0x3d6d2d31
+// -0.033257
+0xbd08387a
+// 0.091881
+0x3dbc2bec
+// -0.037349
+0xbd18fae4
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.042564
+0x3d2e581c
+// -0.025281
+0xbccf198b
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.016754
+0x3c893fc2
+// -0.016257
+0xbc852dfa
+// -0.061413
+0xbd7b8c04
+// -0.091457
+0xbdbb4daf
+// 0.057904
+0x3d6d2d31
+// -0.033257
+0xbd08387a
+// -0.032845
+0xbd068842
+// 0.091881
+0x3dbc2bec
+// -0.037349
+0xbd18fae4
+// 0.009869
+0x3c21b3aa
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// 0.042564
+0x3d2e581c
+// -0.025281
+0xbccf198b
+// -0.052167
+0xbd55ad26
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// 0.016754
+0x3c893fc2
+// -0.016257
+0xbc852dfa
+// 0.127319
+0x3e025ffa
+// -0.061413
+0xbd7b8c04
+// -0.091457
+0xbdbb4daf
+// -0.054669
+0xbd5fecb2
+// 0.057904
+0x3d6d2d31
+// -0.033257
+0xbd08387a
+// -0.032845
+0xbd068842
+// 0.004409
+0x3b907548
+// 0.091881
+0x3dbc2bec
+// -0.037349
+0xbd18fae4
+// 0.009869
+0x3c21b3aa
+// 0.027031
+0x3cdd70e4
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// -0.040042
+0xbd2402b1
+// 0.042564
+0x3d2e581c
+// -0.025281
+0xbccf198b
+// -0.052167
+0xbd55ad26
+// 0.029801
+0x3cf4217e
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// -0.040042
+0xbd2402b1
+// 0.016754
+0x3c893fc2
+// -0.016257
+0xbc852dfa
+// 0.127319
+0x3e025ffa
+// 0.087774
+0x3db3c2bb
+// -0.061413
+0xbd7b8c04
+// -0.091457
+0xbdbb4daf
+// -0.054669
+0xbd5fecb2
+// -0.112814
+0xbde70ae0
+// 0.057904
+0x3d6d2d31
+// -0.033257
+0xbd08387a
+// -0.032845
+0xbd068842
+// 0.091881
+0x3dbc2bec
+// -0.037349
+0xbd18fae4
+// 0.009869
+0x3c21b3aa
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// 0.042564
+0x3d2e581c
+// -0.025281
+0xbccf198b
+// -0.052167
+0xbd55ad26
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// 0.016754
+0x3c893fc2
+// -0.016257
+0xbc852dfa
+// 0.127319
+0x3e025ffa
+// -0.061413
+0xbd7b8c04
+// -0.091457
+0xbdbb4daf
+// -0.054669
+0xbd5fecb2
+// 0.057904
+0x3d6d2d31
+// -0.033257
+0xbd08387a
+// -0.032845
+0xbd068842
+// 0.004409
+0x3b907548
+// -0.006772
+0xbbddeae3
+// -0.016290
+0xbc8571c0
+// 0.002672
+0x3b2f1595
+// -0.023844
+0xbcc354c3
+// 0.091881
+0x3dbc2bec
+// -0.037349
+0xbd18fae4
+// 0.009869
+0x3c21b3aa
+// 0.027031
+0x3cdd70e4
+// -0.014340
+0xbc6af1a0
+// 0.037672
+0x3d1a4e49
+// -0.002634
+0xbb2c9fc7
+// 0.019277
+0x3c9dea2a
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// -0.040042
+0xbd2402b1
+// 0.030201
+0x3cf76888
+// 0.068547
+0x3d8c6283
+// 0.047040
+0x3d40ace7
+// 0.050729
+0x3d4fc911
+// 0.042564
+0x3d2e581c
+// -0.025281
+0xbccf198b
+// -0.052167
+0xbd55ad26
+// 0.029801
+0x3cf4217e
+// -0.049061
+0xbd48f45e
+// -0.050509
+0xbd4ee2fa
+// 0.057598
+0x3d6bec2a
+// 0.130159
+0x3e054862
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// -0.040042
+0xbd2402b1
+// 0.030201
+0x3cf76888
+// 0.068547
+0x3d8c6283
+// 0.047040
+0x3d40ace7
+// 0.050729
+0x3d4fc911
+// 0.016754
+0x3c893fc2
+// -0.016257
+0xbc852dfa
+// 0.127319
+0x3e025ffa
+// 0.087774
+0x3db3c2bb
+// 0.027246
+0x3cdf327a
+// 0.087103
+0x3db262ff
+// -0.174586
+0xbe32c6b0
+// -0.127342
+0xbe026601
+// -0.061413
+0xbd7b8c04
+// -0.091457
+0xbdbb4daf
+// -0.054669
+0xbd5fecb2
+// -0.112814
+0xbde70ae0
+// 0.310927
+0x3e9f31df
+// 0.282749
+0x3e90c479
+// 0.283499
+0x3e9126ca
+// -0.036028
+0xbd1391c4
+// 0.057904
+0x3d6d2d31
+// -0.033257
+0xbd08387a
+// -0.032845
+0xbd068842
+// 0.004409
+0x3b907548
+// -0.006772
+0xbbddeae3
+// -0.016290
+0xbc8571c0
+// 0.002672
+0x3b2f1595
+// -0.023844
+0xbcc354c3
+// 0.004946
+0x3ba20e3d
+// 0.018877
+0x3c9aa437
+// 0.007047
+0x3be6ebbb
+// 0.091881
+0x3dbc2bec
+// -0.037349
+0xbd18fae4
+// 0.009869
+0x3c21b3aa
+// 0.027031
+0x3cdd70e4
+// -0.014340
+0xbc6af1a0
+// 0.037672
+0x3d1a4e49
+// -0.002634
+0xbb2c9fc7
+// 0.019277
+0x3c9dea2a
+// -0.052170
+0xbd55b00d
+// 0.059300
+0x3d72e409
+// -0.049408
+0xbd4a6019
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// -0.040042
+0xbd2402b1
+// 0.030201
+0x3cf76888
+// 0.068547
+0x3d8c6283
+// 0.047040
+0x3d40ace7
+// 0.050729
+0x3d4fc911
+// -0.005363
+0xbbafbeee
+// 0.025814
+0x3cd3789a
+// -0.043460
+0xbd32032c
+// 0.042564
+0x3d2e581c
+// -0.025281
+0xbccf198b
+// -0.052167
+0xbd55ad26
+// 0.029801
+0x3cf4217e
+// -0.049061
+0xbd48f45e
+// -0.050509
+0xbd4ee2fa
+// 0.057598
+0x3d6bec2a
+// 0.130159
+0x3e054862
+// -0.036673
+0xbd1635f1
+// 0.006511
+0x3bd55cde
+// 0.023652
+0x3cc1c1fb
+// 0.035213
+0x3d103bef
+// -0.016777
+0xbc897033
+// 0.035564
+0x3d11ab7e
+// -0.040042
+0xbd2402b1
+// 0.030201
+0x3cf76888
+// 0.068547
+0x3d8c6283
+// 0.047040
+0x3d40ace7
+// 0.050729
+0x3d4fc911
+// -0.005363
+0xbbafbeee
+// 0.025814
+0x3cd3789a
+// -0.043460
+0xbd32032c
+// 0.016754
+0x3c893fc2
+// -0.016257
+0xbc852dfa
+// 0.127319
+0x3e025ffa
+// 0.087774
+0x3db3c2bb
+// 0.027246
+0x3cdf327a
+// 0.087103
+0x3db262ff
+// -0.174586
+0xbe32c6b0
+// -0.127342
+0xbe026601
+// -0.177080
+0xbe355492
+// -0.018097
+0xbc9440fd
+// 0.233290
+0x3e6ee380
+// -0.061413
+0xbd7b8c04
+// -0.091457
+0xbdbb4daf
+// -0.054669
+0xbd5fecb2
+// -0.112814
+0xbde70ae0
+// 0.310927
+0x3e9f31df
+// 0.282749
+0x3e90c479
+// 0.283499
+0x3e9126ca
+// -0.036028
+0xbd1391c4
+// 0.183665
+0x3e3c12c5
+// -0.180983
+0xbe3953ba
+// 0.038990
+0x3d1fb479
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/DimsInvert1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/DimsInvert1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/DimsUnary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/DimsUnary1_s16.txt
@ -18,16 +18,16 @@ H
 0x0004
 // 1
 0x0001
-// 3
-0x0003
 // 1
 0x0001
-// 8
-0x0008
 // 1
 0x0001
-// 11
-0x000B
+// 4
+0x0004
+// 1
+0x0001
+// 5
+0x0005
 // 2
 0x0002
 // 1
@ -46,16 +46,16 @@ H
 0x0004
 // 2
 0x0002
-// 3
-0x0003
+// 1
+0x0001
 // 2
 0x0002
-// 8
-0x0008
+// 4
+0x0004
 // 2
 0x0002
-// 11
-0x000B
+// 5
+0x0005
 // 3
 0x0003
 // 1
@ -74,16 +74,16 @@ H
 0x0004
 // 3
 0x0003
+// 1
+0x0001
 // 3
 0x0003
+// 4
+0x0004
 // 3
 0x0003
-// 8
-0x0008
-// 3
-0x0003
-// 11
-0x000B
+// 5
+0x0005
 // 4
 0x0004
 // 1
@ -102,97 +102,97 @@ H
 0x0004
 // 4
 0x0004
-// 3
-0x0003
+// 1
+0x0001
 // 4
 0x0004
-// 8
-0x0008
 // 4
 0x0004
-// 11
-0x000B
-// 3
-0x0003
+// 4
+0x0004
+// 5
+0x0005
+// 1
+0x0001
+// 1
+0x0001
 // 1
 0x0001
-// 3
-0x0003
 // 2
 0x0002
+// 1
+0x0001
 // 3
 0x0003
-// 3
-0x0003
-// 3
-0x0003
+// 1
+0x0001
+// 4
+0x0004
+// 1
+0x0001
+// 1
+0x0001
+// 1
+0x0001
+// 4
+0x0004
+// 1
+0x0001
+// 5
+0x0005
 // 4
 0x0004
-// 3
-0x0003
-// 3
-0x0003
-// 3
-0x0003
-// 8
-0x0008
-// 3
-0x0003
-// 11
-0x000B
-// 8
-0x0008
 // 1
 0x0001
-// 8
-0x0008
+// 4
+0x0004
 // 2
 0x0002
-// 8
-0x0008
-// 3
-0x0003
-// 8
-0x0008
 // 4
 0x0004
-// 8
-0x0008
 // 3
 0x0003
-// 8
-0x0008
-// 8
-0x0008
-// 8
-0x0008
-// 11
-0x000B
-// 11
-0x000B
-// 1
-0x0001
-// 11
-0x000B
+// 4
+0x0004
+// 4
+0x0004
+// 4
+0x0004
+// 1
+0x0001
+// 4
+0x0004
+// 4
+0x0004
+// 4
+0x0004
+// 5
+0x0005
+// 5
+0x0005
+// 1
+0x0001
+// 5
+0x0005
 // 2
 0x0002
-// 11
-0x000B
+// 5
+0x0005
 // 3
 0x0003
-// 11
-0x000B
+// 5
+0x0005
 // 4
 0x0004
-// 11
-0x000B
-// 3
-0x0003
-// 11
-0x000B
-// 8
-0x0008
-// 11
-0x000B
-// 11
-0x000B
+// 5
+0x0005
+// 1
+0x0001
+// 5
+0x0005
+// 4
+0x0004
+// 5
+0x0005
+// 5
+0x0005
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputA1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputA1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputB1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputB1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputInvert1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputInvert1_f64.txt
@ -5,9 +5,9 @@ D
 // 0.707107
 0x3fe6a09e667f3bcd
 // 0.707107
-0x3fe6a09e667f3bcd
+0x3fe6a09e667f3bcc
 // -0.707107
-0xbfe6a09e667f3bcd
+0xbfe6a09e667f3bcc
 // 0.707107
 0x3fe6a09e667f3bcd
 // 0.804738
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputVec1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/InputVec1_f64.txt
@ -0,0 +1,96 @@
+D
+47
+// 0.188070
+0x3fc812b160473e1b
+// 0.900343
+0x3feccf9ba1a393e4
+// -0.479540
+0xbfdeb0c7a7b7ac57
+// -0.135886
+0xbfc164b5653789d8
+// -0.399584
+0xbfd992c86340e09a
+// 0.633560
+0x3fe4461f1a220351
+// 0.315810
+0x3fd4363b7f35b178
+// -0.665093
+0xbfe5487129b30263
+// -0.267212
+0xbfd119fe9f8f52a4
+// 0.294500
+0x3fd2d914e3c18019
+// 0.258936
+0x3fd09267a8b06fd6
+// -0.345642
+0xbfd61f018991af05
+// 0.047554
+0x3fa858f1ef89c0f0
+// 0.909382
+0x3fed19a8617f4ddb
+// 0.127969
+0x3fc0614aa5fe143f
+// -0.210006
+0xbfcae178d0e47aee
+// 0.582779
+0x3fe2a61fb1ff1a00
+// -0.535823
+0xbfe12576c8a85075
+// 0.602721
+0x3fe3497e9563a433
+// -0.582924
+0xbfe2a75104bef715
+// 0.275512
+0x3fd1a1fde225f517
+// 0.982241
+0x3fef6e841aa8e96f
+// 0.693574
+0x3fe631c28027aceb
+// 0.223552
+0x3fcc9d5c9990d30f
+// 0.691720
+0x3fe6229120bef17e
+// 0.690738
+0x3fe61a8681d9b83f
+// 0.930976
+0x3fedca8d8989eb5e
+// -0.018398
+0xbf92d6f4a29e7b45
+// -0.964702
+0xbfeeded605b56aef
+// -0.008462
+0xbf8154c953943c75
+// -0.097706
+0xbfb90347155e3950
+// 0.281097
+0x3fd1fd7db03383a0
+// 0.460984
+0x3fdd80c1e0096e1c
+// 0.189543
+0x3fc842f09dbf27f7
+// 0.167661
+0x3fc575eafb8da04b
+// -0.265913
+0xbfd104b98b65f5c4
+// -0.723246
+0xbfe724d42ae816ac
+// -0.118634
+0xbfbe5ec7b6b921c6
+// 0.017836
+0x3f9243a3213547bc
+// 0.489443
+0x3fdf5307621b5f53
+// 0.878651
+0x3fec1de9150d1181
+// 0.025390
+0x3f99ffd50acbc46d
+// -0.262164
+0xbfd0c74d27aa14b2
+// 0.205937
+0x3fca5c210787fc0e
+// -1.000000
+0xbff0000000000000
+// -0.254239
+0xbfd04574c0f2cc71
+// -0.117934
+0xbfbe30e967d219c2
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefAdd1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefAdd1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefInvert1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefInvert1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefScale1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefScale1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefSub1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefSub1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefTranspose1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefTranspose1_f64.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefVecMul1_f64.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryF64/RefVecMul1_f64.txt
@ -0,0 +1,282 @@
+D
+140
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.403118
+0x3fd9ccb072b85c1b
+// 0.384922
+0x3fd8a2905b33385f
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.384922
+0x3fd8a2905b33385f
+// 0.549950
+0x3fe1992fe5dda6b9
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.020352
+0x3f94d713938a2f7c
+// 0.403118
+0x3fd9ccb072b85c1b
+// -0.337749
+0xbfd59daf332556b7
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// 0.549950
+0x3fe1992fe5dda6b9
+// 0.067945
+0x3fb164d9e81351be
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.020352
+0x3f94d713938a2f7c
+// -0.094395
+0xbfb82a4a6149b211
+// 0.403118
+0x3fd9ccb072b85c1b
+// -0.337749
+0xbfd59daf332556b7
+// 0.339251
+0x3fd5b64a49b98e01
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// 0.549950
+0x3fe1992fe5dda6b9
+// 0.067945
+0x3fb164d9e81351be
+// -0.060547
+0xbfaefff457782058
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.025184
+0x3f99c9d7a97ba749
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.020352
+0x3f94d713938a2f7c
+// -0.094395
+0xbfb82a4a6149b211
+// 0.291503
+0x3fd2a7fbe5912c42
+// 0.403118
+0x3fd9ccb072b85c1b
+// -0.337749
+0xbfd59daf332556b7
+// 0.339251
+0x3fd5b64a49b98e01
+// -0.156555
+0xbfc409ff7260bdb3
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.103209
+0xbfba6bee66224ef0
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.025184
+0x3f99c9d7a97ba749
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.103209
+0xbfba6bee66224ef0
+// 0.549950
+0x3fe1992fe5dda6b9
+// 0.067945
+0x3fb164d9e81351be
+// -0.060547
+0xbfaefff457782058
+// 0.314512
+0x3fd420f902511d42
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.403118
+0x3fd9ccb072b85c1b
+// 0.384922
+0x3fd8a2905b33385f
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.384922
+0x3fd8a2905b33385f
+// 0.549950
+0x3fe1992fe5dda6b9
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.025184
+0x3f99c9d7a97ba749
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.020352
+0x3f94d713938a2f7c
+// -0.094395
+0xbfb82a4a6149b211
+// 0.291503
+0x3fd2a7fbe5912c42
+// 0.403118
+0x3fd9ccb072b85c1b
+// -0.337749
+0xbfd59daf332556b7
+// 0.339251
+0x3fd5b64a49b98e01
+// -0.156555
+0xbfc409ff7260bdb3
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.103209
+0xbfba6bee66224ef0
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.025184
+0x3f99c9d7a97ba749
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.103209
+0xbfba6bee66224ef0
+// 0.549950
+0x3fe1992fe5dda6b9
+// 0.067945
+0x3fb164d9e81351be
+// -0.060547
+0xbfaefff457782058
+// 0.314512
+0x3fd420f902511d42
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.025184
+0x3f99c9d7a97ba749
+// -0.077673
+0xbfb3e25d18db8a4e
+// 0.147602
+0x3fc2e49d51d2a4d4
+// 0.020352
+0x3f94d713938a2f7c
+// -0.094395
+0xbfb82a4a6149b211
+// 0.291503
+0x3fd2a7fbe5912c42
+// -0.045680
+0xbfa763521c682a20
+// 0.403118
+0x3fd9ccb072b85c1b
+// -0.337749
+0xbfd59daf332556b7
+// 0.339251
+0x3fd5b64a49b98e01
+// -0.156555
+0xbfc409ff7260bdb3
+// -0.078934
+0xbfb43502d75cf28d
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.103209
+0xbfba6bee66224ef0
+// -0.084109
+0xbfb58827f39fe59d
+// -0.072982
+0xbfb2aeedb4a13520
+// 0.046077
+0x3fa7976cc1c8311b
+// -0.100211
+0xbfb9a76b80226704
+// 0.025184
+0x3f99c9d7a97ba749
+// -0.077673
+0xbfb3e25d18db8a4e
+// 0.384922
+0x3fd8a2905b33385f
+// -0.229119
+0xbfcd53c8a53fe1f5
+// -0.013239
+0xbf8b1cdf2bcdbe78
+// -0.103209
+0xbfba6bee66224ef0
+// -0.084109
+0xbfb58827f39fe59d
+// 0.549950
+0x3fe1992fe5dda6b9
+// 0.067945
+0x3fb164d9e81351be
+// -0.060547
+0xbfaefff457782058
+// 0.314512
+0x3fd420f902511d42
+// -0.679441
+0xbfe5bdfab6930964
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/DimsInvert1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/DimsInvert1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/DimsUnary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/DimsUnary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputA1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputA1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputB1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputB1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputInvert1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputInvert1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputVec1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/InputVec1_q15.txt
@ -0,0 +1,96 @@
+H
+47
+// -0.465348
+0xC46F
+// -0.647177
+0xAD29
+// 0.273109
+0x22F5
+// 0.239502
+0x1EA8
+// -0.128441
+0xEF8F
+// 0.124161
+0x0FE5
+// 0.031394
+0x0405
+// 0.429857
+0x3706
+// -0.611894
+0xB1AD
+// -0.330260
+0xD5BA
+// 0.217499
+0x1BD7
+// 0.182294
+0x1755
+// 0.182626
+0x1760
+// -0.750689
+0x9FE9
+// 0.139218
+0x11D2
+// 0.310518
+0x27BF
+// -0.443683
+0xC735
+// 0.159164
+0x145F
+// -0.375996
+0xCFDF
+// 0.330905
+0x2A5B
+// -0.154334
+0xEC3F
+// 0.638196
+0x51B0
+// -0.375128
+0xCFFC
+// -0.475561
+0xC321
+// 0.222805
+0x1C85
+// 0.859294
+0x6DFD
+// -0.524683
+0xBCD7
+// -0.258695
+0xDEE3
+// -0.317873
+0xD750
+// 0.851911
+0x6D0B
+// 0.431010
+0x372B
+// -1.000000
+0x8000
+// 0.196129
+0x191B
+// -0.004486
+0xFF6D
+// -0.320706
+0xD6F3
+// -0.238188
+0xE183
+// 0.027007
+0x0375
+// -0.255580
+0xDF49
+// -0.660830
+0xAB6A
+// -0.006613
+0xFF27
+// 0.273233
+0x22F9
+// -0.160475
+0xEB76
+// 0.158323
+0x1444
+// 0.170421
+0x15D0
+// 0.899525
+0x7324
+// -0.143121
+0xEDAE
+// 0.739937
+0x5EB6
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefAdd1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefAdd1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefInvert1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefInvert1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefScale1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefScale1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefSub1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefSub1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefTranspose1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefTranspose1_q15.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefVecMul1_q15.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ15/RefVecMul1_q15.txt
@ -0,0 +1,786 @@
+H
+392
+// -0.284706
+0xDB8F
+// -0.634417
+0xAECB
+// -0.565603
+0xB79A
+// -0.521330
+0xBD45
+// -0.527687
+0xBC75
+// 0.029040
+0x03B8
+// 0.100193
+0x0CD3
+// -0.284706
+0xDB8F
+// -0.251457
+0xDFD0
+// -0.634417
+0xAECB
+// -0.236885
+0xE1AE
+// -0.565603
+0xB79A
+// 0.057759
+0x0765
+// -0.521330
+0xBD45
+// 0.401426
+0x3362
+// -0.527687
+0xBC75
+// -0.452416
+0xC617
+// 0.029040
+0x03B8
+// -0.012550
+0xFE65
+// 0.100193
+0x0CD3
+// -1.073115
+0x8000
+// -0.284706
+0xDB8F
+// -0.251457
+0xDFD0
+// -0.117251
+0xF0FE
+// -0.634417
+0xAECB
+// -0.236885
+0xE1AE
+// 0.407645
+0x342E
+// -0.565603
+0xB79A
+// 0.057759
+0x0765
+// 0.043445
+0x0590
+// -0.521330
+0xBD45
+// 0.401426
+0x3362
+// 0.034580
+0x046D
+// -0.527687
+0xBC75
+// -0.452416
+0xC617
+// -0.357922
+0xD230
+// 0.029040
+0x03B8
+// -0.012550
+0xFE65
+// 0.268676
+0x2264
+// 0.100193
+0x0CD3
+// -1.073115
+0x8000
+// -0.625507
+0xAFEF
+// -0.284706
+0xDB8F
+// -0.251457
+0xDFD0
+// -0.117251
+0xF0FE
+// -0.086022
+0xF4FD
+// -0.634417
+0xAECB
+// -0.236885
+0xE1AE
+// 0.407645
+0x342E
+// -0.067085
+0xF76A
+// -0.565603
+0xB79A
+// 0.057759
+0x0765
+// 0.043445
+0x0590
+// -0.046226
+0xFA15
+// -0.521330
+0xBD45
+// 0.401426
+0x3362
+// 0.034580
+0x046D
+// 0.573953
+0x4977
+// -0.527687
+0xBC75
+// -0.452416
+0xC617
+// -0.357922
+0xD230
+// 0.416492
+0x3550
+// 0.029040
+0x03B8
+// -0.012550
+0xFE65
+// 0.268676
+0x2264
+// -0.402059
+0xCC89
+// 0.100193
+0x0CD3
+// -1.073115
+0x8000
+// -0.625507
+0xAFEF
+// 0.782695
+0x642F
+// -0.284706
+0xDB8F
+// -0.251457
+0xDFD0
+// -0.117251
+0xF0FE
+// -0.086022
+0xF4FD
+// 0.174216
+0x164D
+// 0.167846
+0x157C
+// 0.143186
+0x1254
+// -0.634417
+0xAECB
+// -0.236885
+0xE1AE
+// 0.407645
+0x342E
+// -0.067085
+0xF76A
+// -0.012304
+0xFE6D
+// -0.062337
+0xF805
+// 0.428129
+0x36CD
+// -0.565603
+0xB79A
+// 0.057759
+0x0765
+// 0.043445
+0x0590
+// -0.046226
+0xFA15
+// 0.524927
+0x4331
+// 0.112396
+0x0E63
+// 0.590338
+0x4B90
+// -0.521330
+0xBD45
+// 0.401426
+0x3362
+// 0.034580
+0x046D
+// 0.573953
+0x4977
+// -0.092854
+0xF41D
+// -0.150489
+0xECBD
+// 0.152085
+0x1378
+// -0.527687
+0xBC75
+// -0.452416
+0xC617
+// -0.357922
+0xD230
+// 0.416492
+0x3550
+// 0.106044
+0x0D93
+// -0.193382
+0xE73F
+// 0.475416
+0x3CDA
+// 0.029040
+0x03B8
+// -0.012550
+0xFE65
+// 0.268676
+0x2264
+// -0.402059
+0xCC89
+// 0.221121
+0x1C4E
+// -0.193017
+0xE74B
+// -0.672793
+0xA9E2
+// 0.100193
+0x0CD3
+// -1.073115
+0x8000
+// -0.625507
+0xAFEF
+// 0.782695
+0x642F
+// -0.393145
+0xCDAD
+// -0.453314
+0xC5FA
+// -0.270996
+0xDD50
+// -0.284706
+0xDB8F
+// -0.251457
+0xDFD0
+// -0.117251
+0xF0FE
+// -0.086022
+0xF4FD
+// 0.174216
+0x164D
+// 0.167846
+0x157C
+// 0.143186
+0x1254
+// -0.151194
+0xECA6
+// -0.188331
+0xE7E5
+// 0.126571
+0x1033
+// -0.109836
+0xF1F1
+// 0.034154
+0x045F
+// 0.046762
+0x05FC
+// 0.274220
+0x231A
+// -0.164933
+0xEAE3
+// -0.095257
+0xF3CF
+// -0.634417
+0xAECB
+// -0.236885
+0xE1AE
+// 0.407645
+0x342E
+// -0.067085
+0xF76A
+// -0.012304
+0xFE6D
+// -0.062337
+0xF805
+// 0.428129
+0x36CD
+// -0.297410
+0xD9EE
+// 0.106542
+0x0DA3
+// 0.442383
+0x38A0
+// -0.252465
+0xDFAF
+// -0.291942
+0xDAA2
+// -0.041080
+0xFABE
+// -0.303190
+0xD931
+// -0.000858
+0xFFE4
+// -0.182104
+0xE8B1
+// -0.565603
+0xB79A
+// 0.057759
+0x0765
+// 0.043445
+0x0590
+// -0.046226
+0xFA15
+// 0.524927
+0x4331
+// 0.112396
+0x0E63
+// 0.590338
+0x4B90
+// 0.173630
+0x1639
+// 0.178073
+0x16CB
+// 0.207389
+0x1A8C
+// -0.305510
+0xD8E5
+// -0.110668
+0xF1D6
+// 0.278231
+0x239D
+// 0.192257
+0x189C
+// 0.338035
+0x2B45
+// 0.170111
+0x15C6
+// -0.521330
+0xBD45
+// 0.401426
+0x3362
+// 0.034580
+0x046D
+// 0.573953
+0x4977
+// -0.092854
+0xF41D
+// -0.150489
+0xECBD
+// 0.152085
+0x1378
+// 0.033223
+0x0441
+// 0.171157
+0x15E8
+// 0.266822
+0x2227
+// -0.019437
+0xFD83
+// 0.155754
+0x13F0
+// -0.089832
+0xF480
+// 0.342389
+0x2BD3
+// -0.477839
+0xC2D6
+// 0.158258
+0x1442
+// -0.527687
+0xBC75
+// -0.452416
+0xC617
+// -0.357922
+0xD230
+// 0.416492
+0x3550
+// 0.106044
+0x0D93
+// -0.193382
+0xE73F
+// 0.475416
+0x3CDA
+// -0.112868
+0xF18E
+// -0.399115
+0xCCEA
+// 0.114840
+0x0EB3
+// -0.065647
+0xF799
+// -0.388258
+0xCE4E
+// -0.365147
+0xD143
+// -0.125584
+0xEFED
+// 0.052454
+0x06B7
+// -0.085717
+0xF507
+// 0.029040
+0x03B8
+// -0.012550
+0xFE65
+// 0.268676
+0x2264
+// -0.402059
+0xCC89
+// 0.221121
+0x1C4E
+// -0.193017
+0xE74B
+// -0.672793
+0xA9E2
+// -0.214537
+0xE48A
+// -0.376556
+0xCFCD
+// 0.474829
+0x3CC7
+// 0.361444
+0x2E44
+// -0.778224
+0x9C63
+// 0.376189
+0x3027
+// -0.985972
+0x81CC
+// -0.062216
+0xF809
+// 0.817140
+0x6898
+// 0.100193
+0x0CD3
+// -1.073115
+0x8000
+// -0.625507
+0xAFEF
+// 0.782695
+0x642F
+// -0.393145
+0xCDAD
+// -0.453314
+0xC5FA
+// -0.270996
+0xDD50
+// -0.615754
+0xB12F
+// 0.209266
+0x1AC9
+// -0.890081
+0x8E12
+// 0.254230
+0x208B
+// -0.547540
+0xB9EA
+// 0.246804
+0x1F97
+// -0.139674
+0xEE1F
+// -0.267184
+0xDDCD
+// 0.650795
+0x534D
+// -0.284706
+0xDB8F
+// -0.251457
+0xDFD0
+// -0.117251
+0xF0FE
+// -0.086022
+0xF4FD
+// 0.174216
+0x164D
+// 0.167846
+0x157C
+// 0.143186
+0x1254
+// -0.151194
+0xECA6
+// -0.188331
+0xE7E5
+// 0.126571
+0x1033
+// -0.109836
+0xF1F1
+// 0.034154
+0x045F
+// 0.046762
+0x05FC
+// 0.274220
+0x231A
+// -0.164933
+0xEAE3
+// -0.095257
+0xF3CF
+// 0.139355
+0x11D6
+// -0.023594
+0xFCFB
+// 0.164569
+0x1511
+// 0.199761
+0x1992
+// -0.252099
+0xDFBB
+// -0.000263
+0xFFF7
+// 0.027965
+0x0394
+// -0.634417
+0xAECB
+// -0.236885
+0xE1AE
+// 0.407645
+0x342E
+// -0.067085
+0xF76A
+// -0.012304
+0xFE6D
+// -0.062337
+0xF805
+// 0.428129
+0x36CD
+// -0.297410
+0xD9EE
+// 0.106542
+0x0DA3
+// 0.442383
+0x38A0
+// -0.252465
+0xDFAF
+// -0.291942
+0xDAA2
+// -0.041080
+0xFABE
+// -0.303190
+0xD931
+// -0.000858
+0xFFE4
+// -0.182104
+0xE8B1
+// 0.148905
+0x130F
+// -0.033985
+0xFBA6
+// 0.235383
+0x1E21
+// -0.042180
+0xFA9A
+// 0.170576
+0x15D5
+// 0.363560
+0x2E89
+// -0.041988
+0xFAA0
+// -0.565603
+0xB79A
+// 0.057759
+0x0765
+// 0.043445
+0x0590
+// -0.046226
+0xFA15
+// 0.524927
+0x4331
+// 0.112396
+0x0E63
+// 0.590338
+0x4B90
+// 0.173630
+0x1639
+// 0.178073
+0x16CB
+// 0.207389
+0x1A8C
+// -0.305510
+0xD8E5
+// -0.110668
+0xF1D6
+// 0.278231
+0x239D
+// 0.192257
+0x189C
+// 0.338035
+0x2B45
+// 0.170111
+0x15C6
+// -0.154852
+0xEC2E
+// 0.106125
+0x0D95
+// -0.269366
+0xDD85
+// -0.199706
+0xE670
+// 0.091273
+0x0BAF
+// 0.067247
+0x089C
+// 0.070451
+0x0905
+// -0.521330
+0xBD45
+// 0.401426
+0x3362
+// 0.034580
+0x046D
+// 0.573953
+0x4977
+// -0.092854
+0xF41D
+// -0.150489
+0xECBD
+// 0.152085
+0x1378
+// 0.033223
+0x0441
+// 0.171157
+0x15E8
+// 0.266822
+0x2227
+// -0.019437
+0xFD83
+// 0.155754
+0x13F0
+// -0.089832
+0xF480
+// 0.342389
+0x2BD3
+// -0.477839
+0xC2D6
+// 0.158258
+0x1442
+// -0.003307
+0xFF94
+// -0.318688
+0xD735
+// 0.277172
+0x237A
+// 0.426033
+0x3688
+// -0.035852
+0xFB69
+// -0.351541
+0xD301
+// -0.082220
+0xF57A
+// -0.527687
+0xBC75
+// -0.452416
+0xC617
+// -0.357922
+0xD230
+// 0.416492
+0x3550
+// 0.106044
+0x0D93
+// -0.193382
+0xE73F
+// 0.475416
+0x3CDA
+// -0.112868
+0xF18E
+// -0.399115
+0xCCEA
+// 0.114840
+0x0EB3
+// -0.065647
+0xF799
+// -0.388258
+0xCE4E
+// -0.365147
+0xD143
+// -0.125584
+0xEFED
+// 0.052454
+0x06B7
+// -0.085717
+0xF507
+// -0.161882
+0xEB47
+// 0.051194
+0x068E
+// -0.569899
+0xB70E
+// 0.003946
+0x0081
+// -0.321326
+0xD6DF
+// -0.050596
+0xF986
+// -0.188283
+0xE7E6
+// 0.029040
+0x03B8
+// -0.012550
+0xFE65
+// 0.268676
+0x2264
+// -0.402059
+0xCC89
+// 0.221121
+0x1C4E
+// -0.193017
+0xE74B
+// -0.672793
+0xA9E2
+// -0.214537
+0xE48A
+// -0.376556
+0xCFCD
+// 0.474829
+0x3CC7
+// 0.361444
+0x2E44
+// -0.778224
+0x9C63
+// 0.376189
+0x3027
+// -0.985972
+0x81CC
+// -0.062216
+0xF809
+// 0.817140
+0x6898
+// -0.219376
+0xE3EB
+// 0.202463
+0x19EA
+// -0.558866
+0xB877
+// 0.478804
+0x3D49
+// 0.133731
+0x111E
+// 0.008904
+0x0124
+// 0.073654
+0x096E
+// 0.100193
+0x0CD3
+// -1.073115
+0x8000
+// -0.625507
+0xAFEF
+// 0.782695
+0x642F
+// -0.393145
+0xCDAD
+// -0.453314
+0xC5FA
+// -0.270996
+0xDD50
+// -0.615754
+0xB12F
+// 0.209266
+0x1AC9
+// -0.890081
+0x8E12
+// 0.254230
+0x208B
+// -0.547540
+0xB9EA
+// 0.246804
+0x1F97
+// -0.139674
+0xEE1F
+// -0.267184
+0xDDCD
+// 0.650795
+0x534D
+// 0.021816
+0x02CB
+// -1.377729
+0x8000
+// -0.496112
+0xC07F
+// -0.951900
+0x8628
+// 0.183783
+0x1786
+// -0.118802
+0xF0CB
+// -0.347255
+0xD38D
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/DimsInvert1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/DimsInvert1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/DimsUnary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/DimsUnary1_s16.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputA1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputA1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputB1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputB1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputInvert1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputInvert1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputVec1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/InputVec1_q31.txt
@ -0,0 +1,96 @@
+W
+47
+// -0.152242
+0xEC835427
+// -0.304523
+0xD9056312
+// 0.752115
+0x6045505B
+// -0.074632
+0xF6727991
+// -0.311527
+0xD81FDE2D
+// -0.268299
+0xDDA85FD6
+// 0.169938
+0x15C08AEA
+// 0.035375
+0x04872EFF
+// 0.553913
+0x46E6A156
+// 0.255991
+0x20C4526C
+// 0.378421
+0x30701B79
+// 0.335550
+0x2AF34957
+// 0.081517
+0x0A6F2769
+// 0.721452
+0x5C588D3E
+// 0.352178
+0x2D142C92
+// 0.121595
+0x0F9068C3
+// -0.104935
+0xF2917B09
+// 0.582863
+0x4A9B4299
+// 0.134666
+0x113CBD7A
+// 0.171217
+0x15EA72A9
+// 0.171319
+0x15EDC67F
+// 0.389106
+0x31CE3640
+// 0.067756
+0x08AC3C05
+// -0.235464
+0xE1DC51C2
+// 0.086284
+0x0B0B5E54
+// -0.036278
+0xFB5B3A7E
+// 0.226389
+0x1CFA5297
+// -0.455548
+0xC5B09D63
+// 0.083920
+0x0ABDE516
+// -0.610345
+0xB1E03B18
+// -0.265371
+0xDE085472
+// 0.038156
+0x04E24D3A
+// 0.482177
+0x3DB7FB7F
+// -0.003396
+0xFF90BBA2
+// -1.000000
+0x80000000
+// -0.100406
+0xF325E8DF
+// -0.424809
+0xC99FDE36
+// 0.461861
+0x3B1E3F9B
+// 0.462910
+0x3B40A5A9
+// -0.201606
+0xE631C49A
+// 0.255992
+0x20C45999
+// 0.161009
+0x149BF0DA
+// -0.732425
+0xA23FE6B7
+// -0.131945
+0xEF1C6AA0
+// 0.269249
+0x2276C433
+// -0.655274
+0xAC1FFDDC
+// 0.266515
+0x221D29CA
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefAdd1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefAdd1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefInvert1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefInvert1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefScale1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefScale1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefSub1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefSub1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefTranspose1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefTranspose1_q31.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefVecMul1_q31.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ31/RefVecMul1_q31.txt
@ -0,0 +1,450 @@
+W
+224
+// -0.012688
+0xFE603DCA
+// 0.019012
+0x026EF83C
+// 0.363476
+0x2E866520
+// 0.364784
+0x2EB13BB4
+// 0.363476
+0x2E866520
+// 0.410001
+0x347AE754
+// 0.398063
+0x32F3BE56
+// -0.012688
+0xFE603DCA
+// 0.015848
+0x02074C4B
+// 0.019012
+0x026EF83C
+// -0.064392
+0xF7C2027A
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// 0.364784
+0x2EB13BB4
+// 0.226368
+0x1CF99D3B
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// 0.410001
+0x347AE754
+// -0.219985
+0xE3D78878
+// 0.398063
+0x32F3BE56
+// -0.033895
+0xFBA95266
+// -0.012688
+0xFE603DCA
+// 0.015848
+0x02074C4B
+// -0.069726
+0xF71336ED
+// 0.019012
+0x026EF83C
+// -0.064392
+0xF7C2027A
+// -0.003738
+0xFF85853E
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.364784
+0x2EB13BB4
+// 0.226368
+0x1CF99D3B
+// -0.199963
+0xE6679903
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.410001
+0x347AE754
+// -0.219985
+0xE3D78878
+// 0.136775
+0x1181D4FA
+// 0.398063
+0x32F3BE56
+// -0.033895
+0xFBA95266
+// 0.026296
+0x035DABF3
+// -0.012688
+0xFE603DCA
+// 0.015848
+0x02074C4B
+// -0.069726
+0xF71336ED
+// 0.002667
+0x005762EE
+// 0.019012
+0x026EF83C
+// -0.064392
+0xF7C2027A
+// -0.003738
+0xFF85853E
+// -0.073292
+0xF69E5A1A
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.031909
+0x041598A2
+// 0.364784
+0x2EB13BB4
+// 0.226368
+0x1CF99D3B
+// -0.199963
+0xE6679903
+// -0.094819
+0xF3DCF7C9
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.031909
+0x041598A2
+// 0.410001
+0x347AE754
+// -0.219985
+0xE3D78878
+// 0.136775
+0x1181D4FA
+// -0.282357
+0xDBDBB826
+// 0.398063
+0x32F3BE56
+// -0.033895
+0xFBA95266
+// 0.026296
+0x035DABF3
+// -0.104912
+0xF2923C41
+// -0.012688
+0xFE603DCA
+// 0.015848
+0x02074C4B
+// -0.069726
+0xF71336ED
+// 0.019012
+0x026EF83C
+// -0.064392
+0xF7C2027A
+// -0.003738
+0xFF85853E
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.364784
+0x2EB13BB4
+// 0.226368
+0x1CF99D3B
+// -0.199963
+0xE6679903
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.410001
+0x347AE754
+// -0.219985
+0xE3D78878
+// 0.136775
+0x1181D4FA
+// 0.398063
+0x32F3BE56
+// -0.033895
+0xFBA95266
+// 0.026296
+0x035DABF3
+// -0.012688
+0xFE603DCA
+// 0.015848
+0x02074C4B
+// -0.069726
+0xF71336ED
+// 0.002667
+0x005762EE
+// -0.006735
+0xFF234F46
+// 0.001498
+0x00311962
+// -0.047840
+0xF9E05ECD
+// -0.012724
+0xFE5F0B3E
+// 0.019012
+0x026EF83C
+// -0.064392
+0xF7C2027A
+// -0.003738
+0xFF85853E
+// -0.073292
+0xF69E5A1A
+// -0.040512
+0xFAD07E4A
+// 0.040550
+0x0530C18A
+// 0.041407
+0x054CD4AF
+// 0.180525
+0x171B7101
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.031909
+0x041598A2
+// -0.130532
+0xEF4ABA75
+// 0.106603
+0x0DA5272F
+// 0.219981
+0x1C285579
+// 0.362146
+0x2E5ACAC0
+// 0.364784
+0x2EB13BB4
+// 0.226368
+0x1CF99D3B
+// -0.199963
+0xE6679903
+// -0.094819
+0xF3DCF7C9
+// 0.235048
+0x1E160D72
+// -0.384188
+0xCED2EF93
+// -0.375134
+0xCFFB9E44
+// 0.018096
+0x0250F6D8
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.031909
+0x041598A2
+// -0.130532
+0xEF4ABA75
+// 0.106603
+0x0DA5272F
+// 0.219981
+0x1C285579
+// 0.362146
+0x2E5ACAC0
+// 0.410001
+0x347AE754
+// -0.219985
+0xE3D78878
+// 0.136775
+0x1181D4FA
+// -0.282357
+0xDBDBB826
+// -0.110660
+0xF1D5E424
+// -0.008943
+0xFEDAF8D8
+// 0.527696
+0x438B872E
+// 0.132354
+0x10F0FC99
+// 0.398063
+0x32F3BE56
+// -0.033895
+0xFBA95266
+// 0.026296
+0x035DABF3
+// -0.104912
+0xF2923C41
+// -0.349996
+0xD333536A
+// 0.036937
+0x04BA5ACD
+// 0.118190
+0x0F20DB79
+// 0.450730
+0x39B185EB
+// -0.012688
+0xFE603DCA
+// 0.015848
+0x02074C4B
+// -0.069726
+0xF71336ED
+// 0.002667
+0x005762EE
+// -0.006735
+0xFF234F46
+// 0.001498
+0x00311962
+// -0.047840
+0xF9E05ECD
+// -0.012724
+0xFE5F0B3E
+// -0.012584
+0xFE63A90B
+// -0.013963
+0xFE36798A
+// 0.032667
+0x042E6E6C
+// 0.019012
+0x026EF83C
+// -0.064392
+0xF7C2027A
+// -0.003738
+0xFF85853E
+// -0.073292
+0xF69E5A1A
+// -0.040512
+0xFAD07E4A
+// 0.040550
+0x0530C18A
+// 0.041407
+0x054CD4AF
+// 0.180525
+0x171B7101
+// 0.050335
+0x06716258
+// 0.044560
+0x05B4241A
+// 0.027548
+0x0386B0FD
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.031909
+0x041598A2
+// -0.130532
+0xEF4ABA75
+// 0.106603
+0x0DA5272F
+// 0.219981
+0x1C285579
+// 0.362146
+0x2E5ACAC0
+// -0.370423
+0xD095F746
+// 0.071175
+0x091C4572
+// 0.104430
+0x0D5DF82D
+// 0.364784
+0x2EB13BB4
+// 0.226368
+0x1CF99D3B
+// -0.199963
+0xE6679903
+// -0.094819
+0xF3DCF7C9
+// 0.235048
+0x1E160D72
+// -0.384188
+0xCED2EF93
+// -0.375134
+0xCFFB9E44
+// 0.018096
+0x0250F6D8
+// 0.037120
+0x04C057F0
+// 0.076411
+0x09C7D778
+// 0.001489
+0x0030CDF6
+// 0.363476
+0x2E866520
+// -0.018207
+0xFDAB631E
+// -0.011126
+0xFE936A65
+// 0.031909
+0x041598A2
+// -0.130532
+0xEF4ABA75
+// 0.106603
+0x0DA5272F
+// 0.219981
+0x1C285579
+// 0.362146
+0x2E5ACAC0
+// -0.370423
+0xD095F746
+// 0.071175
+0x091C4572
+// 0.104430
+0x0D5DF82D
+// 0.410001
+0x347AE754
+// -0.219985
+0xE3D78878
+// 0.136775
+0x1181D4FA
+// -0.282357
+0xDBDBB826
+// -0.110660
+0xF1D5E424
+// -0.008943
+0xFEDAF8D8
+// 0.527696
+0x438B872E
+// 0.132354
+0x10F0FC99
+// -0.048568
+0xF9C8841B
+// -0.049726
+0xF9A29605
+// -0.639715
+0xAE1DD1ED
+// 0.398063
+0x32F3BE56
+// -0.033895
+0xFBA95266
+// 0.026296
+0x035DABF3
+// -0.104912
+0xF2923C41
+// -0.349996
+0xD333536A
+// 0.036937
+0x04BA5ACD
+// 0.118190
+0x0F20DB79
+// 0.450730
+0x39B185EB
+// 0.561376
+0x47DB2E41
+// -0.572353
+0xB6BD220E
+// -0.246312
+0xE078DB44
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/DimsInvert1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/DimsInvert1_s16.txt
@ -0,0 +1,28 @@
+H
+13
+// 1
+0x0001
+// 2
+0x0002
+// 3
+0x0003
+// 4
+0x0004
+// 7
+0x0007
+// 8
+0x0008
+// 9
+0x0009
+// 15
+0x000F
+// 16
+0x0010
+// 17
+0x0011
+// 32
+0x0020
+// 33
+0x0021
+// 2
+0x0002
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/DimsUnary1_s16.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/DimsUnary1_s16.txt
@ -0,0 +1,198 @@
+H
+98
+// 1
+0x0001
+// 1
+0x0001
+// 1
+0x0001
+// 2
+0x0002
+// 1
+0x0001
+// 3
+0x0003
+// 1
+0x0001
+// 4
+0x0004
+// 1
+0x0001
+// 15
+0x000F
+// 1
+0x0001
+// 32
+0x0020
+// 1
+0x0001
+// 47
+0x002F
+// 2
+0x0002
+// 1
+0x0001
+// 2
+0x0002
+// 2
+0x0002
+// 2
+0x0002
+// 3
+0x0003
+// 2
+0x0002
+// 4
+0x0004
+// 2
+0x0002
+// 15
+0x000F
+// 2
+0x0002
+// 32
+0x0020
+// 2
+0x0002
+// 47
+0x002F
+// 3
+0x0003
+// 1
+0x0001
+// 3
+0x0003
+// 2
+0x0002
+// 3
+0x0003
+// 3
+0x0003
+// 3
+0x0003
+// 4
+0x0004
+// 3
+0x0003
+// 15
+0x000F
+// 3
+0x0003
+// 32
+0x0020
+// 3
+0x0003
+// 47
+0x002F
+// 4
+0x0004
+// 1
+0x0001
+// 4
+0x0004
+// 2
+0x0002
+// 4
+0x0004
+// 3
+0x0003
+// 4
+0x0004
+// 4
+0x0004
+// 4
+0x0004
+// 15
+0x000F
+// 4
+0x0004
+// 32
+0x0020
+// 4
+0x0004
+// 47
+0x002F
+// 15
+0x000F
+// 1
+0x0001
+// 15
+0x000F
+// 2
+0x0002
+// 15
+0x000F
+// 3
+0x0003
+// 15
+0x000F
+// 4
+0x0004
+// 15
+0x000F
+// 15
+0x000F
+// 15
+0x000F
+// 32
+0x0020
+// 15
+0x000F
+// 47
+0x002F
+// 32
+0x0020
+// 1
+0x0001
+// 32
+0x0020
+// 2
+0x0002
+// 32
+0x0020
+// 3
+0x0003
+// 32
+0x0020
+// 4
+0x0004
+// 32
+0x0020
+// 15
+0x000F
+// 32
+0x0020
+// 32
+0x0020
+// 32
+0x0020
+// 47
+0x002F
+// 47
+0x002F
+// 1
+0x0001
+// 47
+0x002F
+// 2
+0x0002
+// 47
+0x002F
+// 3
+0x0003
+// 47
+0x002F
+// 4
+0x0004
+// 47
+0x002F
+// 15
+0x000F
+// 47
+0x002F
+// 32
+0x0020
+// 47
+0x002F
+// 47
+0x002F
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputA1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputA1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputB1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputB1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputInvert1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputInvert1_q7.txt
--- a/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputVec1_q7.txt
+++ b/Testing/Patterns/DSP/Matrix/Unary/UnaryQ7/InputVec1_q7.txt
@ -0,0 +1,96 @@
+B
+47
+// 0.046062
+0x06
+// -0.185349
+0xE8
+// -0.187892
+0xE8
+// 0.036878
+0x05
+// -0.250000
+0xE0
+// -0.146929
+0xED
+// -0.009667
+0xFF
+// 0.050554
+0x06
+// 0.063652
+0x08
+// 0.049425
+0x06
+// -0.033825
+0xFC
+// -0.193500
+0xE7
+// 0.061112
+0x08
+// 0.105142
+0x0D
+// 0.055576
+0x07
+// -0.052231
+0xF9
+// -0.187179
+0xE8
+// 0.093307
+0x0C
+// 0.047570
+0x06
+// -0.002171
+0x00
+// -0.099437
+0xF3
+// 0.043236
+0x06
+// 0.016710
+0x02
+// 0.075348
+0x0A
+// 0.089907
+0x0C
+// 0.016701
+0x02
+// -0.128165
+0xF0
+// -0.038441
+0xFB
+// -0.116075
+0xF1
+// -0.065239
+0xF8
+// -0.084529
+0xF5
+// -0.077332
+0xF6
+// 0.119233
+0x0F
+// 0.092500
+0x0C
+// -0.044774
+0xFA
+// 0.231531
+0x1E
+// 0.008593
+0x01
+// 0.065556
+0x08
+// 0.169027
+0x16
+// 0.022974
+0x03
+// 0.081319
+0x0A
+// 0.020806
+0x03
+// -0.006479
+0xFF
+// 0.109865
+0x0E
+// -0.158170
+0xEC
+// 0.156490
+0x14
+// -0.142323
+0xEE
--- a/Show More
+++ b/Show More