CMSIS-DSP: Mean square error for q15, q31, f16, f32, f64.

Reworked q7 to have a bit more accuracy.
4 years ago · 8dcdb350a4
parent 47a987217f
commit 8dcdb350a4
62 changed files with 8788 additions and 7398 deletions
--- a/Include/dsp/statistics_functions.h
+++ b/Include/dsp/statistics_functions.h
@ -910,6 +910,66 @@ void arm_mse_q7(
        uint32_t blockSize,
        q7_t * pResult);
 /**
  @brief         Mean square error between two Q15 vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize  number of samples in input vector
  @param[out]    pResult    mean square error
  @return        none 
 */
 void arm_mse_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t blockSize,
        q15_t * pResult);
 /**
  @brief         Mean square error between two Q31 vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize  number of samples in input vector
  @param[out]    pResult    mean square error
  @return        none 
 */
 void arm_mse_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t blockSize,
        q31_t * pResult);
 /**
  @brief         Mean square error between two single precision float vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize  number of samples in input vector
  @param[out]    pResult    mean square error
  @return        none 
 */
 void arm_mse_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
        uint32_t blockSize,
        float32_t * pResult);
 /**
  @brief         Mean square error between two double precision float vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize  number of samples in input vector
  @param[out]    pResult    mean square error
  @return        none 
 */
 void arm_mse_f64(
  const float64_t * pSrcA,
  const float64_t * pSrcB,
        uint32_t blockSize,
        float64_t * pResult);
 #ifdef   __cplusplus
 }
 #endif
--- a/Include/dsp/statistics_functions_f16.h
+++ b/Include/dsp/statistics_functions_f16.h
@ -243,6 +243,21 @@ float16_t arm_kullback_leibler_f16(const float16_t * pSrcA
      uint32_t   blockSize,
      float16_t *pResult);
 /**
  @brief         Mean square error between two half precision float vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize  number of samples in input vector
  @param[out]    pResult    mean square error
  @return        none 
 */
 void arm_mse_f16(
  const float16_t * pSrcA,
  const float16_t * pSrcB,
        uint32_t blockSize,
        float16_t * pResult);
 #endif /*defined(ARM_FLOAT16_SUPPORTED)*/
 #ifdef   __cplusplus
 }
--- a/Source/StatisticsFunctions/CMakeLists.txt
+++ b/Source/StatisticsFunctions/CMakeLists.txt
@ -81,6 +81,11 @@ target_sources(CMSISDSPStatistics PRIVATE arm_absmin_no_idx_q15.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_absmin_no_idx_q31.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_absmin_no_idx_q7.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_mse_q7.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_mse_q15.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_mse_q31.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_mse_f16.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_mse_f32.c)
 target_sources(CMSISDSPStatistics PRIVATE arm_mse_f64.c)
 configLib(CMSISDSPStatistics ${ROOT})
 configDsp(CMSISDSPStatistics ${ROOT})
--- a/Source/StatisticsFunctions/StatisticsFunctions.c
+++ b/Source/StatisticsFunctions/StatisticsFunctions.c
@ -94,3 +94,7 @@
 #include "arm_absmin_no_idx_q31.c"
 #include "arm_absmin_no_idx_q7.c"
 #include "arm_mse_q7.c"
 #include "arm_mse_q15.c"
 #include "arm_mse_q31.c"
 #include "arm_mse_f32.c"
 #include "arm_mse_f64.c"
--- a/Source/StatisticsFunctions/StatisticsFunctionsF16.c
+++ b/Source/StatisticsFunctions/StatisticsFunctionsF16.c
@ -43,3 +43,4 @@
 #include "arm_absmin_f16.c"
 #include "arm_absmax_no_idx_f16.c"
 #include "arm_absmin_no_idx_f16.c"
 #include "arm_mse_f16.c"
--- a/Source/StatisticsFunctions/arm_mse_f16.c
+++ b/Source/StatisticsFunctions/arm_mse_f16.c
@ -0,0 +1,203 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mse_f16.c
 * Description:  Half floating point mean square error
 *
 * $Date:        05 April 2022
 * $Revision:    V1.10.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "dsp/statistics_functions_f16.h"
 /**
  @ingroup groupStats
 */
 /**
  @addtogroup MSE
  @{
 */
 /**
  @brief         Mean square error between two half floating point vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize   number of samples in input vector
  @param[out]    result      mean square error
  @return        none
 */
 #if !defined(ARM_MATH_AUTOVECTORIZE)
 #if defined(ARM_MATH_MVE_FLOAT16)
 #include "arm_helium_utils.h"
 void arm_mse_f16(
    const float16_t * pSrcA,
    const float16_t * pSrcB,
    uint32_t    blockSize,
    float16_t * result)
 {
    float16x8_t vecA, vecB;
    float16x8_t vecSum;
    uint32_t blkCnt; 
    _Float16 sum = 0.0f16;  
    vecSum = vdupq_n_f16(0.0f16);
    blkCnt = (blockSize) >> 3;
    while (blkCnt > 0U)
    {
        vecA = vld1q(pSrcA);
        pSrcA += 8;
        vecB = vld1q(pSrcB);
        pSrcB += 8;
        vecA = vsubq(vecA, vecB);
        vecSum = vfmaq(vecSum, vecA, vecA);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt --;
    }
    blkCnt = (blockSize) & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vecA = vsubq(vecA, vecB);
        vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
    }
    sum = vecAddAcrossF16Mve(vecSum);
    /* Store result in destination buffer */
    *result = (_Float16)sum / (_Float16)blockSize;
 }
 #endif
 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
 #if defined(ARM_FLOAT16_SUPPORTED)
 #if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mse_f16(
    const float16_t * pSrcA,
    const float16_t * pSrcB,
    uint32_t    blockSize,
    float16_t * result)
 {
  uint32_t blkCnt;                               /* Loop counter */
  _Float16 inA, inB;
  _Float16 sum = 0.0f16;                          /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  blkCnt = (blockSize) >> 3;
  while (blkCnt > 0U)
  {
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = (blockSize) & 7;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif
  while (blkCnt > 0U)
  {
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = (_Float16)inA - (_Float16)inB;
    sum += (_Float16)inA * (_Float16)inA;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer */
  *result = (_Float16)sum / (_Float16)blockSize;
 }
 #endif /* end of test for vector instruction availability */
 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
 /**
  @} end of MSE group
 */
--- a/Source/StatisticsFunctions/arm_mse_f32.c
+++ b/Source/StatisticsFunctions/arm_mse_f32.c
@ -0,0 +1,246 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mse_f32.c
 * Description:  Floating point mean square error
 *
 * $Date:        05 April 2022
 * $Revision:    V1.10.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "dsp/statistics_functions.h"
 /**
  @ingroup groupStats
 */
 /**
  @addtogroup MSE
  @{
 */
 /**
  @brief         Mean square error between two floating point vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize   number of samples in input vector
  @param[out]    result      mean square error
  @return        none
 */
 #if !defined(ARM_MATH_AUTOVECTORIZE)
 #if defined(ARM_MATH_MVEF)
 #include "arm_helium_utils.h"
 void arm_mse_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t    blockSize,
    float32_t * result)
 {
    float32x4_t vecA, vecB;
    float32x4_t vecSum;
    uint32_t blkCnt; 
    float32_t sum = 0.0f;  
    vecSum = vdupq_n_f32(0.0f);
    /* Compute 4 outputs at a time */
    blkCnt = (blockSize) >> 2;
    while (blkCnt > 0U)
    {
        vecA = vld1q(pSrcA);
        pSrcA += 4;
        vecB = vld1q(pSrcB);
        pSrcB += 4;
        vecA = vsubq(vecA, vecB);
        vecSum = vfmaq(vecSum, vecA, vecA);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt --;
    }
    blkCnt = (blockSize) & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecA = vld1q(pSrcA);
        vecB = vld1q(pSrcB);
        vecA = vsubq(vecA, vecB);
        vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
    }
    sum = vecAddAcrossF32Mve(vecSum);
    /* Store result in destination buffer */
    *result = sum / blockSize;
 }
 #endif
 #if defined(ARM_MATH_NEON) 
 void arm_mse_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t    blockSize,
    float32_t * result)
 {
    float32x4_t vecA, vecB;
    float32x4_t vecSum;
    uint32_t blkCnt; 
    float32_t sum = 0.0f;  
    vecSum = vdupq_n_f32(0.0f);
 #if !defined(__aarch64__)
    f32x2_t tmp = vdup_n_f32(0.0f); 
 #endif 
    /* Compute 4 outputs at a time */
    blkCnt = (blockSize) >> 2;
    while (blkCnt > 0U)
    {
        vecA = vld1q_f32(pSrcA);
        pSrcA += 4;
        vecB = vld1q_f32(pSrcB);
        pSrcB += 4;
        vecA = vsubq_f32(vecA, vecB);
        vecSum = vfmaq_f32(vecSum, vecA, vecA);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt --;
    }
 #if defined(__aarch64__)
    sum = vpadds_f32(vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum)));
 #else
    tmp = vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum));
    sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
 #endif 
    blkCnt = (blockSize) & 3;
    while (blkCnt > 0U)
    {
        /* Calculate dot product and store result in a temporary buffer. */
        inA = *pSrcA++; 
        inB = *pSrcB++;
        inA = inA - inB;
        sum += inA * inA;
        /* Decrement loop counter */
        blkCnt--;
    }
    /* Store result in destination buffer */
    *result = sum / blockSize;
 }
 #endif
 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
 #if (!defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)) || defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mse_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t    blockSize,
    float32_t * result)
 {
  uint32_t blkCnt;                               /* Loop counter */
  float32_t inA, inB;
  float32_t sum = 0.0f;                          /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = (blockSize) >> 2;
  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = (blockSize) & 3;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif
  while (blkCnt > 0U)
  {
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer */
  *result = sum / blockSize;
 }
 #endif /* end of test for vector instruction availability */
 /**
  @} end of MSE group
 */
--- a/Source/StatisticsFunctions/arm_mse_f64.c
+++ b/Source/StatisticsFunctions/arm_mse_f64.c
@ -0,0 +1,110 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mse_f64.c
 * Description:  Double floating point mean square error
 *
 * $Date:        05 April 2022
 * $Revision:    V1.10.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "dsp/statistics_functions.h"
 /**
  @ingroup groupStats
 */
 /**
  @addtogroup MSE
  @{
 */
 /**
  @brief         Mean square error between two double floating point vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize   number of samples in input vector
  @param[out]    result      mean square error
  @return        none
 */
 void arm_mse_f64(
    const float64_t * pSrcA,
    const float64_t * pSrcB,
    uint32_t    blockSize,
    float64_t * result)
 {
  uint32_t blkCnt;                               /* Loop counter */
  float64_t inA, inB;
  float64_t sum = 0.0;                          /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
  blkCnt = (blockSize) >> 1;
  while (blkCnt > 0U)
  {
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = (blockSize) & 1;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif
  while (blkCnt > 0U)
  {
    inA = *pSrcA++; 
    inB = *pSrcB++;
    inA = inA - inB;
    sum += inA * inA;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in destination buffer */
  *result = sum / blockSize;
 }
 /**
  @} end of MSE group
 */
--- a/Source/StatisticsFunctions/arm_mse_q15.c
+++ b/Source/StatisticsFunctions/arm_mse_q15.c
@ -0,0 +1,175 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mse_q15.c
 * Description:  Mean square error between two Q15 vectors
 *
 * $Date:        04 April 2022
 * $Revision:    V1.10.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "dsp/statistics_functions.h"
 /**
  @ingroup groupStats
 */
 /**
  @addtogroup MSE
  @{
 */
 /**
  @brief         Mean square error between two Q15 vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize   number of samples in input vector
  @param[out]    pResult     mean square error
  @return        none
 */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mse_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t blockSize,
        q15_t * pResult)
 {
    uint32_t  blkCnt;           /* loop counters */
    q15x8_t vecSrcA,vecSrcB;
    q63_t   sum = 0LL;
    blkCnt = blockSize >> 3U;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vecSrcA = vshrq(vecSrcA,1);
        vecSrcB = vshrq(vecSrcB,1);
        vecSrcA = vqsubq(vecSrcA,vecSrcB);
        /*
         * sum lanes
         */
        sum = vmlaldavaq(sum, vecSrcA, vecSrcA);
        blkCnt--;
        pSrcA += 8;
        pSrcB += 8;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 7;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp16q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vecSrcA = vshrq(vecSrcA,1);
        vecSrcB = vshrq(vecSrcB,1);
        vecSrcA = vqsubq(vecSrcA,vecSrcB);
        sum = vmlaldavaq_p(sum, vecSrcA, vecSrcA, p0);
    }
    *pResult = (q15_t) __SSAT((q31_t) (sum / blockSize)>>13, 16);
 }
 #else
 void arm_mse_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
        uint32_t blockSize,
        q15_t * pResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary result storage */
        q15_t inA,inB;                                       /* Temporary variable to store input value */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    inA = *pSrcA++ >> 1;
    inB = *pSrcB++ >> 1;
    inA = (q15_t) __SSAT(((q31_t) inA - (q31_t)inB), 16);
    sum += (q63_t)((q31_t) inA * inA);
    inA = *pSrcA++ >> 1;
    inB = *pSrcB++ >> 1;
    inA = (q15_t) __SSAT(((q31_t) inA - (q31_t)inB), 16);
    sum += (q63_t)((q31_t) inA * inA);
    inA = *pSrcA++ >> 1;
    inB = *pSrcB++ >> 1;
    inA = (q15_t) __SSAT(((q31_t) inA - (q31_t)inB), 16);
    sum += (q63_t)((q31_t) inA * inA);
    inA = *pSrcA++ >> 1;
    inB = *pSrcB++ >> 1;
    inA = (q15_t) __SSAT(((q31_t) inA - (q31_t)inB), 16);
    sum += (q63_t)((q31_t) inA * inA);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    inA = *pSrcA++ >> 1;
    inB = *pSrcB++ >> 1;
    inA = (q15_t) __SSAT(((q31_t) inA - (q31_t)inB), 16);
    sum += (q63_t)((q31_t) inA * inA);
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in q15 format */
  *pResult = (q15_t) __SSAT((q31_t) (sum / blockSize)>>13, 16);
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of MSE group
 */
--- a/Source/StatisticsFunctions/arm_mse_q31.c
+++ b/Source/StatisticsFunctions/arm_mse_q31.c
@ -0,0 +1,176 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        arm_mse_q31.c
 * Description:  Mean square error between two Q31 vectors
 *
 * $Date:        04 April 2022
 * $Revision:    V1.10.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "dsp/statistics_functions.h"
 /**
  @ingroup groupStats
 */
 /**
  @addtogroup MSE
  @{
 */
 /**
  @brief         Mean square error between two Q31 vectors.
  @param[in]     pSrcA       points to the first input vector
  @param[in]     pSrcB       points to the second input vector
  @param[in]     blockSize  number of samples in input vector
  @param[out]    pResult    mean square error
  @return        none
 */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mse_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t blockSize,
        q31_t * pResult)
 {
    uint32_t  blkCnt;           /* loop counters */
    q31x4_t vecSrcA,vecSrcB;
    q63_t   sum = 0LL;
   /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
    while (blkCnt > 0U)
    {
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vecSrcA = vshrq(vecSrcA,1);
        vecSrcB = vshrq(vecSrcB,1);
        vecSrcA = vqsubq(vecSrcA,vecSrcB);
        /*
         * sum lanes
         */
        sum = vrmlaldavhaq(sum, vecSrcA, vecSrcA);
        blkCnt--;
        pSrcA += 4;
        pSrcB += 4;
    }
    /*
     * tail
     */
    blkCnt = blockSize & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
        vecSrcA = vshrq(vecSrcA,1);
        vecSrcB = vshrq(vecSrcB,1);
        vecSrcA = vqsubq(vecSrcA,vecSrcB);
        sum = vrmlaldavhaq_p(sum, vecSrcA, vecSrcA, p0);
    }
    *pResult = (q31_t) ((sum / blockSize)>>21);
 }
 #else
 void arm_mse_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
        uint32_t blockSize,
        q31_t * pResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
        q63_t sum = 0;                                 /* Temporary result storage */
        q31_t inA32,inB32;                                    /* Temporary variable to store packed input value */
 #if defined (ARM_MATH_LOOPUNROLL)
  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
  while (blkCnt > 0U)
  {
    inA32 = *pSrcA++ >> 1;
    inB32 = *pSrcB++ >> 1;
    inA32 = __QSUB(inA32, inB32);
    sum += ((q63_t) inA32 * inA32) >> 14U;
    inA32 = *pSrcA++ >> 1;
    inB32 = *pSrcB++ >> 1;
    inA32 = __QSUB(inA32, inB32);
    sum += ((q63_t) inA32 * inA32) >> 14U;
    inA32 = *pSrcA++ >> 1;
    inB32 = *pSrcB++ >> 1;
    inA32 = __QSUB(inA32, inB32);
    sum += ((q63_t) inA32 * inA32) >> 14U;
    inA32 = *pSrcA++ >> 1;
    inB32 = *pSrcB++ >> 1;
    inA32 = __QSUB(inA32, inB32);
    sum += ((q63_t) inA32 * inA32) >> 14U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Loop unrolling: Compute remaining outputs */
  blkCnt = blockSize % 0x4U;
 #else
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  while (blkCnt > 0U)
  {
    inA32 = *pSrcA++ >> 1;
    inB32 = *pSrcB++ >> 1;
    inA32 = __QSUB(inA32, inB32);
    sum += ((q63_t) inA32 * inA32) >> 14U;
    /* Decrement loop counter */
    blkCnt--;
  }
  /* Store result in q31 format */
  *pResult = (q31_t) ((sum / blockSize)>>15);
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of MSE group
 */
--- a/Source/StatisticsFunctions/arm_mse_q7.c
+++ b/Source/StatisticsFunctions/arm_mse_q7.c
@ -33,14 +33,14 @@
 */
 /**
-  @defgroup mse Mean Square Error
+  @defgroup MSE Mean Square Error
  Calculates the mean square error between two vectors.
 */
 /**
-  @addtogroup mse
+  @addtogroup MSE
  @{
 */
@ -51,8 +51,7 @@
  @param[in]     blockSize   number of samples in input vector
  @param[out]    pResult     mean square error
  @return        none
-
+ */
 */
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mse_q7(
  const q7_t * pSrcA,
@ -63,14 +62,16 @@ void arm_mse_q7(
    uint32_t  blkCnt;           /* loop counters */
    q7x16_t vecSrcA,vecSrcB;
    q31_t   sum = 0LL;
    q7_t inA,inB;
   /* Compute 16 outputs at a time */
    blkCnt = blockSize >> 4U;
    while (blkCnt > 0U)
    {
-        vecSrcA = vldrbq_s8(pSrcA);
+        vecSrcA = vld1q(pSrcA);
-        vecSrcB = vldrbq_s8(pSrcB);
+        vecSrcB = vld1q(pSrcB);
        vecSrcA = vshrq(vecSrcA,1);
        vecSrcB = vshrq(vecSrcB,1);
        vecSrcA = vqsubq(vecSrcA,vecSrcB);
        /*
@ -87,23 +88,21 @@ void arm_mse_q7(
     * tail
     */
    blkCnt = blockSize & 0xF;
-    while (blkCnt > 0U)
+    if (blkCnt > 0U)
    {
-       /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+        mve_pred16_t p0 = vctp8q(blkCnt);
        vecSrcA = vld1q(pSrcA);
        vecSrcB = vld1q(pSrcB);
-       /* Compute Power and store result in a temporary variable, sum. */
+        vecSrcA = vshrq(vecSrcA,1);
-       inA = *pSrcA++;
+        vecSrcB = vshrq(vecSrcB,1);
       inB = *pSrcB++;
-       inA = (q7_t) __SSAT((q15_t) inA - (q15_t)inB, 8);
+        vecSrcA = vqsubq(vecSrcA,vecSrcB);
       sum += ((q15_t) inA * inA);
-       /* Decrement loop counter */
+        sum = vmladavaq_p(sum, vecSrcA, vecSrcA, p0);
       blkCnt--;
    }
-    *pResult = (q7_t) __SSAT((q15_t) (sum / blockSize)>>7, 8);
+    *pResult = (q7_t) __SSAT((q15_t) (sum / blockSize)>>5, 8);
 }
 #else
 void arm_mse_q7(
@ -116,10 +115,6 @@ void arm_mse_q7(
        q31_t sum = 0;                                 /* Temporary result storage */
        q7_t inA,inB;                                       /* Temporary variable to store input value */
 #if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
        q31_t inA32,inB32;                                    /* Temporary variable to store packed input value */
        q31_t in1, in2;                                /* Temporary variables to store input value */
 #endif
 #if defined (ARM_MATH_LOOPUNROLL)
@ -128,42 +123,25 @@ void arm_mse_q7(
  while (blkCnt > 0U)
  {
-    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+    inA = *pSrcA++ >> 1;
-
+    inB = *pSrcB++ >> 1;
    /* Compute Power and store result in a temporary variable, sum. */
 #if defined (ARM_MATH_DSP)
    inA32 = read_q7x4_ia ((q7_t **) &pSrcA);
    inB32 = read_q7x4_ia ((q7_t **) &pSrcB);
    inA32 = __QSUB8(inA32, inB32);
    in1 = __SXTB16(__ROR(inA32, 8));
    in2 = __SXTB16(inA32);
    /* calculate power and accumulate to accumulator */
    sum = __SMLAD(in1, in1, sum);
    sum = __SMLAD(in2, in2, sum);
 #else
    inA = *pSrcA++;
    inB = *pSrcB++;
    inA = (q7_t) __SSAT((q15_t) inA - (q15_t)inB, 8);
    sum += ((q15_t) inA * inA);
-    inA = *pSrcA++;
+    inA = *pSrcA++ >> 1;
-    inB = *pSrcB++;
+    inB = *pSrcB++ >> 1;
    inA = (q7_t) __SSAT((q15_t) inA - (q15_t)inB, 8);
    sum += ((q15_t) inA * inA);
-    inA = *pSrcA++;
+    inA = *pSrcA++ >> 1;
-    inB = *pSrcB++;
+    inB = *pSrcB++ >> 1;
    inA = (q7_t) __SSAT((q15_t) inA - (q15_t)inB, 8);
    sum += ((q15_t) inA * inA);
-    inA = *pSrcA++;
+    inA = *pSrcA++ >> 1;
-    inB = *pSrcB++;
+    inB = *pSrcB++ >> 1;
    inA = (q7_t) __SSAT((q15_t) inA - (q15_t)inB, 8);
    sum += ((q15_t) inA * inA);
 #endif /* #if defined (ARM_MATH_DSP) */
    /* Decrement loop counter */
    blkCnt--;
@ -181,11 +159,8 @@ void arm_mse_q7(
  while (blkCnt > 0U)
  {
-    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+    inA = *pSrcA++ >> 1;
-
+    inB = *pSrcB++ >> 1;
    /* Compute Power and store result in a temporary variable, sum. */
    inA = *pSrcA++;
    inB = *pSrcB++;
    inA = (q7_t) __SSAT((q15_t) inA - (q15_t)inB, 8);
    sum += ((q15_t) inA * inA);
@ -195,10 +170,10 @@ void arm_mse_q7(
  }
  /* Store result in q7 format */
-  *pResult = (q7_t) __SSAT((q15_t) (sum / blockSize)>>7, 8);;
+  *pResult = (q7_t) __SSAT((q15_t) (sum / blockSize)>>5, 8);;
 }
 #endif /* defined(ARM_MATH_MVEI) */
 /**
-  @} end of power group
+  @} end of MSE group
 */
--- a/Testing/PatternGeneration/Stats.py
+++ b/Testing/PatternGeneration/Stats.py
@ -477,7 +477,8 @@ def writeTests(config,nb,format):
 # So new tests have to be added after existing ones
 def writeNewsTests(config,nb,format):
    NBSAMPLES = 300
-    #config.setOverwrite(True)
+    if format==Tools.F16:
       config.setOverwrite(True)
    data1=np.random.randn(NBSAMPLES)
    data1 = Tools.normalize(data1)
@ -491,7 +492,7 @@ def writeNewsTests(config,nb,format):
    config.writeInput(2, data2,"InputNew")
    nb=generateOperatorTests(config,nb,format,data1,data2,mseTest,"MSEVals")
-    #config.setOverwrite(False)
+    config.setOverwrite(False)
 def generateBenchmark(config,format):
--- a/Testing/Patterns/DSP/Stats/StatsF16/AbsMaxIndexes26_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/AbsMaxIndexes26_s16.txt
@ -1,8 +1,8 @@
 H
 3
-// 4
+// 6
-0x0004
+0x0006
-// 4
+// 6
-0x0004
+0x0006
-// 4
+// 18
-0x0004
+0x0012
--- a/Testing/Patterns/DSP/Stats/StatsF16/AbsMaxVals26_f16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/AbsMaxVals26_f16.txt
@ -1,8 +1,8 @@
 H
 3
-// 0.423138
+// 0.640755
-0x36c5
+0x3920
-// 0.423138
+// 0.640755
-0x36c5
+0x3920
-// 0.423138
+// 0.887109
-0x36c5
+0x3b19
--- a/Testing/Patterns/DSP/Stats/StatsF16/AbsMinIndexes27_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/AbsMinIndexes27_s16.txt
@ -2,7 +2,7 @@ H
 3
 // 0
 0x0000
-// 15
+// 7
-0x000F
+0x0007
-// 15
+// 19
-0x000F
+0x0013
--- a/Testing/Patterns/DSP/Stats/StatsF16/AbsMinVals27_f16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/AbsMinVals27_f16.txt
@ -1,8 +1,8 @@
 H
 3
-// 0.027578
+// 0.107198
-0x270f
+0x2edc
-// 0.007974
+// 0.021092
-0x2015
+0x2566
-// 0.007974
+// 0.002011
-0x2015
+0x181e
--- a/Testing/Patterns/DSP/Stats/StatsF16/InputNew1_f16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/InputNew1_f16.txt
--- a/Testing/Patterns/DSP/Stats/StatsF16/InputNew2_f16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/InputNew2_f16.txt
--- a/Testing/Patterns/DSP/Stats/StatsF16/MSEVals28_f16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF16/MSEVals28_f16.txt
@ -1,10 +1,10 @@
 H
 4
-// 0.038705
+// 0.211855
-0x28f4
+0x32c8
-// 0.092517
+// 0.182973
-0x2dec
+0x31db
-// 0.106867
+// 0.268630
-0x2ed7
+0x344c
-// 0.225679
+// 0.234421
-0x3339
+0x3380
--- a/Testing/Patterns/DSP/Stats/StatsF32/AbsMaxIndexes26_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/AbsMaxIndexes26_s16.txt
@ -1,8 +1,8 @@
 H
 3
-// 1
+// 0
-0x0001
+0x0000
-// 1
+// 7
-0x0001
+0x0007
-// 1
+// 7
-0x0001
+0x0007
--- a/Testing/Patterns/DSP/Stats/StatsF32/AbsMaxVals26_f32.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/AbsMaxVals26_f32.txt
@ -1,8 +1,8 @@
 W
 3
-// 0.476185
+// 0.725166
-0x3ef3ce78
+0x3f39a47a
-// 0.476185
+// 0.817687
-0x3ef3ce78
+0x3f5153ed
-// 0.476185
+// 0.817687
-0x3ef3ce78
+0x3f5153ed
--- a/Testing/Patterns/DSP/Stats/StatsF32/AbsMinIndexes27_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/AbsMinIndexes27_s16.txt
@ -1,8 +1,8 @@
 H
 3
-// 0
+// 1
-0x0000
+0x0001
-// 7
+// 5
-0x0007
+0x0005
-// 7
+// 9
-0x0007
+0x0009
--- a/Testing/Patterns/DSP/Stats/StatsF32/AbsMinVals27_f32.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/AbsMinVals27_f32.txt
@ -1,8 +1,8 @@
 W
 3
-// 0.184919
+// 0.198876
-0x3e3d5b69
+0x3e4ba63c
-// 0.008792
+// 0.035481
-0x3c100d1c
+0x3d1154a3
-// 0.008792
+// 0.034200
-0x3c100d1c
+0x3d0c1510
--- a/Testing/Patterns/DSP/Stats/StatsF32/InputNew1_f32.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/InputNew1_f32.txt
--- a/Testing/Patterns/DSP/Stats/StatsF32/InputNew2_f32.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/InputNew2_f32.txt
--- a/Testing/Patterns/DSP/Stats/StatsF32/MSEVals28_f32.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF32/MSEVals28_f32.txt
@ -1,10 +1,10 @@
 W
 4
-// 0.125231
+// 0.072747
-0x3e003c73
+0x3d94fc3e
-// 0.122919
+// 0.176808
-0x3dfbbceb
+0x3e350d0d
-// 0.145740
+// 0.207669
-0x3e153cd2
+0x3e54a726
-// 0.189820
+// 0.183645
-0x3e426031
+0x3e3c0d87
--- a/Testing/Patterns/DSP/Stats/StatsF64/AbsMaxIndexes26_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/AbsMaxIndexes26_s16.txt
@ -2,7 +2,7 @@ H
 3
 // 1
 0x0001
-// 2
+// 3
-0x0002
+0x0003
-// 2
+// 3
-0x0002
+0x0003
--- a/Testing/Patterns/DSP/Stats/StatsF64/AbsMaxVals26_f64.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/AbsMaxVals26_f64.txt
@ -1,8 +1,8 @@
 D
 3
-// 0.203055
+// 0.579795
-0x3fc9fdb6e0c81ee0
+0x3fe28dad67519d3d
-// 0.360222
+// 0.783610
-0x3fd70de0df777efb
+0x3fe91356237f16f6
-// 0.360222
+// 0.783610
-0x3fd70de0df777efb
+0x3fe91356237f16f6
--- a/Testing/Patterns/DSP/Stats/StatsF64/AbsMinIndexes27_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/AbsMinIndexes27_s16.txt
@ -4,5 +4,5 @@ H
 0x0000
 // 0
 0x0000
-// 0
+// 4
-0x0000
+0x0004
--- a/Testing/Patterns/DSP/Stats/StatsF64/AbsMinVals27_f64.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/AbsMinVals27_f64.txt
@ -1,8 +1,8 @@
 D
 3
-// 0.003692
+// 0.310923
-0x3f6e3f80ef9e8a83
+0x3fd3e6286ed8195c
-// 0.003692
+// 0.310923
-0x3f6e3f80ef9e8a83
+0x3fd3e6286ed8195c
-// 0.003692
+// 0.150640
-0x3f6e3f80ef9e8a83
+0x3fc34828d25e0053
--- a/Testing/Patterns/DSP/Stats/StatsF64/InputNew1_f64.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/InputNew1_f64.txt
--- a/Testing/Patterns/DSP/Stats/StatsF64/InputNew2_f64.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/InputNew2_f64.txt
--- a/Testing/Patterns/DSP/Stats/StatsF64/MSEVals28_f64.txt
+++ b/Testing/Patterns/DSP/Stats/StatsF64/MSEVals28_f64.txt
@ -1,10 +1,10 @@
 D
 4
-// 0.001072
+// 0.221944
-0x3f518f8a7ed015a2
+0x3fcc68ab519cbb08
-// 0.073015
+// 0.487606
-0x3fb2b11b5caa023a
+0x3fdf34ef9e2840ea
-// 0.060567
+// 0.411797
-0x3faf02a5beb935ad
+0x3fda5ae1181a5066
-// 0.198414
+// 0.186577
-0x3fc9659ffa60ff3b
+0x3fc7e1bdbcffc958
--- a/Testing/Patterns/DSP/Stats/StatsQ15/AbsMaxIndexes8_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/AbsMaxIndexes8_s16.txt
@ -1,8 +1,8 @@
 H
 3
-// 4
+// 5
-0x0004
+0x0005
-// 4
+// 15
-0x0004
+0x000F
-// 18
+// 15
-0x0012
+0x000F
--- a/Testing/Patterns/DSP/Stats/StatsQ15/AbsMaxVals8_q15.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/AbsMaxVals8_q15.txt
@ -1,8 +1,8 @@
 H
 3
-// 0.540886
+// 0.511444
-0x453C
+0x4177
-// 0.540886
+// 0.572485
-0x453C
+0x4947
-// 0.701466
+// 0.572485
-0x59CA
+0x4947
--- a/Testing/Patterns/DSP/Stats/StatsQ15/AbsMinIndexes9_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/AbsMinIndexes9_s16.txt
@ -1,8 +1,8 @@
 H
 3
-// 6
+// 1
-0x0006
+0x0001
-// 6
+// 1
-0x0006
+0x0001
-// 6
+// 1
-0x0006
+0x0001
--- a/Testing/Patterns/DSP/Stats/StatsQ15/AbsMinVals9_q15.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/AbsMinVals9_q15.txt
@ -1,8 +1,8 @@
 H
 3
-// 0.003012
+// 0.065882
-0x0063
+0x086F
-// 0.003012
+// 0.065882
-0x0063
+0x086F
-// 0.003012
+// 0.065882
-0x0063
+0x086F
--- a/Testing/Patterns/DSP/Stats/StatsQ15/InputNew1_q15.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/InputNew1_q15.txt
--- a/Testing/Patterns/DSP/Stats/StatsQ15/InputNew2_q15.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/InputNew2_q15.txt
--- a/Testing/Patterns/DSP/Stats/StatsQ15/MSEVals10_q15.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ15/MSEVals10_q15.txt
@ -1,10 +1,10 @@
 H
 4
-// 0.291384
+// 0.123046
-0x254C
+0x0FC0
-// 0.326840
+// 0.134261
-0x29D6
+0x112F
-// 0.266990
+// 0.135165
-0x222D
+0x114D
-// 0.278624
+// 0.237464
-0x23AA
+0x1E65
--- a/Testing/Patterns/DSP/Stats/StatsQ31/AbsMaxIndexes8_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/AbsMaxIndexes8_s16.txt
@ -1,8 +1,8 @@
 H
 3
-// 1
+// 2
-0x0001
+0x0002
-// 3
+// 7
-0x0003
+0x0007
-// 8
+// 7
-0x0008
+0x0007
--- a/Testing/Patterns/DSP/Stats/StatsQ31/AbsMaxVals8_q31.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/AbsMaxVals8_q31.txt
@ -1,8 +1,8 @@
 W
 3
-// 0.352374
+// 0.254671
-0x2D1A96B5
+0x20990B68
-// 0.530170
+// 0.516980
-0x43DC9BE7
+0x422C699D
-// 0.634745
+// 0.516980
-0x513F5458
+0x422C699D
--- a/Testing/Patterns/DSP/Stats/StatsQ31/AbsMinIndexes9_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/AbsMinIndexes9_s16.txt
@ -2,7 +2,7 @@ H
 3
 // 0
 0x0000
-// 7
+// 4
-0x0007
+0x0004
-// 7
+// 4
-0x0007
+0x0004
--- a/Testing/Patterns/DSP/Stats/StatsQ31/AbsMinVals9_q31.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/AbsMinVals9_q31.txt
@ -1,8 +1,8 @@
 W
 3
-// 0.132805
+// 0.053227
-0x10FFBE95
+0x06D0231F
-// 0.003898
+// 0.003305
-0x007FB95F
+0x006C4DD3
-// 0.003898
+// 0.003305
-0x007FB95F
+0x006C4DD3
--- a/Testing/Patterns/DSP/Stats/StatsQ31/InputNew1_q31.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/InputNew1_q31.txt
--- a/Testing/Patterns/DSP/Stats/StatsQ31/InputNew2_q31.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/InputNew2_q31.txt
--- a/Testing/Patterns/DSP/Stats/StatsQ31/MSEVals10_q31.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ31/MSEVals10_q31.txt
@ -1,10 +1,10 @@
 W
 4
-// 0.066580
+// 0.153783
-0x0885AD96
+0x13AF2B40
-// 0.089078
+// 0.209919
-0x0B66E9B3
+0x1ADE9F11
-// 0.168307
+// 0.155268
-0x158B15E2
+0x13DFD01C
-// 0.196400
+// 0.248101
-0x19239FC7
+0x1FC1C512
--- a/Testing/Patterns/DSP/Stats/StatsQ7/AbsMaxIndexes8_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/AbsMaxIndexes8_s16.txt
@ -1,10 +1,10 @@
 H
 4
-// 7
+// 1
-0x0007
+0x0001
-// 7
+// 25
-0x0007
+0x0019
-// 46
+// 25
-0x002E
+0x0019
 // 279
 0x0117
--- a/Testing/Patterns/DSP/Stats/StatsQ7/AbsMaxVals8_q7.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/AbsMaxVals8_q7.txt
@ -1,10 +1,10 @@
 B
 4
-// 0.807620
+// 0.619484
 0x4F
 // 0.802797
 0x67
-// 0.807620
+// 0.802797
 0x67
 // 0.984827
 0x7E
 // 0.900000
 0x73
--- a/Testing/Patterns/DSP/Stats/StatsQ7/AbsMinIndexes9_s16.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/AbsMinIndexes9_s16.txt
@ -1,10 +1,10 @@
 H
 4
-// 13
+// 8
-0x000D
+0x0008
-// 13
+// 18
-0x000D
+0x0012
-// 13
+// 18
-0x000D
+0x0012
 // 279
 0x0117
--- a/Testing/Patterns/DSP/Stats/StatsQ7/AbsMinVals9_q7.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/AbsMinVals9_q7.txt
@ -1,10 +1,10 @@
 B
 4
-// 0.008109
+// 0.008779
 0x01
 // 0.008109
 0x01
 // 0.008109
 0x01
 // 0.000193
 0x00
 // 0.000193
 0x00
 // 0.000000
 0x00
--- a/Testing/Patterns/DSP/Stats/StatsQ7/InputNew1_q7.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/InputNew1_q7.txt
--- a/Testing/Patterns/DSP/Stats/StatsQ7/InputNew2_q7.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/InputNew2_q7.txt
--- a/Testing/Patterns/DSP/Stats/StatsQ7/MSEVals10_q7.txt
+++ b/Testing/Patterns/DSP/Stats/StatsQ7/MSEVals10_q7.txt
@ -1,10 +1,10 @@
 B
 4
-// 0.191272
+// 0.092336
-0x18
+0x0C
-// 0.159547
+// 0.121537
-0x14
+0x10
-// 0.205092
+// 0.162974
-0x1A
+0x15
-// 0.257902
+// 0.148534
-0x21
+0x13
--- a/Testing/Source/Tests/StatsTestsF16.cpp
+++ b/Testing/Source/Tests/StatsTestsF16.cpp
@ -441,7 +441,28 @@ a double precision computation.
      ASSERT_REL_ERROR(ref,output,REL_ERROR);
    } 
    void StatsTestsF16::test_mse_f16()
    {
        const float16_t *inpA  = inputA.ptr();
        const float16_t *inpB  = inputB.ptr();
        float16_t result;
        float16_t *refp  = ref.ptr();
        float16_t *outp  = output.ptr();
        arm_mse_f16(inpA,inpB,
              inputA.nbSamples(),
              &result);
        outp[0] = result;
        ASSERT_SNR(result,refp[this->refOffset],(float16_t)SNR_THRESHOLD);
        ASSERT_REL_ERROR(result,refp[this->refOffset],(float16_t)REL_ERROR);
    }
    void StatsTestsF16::setUp(Testing::testID_t id,std::vector<Testing::param_t>& paramsArgs,Client::PatternMgr *mgr)
    {
@ -1032,6 +1053,58 @@ a double precision computation.
               refOffset = 2;
            }
            break;
             case StatsTestsF16::TEST_MSE_F16_49:
            {
               inputA.reload(StatsTestsF16::INPUTNEW1_F16_ID,mgr,7);
               inputB.reload(StatsTestsF16::INPUTNEW2_F16_ID,mgr,7);
               ref.reload(StatsTestsF16::MSE_F16_ID,mgr);
               output.create(1,StatsTestsF16::OUT_F16_ID,mgr);
               refOffset = 0;
            }
            break;
            case StatsTestsF16::TEST_MSE_F16_50:
            {
               inputA.reload(StatsTestsF16::INPUTNEW1_F16_ID,mgr,16);
               inputB.reload(StatsTestsF16::INPUTNEW2_F16_ID,mgr,16);
               ref.reload(StatsTestsF16::MSE_F16_ID,mgr);
               output.create(1,StatsTestsF16::OUT_F16_ID,mgr);
               refOffset = 1;
            }
            break;
            case StatsTestsF16::TEST_MSE_F16_51:
            {
               inputA.reload(StatsTestsF16::INPUTNEW1_F16_ID,mgr,23);
               inputB.reload(StatsTestsF16::INPUTNEW2_F16_ID,mgr,23);
               ref.reload(StatsTestsF16::MSE_F16_ID,mgr);
               output.create(1,StatsTestsF16::OUT_F16_ID,mgr);
               refOffset = 2;
            }
            break;
            case StatsTestsF16::TEST_MSE_F16_52:
            {
               inputA.reload(StatsTestsF16::INPUTNEW1_F16_ID,mgr,100);
               inputB.reload(StatsTestsF16::INPUTNEW2_F16_ID,mgr,100);
               ref.reload(StatsTestsF16::MSE_F16_ID,mgr);
               output.create(1,StatsTestsF16::OUT_F16_ID,mgr);
               refOffset = 3;
            }
            break;
        }
    }
--- a/Testing/Source/Tests/StatsTestsF32.cpp
+++ b/Testing/Source/Tests/StatsTestsF32.cpp
@ -436,6 +436,29 @@ a double precision computation.
    } 
    void StatsTestsF32::test_mse_f32()
    {
        const float32_t *inpA  = inputA.ptr();
        const float32_t *inpB  = inputB.ptr();
        float32_t result;
        float32_t *refp  = ref.ptr();
        float32_t *outp  = output.ptr();
        arm_mse_f32(inpA,inpB,
              inputA.nbSamples(),
              &result);
        outp[0] = result;
        ASSERT_SNR(result,refp[this->refOffset],(float32_t)SNR_THRESHOLD);
        ASSERT_REL_ERROR(result,refp[this->refOffset],(float32_t)REL_ERROR);
    }
    void StatsTestsF32::setUp(Testing::testID_t id,std::vector<Testing::param_t>& paramsArgs,Client::PatternMgr *mgr)
    {
@ -1027,6 +1050,58 @@ a double precision computation.
            }
            break;
            case StatsTestsF32::TEST_MSE_F32_49:
            {
               inputA.reload(StatsTestsF32::INPUTNEW1_F32_ID,mgr,3);
               inputB.reload(StatsTestsF32::INPUTNEW2_F32_ID,mgr,3);
               ref.reload(StatsTestsF32::MSE_F32_ID,mgr);
               output.create(1,StatsTestsF32::OUT_F32_ID,mgr);
               refOffset = 0;
            }
            break;
            case StatsTestsF32::TEST_MSE_F32_50:
            {
               inputA.reload(StatsTestsF32::INPUTNEW1_F32_ID,mgr,8);
               inputB.reload(StatsTestsF32::INPUTNEW2_F32_ID,mgr,8);
               ref.reload(StatsTestsF32::MSE_F32_ID,mgr);
               output.create(1,StatsTestsF32::OUT_F32_ID,mgr);
               refOffset = 1;
            }
            break;
            case StatsTestsF32::TEST_MSE_F32_51:
            {
               inputA.reload(StatsTestsF32::INPUTNEW1_F32_ID,mgr,11);
               inputB.reload(StatsTestsF32::INPUTNEW2_F32_ID,mgr,11);
               ref.reload(StatsTestsF32::MSE_F32_ID,mgr);
               output.create(1,StatsTestsF32::OUT_F32_ID,mgr);
               refOffset = 2;
            }
            break;
            case StatsTestsF32::TEST_MSE_F32_52:
            {
               inputA.reload(StatsTestsF32::INPUTNEW1_F32_ID,mgr,100);
               inputB.reload(StatsTestsF32::INPUTNEW2_F32_ID,mgr,100);
               ref.reload(StatsTestsF32::MSE_F32_ID,mgr);
               output.create(1,StatsTestsF32::OUT_F32_ID,mgr);
               refOffset = 3;
            }
            break;
        }
--- a/Testing/Source/Tests/StatsTestsF64.cpp
+++ b/Testing/Source/Tests/StatsTestsF64.cpp
@ -439,6 +439,29 @@ a double precision computation.
 */
    void StatsTestsF64::test_mse_f64()
    {
        const float64_t *inpA  = inputA.ptr();
        const float64_t *inpB  = inputB.ptr();
        float64_t result;
        float64_t *refp  = ref.ptr();
        float64_t *outp  = output.ptr();
        arm_mse_f64(inpA,inpB,
              inputA.nbSamples(),
              &result);
        outp[0] = result;
        ASSERT_SNR(result,refp[this->refOffset],(float64_t)SNR_THRESHOLD);
        ASSERT_REL_ERROR(result,refp[this->refOffset],(float64_t)REL_ERROR);
    }
    void StatsTestsF64::setUp(Testing::testID_t id,std::vector<Testing::param_t>& paramsArgs,Client::PatternMgr *mgr)
    {
        (void)paramsArgs;
@ -1030,6 +1053,58 @@ a double precision computation.
            }
            break;
            case StatsTestsF64::TEST_MSE_F64_49:
            {
               inputA.reload(StatsTestsF64::INPUTNEW1_F64_ID,mgr,2);
               inputB.reload(StatsTestsF64::INPUTNEW2_F64_ID,mgr,2);
               ref.reload(StatsTestsF64::MSE_F64_ID,mgr);
               output.create(1,StatsTestsF64::OUT_F64_ID,mgr);
               refOffset = 0;
            }
            break;
            case StatsTestsF64::TEST_MSE_F64_50:
            {
               inputA.reload(StatsTestsF64::INPUTNEW1_F64_ID,mgr,4);
               inputB.reload(StatsTestsF64::INPUTNEW2_F64_ID,mgr,4);
               ref.reload(StatsTestsF64::MSE_F64_ID,mgr);
               output.create(1,StatsTestsF64::OUT_F64_ID,mgr);
               refOffset = 1;
            }
            break;
            case StatsTestsF64::TEST_MSE_F64_51:
            {
               inputA.reload(StatsTestsF64::INPUTNEW1_F64_ID,mgr,5);
               inputB.reload(StatsTestsF64::INPUTNEW2_F64_ID,mgr,5);
               ref.reload(StatsTestsF64::MSE_F64_ID,mgr);
               output.create(1,StatsTestsF64::OUT_F64_ID,mgr);
               refOffset = 2;
            }
            break;
            case StatsTestsF64::TEST_MSE_F64_52:
            {
               inputA.reload(StatsTestsF64::INPUTNEW1_F64_ID,mgr,100);
               inputB.reload(StatsTestsF64::INPUTNEW2_F64_ID,mgr,100);
               ref.reload(StatsTestsF64::MSE_F64_ID,mgr);
               output.create(1,StatsTestsF64::OUT_F64_ID,mgr);
               refOffset = 3;
            }
            break;
        }
--- a/Testing/Source/Tests/StatsTestsQ15.cpp
+++ b/Testing/Source/Tests/StatsTestsQ15.cpp
@ -6,6 +6,8 @@
 //#include <cstdio>
 #define SNR_THRESHOLD 50
 #define SNR_THRESHOLD_MSE 50
 /* 
 Reference patterns are generated with
@ -13,6 +15,8 @@ a double precision computation.
 */
 #define ABS_ERROR_Q15 ((q15_t)100)
 #define ABS_ERROR_Q15_MSE ((q15_t)100)
 #define ABS_ERROR_Q63 (1<<17)
    void StatsTestsQ15::test_max_q15()
@ -310,6 +314,29 @@ a double precision computation.
    }
    void StatsTestsQ15::test_mse_q15()
    {
        const q15_t *inpA  = inputA.ptr();
        const q15_t *inpB  = inputB.ptr();
        q15_t result;
        q15_t *refp  = ref.ptr();
        q15_t *outp  = output.ptr();
        arm_mse_q15(inpA,inpB,
              inputA.nbSamples(),
              &result);
        outp[0] = result;
        ASSERT_SNR(result,refp[this->refOffset],(float32_t)SNR_THRESHOLD_MSE);
        ASSERT_NEAR_EQ(result,refp[this->refOffset],(q15_t)ABS_ERROR_Q15_MSE);
    }
    void StatsTestsQ15::setUp(Testing::testID_t id,std::vector<Testing::param_t>& paramsArgs,Client::PatternMgr *mgr)
    {
@ -808,6 +835,58 @@ a double precision computation.
            }
            break;
            case StatsTestsQ15::TEST_MSE_Q15_40:
            {
               inputA.reload(StatsTestsQ15::INPUTNEW1_Q15_ID,mgr,7);
               inputB.reload(StatsTestsQ15::INPUTNEW2_Q15_ID,mgr,7);
               ref.reload(StatsTestsQ15::MSE_Q15_ID,mgr);
               output.create(1,StatsTestsQ15::OUT_Q15_ID,mgr);
               refOffset = 0;
            }
            break;
            case StatsTestsQ15::TEST_MSE_Q15_41:
            {
               inputA.reload(StatsTestsQ15::INPUTNEW1_Q15_ID,mgr,16);
               inputB.reload(StatsTestsQ15::INPUTNEW2_Q15_ID,mgr,16);
               ref.reload(StatsTestsQ15::MSE_Q15_ID,mgr);
               output.create(1,StatsTestsQ15::OUT_Q15_ID,mgr);
               refOffset = 1;
            }
            break;
            case StatsTestsQ15::TEST_MSE_Q15_42:
            {
               inputA.reload(StatsTestsQ15::INPUTNEW1_Q15_ID,mgr,23);
               inputB.reload(StatsTestsQ15::INPUTNEW2_Q15_ID,mgr,23);
               ref.reload(StatsTestsQ15::MSE_Q15_ID,mgr);
               output.create(1,StatsTestsQ15::OUT_Q15_ID,mgr);
               refOffset = 2;
            }
            break;
            case StatsTestsQ15::TEST_MSE_Q15_43:
            {
               inputA.reload(StatsTestsQ15::INPUTNEW1_Q15_ID,mgr,100);
               inputB.reload(StatsTestsQ15::INPUTNEW2_Q15_ID,mgr,100);
               ref.reload(StatsTestsQ15::MSE_Q15_ID,mgr);
               output.create(1,StatsTestsQ15::OUT_Q15_ID,mgr);
               refOffset = 3;
            }
            break;
        }
--- a/Testing/Source/Tests/StatsTestsQ31.cpp
+++ b/Testing/Source/Tests/StatsTestsQ31.cpp
@ -6,6 +6,8 @@
 //#include <cstdio>
 #define SNR_THRESHOLD 100
 #define SNR_THRESHOLD_MSE 100
 /* 
 Reference patterns are generated with
@ -13,6 +15,8 @@ a double precision computation.
 */
 #define ABS_ERROR_Q31 ((q31_t)(100))
 #define ABS_ERROR_Q31_MSE ((q31_t)(100))
 #define ABS_ERROR_Q63 ((q63_t)(1<<18))
    void StatsTestsQ31::test_max_q31()
@ -309,6 +313,29 @@ a double precision computation.
    }
    void StatsTestsQ31::test_mse_q31()
    {
        const q31_t *inpA  = inputA.ptr();
        const q31_t *inpB  = inputB.ptr();
        q31_t result;
        q31_t *refp  = ref.ptr();
        q31_t *outp  = output.ptr();
        arm_mse_q31(inpA,inpB,
              inputA.nbSamples(),
              &result);
        outp[0] = result;
        ASSERT_SNR(result,refp[this->refOffset],(float32_t)SNR_THRESHOLD_MSE);
        ASSERT_NEAR_EQ(result,refp[this->refOffset],(q31_t)ABS_ERROR_Q31_MSE);
    }
    void StatsTestsQ31::setUp(Testing::testID_t id,std::vector<Testing::param_t>& paramsArgs,Client::PatternMgr *mgr)
@ -808,6 +835,58 @@ a double precision computation.
            }
            break;
            case StatsTestsQ31::TEST_MSE_Q31_40:
            {
               inputA.reload(StatsTestsQ31::INPUTNEW1_Q31_ID,mgr,3);
               inputB.reload(StatsTestsQ31::INPUTNEW2_Q31_ID,mgr,3);
               ref.reload(StatsTestsQ31::MSE_Q31_ID,mgr);
               output.create(1,StatsTestsQ31::OUT_Q31_ID,mgr);
               refOffset = 0;
            }
            break;
            case StatsTestsQ31::TEST_MSE_Q31_41:
            {
               inputA.reload(StatsTestsQ31::INPUTNEW1_Q31_ID,mgr,8);
               inputB.reload(StatsTestsQ31::INPUTNEW2_Q31_ID,mgr,8);
               ref.reload(StatsTestsQ31::MSE_Q31_ID,mgr);
               output.create(1,StatsTestsQ31::OUT_Q31_ID,mgr);
               refOffset = 1;
            }
            break;
            case StatsTestsQ31::TEST_MSE_Q31_42:
            {
               inputA.reload(StatsTestsQ31::INPUTNEW1_Q31_ID,mgr,11);
               inputB.reload(StatsTestsQ31::INPUTNEW2_Q31_ID,mgr,11);
               ref.reload(StatsTestsQ31::MSE_Q31_ID,mgr);
               output.create(1,StatsTestsQ31::OUT_Q31_ID,mgr);
               refOffset = 2;
            }
            break;
            case StatsTestsQ31::TEST_MSE_Q31_43:
            {
               inputA.reload(StatsTestsQ31::INPUTNEW1_Q31_ID,mgr,100);
               inputB.reload(StatsTestsQ31::INPUTNEW2_Q31_ID,mgr,100);
               ref.reload(StatsTestsQ31::MSE_Q31_ID,mgr);
               output.create(1,StatsTestsQ31::OUT_Q31_ID,mgr);
               refOffset = 3;
            }
            break;
        }
--- a/Testing/Source/Tests/StatsTestsQ7.cpp
+++ b/Testing/Source/Tests/StatsTestsQ7.cpp
@ -6,7 +6,7 @@
 //#include <cstdio>
 #define SNR_THRESHOLD 20
-#define SNR_THRESHOLD_MSE 14
+#define SNR_THRESHOLD_MSE 20
 /* 
--- a/Testing/desc.txt
+++ b/Testing/desc.txt
@ -16,6 +16,7 @@ group Root {
              Pattern INPUT1_F64_ID : Input1_f64.txt 
              Pattern INPUTNEW1_F64_ID : InputNew1_f64.txt 
              Pattern INPUTNEW2_F64_ID : InputNew2_f64.txt 
              Pattern INPUT2_F64_ID : Input2_f64.txt 
              Pattern MAXINDEXES_S16_ID : MaxIndexes1_s16.txt
@ -52,6 +53,8 @@ group Root {
              Pattern ABSMININDEXES_S16_ID : AbsMinIndexes27_s16.txt
              Pattern ABSMINVALS_F64_ID : AbsMinVals27_f64.txt
              Pattern MSE_F64_ID : MSEVals28_f64.txt
              Output  OUT_F64_ID : Output
              Output  OUT_S16_ID : Index
@ -122,6 +125,11 @@ group Root {
                Test nb=2n   arm_absmin_no_idx_f64:test_absmin_no_idx_f64
                Test nb=2n+1 arm_absmin_no_idx_f64:test_absmin_no_idx_f64
                Test nb=2    arm_mse_f64:test_mse_f64
                Test nb=2n   arm_mse_f64:test_mse_f64
                Test nb=2n+1 arm_mse_f64:test_mse_f64
                Test long    arm_mse_f64:test_mse_f64
              }
@ -133,6 +141,7 @@ group Root {
              Pattern INPUT1_F32_ID : Input1_f32.txt 
              Pattern INPUTNEW1_F32_ID : InputNew1_f32.txt 
              Pattern INPUTNEW2_F32_ID : InputNew2_f32.txt
              Pattern INPUT2_F32_ID : Input2_f32.txt 
              Pattern MAXINDEXES_S16_ID : MaxIndexes1_s16.txt
@ -169,6 +178,8 @@ group Root {
              Pattern ABSMININDEXES_S16_ID : AbsMinIndexes27_s16.txt
              Pattern ABSMINVALS_F32_ID : AbsMinVals27_f32.txt
              Pattern MSE_F32_ID : MSEVals28_f32.txt
              Output  OUT_F32_ID : Output
              Output  OUT_S16_ID : Index
@ -231,13 +242,18 @@ group Root {
                Test nb=4n   arm_min_no_idx_f32:test_min_no_idx_f32
                Test nb=4n+1 arm_min_no_idx_f32:test_min_no_idx_f32
-                Test nb=2    arm_absmax_no_idx_f32:test_absmax_no_idx_f32
+                Test nb=3    arm_absmax_no_idx_f32:test_absmax_no_idx_f32
-                Test nb=2n   arm_absmax_no_idx_f32:test_absmax_no_idx_f32
+                Test nb=4n   arm_absmax_no_idx_f32:test_absmax_no_idx_f32
-                Test nb=2n+1 arm_absmax_no_idx_f32:test_absmax_no_idx_f32
+                Test nb=4n+1 arm_absmax_no_idx_f32:test_absmax_no_idx_f32
                Test nb=3    arm_absmin_no_idx_f32:test_absmin_no_idx_f32
                Test nb=4n   arm_absmin_no_idx_f32:test_absmin_no_idx_f32
                Test nb=4n+1 arm_absmin_no_idx_f32:test_absmin_no_idx_f32
-                Test nb=2    arm_absmin_no_idx_f32:test_absmin_no_idx_f32
+                Test nb=3    arm_mse_f32:test_mse_f32
-                Test nb=2n   arm_absmin_no_idx_f32:test_absmin_no_idx_f32
+                Test nb=4n   arm_mse_f32:test_mse_f32
-                Test nb=2n+1 arm_absmin_no_idx_f32:test_absmin_no_idx_f32
+                Test nb=4n+1 arm_mse_f32:test_mse_f32
                Test long    arm_mse_f32:test_mse_f32
              }
@ -252,6 +268,7 @@ group Root {
              Pattern INPUT1_Q31_ID : Input1_q31.txt 
              Pattern INPUTNEW1_Q31_ID : InputNew1_q31.txt 
              Pattern INPUTNEW2_Q31_ID : InputNew2_q31.txt 
              Pattern INPUT2_Q31_ID : Input2_q31.txt 
              Pattern MAXINDEXES_S16_ID : MaxIndexes1_s16.txt
@ -270,6 +287,8 @@ group Root {
              Pattern ABSMININDEXES_S16_ID : AbsMinIndexes9_s16.txt
              Pattern ABSMINVALS_Q31_ID : AbsMinVals9_q31.txt
              Pattern MSE_Q31_ID : MSEVals10_q31.txt
              Output  OUT_Q31_ID : Output
              Output  OUT_Q63_ID : Output
              Output  OUT_S16_ID : Index
@ -320,13 +339,18 @@ group Root {
                Test nb=4n   arm_min_no_idx_q31:test_min_no_idx_q31
                Test nb=4n+1 arm_min_no_idx_q31:test_min_no_idx_q31
-                Test nb=2    arm_absmax_no_idx_q31:test_absmax_no_idx_q31
+                Test nb=3    arm_absmax_no_idx_q31:test_absmax_no_idx_q31
-                Test nb=2n   arm_absmax_no_idx_q31:test_absmax_no_idx_q31
+                Test nb=4n   arm_absmax_no_idx_q31:test_absmax_no_idx_q31
-                Test nb=2n+1 arm_absmax_no_idx_q31:test_absmax_no_idx_q31
+                Test nb=4n+1 arm_absmax_no_idx_q31:test_absmax_no_idx_q31
                Test nb=3    arm_absmin_no_idx_q31:test_absmin_no_idx_q31
                Test nb=4n   arm_absmin_no_idx_q31:test_absmin_no_idx_q31
                Test nb=4n+1 arm_absmin_no_idx_q31:test_absmin_no_idx_q31
-                Test nb=2    arm_absmin_no_idx_q31:test_absmin_no_idx_q31
+                Test nb=3    arm_mse_q31:test_mse_q31
-                Test nb=2n   arm_absmin_no_idx_q31:test_absmin_no_idx_q31
+                Test nb=4n   arm_mse_q31:test_mse_q31
-                Test nb=2n+1 arm_absmin_no_idx_q31:test_absmin_no_idx_q31
+                Test nb=4n+1 arm_mse_q31:test_mse_q31
                Test long    arm_mse_q31:test_mse_q31
              }
@ -338,6 +362,7 @@ group Root {
              Pattern INPUT1_Q15_ID : Input1_q15.txt 
              Pattern INPUTNEW1_Q15_ID : InputNew1_q15.txt
              Pattern INPUTNEW2_Q15_ID : InputNew2_q15.txt
              Pattern INPUT2_Q15_ID : Input2_q15.txt 
              Pattern MAXINDEXES_S16_ID : MaxIndexes1_s16.txt
@ -356,7 +381,7 @@ group Root {
              Pattern ABSMININDEXES_S16_ID : AbsMinIndexes9_s16.txt
              Pattern ABSMINVALS_Q15_ID : AbsMinVals9_q15.txt
-
+              Pattern MSE_Q15_ID : MSEVals10_q15.txt
              Output  OUT_Q15_ID : Output
              Output  OUT_Q63_ID : Output
@ -392,29 +417,34 @@ group Root {
                Test nb=8n   arm_var_q15:test_var_q15
                Test nb=8n+1 arm_var_q15:test_var_q15
-                Test nb=3    arm_absmax_q15:test_absmax_q15
+                Test nb=7    arm_absmax_q15:test_absmax_q15
-                Test nb=4n   arm_absmax_q15:test_absmax_q15
+                Test nb=8n   arm_absmax_q15:test_absmax_q15
-                Test nb=4n+1 arm_absmax_q15:test_absmax_q15
+                Test nb=8n+1 arm_absmax_q15:test_absmax_q15
                Test nb=7    arm_absmin_q15:test_absmin_q15
                Test nb=8n   arm_absmin_q15:test_absmin_q15
                Test nb=8n+1 arm_absmin_q15:test_absmin_q15
-                Test nb=3    arm_absmin_q15:test_absmin_q15
+                Test nb=7    arm_max_no_idx_q15:test_max_no_idx_q15
-                Test nb=4n   arm_absmin_q15:test_absmin_q15
+                Test nb=8n   arm_max_no_idx_q15:test_max_no_idx_q15
-                Test nb=4n+1 arm_absmin_q15:test_absmin_q15
+                Test nb=8n+1 arm_max_no_idx_q15:test_max_no_idx_q15
-                Test nb=3    arm_max_no_idx_q15:test_max_no_idx_q15
+                Test nb=7    arm_min_no_idx_q15:test_min_no_idx_q15
-                Test nb=4n   arm_max_no_idx_q15:test_max_no_idx_q15
+                Test nb=8n   arm_min_no_idx_q15:test_min_no_idx_q15
-                Test nb=4n+1 arm_max_no_idx_q15:test_max_no_idx_q15
+                Test nb=8n+1 arm_min_no_idx_q15:test_min_no_idx_q15
-                Test nb=3    arm_min_no_idx_q15:test_min_no_idx_q15
+                Test nb=7    arm_absmax_no_idx_q15:test_absmax_no_idx_q15
-                Test nb=4n   arm_min_no_idx_q15:test_min_no_idx_q15
+                Test nb=8n   arm_absmax_no_idx_q15:test_absmax_no_idx_q15
-                Test nb=4n+1 arm_min_no_idx_q15:test_min_no_idx_q15
+                Test nb=8n+1 arm_absmax_no_idx_q15:test_absmax_no_idx_q15
-                Test nb=2    arm_absmax_no_idx_q15:test_absmax_no_idx_q15
+                Test nb=7    arm_absmin_no_idx_q15:test_absmin_no_idx_q15
-                Test nb=2n   arm_absmax_no_idx_q15:test_absmax_no_idx_q15
+                Test nb=8n   arm_absmin_no_idx_q15:test_absmin_no_idx_q15
-                Test nb=2n+1 arm_absmax_no_idx_q15:test_absmax_no_idx_q15
+                Test nb=8n+1 arm_absmin_no_idx_q15:test_absmin_no_idx_q15
-                Test nb=2    arm_absmin_no_idx_q15:test_absmin_no_idx_q15
+                Test nb=7    arm_mse_q15:test_mse_q15
-                Test nb=2n   arm_absmin_no_idx_q15:test_absmin_no_idx_q15
+                Test nb=8n   arm_mse_q15:test_mse_q15
-                Test nb=2n+1 arm_absmin_no_idx_q15:test_absmin_no_idx_q15
+                Test nb=8n+1 arm_mse_q15:test_mse_q15
                Test long    arm_mse_q15:test_mse_q15
              }
@ -479,32 +509,32 @@ group Root {
                Test big index  arm_max_q7:test_max_q7
                Test big index  arm_min_q7:test_min_q7
-                Test nb=3    arm_absmax_q7:test_absmax_q7
+                Test nb=15    arm_absmax_q7:test_absmax_q7
-                Test nb=4n   arm_absmax_q7:test_absmax_q7
+                Test nb=16n   arm_absmax_q7:test_absmax_q7
-                Test nb=4n+1 arm_absmax_q7:test_absmax_q7
+                Test nb=16n+1 arm_absmax_q7:test_absmax_q7
-                Test nb=3    arm_absmin_q7:test_absmin_q7
+                Test nb=15    arm_absmin_q7:test_absmin_q7
-                Test nb=4n   arm_absmin_q7:test_absmin_q7
+                Test nb=16n   arm_absmin_q7:test_absmin_q7
-                Test nb=4n+1 arm_absmin_q7:test_absmin_q7
+                Test nb=16n+1 arm_absmin_q7:test_absmin_q7
                Test big index  arm_absmax_q7:test_absmax_q7
                Test big index  arm_absmin_q7:test_absmin_q7
-                Test nb=3    arm_max_no_idx_q7:test_max_no_idx_q7
+                Test nb=15    arm_max_no_idx_q7:test_max_no_idx_q7
-                Test nb=4n   arm_max_no_idx_q7:test_max_no_idx_q7
+                Test nb=16n   arm_max_no_idx_q7:test_max_no_idx_q7
-                Test nb=4n+1 arm_max_no_idx_q7:test_max_no_idx_q7
+                Test nb=16n+1 arm_max_no_idx_q7:test_max_no_idx_q7
-                Test nb=3    arm_min_no_idx_q7:test_min_no_idx_q7
+                Test nb=15    arm_min_no_idx_q7:test_min_no_idx_q7
-                Test nb=4n   arm_min_no_idx_q7:test_min_no_idx_q7
+                Test nb=16n   arm_min_no_idx_q7:test_min_no_idx_q7
-                Test nb=4n+1 arm_min_no_idx_q7:test_min_no_idx_q7
+                Test nb=16n+1 arm_min_no_idx_q7:test_min_no_idx_q7
-                Test nb=2    arm_absmax_no_idx_q7:test_absmax_no_idx_q7
+                Test nb=15    arm_absmax_no_idx_q7:test_absmax_no_idx_q7
-                Test nb=2n   arm_absmax_no_idx_q7:test_absmax_no_idx_q7
+                Test nb=16n   arm_absmax_no_idx_q7:test_absmax_no_idx_q7
-                Test nb=2n+1 arm_absmax_no_idx_q7:test_absmax_no_idx_q7
+                Test nb=16n+1 arm_absmax_no_idx_q7:test_absmax_no_idx_q7
-                Test nb=2    arm_absmin_no_idx_q7:test_absmin_no_idx_q7
+                Test nb=15    arm_absmin_no_idx_q7:test_absmin_no_idx_q7
-                Test nb=2n   arm_absmin_no_idx_q7:test_absmin_no_idx_q7
+                Test nb=16n   arm_absmin_no_idx_q7:test_absmin_no_idx_q7
-                Test nb=2n+1 arm_absmin_no_idx_q7:test_absmin_no_idx_q7
+                Test nb=16n+1 arm_absmin_no_idx_q7:test_absmin_no_idx_q7
                Test nb=15    arm_mse_q7:test_mse_q7
                Test nb=16n   arm_mse_q7:test_mse_q7
--- a/Testing/desc_f16.txt
+++ b/Testing/desc_f16.txt
@ -15,6 +15,7 @@ group Root {
              Pattern INPUT1_F16_ID : Input1_f16.txt 
              Pattern INPUTNEW1_F16_ID : InputNew1_f16.txt 
              Pattern INPUTNEW2_F16_ID : InputNew2_f16.txt 
              Pattern INPUT2_F16_ID : Input2_f16.txt 
              Pattern MAXINDEXES_S16_ID : MaxIndexes1_s16.txt
@ -51,6 +52,8 @@ group Root {
              Pattern ABSMININDEXES_S16_ID : AbsMinIndexes27_s16.txt
              Pattern ABSMINVALS_F16_ID : AbsMinVals27_f16.txt
              Pattern MSE_F16_ID : MSEVals28_f16.txt
              Output  OUT_F16_ID : Output
              Output  OUT_S16_ID : Index
              Output  TMP_F16_ID : Temp
@ -100,25 +103,30 @@ group Root {
                Test stability  arm_std_f16:test_std_stability_f16
-                Test nb=3    arm_absmax_f16:test_absmax_f16
+                Test nb=7    arm_absmax_f16:test_absmax_f16
-                Test nb=4n   arm_absmax_f16:test_absmax_f16
+                Test nb=8n   arm_absmax_f16:test_absmax_f16
-                Test nb=4n+1 arm_absmax_f16:test_absmax_f16
+                Test nb=8n+1 arm_absmax_f16:test_absmax_f16
-                Test nb=3    arm_absmin_f16:test_absmin_f16
+                Test nb=7    arm_absmin_f16:test_absmin_f16
-                Test nb=4n   arm_absmin_f16:test_absmin_f16
+                Test nb=8n   arm_absmin_f16:test_absmin_f16
-                Test nb=4n+1 arm_absmin_f16:test_absmin_f16
+                Test nb=8n+1 arm_absmin_f16:test_absmin_f16
                Test nb=7    arm_min_no_idx_f16:test_min_no_idx_f16
                Test nb=8n   arm_min_no_idx_f16:test_min_no_idx_f16
                Test nb=8n+1 arm_min_no_idx_f16:test_min_no_idx_f16
-                Test nb=2    arm_absmax_no_idx_f16:test_absmax_no_idx_f16
+                Test nb=7    arm_absmax_no_idx_f16:test_absmax_no_idx_f16
-                Test nb=2n   arm_absmax_no_idx_f16:test_absmax_no_idx_f16
+                Test nb=8n   arm_absmax_no_idx_f16:test_absmax_no_idx_f16
-                Test nb=2n+1 arm_absmax_no_idx_f16:test_absmax_no_idx_f16
+                Test nb=8n+1 arm_absmax_no_idx_f16:test_absmax_no_idx_f16
                Test nb=7    arm_absmin_no_idx_f16:test_absmin_no_idx_f16
                Test nb=8n   arm_absmin_no_idx_f16:test_absmin_no_idx_f16
                Test nb=8n+1 arm_absmin_no_idx_f16:test_absmin_no_idx_f16
-                Test nb=2    arm_absmin_no_idx_f16:test_absmin_no_idx_f16
+                Test nb=7    arm_mse_f16:test_mse_f16
-                Test nb=2n   arm_absmin_no_idx_f16:test_absmin_no_idx_f16
+                Test nb=8n   arm_mse_f16:test_mse_f16
-                Test nb=2n+1 arm_absmin_no_idx_f16:test_absmin_no_idx_f16
+                Test nb=8n+1 arm_mse_f16:test_mse_f16
                Test long    arm_mse_f16:test_mse_f16
              }
           }
        }
 H
-// 4
+// 6
-x0004
+x0006
-// 4
+// 6
-x0004
+x0006
-// 4
+// 18
-x0004
+x0012
 // 0
 x0000
-// 15
+// 7
-x000F
+x0007
-// 15
+// 19
-x000F
+x0013
 H
-// 1
+// 0
-x0001
+x0000
-// 1
+// 7
-x0001
+x0007
-// 1
+// 7
-x0001
+x0007
 H
-// 0
+// 1
-x0000
+x0001
-// 7
+// 5
-x0007
+x0005
-// 7
+// 9
-x0007
+x0009
 // 1
 x0001
-// 2
+// 3
-x0002
+x0003
-// 2
+// 3
-x0002
+x0003
 x0000
 // 0
 x0000
-// 0
+// 4
-x0000
+x0004
 H
-// 4
+// 5
-x0004
+x0005
-// 4
+// 15
-x0004
+x000F
-// 18
+// 15
-x0012
+x000F
 H
-// 6
+// 1
-x0006
+x0001
-// 6
+// 1
-x0006
+x0001
-// 6
+// 1
-x0006
+x0001
 H
-// 1
+// 2
-x0001
+x0002
-// 3
+// 7
-x0003
+x0007
-// 8
+// 7
-x0008
+x0007
 // 0
 x0000
-// 7
+// 4
-x0007
+x0004
-// 7
+// 4
-x0007
+x0004