CMSIS-DSP: Added Helium support for functions in BasicMaths.

Added test patterns for functions in BasicMaths. Corrected github issue 622 (clarified comment for lms_norm_f32). Added support for q63_t to the test framework.
7 years ago · 56a540336a
parent 2d3a2166d6
commit 56a540336a
162 changed files with 19729 additions and 13437 deletions
--- a/Include/arm_helium_utils.h
+++ b/Include/arm_helium_utils.h
@ -36,35 +36,6 @@ Definitions available for MVEF and MVEI
 ***************************************/
 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)

-#define nbLanes(sz)             (128/sz)
-
-#define VEC_LANES_F32       nbLanes(32)
-#define VEC_LANES_F16       nbLanes(16)
-#define VEC_LANES_Q63       nbLanes(64)
-#define VEC_LANES_Q31       nbLanes(32)
-#define VEC_LANES_Q15       nbLanes(16)
-#define VEC_LANES_Q7        nbLanes(8)
-
-#define nb_vec_lanes(ptr) _Generic((ptr), \
-               uint32_t *: VEC_LANES_Q31, \
-               uint16_t *: VEC_LANES_Q15, \
-                uint8_t *: VEC_LANES_Q7,  \
-                  q31_t *: VEC_LANES_Q31, \
-                  q15_t *: VEC_LANES_Q15, \
-                   q7_t *: VEC_LANES_Q7,  \
-               float32_t*: VEC_LANES_F32, \
-               float16_t*: VEC_LANES_F16, \
-            const q31_t *: VEC_LANES_Q31, \
-            const q15_t *: VEC_LANES_Q15, \
-             const q7_t *: VEC_LANES_Q7,  \
-         const float32_t*: VEC_LANES_F32, \
-         const float16_t*: VEC_LANES_F16, \
-                  default: "err")
-
-
-
-#define post_incr_vec_size(ptr)         ptr += nb_vec_lanes(ptr)
-
 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */

 /***************************************
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@ -386,10 +386,22 @@ extern "C"
  #define ARM_MATH_DSP                   1
 #endif

-#if defined(__ARM_NEON)
+#if defined(ARM_MATH_NEON)
 #include <arm_neon.h>
 #endif

+#if defined (ARM_MATH_HELIUM)
+  #define ARM_MATH_MVEF
+#endif
+
+#if defined (ARM_MATH_MVEF)
+  #define ARM_MATH_MVEI
+#endif
+
+#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+#include <arm_mve.h>
+#endif
+

  /**
   * @brief Macros required for reciprocal calculation in Normalized LMS
@ -466,6 +478,308 @@ extern "C"
   */
  typedef double float64_t;

+  /**
+   * @brief vector types
+   */
+#if defined(ARM_MATH_NEON) || defined (ARM_MATH_MVEI)
+  /**
+   * @brief 64-bit fractional 128-bit vector data type in 1.63 format
+   */
+  typedef int64x2_t q63x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 1.31 format.
+   */
+  typedef int32x4_t q31x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector data type with 16-bit alignement in 1.15 format.
+   */
+  typedef __ALIGNED(2) int16x8_t q15x8_t;
+
+ /**
+   * @brief 8-bit fractional 128-bit vector data type with 8-bit alignement in 1.7 format.
+   */
+  typedef __ALIGNED(1) int8x16_t q7x16_t;
+
+    /**
+   * @brief 32-bit fractional 128-bit vector pair data type in 1.31 format.
+   */
+  typedef int32x4x2_t q31x4x2_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector quadruplet data type in 1.31 format.
+   */
+  typedef int32x4x4_t q31x4x4_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector pair data type in 1.15 format.
+   */
+  typedef int16x8x2_t q15x8x2_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector quadruplet data type in 1.15 format.
+   */
+  typedef int16x8x4_t q15x8x4_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector pair data type in 1.7 format.
+   */
+  typedef int8x16x2_t q7x16x2_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector quadruplet data type in 1.7 format.
+   */
+   typedef int8x16x4_t q7x16x4_t;
+
+  /**
+   * @brief 32-bit fractional data type in 9.23 format.
+   */
+  typedef int32_t q23_t;
+
+  /**
+   * @brief 32-bit fractional 128-bit vector data type in 9.23 format.
+   */
+  typedef int32x4_t q23x4_t;
+
+  /**
+   * @brief 64-bit status 128-bit vector data type.
+   */
+  typedef int64x2_t status64x2_t;
+
+  /**
+   * @brief 32-bit status 128-bit vector data type.
+   */
+  typedef int32x4_t status32x4_t;
+
+  /**
+   * @brief 16-bit status 128-bit vector data type.
+   */
+  typedef int16x8_t status16x8_t;
+
+  /**
+   * @brief 8-bit status 128-bit vector data type.
+   */
+  typedef int8x16_t status8x16_t;
+
+
+#endif
+
+
+#if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF) /* floating point vector*/
+  /**
+   * @brief 32-bit floating-point 128-bit vector type
+   */
+  typedef float32x4_t f32x4_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector data type
+   */
+  typedef __ALIGNED(2) float16x8_t f16x8_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector pair data type
+   */
+  typedef float32x4x2_t f32x4x2_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float32x4x4_t f32x4x4_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector pair data type
+   */
+  typedef float16x8x2_t f16x8x2_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector quadruplet data type
+   */
+  typedef float16x8x4_t f16x8x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any32x4_t
+  {
+      float32x4_t     f;
+      int32x4_t       i;
+  } any32x4_t;
+
+  /**
+   * @brief 16-bit ubiquitous 128-bit vector data type
+   */
+  typedef union _any16x8_t
+  {
+      float16x8_t     f;
+      int16x8_t       i;
+  } any16x8_t;
+
+#endif
+
+#if defined(ARM_MATH_NEON)
+  /**
+   * @brief 32-bit fractional 64-bit vector data type in 1.31 format.
+   */
+  typedef int32x2_t  q31x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector data type in 1.15 format.
+   */
+  typedef  __ALIGNED(2) int16x4_t q15x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector data type in 1.7 format.
+   */
+  typedef  __ALIGNED(1) int8x8_t q7x8_t;
+
+  /**
+   * @brief 32-bit float 64-bit vector data type.
+   */
+  typedef float32x2_t  f32x2_t;
+
+  /**
+   * @brief 16-bit float 64-bit vector data type.
+   */
+  typedef  __ALIGNED(2) float16x4_t f16x4_t;
+
+  /**
+   * @brief 32-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float32x4x3_t f32x4x3_t;
+
+  /**
+   * @brief 16-bit floating-point 128-bit vector triplet data type
+   */
+  typedef float16x8x3_t f16x8x3_t;
+
+
+  /**
+   * @brief 32-bit fractional 128-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 128-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x8x3_t q15x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 128-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x16x3_t q7x16x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector pair data type
+   */
+  typedef float32x2x2_t f32x2x2_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float32x2x3_t f32x2x3_t;
+
+  /**
+   * @brief 32-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float32x2x4_t f32x2x4_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector pair data type
+   */
+  typedef float16x4x2_t f16x4x2_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector triplet data type
+   */
+  typedef float16x4x3_t f16x4x3_t;
+
+  /**
+   * @brief 16-bit floating-point 64-bit vector quadruplet data type
+   */
+  typedef float16x4x4_t f16x4x4_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector pair data type in 1.31 format
+   */
+  typedef int32x2x2_t q31x2x2_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector triplet data type in 1.31 format
+   */
+  typedef int32x2x3_t q31x2x3_t;
+
+  /**
+   * @brief 32-bit fractional 64-bit vector quadruplet data type in 1.31 format
+   */
+  typedef int32x4x3_t q31x2x4_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector pair data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x2_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector triplet data type in 1.15 format
+   */
+  typedef int16x4x2_t q15x4x3_t;
+
+  /**
+   * @brief 16-bit fractional 64-bit vector quadruplet data type in 1.15 format
+   */
+  typedef int16x4x3_t q15x4x4_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector pair data type in 1.7 format
+   */
+  typedef int8x8x2_t q7x8x2_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector triplet data type in 1.7 format
+   */
+  typedef int8x8x3_t q7x8x3_t;
+
+  /**
+   * @brief 8-bit fractional 64-bit vector quadruplet data type in 1.7 format
+   */
+  typedef int8x8x4_t q7x8x4_t;
+
+  /**
+   * @brief 32-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any32x2_t
+  {
+      float32x2_t     f;
+      int32x2_t       i;
+  } any32x2_t;
+
+  /**
+   * @brief 16-bit ubiquitous 64-bit vector data type
+   */
+  typedef union _any16x4_t
+  {
+      float16x4_t     f;
+      int16x4_t       i;
+  } any16x4_t;
+
+  /**
+   * @brief 32-bit status 64-bit vector data type.
+   */
+  typedef int32x4_t status32x2_t;
+
+  /**
+   * @brief 16-bit status 64-bit vector data type.
+   */
+  typedef int16x8_t status16x4_t;
+
+  /**
+   * @brief 8-bit status 64-bit vector data type.
+   */
+  typedef int8x16_t status8x8_t;
+
+#endif
+
+

 /**
  @brief definition to read/write two 16 bit values.
--- a/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/Source/BasicMathFunctions/arm_abs_f32.c
@ -60,6 +60,55 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f32x4_t vec1;
+    f32x4_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vabsq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrc);
+      vstrwq_p(pDst, vabsq(vec1), p0);
+    }
+
+}
+
+#else
 void arm_abs_f32(
  const float32_t * pSrc,
        float32_t * pDst,
@ -67,9 +116,9 @@ void arm_abs_f32(
 {
        uint32_t blkCnt;                               /* Loop counter */

-#if defined(ARM_MATH_NEON)
-    float32x4_t vec1;
-    float32x4_t res;
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -95,7 +144,7 @@ void arm_abs_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -140,7 +189,7 @@ void arm_abs_f32(
  }

 }
-
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicAbs group
 */
--- a/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/Source/BasicMathFunctions/arm_abs_q15.c
@ -49,6 +49,51 @@
                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q15(
+    const q15_t * pSrc,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
 void arm_abs_q15(
  const q15_t * pSrc,
        q15_t * pDst,
@ -126,6 +171,7 @@ void arm_abs_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicAbs group
--- a/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/Source/BasicMathFunctions/arm_abs_q31.c
@ -49,6 +49,51 @@
                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q31(
+    const q31_t * pSrc,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
 void arm_abs_q31(
  const q31_t * pSrc,
        q31_t * pDst,
@ -126,7 +171,7 @@ void arm_abs_q31(
  }

 }
-
+#endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAbs group
 */
--- a/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/Source/BasicMathFunctions/arm_abs_q7.c
@ -51,6 +51,51 @@
                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_abs_q7(
+    const q7_t * pSrc,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = |A|
+         * Calculate absolute and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqabsq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqabsq(vecSrc), p0);
+    }
+}
+
+#else
 void arm_abs_q7(
  const q7_t * pSrc,
        q7_t * pDst,
@ -128,6 +173,7 @@ void arm_abs_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicAbs group
--- a/Source/BasicMathFunctions/arm_add_f32.c
+++ b/Source/BasicMathFunctions/arm_add_f32.c
@ -58,6 +58,59 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_add_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+        /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vaddq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vaddq(vec1,vec2), p0);
+    }
+
+}
+
+#else
 void arm_add_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
@ -66,10 +119,10 @@ void arm_add_f32(
 {
        uint32_t blkCnt;                               /* Loop counter */

-#if defined(ARM_MATH_NEON)
-    float32x4_t vec1;
-    float32x4_t vec2;
-    float32x4_t res;
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -97,7 +150,7 @@ void arm_add_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -139,6 +192,7 @@ void arm_add_f32(
  }

 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

 /**
  @} end of BasicAdd group
--- a/Source/BasicMathFunctions/arm_add_q15.c
+++ b/Source/BasicMathFunctions/arm_add_q15.c
@ -50,6 +50,56 @@
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#else
 void arm_add_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
@ -120,7 +170,7 @@ void arm_add_q15(
  }

 }
-
+#endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/Source/BasicMathFunctions/arm_add_q31.c
+++ b/Source/BasicMathFunctions/arm_add_q31.c
@ -50,6 +50,56 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;   
+    q31x4_t vecA;
+    q31x4_t vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+
+#else
 void arm_add_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
@ -103,6 +153,7 @@ void arm_add_q31(

 }

+#endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/Source/BasicMathFunctions/arm_add_q7.c
+++ b/Source/BasicMathFunctions/arm_add_q7.c
@ -50,6 +50,55 @@
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_add_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqaddq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
+    }
+}
+#else
 void arm_add_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
@ -103,7 +152,7 @@ void arm_add_q7(
  }

 }
-
+#endif /* defined(ARM_MATH_MVEI) */
 /**
  @} end of BasicAdd group
 */
--- a/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c
@ -59,42 +59,62 @@
  @return        none
 */

-#if defined (ARM_MATH_HELIUM)
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)

-#include "arm_mve.h"
 #include "arm_helium_utils.h"

+
 void arm_dot_prod_f32(
    const float32_t * pSrcA,
    const float32_t * pSrcB,
    uint32_t    blockSize,
    float32_t * result)
 {
-    float32x4_t vecA, vecB;
-    float32x4_t vecSum;
+    f32x4_t vecA, vecB;
+    f32x4_t vecSum;
+    uint32_t blkCnt; 
+    float32_t sum = 0.0f;  
    vecSum = vdupq_n_f32(0.0);

-    do {
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
        /*
         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
         * Calculate dot product and then store the result in a temporary buffer.
+         * and advance vector source and destination pointers
         */
-        mve_pred16_t p = vctp32q(blockSize);
+        vecA = vld1q(pSrcA);
+        pSrcA += 4;
+        
+        vecB = vld1q(pSrcB);
+        pSrcB += 4;

-        vecA = vldrwq_z_f32(pSrcA, p);
-        vecB = vldrwq_z_f32(pSrcB, p);
-        vecSum = vfmaq_m(vecSum, vecA, vecB, p);
+        vecSum = vfmaq(vecSum, vecA, vecB);
        /*
         * Decrement the blockSize loop counter
-         * Advance vector source and destination pointers
         */
-        post_incr_vec_size(pSrcA);
-        post_incr_vec_size(pSrcB);
-        blockSize -= VEC_LANES_F32;
+        blkCnt --;
    }
-    while ((int32_t) blockSize > 0);

-    *result = vecAddAcrossF32Mve(vecSum);
+
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
+    }
+
+    sum = vecAddAcrossF32Mve(vecSum);
+
+    /* Store result in destination buffer */
+    *result = sum;
+
 }

 #else
@ -108,11 +128,11 @@ void arm_dot_prod_f32(
        uint32_t blkCnt;                               /* Loop counter */
        float32_t sum = 0.0f;                          /* Temporary return variable */

-#if defined(ARM_MATH_NEON)
-    float32x4_t vec1;
-    float32x4_t vec2;
-    float32x4_t res;
-    float32x4_t accum = vdupq_n_f32(0);    
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+    f32x4_t accum = vdupq_n_f32(0);    

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -125,7 +145,7 @@ void arm_dot_prod_f32(
        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
        /* Calculate dot product and then store the result in a temporary buffer. */
        
-	accum = vmlaq_f32(accum, vec1, vec2);
+	      accum = vmlaq_f32(accum, vec1, vec2);
 	
        /* Increment pointers */
        pSrcA += 4;
@ -148,7 +168,7 @@ void arm_dot_prod_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -198,7 +218,7 @@ void arm_dot_prod_f32(
  *result = sum;
 }

-#endif /* ARM_MATH_HELIUM */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 /**
  @} end of BasicDotProd group
 */
--- a/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_q15.c
@ -52,7 +52,58 @@
                   there is no risk of overflow.
                   The return result is in 34.30 format.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    uint32_t blockSize,
+    q63_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+    q63_t     sum = 0LL;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmlaldavaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 8;
+        pSrcB += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmlaldavaq_p(sum, vecA, vecB, p0);
+    }
+
+    *result = sum;
+}
+
+#else
 void arm_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
@ -114,6 +165,7 @@ void arm_dot_prod_q15(
  /* Store result in destination buffer in 34.30 format */
  *result = sum;
 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicDotProd group
--- a/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_q31.c
@ -54,6 +54,64 @@
                   The return result is in 16.48 format.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    uint32_t blockSize,
+    q63_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecA;
+    q31x4_t vecB;
+    q63_t     sum = 0LL;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vrmlaldavhaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 4;
+        pSrcB += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
+    }
+
+    /*
+     * vrmlaldavhaq provides extra intermediate accumulator headroom.
+     * limiting the need of intermediate scaling
+     * Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
+     * 16.48 output conversion is performed outside the loop by scaling accu. by 6
+     */
+    *result = asrl(sum, (14 - 8));
+}
+
+#else
 void arm_dot_prod_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
@ -109,6 +167,7 @@ void arm_dot_prod_q31(
  /* Store result in destination buffer in 16.48 format */
  *result = sum;
 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicDotProd group
--- a/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_q7.c
@ -53,6 +53,57 @@
                   The return result is in 18.14 format.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    uint32_t blockSize,
+    q31_t * result)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+    q31_t     sum = 0;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmladavaq(sum, vecA, vecB);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA += 16;
+        pSrcB += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        sum = vmladavaq_p(sum, vecA, vecB, p0);
+    }
+
+    *result = sum;
+}
+#else
 void arm_dot_prod_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
@ -133,6 +184,7 @@ void arm_dot_prod_q7(
  /* Store result in destination buffer in 18.14 format */
  *result = sum;
 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicDotProd group
--- a/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/Source/BasicMathFunctions/arm_mult_f32.c
@ -58,6 +58,57 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mult_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vmulq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vmulq(vec1,vec2), p0);
+    }
+
+}
+
+#else
 void arm_mult_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
@ -66,10 +117,10 @@ void arm_mult_f32(
 {
    uint32_t blkCnt;                               /* Loop counter */

-#if defined(ARM_MATH_NEON)
-    float32x4_t vec1;
-    float32x4_t vec2;
-    float32x4_t res;
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -97,7 +148,7 @@ void arm_mult_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -142,6 +193,7 @@ void arm_mult_f32(
  }

 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

 /**
  @} end of BasicMult group
--- a/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/Source/BasicMathFunctions/arm_mult_q15.c
@ -49,7 +49,55 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_mult_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
 void arm_mult_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
@ -137,6 +185,7 @@ void arm_mult_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicMult group
--- a/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/Source/BasicMathFunctions/arm_mult_q31.c
@ -49,7 +49,55 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_mult_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
 void arm_mult_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
@ -113,6 +161,7 @@ void arm_mult_q31(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicMult group
--- a/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/Source/BasicMathFunctions/arm_mult_q7.c
@ -49,7 +49,55 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_mult_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * B
+         * Multiply the inputs and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqdmulhq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
+    }
+}
+
+#else
 void arm_mult_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
@ -113,6 +161,7 @@ void arm_mult_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicMult group
--- a/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/Source/BasicMathFunctions/arm_negate_f32.c
@ -59,6 +59,52 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    f32x4_t vec1;
+    f32x4_t res;
+
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = |A| */
+
+        /* Calculate absolute values and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vnegq(vec1);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+    if (blkCnt > 0U)
+    {
+      /* C = |A| */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q((float32_t const *) pSrc);
+      vstrwq_p(pDst, vnegq(vec1), p0);
+    }
+
+}
+
+#else
 void arm_negate_f32(
  const float32_t * pSrc,
        float32_t * pDst,
@ -66,9 +112,9 @@ void arm_negate_f32(
 {
        uint32_t blkCnt;                               /* Loop counter */

-#if defined(ARM_MATH_NEON_EXPERIMENTAL)
-    float32x4_t vec1;
-    float32x4_t res;
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -94,7 +140,7 @@ void arm_negate_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -139,6 +185,7 @@ void arm_negate_f32(
  }

 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

 /**
  @} end of BasicNegate group
--- a/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/Source/BasicMathFunctions/arm_negate_q15.c
@ -50,7 +50,51 @@
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_negate_q15(
+    const q15_t  * pSrc,
+    q15_t  * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
 void arm_negate_q15(
  const q15_t * pSrc,
        q15_t * pDst,
@ -120,6 +164,7 @@ void arm_negate_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicNegate group
--- a/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/Source/BasicMathFunctions/arm_negate_q31.c
@ -49,6 +49,51 @@
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_negate_q31(
+    const q31_t * pSrc,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
 void arm_negate_q31(
  const q31_t * pSrc,
        q31_t * pDst,
@ -126,6 +171,7 @@ void arm_negate_q31(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicNegate group
--- a/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/Source/BasicMathFunctions/arm_negate_q7.c
@ -48,7 +48,51 @@
                   The function uses saturating arithmetic.
                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_negate_q7(
+    const q7_t   * pSrc,
+    q7_t   * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = -A
+         * Negate and then store the results in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqnegq(vecSrc));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqnegq(vecSrc), p0);
+    }
+}
+
+#else
 void arm_negate_q7(
  const q7_t * pSrc,
        q7_t * pDst,
@ -110,7 +154,7 @@ void arm_negate_q7(
    in = *pSrc++;

 #if defined (ARM_MATH_DSP)
-    *pDst++ = (q7_t) __QSUB(0, in);
+    *pDst++ = (q7_t) __QSUB8(0, in);
 #else
    *pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
 #endif
@ -120,6 +164,7 @@ void arm_negate_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicNegate group
--- a/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/Source/BasicMathFunctions/arm_offset_f32.c
@ -60,6 +60,54 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_f32(
+  const float32_t * pSrc,
+        float32_t offset,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+ 
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vaddq(vec1,offset);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vec1 = vld1q((float32_t const *) pSrc);
+        vstrwq_p(pDst, vaddq(vec1, offset), p0);
+    }
+
+
+}
+
+#else
 void arm_offset_f32(
  const float32_t * pSrc,
        float32_t offset,
@ -68,9 +116,9 @@ void arm_offset_f32(
 {
        uint32_t blkCnt;                               /* Loop counter */

-#if defined(ARM_MATH_NEON_EXPERIMENTAL)
-    float32x4_t vec1;
-    float32x4_t res;
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -96,7 +144,7 @@ void arm_offset_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -141,6 +189,7 @@ void arm_offset_f32(
  }

 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

 /**
  @} end of BasicOffset group
--- a/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/Source/BasicMathFunctions/arm_offset_q15.c
@ -49,7 +49,53 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_offset_q15(
+    const q15_t * pSrc,
+    q15_t   offset,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+
+#else
 void arm_offset_q15(
  const q15_t * pSrc,
        q15_t offset,
@ -115,6 +161,7 @@ void arm_offset_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicOffset group
--- a/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/Source/BasicMathFunctions/arm_offset_q31.c
@ -50,6 +50,52 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_offset_q31(
+    const q31_t * pSrc,
+    q31_t   offset,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+#else
 void arm_offset_q31(
  const q31_t * pSrc,
        q31_t offset,
@ -122,6 +168,7 @@ void arm_offset_q31(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicOffset group
--- a/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/Source/BasicMathFunctions/arm_offset_q7.c
@ -49,7 +49,52 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_offset_q7(
+    const q7_t * pSrc,
+    q7_t   offset,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + offset
+         * Add offset and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst, vqaddq(vecSrc, offset));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
+    }
+}
+
+#else
 void arm_offset_q7(
  const q7_t * pSrc,
        q7_t offset,
@ -110,6 +155,7 @@ void arm_offset_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicOffset group
--- a/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/Source/BasicMathFunctions/arm_scale_f32.c
@ -73,6 +73,55 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_f32(
+  const float32_t * pSrc,
+        float32_t scale,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + offset */
+ 
+        /* Add offset and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrc);
+        res = vmulq(vec1,scale);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrc += 4;
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vec1 = vld1q((float32_t const *) pSrc);
+        vstrwq_p(pDst, vmulq(vec1, scale), p0);
+    }
+
+
+}
+
+#else
 void arm_scale_f32(
  const float32_t *pSrc,
        float32_t scale,
@ -81,8 +130,8 @@ void arm_scale_f32(
 {
  uint32_t blkCnt;                               /* Loop counter */
 #if defined(ARM_MATH_NEON_EXPERIMENTAL)
-    float32x4_t vec1;
-    float32x4_t res;
+    f32x4_t vec1;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -153,6 +202,7 @@ void arm_scale_f32(
  }

 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

 /**
  @} end of BasicScale group
--- a/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/Source/BasicMathFunctions/arm_scale_q15.c
@ -51,6 +51,62 @@
                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_q15(
+    const q15_t * pSrc,
+    q15_t   scaleFract,
+    int8_t  shift,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+    q15x8_t vecDst;
+
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);;
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s16(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+
+}
+
+
+#else
 void arm_scale_q15(
  const q15_t *pSrc,
        q15_t scaleFract,
@ -138,6 +194,7 @@ void arm_scale_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicScale group
--- a/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/Source/BasicMathFunctions/arm_scale_q31.c
@ -51,6 +51,58 @@
                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_scale_q31(
+    const q31_t * pSrc,
+    q31_t   scaleFract,
+    int8_t  shift,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+    q31x4_t vecDst;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s32(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrwq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
 void arm_scale_q31(
  const q31_t *pSrc,
        q31_t scaleFract,
@ -185,6 +237,7 @@ void arm_scale_q31(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicScale group
--- a/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/Source/BasicMathFunctions/arm_scale_q7.c
@ -51,6 +51,62 @@
                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+
+void arm_scale_q7(
+    const q7_t * pSrc,
+    q7_t   scaleFract,
+    int8_t  shift,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+    q7x16_t vecDst;
+
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A * scale
+         * Scale the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vmulhq(vecSrc, vdupq_n_s8(scaleFract));
+        vecDst = vqshlq_r(vecDst, shift + 1);
+        vstrbq_p(pDst, vecDst, p0);
+    }
+
+}
+
+#else
 void arm_scale_q7(
  const q7_t * pSrc,
        q7_t scaleFract,
@ -123,6 +179,7 @@ void arm_scale_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicScale group
--- a/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/Source/BasicMathFunctions/arm_shift_q15.c
@ -50,6 +50,55 @@
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q15(
+    const q15_t * pSrc,
+    int8_t shiftBits,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecSrc;
+    q15x8_t vecDst;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
 void arm_shift_q15(
  const q15_t * pSrc,
        int8_t shiftBits,
@ -195,6 +244,7 @@ void arm_shift_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicShift group
--- a/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/Source/BasicMathFunctions/arm_shift_q31.c
@ -67,6 +67,56 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q31(
+    const q31_t * pSrc,
+    int8_t shiftBits,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecSrc;
+    q31x4_t vecDst;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q((q31_t const *) pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecSrc = vld1q((q31_t const *) pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrwq_p(pDst, vecDst, p0);
+    }
+}
+
+
+#else
 void arm_shift_q31(
  const q31_t * pSrc,
        int8_t shiftBits,
@ -175,6 +225,7 @@ void arm_shift_q31(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicShift group
--- a/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/Source/BasicMathFunctions/arm_shift_q7.c
@ -52,6 +52,55 @@
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_shift_q7(
+    const q7_t * pSrc,
+    int8_t shiftBits,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecSrc;
+    q7x16_t vecDst;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A (>> or <<) shiftBits
+         * Shift the input and then store the result in the destination buffer.
+         */
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecSrc = vld1q(pSrc);
+        vecDst = vqshlq_r(vecSrc, shiftBits);
+        vstrbq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
 void arm_shift_q7(
  const q7_t * pSrc,
        int8_t shiftBits,
@ -169,6 +218,7 @@ void arm_shift_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicShift group
--- a/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/Source/BasicMathFunctions/arm_sub_f32.c
@ -58,6 +58,59 @@
  @return        none
 */

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        /* C = A + B */
+
+      /* Add and then store the results in the destination buffer. */
+        vec1 = vld1q(pSrcA);
+        vec2 = vld1q(pSrcB);
+        res = vsubq(vec1, vec2);
+        vst1q(pDst, res);
+
+        /* Increment pointers */
+        pSrcA += 4;
+        pSrcB += 4; 
+        pDst += 4;
+        
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0x3;
+
+    if (blkCnt > 0U)
+    {
+      /* C = A + B */
+      mve_pred16_t p0 = vctp32q(blkCnt);
+      vec1 = vld1q(pSrcA);
+      vec2 = vld1q(pSrcB);
+      vstrwq_p(pDst, vsubq(vec1,vec2), p0);
+    }
+
+}
+
+#else
 void arm_sub_f32(
  const float32_t * pSrcA,
  const float32_t * pSrcB,
@ -66,10 +119,10 @@ void arm_sub_f32(
 {
        uint32_t blkCnt;                               /* Loop counter */

-#if defined(ARM_MATH_NEON)
-    float32x4_t vec1;
-    float32x4_t vec2;
-    float32x4_t res;
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+    f32x4_t vec1;
+    f32x4_t vec2;
+    f32x4_t res;

    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;
@ -97,7 +150,7 @@ void arm_sub_f32(
    blkCnt = blockSize & 0x3;

 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

  /* Loop unrolling: Compute 4 outputs at a time */
  blkCnt = blockSize >> 2U;
@ -142,6 +195,7 @@ void arm_sub_f32(
  }

 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

 /**
  @} end of BasicSub group
--- a/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/Source/BasicMathFunctions/arm_sub_q15.c
@ -50,6 +50,57 @@
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecA;
+    q15x8_t vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A - B
+         * Subtract and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 8;
+        pSrcB  += 8;
+        pDst   += 8;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrhq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+
+
+#else
 void arm_sub_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
@ -120,6 +171,7 @@ void arm_sub_q15(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicSub group
--- a/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/Source/BasicMathFunctions/arm_sub_q31.c
@ -50,6 +50,56 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_sub_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;   
+    q31x4_t vecA;
+    q31x4_t vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A + B
+         * Add and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 4;
+        pSrcB  += 4;
+        pDst   += 4;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp32q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrwq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+
+#else
 void arm_sub_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
@ -102,6 +152,7 @@ void arm_sub_q31(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicSub group
--- a/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/Source/BasicMathFunctions/arm_sub_q7.c
@ -49,7 +49,55 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
 */
+#if defined(ARM_MATH_MVEI)

+#include "arm_helium_utils.h"
+
+void arm_sub_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecA;
+    q7x16_t vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /*
+         * C = A - B
+         * Subtract and then store the results in the destination buffer.
+         */
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vst1q(pDst, vqsubq(vecA, vecB));
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        /*
+         * advance vector source and destination pointers
+         */
+        pSrcA  += 16;
+        pSrcB  += 16;
+        pDst   += 16;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    if (blkCnt > 0U)
+    {
+        mve_pred16_t p0 = vctp8q(blkCnt);
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        vstrbq_p(pDst, vqsubq(vecA, vecB), p0);
+    }
+}
+#else
 void arm_sub_q7(
  const q7_t * pSrcA,
  const q7_t * pSrcB,
@ -103,6 +151,7 @@ void arm_sub_q7(
  }

 }
+#endif /* defined(ARM_MATH_MVEI) */

 /**
  @} end of BasicSub group
--- a/Source/FilteringFunctions/arm_lms_norm_f32.c
+++ b/Source/FilteringFunctions/arm_lms_norm_f32.c
@ -126,7 +126,7 @@
                   - Scaling of coefficients
                   - Overflow and saturation

- @par            Scaling of Coefficients
+ @par            Scaling of Coefficients (fixed point versions)
                   Filter coefficients are represented as fractional values and
                   coefficients are restricted to lie in the range <code>[-1 +1)</code>.
                   The fixed-point functions have an additional scaling parameter <code>postShift</code>.
@ -135,7 +135,7 @@
                   allows the filter coefficients to exceed the range <code>[+1 -1)</code>.
                   The value of <code>postShift</code> is set by the user based on the expected gain through the system being modeled.

- @par            Overflow and Saturation
+ @par            Overflow and Saturation (fixed point versions)
                   Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
                   described separately as part of the function specific documentation below.
 */
--- a/Testing/CMakeLists.txt
+++ b/Testing/CMakeLists.txt
@ -139,6 +139,9 @@ else()

 set(TESTSRC 
  Source/Tests/BasicTestsF32.cpp
+  Source/Tests/BasicTestsQ31.cpp
+  Source/Tests/BasicTestsQ15.cpp
+  Source/Tests/BasicTestsQ7.cpp
  Source/Tests/SVMF32.cpp
  Source/Tests/BayesF32.cpp
  Source/Tests/TransformF32.cpp
--- a/Testing/FrameworkInclude/FPGA.h
+++ b/Testing/FrameworkInclude/FPGA.h
@ -59,6 +59,7 @@ FPGA driver. Used to read a C array describing how to drive the test.

      virtual void ImportPattern_f64(Testing::PatternID_t,char*,Testing::nbSamples_t nb);
      virtual void ImportPattern_f32(Testing::PatternID_t,char*,Testing::nbSamples_t nb);
+      virtual void ImportPattern_q63(Testing::PatternID_t,char*,Testing::nbSamples_t nb);
      virtual void ImportPattern_q31(Testing::PatternID_t,char*,Testing::nbSamples_t nb);
      virtual void ImportPattern_q15(Testing::PatternID_t,char*,Testing::nbSamples_t nb);
      virtual void ImportPattern_q7(Testing::PatternID_t,char*,Testing::nbSamples_t nb);
@ -73,6 +74,7 @@ FPGA driver. Used to read a C array describing how to drive the test.

      virtual void DumpPattern_f64(Testing::outputID_t,Testing::nbSamples_t nb, float64_t* data);
      virtual void DumpPattern_f32(Testing::outputID_t,Testing::nbSamples_t nb, float32_t* data);
+      virtual void DumpPattern_q63(Testing::outputID_t,Testing::nbSamples_t nb, q63_t* data);
      virtual void DumpPattern_q31(Testing::outputID_t,Testing::nbSamples_t nb, q31_t* data);
      virtual void DumpPattern_q15(Testing::outputID_t,Testing::nbSamples_t nb, q15_t* data);
      virtual void DumpPattern_q7(Testing::outputID_t,Testing::nbSamples_t nb, q7_t* data);
--- a/Testing/FrameworkInclude/Pattern.h
+++ b/Testing/FrameworkInclude/Pattern.h
@ -45,6 +45,9 @@ float64_t *loadPattern(Testing::PatternID_t id, PatternMgr *mgr,Testing::nbSampl
 template <>
 float32_t *loadPattern(Testing::PatternID_t id, PatternMgr *mgr,Testing::nbSamples_t &nb, Testing::nbSamples_t maxSamples);

+template <>
+q63_t *loadPattern(Testing::PatternID_t id, PatternMgr *mgr,Testing::nbSamples_t &nb, Testing::nbSamples_t maxSamples);
+
 template <>
 q31_t *loadPattern(Testing::PatternID_t id, PatternMgr *mgr,Testing::nbSamples_t &nb, Testing::nbSamples_t maxSamples);

@ -75,6 +78,9 @@ float64_t *localPattern(Testing::nbSamples_t nb, PatternMgr *mgr);
 template <>
 float32_t *localPattern(Testing::nbSamples_t nb, PatternMgr *mgr);

+template <>
+q63_t *localPattern(Testing::nbSamples_t nb, PatternMgr *mgr);
+
 template <>
 q31_t *localPattern(Testing::nbSamples_t nb, PatternMgr *mgr);

@ -95,6 +101,7 @@ uint8_t *localPattern(Testing::nbSamples_t nb, PatternMgr *mgr);

 extern void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t nb,float64_t* data,PatternMgr *mgr);
 extern void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t,float32_t*,PatternMgr *);
+extern void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t,q63_t*,PatternMgr *);
 extern void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t,q31_t*,PatternMgr *);
 extern void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t,q15_t*,PatternMgr *);
 extern void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t,q7_t*,PatternMgr *);
--- a/Testing/FrameworkInclude/Semihosting.h
+++ b/Testing/FrameworkInclude/Semihosting.h
@ -63,6 +63,7 @@ Semihosting driver. Used to read a text file describing how to drive the test.
      
      virtual void ImportPattern_f64(Testing::PatternID_t,char*,Testing::nbSamples_t nb=0);
      virtual void ImportPattern_f32(Testing::PatternID_t,char*,Testing::nbSamples_t nb=0);
+      virtual void ImportPattern_q63(Testing::PatternID_t,char*,Testing::nbSamples_t nb=0);
      virtual void ImportPattern_q31(Testing::PatternID_t,char*,Testing::nbSamples_t nb=0);
      virtual void ImportPattern_q15(Testing::PatternID_t,char*,Testing::nbSamples_t nb=0);
      virtual void ImportPattern_q7(Testing::PatternID_t,char*,Testing::nbSamples_t nb=0);
@ -78,6 +79,7 @@ Semihosting driver. Used to read a text file describing how to drive the test.

      virtual void DumpPattern_f64(Testing::outputID_t,Testing::nbSamples_t nb, float64_t*);
      virtual void DumpPattern_f32(Testing::outputID_t,Testing::nbSamples_t nb, float32_t*);
+      virtual void DumpPattern_q63(Testing::outputID_t,Testing::nbSamples_t nb, q63_t*);
      virtual void DumpPattern_q31(Testing::outputID_t,Testing::nbSamples_t nb, q31_t*);
      virtual void DumpPattern_q15(Testing::outputID_t,Testing::nbSamples_t nb, q15_t*);
      virtual void DumpPattern_q7(Testing::outputID_t,Testing::nbSamples_t nb, q7_t*);
--- a/Testing/FrameworkInclude/Test.h
+++ b/Testing/FrameworkInclude/Test.h
@ -280,6 +280,7 @@ API of Memory managers used in the test framework
      */
      virtual void ImportPattern_f64(Testing::PatternID_t,char*,Testing::nbSamples_t nb=MAX_NB_SAMPLES)=0;
      virtual void ImportPattern_f32(Testing::PatternID_t,char*,Testing::nbSamples_t nb=MAX_NB_SAMPLES)=0;
+      virtual void ImportPattern_q63(Testing::PatternID_t,char*,Testing::nbSamples_t nb=MAX_NB_SAMPLES)=0;
      virtual void ImportPattern_q31(Testing::PatternID_t,char*,Testing::nbSamples_t nb=MAX_NB_SAMPLES)=0;
      virtual void ImportPattern_q15(Testing::PatternID_t,char*,Testing::nbSamples_t nb=MAX_NB_SAMPLES)=0;
      virtual void ImportPattern_q7(Testing::PatternID_t,char*,Testing::nbSamples_t nb=MAX_NB_SAMPLES)=0;
@ -308,6 +309,7 @@ API of Memory managers used in the test framework
      */
      virtual void DumpPattern_f64(Testing::outputID_t,Testing::nbSamples_t nb, float64_t*)=0;
      virtual void DumpPattern_f32(Testing::outputID_t,Testing::nbSamples_t nb, float32_t*)=0;
+      virtual void DumpPattern_q63(Testing::outputID_t,Testing::nbSamples_t nb, q63_t*)=0;
      virtual void DumpPattern_q31(Testing::outputID_t,Testing::nbSamples_t nb, q31_t*)=0;
      virtual void DumpPattern_q15(Testing::outputID_t,Testing::nbSamples_t nb, q15_t*)=0;
      virtual void DumpPattern_q7(Testing::outputID_t,Testing::nbSamples_t nb, q7_t*)=0;
@ -386,6 +388,7 @@ public:
    */
    float64_t *load_f64(Testing::PatternID_t,Testing::nbSamples_t&,Testing::nbSamples_t maxSamples=MAX_NB_SAMPLES);
    float32_t *load_f32(Testing::PatternID_t,Testing::nbSamples_t&,Testing::nbSamples_t maxSamples=MAX_NB_SAMPLES);
+    q63_t *load_q63(Testing::PatternID_t,Testing::nbSamples_t&,Testing::nbSamples_t maxSamples=MAX_NB_SAMPLES);
    q31_t *load_q31(Testing::PatternID_t,Testing::nbSamples_t&,Testing::nbSamples_t maxSamples=MAX_NB_SAMPLES);
    q15_t *load_q15(Testing::PatternID_t,Testing::nbSamples_t&,Testing::nbSamples_t maxSamples=MAX_NB_SAMPLES);
    q7_t *load_q7(Testing::PatternID_t,Testing::nbSamples_t&,Testing::nbSamples_t maxSamples=MAX_NB_SAMPLES);
@ -402,6 +405,7 @@ public:
    */
    float64_t *local_f64(Testing::nbSamples_t);
    float32_t *local_f32(Testing::nbSamples_t);
+    q63_t *local_q63(Testing::nbSamples_t);
    q31_t *local_q31(Testing::nbSamples_t);
    q15_t *local_q15(Testing::nbSamples_t);
    q7_t *local_q7(Testing::nbSamples_t);
@ -416,6 +420,7 @@ public:
    void dumpPattern_f64(Testing::outputID_t,Testing::nbSamples_t,float64_t*);
    void dumpPattern_f32(Testing::outputID_t,Testing::nbSamples_t,float32_t*);

+    void dumpPattern_q63(Testing::outputID_t,Testing::nbSamples_t,q63_t*);
    void dumpPattern_q31(Testing::outputID_t,Testing::nbSamples_t,q31_t*);
    void dumpPattern_q15(Testing::outputID_t,Testing::nbSamples_t,q15_t*);
    void dumpPattern_q7(Testing::outputID_t,Testing::nbSamples_t,q7_t*);
--- a/Testing/FrameworkSource/Error.cpp
+++ b/Testing/FrameworkSource/Error.cpp
@ -187,8 +187,8 @@ float arm_snr_q15(q15_t *pRef, q15_t *pTest, uint32_t buffSize)
 {
  float EnergySignal = 0.0, EnergyError = 0.0;
  uint32_t i;
-  float SNR;
- 
+  float SNR; 
+
  float32_t testVal,refVal;

  for (i = 0; i < buffSize; i++)
@ -331,6 +331,7 @@ void assert_snr_error(unsigned long nb,AnyPattern<q15_t> &pa,AnyPattern<q15_t> &

   snr = arm_snr_q15(ptrA, ptrB, pa.nbSamples());

+   //printf("SNR = %f\n",snr);

   if (snr < threshold)
   {
@ -353,6 +354,7 @@ void assert_snr_error(unsigned long nb,AnyPattern<q7_t> &pa,AnyPattern<q7_t> &pb

   snr = arm_snr_q7(ptrA, ptrB, pa.nbSamples());

+   //printf("SNR = %f\n",snr);

   if (snr < threshold)
   {
--- a/Testing/FrameworkSource/FPGA.cpp
+++ b/Testing/FrameworkSource/FPGA.cpp
@ -96,6 +96,10 @@ namespace Client
      delete(this->outputNames);
    }

+    /** Read word 64 from C array
+
+    */
+
    /** Read word 32 from C array

    */
@ -531,6 +535,25 @@ namespace Client

    }

+    void FPGA::ImportPattern_q63(Testing::PatternID_t id,char* p,Testing::nbSamples_t nb)
+    {
+        unsigned long offset,i;
+
+        offset=this->getPatternOffset(id);
+
+        const char *patternStart = this->m_patterns + offset;
+        const q63_t *src = (const q63_t*)patternStart;
+        q63_t *dst = (q63_t*)p;
+
+        if (dst)
+        {
+           for(i=0; i < nb; i++)
+           {
+               *dst++ = *src++;
+           }
+        }
+    }
+
    void FPGA::ImportPattern_q31(Testing::PatternID_t id,char* p,Testing::nbSamples_t nb)
    {
        unsigned long offset,i;
@ -691,6 +714,25 @@ namespace Client
        }
    }

+    void FPGA::DumpPattern_q63(Testing::outputID_t id,Testing::nbSamples_t nb, q63_t* data)
+    {
+        std::string fileName = this->getOutputPath(id); 
+        if (data)
+        {
+           printf("D: %s\n",fileName.c_str());
+           Testing::nbSamples_t i=0;
+           uint64_t t;
+           q63_t v;
+           for(i=0; i < nb; i++)
+           {
+              v = data[i];
+              t = (uint64_t)v;
+              printf("D: 0x%016llx\n",t);
+           }
+           printf("D: END\n");
+        }
+    }
+
    void FPGA::DumpPattern_q31(Testing::outputID_t id,Testing::nbSamples_t nb, q31_t* data)
    {
        std::string fileName = this->getOutputPath(id); 
--- a/Testing/FrameworkSource/Pattern.cpp
+++ b/Testing/FrameworkSource/Pattern.cpp
@ -45,6 +45,12 @@ float32_t *loadPattern(Testing::PatternID_t id, Client::PatternMgr *mgr,Testing:
    return(mgr->load_f32(id,nb,maxSamples));
 }

+template <> 
+q63_t *loadPattern(Testing::PatternID_t id, Client::PatternMgr *mgr,Testing::nbSamples_t &nb, Testing::nbSamples_t maxSamples)
+{
+    return(mgr->load_q63(id,nb,maxSamples));
+}
+
 template <> 
 q31_t *loadPattern(Testing::PatternID_t id, Client::PatternMgr *mgr,Testing::nbSamples_t &nb, Testing::nbSamples_t maxSamples)
 {
@ -94,6 +100,12 @@ float32_t *localPattern(Testing::PatternID_t id, Client::PatternMgr *mgr)
    return(mgr->local_f32(id));
 }

+template <> 
+q63_t *localPattern(Testing::PatternID_t id, Client::PatternMgr *mgr)
+{
+    return(mgr->local_q63(id));
+}
+
 template <> 
 q31_t *localPattern(Testing::PatternID_t id, Client::PatternMgr *mgr)
 {
@ -140,6 +152,11 @@ void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t nbSamples,float32_t
  mgr->dumpPattern_f32(id,nbSamples,data);
 }

+void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t nbSamples,q63_t* data,PatternMgr *mgr)
+{
+  mgr->dumpPattern_q63(id,nbSamples,data);
+}
+
 void dumpPattern(Testing::outputID_t id,Testing::nbSamples_t nbSamples,q31_t* data,PatternMgr *mgr)
 {
  mgr->dumpPattern_q31(id,nbSamples,data);
--- a/Testing/FrameworkSource/PatternMgr.cpp
+++ b/Testing/FrameworkSource/PatternMgr.cpp
@ -46,6 +46,7 @@ TYPE *PatternMgr::local_##EXT(Testing::nbSamples_t nbSamples) \

 LOCAL(float64_t,f64)
 LOCAL(float32_t,f32)
+LOCAL(q63_t,q63)
 LOCAL(q31_t,q31)
 LOCAL(q15_t,q15)
 LOCAL(q7_t,q7)
@ -90,6 +91,24 @@ float32_t *PatternMgr::load_f32(Testing::PatternID_t id,Testing::nbSamples_t& nb
   
 }

+q63_t *PatternMgr::load_q63(Testing::PatternID_t id,Testing::nbSamples_t& nbSamples,Testing::nbSamples_t maxSamples)
+{
+    nbSamples=m_io->GetPatternSize(id);
+
+    if ((maxSamples != MAX_NB_SAMPLES) && (maxSamples < nbSamples))
+    {
+        nbSamples = maxSamples;
+    }
+
+    char *b = m_mem->NewBuffer(sizeof(q63_t)*nbSamples);
+    if (b != NULL)
+    {
+       m_io->ImportPattern_q63(id,b,nbSamples);
+    }
+    return((q63_t*)b);
+}
+
+
 q31_t *PatternMgr::load_q31(Testing::PatternID_t id,Testing::nbSamples_t& nbSamples,Testing::nbSamples_t maxSamples)
 {
    nbSamples=m_io->GetPatternSize(id);
@ -203,6 +222,11 @@ void PatternMgr::dumpPattern_f32(Testing::outputID_t id,Testing::nbSamples_t nbS
   m_io->DumpPattern_f32(id,nbSamples,data);
 }

+void PatternMgr::dumpPattern_q63(Testing::outputID_t id,Testing::nbSamples_t nbSamples,q63_t* data)
+{
+   m_io->DumpPattern_q63(id,nbSamples,data);
+}
+
 void PatternMgr::dumpPattern_q31(Testing::outputID_t id,Testing::nbSamples_t nbSamples,q31_t* data)
 {
   m_io->DumpPattern_q31(id,nbSamples,data);
--- a/Testing/FrameworkSource/Semihosting.cpp
+++ b/Testing/FrameworkSource/Semihosting.cpp
@ -663,6 +663,44 @@ namespace Client
          
      }

+      void Semihosting::ImportPattern_q63(Testing::PatternID_t id,char* p,Testing::nbSamples_t nb)
+      {
+          char tmp[256];
+          Testing::nbSamples_t len;
+          Testing::nbSamples_t i=0;
+
+          uint64_t val;
+          q63_t *ptr=(q63_t*)p;
+
+          std::string fileName = this->getPatternPath(id);
+          FILE *pattern=fopen(fileName.c_str(), "r");
+          // Ignore word size format
+          fgets(tmp,256,pattern);
+          // Get nb of samples
+          fgets(tmp,256,pattern);
+          len=atoi(tmp);
+
+          if ((nb != MAX_NB_SAMPLES) && (nb < len))
+          {
+             len = nb;
+          }
+
+          if (ptr)
+          {
+             for(i=0;i<len;i++)
+             {
+               // Ignore comment
+                fgets(tmp,256,pattern);
+                fscanf(pattern,"0x%016llX\n",&val);
+                *ptr = TOTYP(q63_t,val);
+                ptr++;
+             }
+          }
+
+          fclose(pattern);
+          
+      }
+
      void Semihosting::ImportPattern_q31(Testing::PatternID_t id,char* p,Testing::nbSamples_t nb)
      {
          char tmp[256];
@ -928,6 +966,24 @@ namespace Client
               fclose(f);
            }
      }
+
+      void Semihosting::DumpPattern_q63(Testing::outputID_t id,Testing::nbSamples_t nb, q63_t* data)
+      {
+            std::string fileName = this->getOutputPath(id);
+            if (data)
+            {
+                FILE *f = fopen(fileName.c_str(),"w");
+                Testing::nbSamples_t i=0;
+                uint64_t t;
+                for(i=0; i < nb; i++)
+                {
+                   t = (uint64_t)data[i];
+                   fprintf(f,"0x%016llx\n",t);
+                }
+                fclose(f);
+            }
+      }
+
      void Semihosting::DumpPattern_q31(Testing::outputID_t id,Testing::nbSamples_t nb, q31_t* data)
      {
            std::string fileName = this->getOutputPath(id);
--- a/Testing/Include/Benchmarks/BasicMathsBenchmarksF32.h
+++ b/Testing/Include/Benchmarks/BasicMathsBenchmarksF32.h
@ -12,10 +12,15 @@ class BasicMathsBenchmarksF32:public Client::Suite
            Client::Pattern<float32_t> input2;
            Client::LocalPattern<float32_t> output;

+            Client::RefPattern<float32_t> ref;
+
+
            int nb;

            float32_t *inp1;
            float32_t *inp2;
            float32_t *outp;
+
+            float32_t *refp;
            
    };
--- a/Testing/Include/Tests/BasicTestsQ15.h
+++ b/Testing/Include/Tests/BasicTestsQ15.h
@ -0,0 +1,25 @@
+#include "Test.h"
+#include "Pattern.h"
+class BasicTestsQ15:public Client::Suite
+    {
+        public:
+            BasicTestsQ15(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "BasicTestsQ15_decl.h"
+            
+            Client::Pattern<q15_t> input1;
+            Client::Pattern<q15_t> input2;
+
+            Client::LocalPattern<q15_t> output;
+            Client::LocalPattern<q63_t> dotOutput;
+
+            // Reference patterns are not loaded when we are in dump mode
+            Client::RefPattern<q15_t> ref;
+
+            Client::RefPattern<q63_t> dotRef;
+
+            /* Offset or scale value */
+            q15_t scalar;
+    };
--- a/Testing/Include/Tests/BasicTestsQ31.h
+++ b/Testing/Include/Tests/BasicTestsQ31.h
@ -0,0 +1,25 @@
+#include "Test.h"
+#include "Pattern.h"
+class BasicTestsQ31:public Client::Suite
+    {
+        public:
+            BasicTestsQ31(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "BasicTestsQ31_decl.h"
+            
+            Client::Pattern<q31_t> input1;
+            Client::Pattern<q31_t> input2;
+
+            Client::LocalPattern<q31_t> output;
+            Client::LocalPattern<q63_t> dotOutput;
+
+            // Reference patterns are not loaded when we are in dump mode
+            Client::RefPattern<q31_t> ref;
+
+            Client::RefPattern<q63_t> dotRef;
+
+            /* Offset or scale value */
+            q31_t scalar;
+    };
--- a/Testing/Include/Tests/BasicTestsQ7.h
+++ b/Testing/Include/Tests/BasicTestsQ7.h
@ -0,0 +1,25 @@
+#include "Test.h"
+#include "Pattern.h"
+class BasicTestsQ7:public Client::Suite
+    {
+        public:
+            BasicTestsQ7(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "BasicTestsQ7_decl.h"
+            
+            Client::Pattern<q7_t> input1;
+            Client::Pattern<q7_t> input2;
+
+            Client::LocalPattern<q7_t> output;
+            Client::LocalPattern<q31_t> dotOutput;
+
+            // Reference patterns are not loaded when we are in dump mode
+            Client::RefPattern<q7_t> ref;
+
+            Client::RefPattern<q31_t> dotRef;
+
+            /* Offset or scale value */
+            q7_t scalar;
+    };
--- a/Testing/PatternGeneration/BasicMaths.py
+++ b/Testing/PatternGeneration/BasicMaths.py
@ -7,7 +7,7 @@ import Tools
 # Those patterns are used for tests and benchmarks.
 # For tests, there is the need to add tests for saturation

-def writeTests(config):
+def writeTests(config,format):
    NBSAMPLES=256

    data1=np.random.randn(NBSAMPLES)
@ -39,16 +39,31 @@ def writeTests(config):
    config.writeReference(6, ref)
    
    nb = 3
-    ref = np.array([np.dot(data1[0:nb] ,data2[0:nb])])
-    config.writeReference(7, ref)
+    ref = np.array([np.dot(data1[0:nb] ,data2[0:nb])]) / 2**15
+    if format == 31 or format == 15:
+       config.writeReferenceQ63(7, ref)
+    elif format == 7:
+       config.writeReferenceQ31(7, ref)
+    else:
+       config.writeReference(7, ref)
    
    nb = 8
-    ref = np.array([np.dot(data1[0:nb] ,data2[0:nb])])
-    config.writeReference(8, ref)
+    ref = np.array([np.dot(data1[0:nb] ,data2[0:nb])]) / 2**15
+    if format == 31 or format == 15:
+       config.writeReferenceQ63(8, ref)
+    elif format == 7:
+       config.writeReferenceQ31(8, ref)
+    else:
+       config.writeReference(8, ref)
    
    nb = 9
-    ref = np.array([np.dot(data1[0:nb] ,data2[0:nb])])
-    config.writeReference(9, ref)
+    ref = np.array([np.dot(data1[0:nb] ,data2[0:nb])]) / 2**15
+    if format == 31 or format == 15:
+       config.writeReferenceQ63(9, ref)
+    elif format == 7:
+       config.writeReferenceQ31(9, ref)
+    else:
+       config.writeReference(9, ref)
    
    ref = abs(data1)
    config.writeReference(10, ref)
@ -56,6 +71,88 @@ def writeTests(config):
    ref = np.array([np.dot(data1 ,data2)])
    config.writeReference(11, ref)

+    return(11)
+
+
+def writeTestsWithSat(config,format):
+    if format == 31:
+       NBSAMPLES=9
+
+    if format == 15:
+       NBSAMPLES=17
+
+    if format == 7:
+       NBSAMPLES=33
+
+    nb = writeTests(config,format)
+
+    data1 = np.full(NBSAMPLES, 2**format - 1)
+    data1[1::2] = 2
+    data2 = np.full(NBSAMPLES, -2**format)
+    data2[1::2] = -2
+
+    datar=np.random.randn(NBSAMPLES)
+    datar = datar/max(datar)
+    datar = datar / 3.0 # Because used to test shift of 2 without saturation
+
+    config.writeInput(12, datar)
+
+    if format == 31:
+       config.writeInputS32(12,data1-1,"MaxPosInput")
+       config.writeInputS32(12,data2+1,"MaxNegInput")
+       config.writeInputS32(12,data2,"MaxNeg2Input")
+
+    if format == 15:
+       config.writeInputS16(12,data1-1,"MaxPosInput")
+       config.writeInputS16(12,data2+1,"MaxNegInput")
+       config.writeInputS16(12,data2,"MaxNeg2Input")
+
+    if format == 7:
+       config.writeInputS8(12,data1-1,"MaxPosInput")
+       config.writeInputS8(12,data2+1,"MaxNegInput")
+       config.writeInputS8(12,data2,"MaxNeg2Input")
+       
+    d1 = 1.0*(data1-1) / 2**format
+    d2 = 1.0*(data2+1) / 2**format
+    d3 = 1.0*(data2) / 2**format
+
+    ref = d1 + d1
+    config.writeReference(nb+1, ref,"PosSat")
+    ref = d2 + d2
+    config.writeReference(nb+2, ref,"NegSat")
+
+    d1 = 1.0*(data1-1) / 2**format
+    d2 = 1.0*(data2+1) / 2**format
+    ref = d1 - d2
+    config.writeReference(nb+3, ref,"PosSat")
+
+    ref = d2 - d1
+    config.writeReference(nb+4, ref,"NegSat")
+
+    ref = d3*d3
+    config.writeReference(nb+5, ref,"PosSat")
+
+    ref = -d3
+    config.writeReference(nb+6, ref,"PosSat")
+
+    ref = d1 + 0.9
+    config.writeReference(nb+7, ref,"PosSat")
+    ref = d2 - 0.9
+    config.writeReference(nb+8, ref,"NegSat")
+
+    ref = d3 * d3[0]
+    config.writeReference(nb+9, ref,"PosSat")
+
+    ref = datar * 2.0
+    config.writeReference(nb+10, ref,"Shift")
+
+    ref = d1 * 2.0
+    config.writeReference(nb+11, ref,"Shift")
+
+    ref = d2 * 2.0
+    config.writeReference(nb+12, ref,"Shift")
+
+

 PATTERNDIR = os.path.join("Patterns","DSP","BasicMaths","BasicMaths")
 PARAMDIR = os.path.join("Parameters","DSP","BasicMaths","BasicMaths")
@ -67,10 +164,10 @@ configq7=Tools.Config(PATTERNDIR,PARAMDIR,"q7")



-writeTests(configf32)
-writeTests(configq31)
-writeTests(configq15)
-writeTests(configq7)
+#writeTests(configf32,0)
+writeTestsWithSat(configq31,31)
+writeTestsWithSat(configq15,15)
+writeTestsWithSat(configq7,7)

 # Params just as example
 someLists=[[1,3,5],[1,3,5],[1,3,5]]
--- a/Testing/PatternGeneration/Tools.py
+++ b/Testing/PatternGeneration/Tools.py
@ -52,6 +52,14 @@ def float64_to_hex(f):
    """
    return hex(struct.unpack('<Q', struct.pack('<d', f))[0])

+def to_q63(v):
+    r = int(round(v * 2**63))
+    if (r > 0x07FFFFFFFFFFFFFFF):
+      r = 0x07FFFFFFFFFFFFFFF
+    if (r < -0x08000000000000000):
+      r = -0x08000000000000000
+    return ("0x%s" % format(struct.unpack('<Q', struct.pack('<q', r))[0],'016X'))
+
 def to_q31(v):
    r = int(round(v * 2**31))
    if (r > 0x07FFFFFFF):
@ -113,6 +121,21 @@ class Config:
        else:
          return(os.path.join(self._patternDir,"Input%d_%s.txt" % (i,self._ext)))

+    def inputS32P(self,i,name=None):
+        """ Path to a reference pattern from the ID
+      
+        Args:
+          i (int): ID to the reference pattern
+        Raises:
+          Nothing 
+        Returns:
+          str : path to the file where to generate the pattern data
+        """
+        if name:
+          return(os.path.join(self._patternDir,"%s%d_%s.txt" % (name,i,"s32")))
+        else:
+          return(os.path.join(self._patternDir,"Input%d_%s.txt" % (i,"s32")))
+
    def inputS16P(self,i,name=None):
        """ Path to a reference pattern from the ID
      
@ -128,6 +151,21 @@ class Config:
        else:
          return(os.path.join(self._patternDir,"Input%d_%s.txt" % (i,"s16")))

+    def inputS8P(self,i,name=None):
+        """ Path to a reference pattern from the ID
+      
+        Args:
+          i (int): ID to the reference pattern
+        Raises:
+          Nothing 
+        Returns:
+          str : path to the file where to generate the pattern data
+        """
+        if name:
+          return(os.path.join(self._patternDir,"%s%d_%s.txt" % (name,i,"s8")))
+        else:
+          return(os.path.join(self._patternDir,"Input%d_%s.txt" % (i,"s8")))
+
    def inputQ31P(self,i,name=None):
        """ Path to a reference pattern from the ID
      
@ -248,6 +286,36 @@ class Config:
        else:
          return(os.path.join(self._patternDir,"Reference%d_%s.txt" % (i,"s32")))

+    def refQ63P(self,i,name=None):
+        """ Path to a reference pattern from the ID
+      
+        Args:
+          i (int): ID to the reference pattern
+        Raises:
+          Nothing 
+        Returns:
+          str : path to the file where to generate the pattern data
+        """
+        if name:
+          return(os.path.join(self._patternDir,"%s%d_%s.txt" % (name,i,"q63")))
+        else:
+          return(os.path.join(self._patternDir,"Reference%d_%s.txt" % (i,"q63")))
+
+    def refQ31P(self,i,name=None):
+        """ Path to a reference pattern from the ID
+      
+        Args:
+          i (int): ID to the reference pattern
+        Raises:
+          Nothing 
+        Returns:
+          str : path to the file where to generate the pattern data
+        """
+        if name:
+          return(os.path.join(self._patternDir,"%s%d_%s.txt" % (name,i,"q31")))
+        else:
+          return(os.path.join(self._patternDir,"Reference%d_%s.txt" % (i,"q31")))
+
    def refF32P(self,i,name=None):
        """ Path to a reference pattern from the ID
      
@ -328,6 +396,31 @@ class Config:
                f.write("// %f\n" % v)
                f.write("%s\n" % float_to_hex(v))

+    def _writeVectorQ63(self,i,data):
+        """ Write pattern data
+        
+        The format is recognized by the text framework script.
+        First line is the sample width (B,H or W for 8,16 or 32 bits)
+        Second line is number of samples
+        Other lines are hexadecimal representation of the samples in format
+        which can be read on big endian ARM.
+        
+          Args:
+            j (int): ID of pattern file
+            data (array): Vector containing the data
+          Raises:
+            Nothing 
+          Returns:
+            Nothing
+        """
+        with open(i,"w") as f:
+            # Write sample dimension nb sample header
+            #np.savetxt(i, data, newline="\n", header="W\n%d" % len(data),comments ="" )
+            f.write("D\n%d\n" % len(data))
+            for v in data:
+                f.write("// %f\n" % v)
+                f.write("%s\n" % to_q63(v))
+
    def _writeVectorQ31(self,i,data):
        """ Write pattern data
        
@ -508,6 +601,8 @@ class Config:
          self._writeVectorF64(self.refP(j,name),data)
        if (self._ext == "f32"):
          self._writeVectorF32(self.refP(j,name),data)
+        if (self._ext == "q63"):
+          self._writeVectorQ63(self.refP(j,name),data)
        if (self._ext == "q31"):
          self._writeVectorQ31(self.refP(j,name),data)
        if (self._ext == "q15"):
@ -519,6 +614,12 @@ class Config:
        if (self._ext == "s8"):
          self._writeVectorS8(self.refP(j,name),data)

+    def writeReferenceQ63(self,j,data,name=None):
+        self._writeVectorQ63(self.refQ63P(j,name),data)
+
+    def writeReferenceQ31(self,j,data,name=None):
+        self._writeVectorQ31(self.refQ31P(j,name),data)
+
    def writeReferenceS8(self,j,data,name=None):
        self._writeVectorS8(self.refS8P(j,name),data)

@ -556,9 +657,15 @@ class Config:
    def writeInputQ7(self,j,data,name=None):
        self._writeVectorQ7(self.inputQ7P(j,name),data)

+    def writeInputS32(self,j,data,name=None):
+        self._writeVectorS32(self.inputS32P(j,name),data)
+
    def writeInputS16(self,j,data,name=None):
        self._writeVectorS16(self.inputS16P(j,name),data)

+    def writeInputS8(self,j,data,name=None):
+        self._writeVectorS8(self.inputS8P(j,name),data)
+
    def writeInputU32(self,j,data,name=None):
        self._writeVectorU32(self.inputU32P(j,name),data)

--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Input12_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Input12_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -0.127026
+0xEFBE
+// 0.135563
+0x115A
+// -0.055957
+0xF8D6
+// 0.005012
+0x00A4
+// 0.049539
+0x0657
+// 0.143211
+0x1255
+// 0.041455
+0x054E
+// -0.054525
+0xF905
+// 0.016068
+0x020F
+// -0.120403
+0xF097
+// 0.097939
+0x0C89
+// -0.110690
+0xF1D5
+// 0.333333
+0x2AAB
+// 0.004649
+0x0098
+// 0.090070
+0x0B87
+// 0.027590
+0x0388
+// 0.058612
+0x0781
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Input1_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Input1_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Input2_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Input2_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/MaxNeg2Input12_s16.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/MaxNeg2Input12_s16.txt
@ -0,0 +1,36 @@
+H
+17
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
+// -2
+0xFFFE
+// -32768
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/MaxNegInput12_s16.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/MaxNegInput12_s16.txt
@ -0,0 +1,36 @@
+H
+17
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
+// -1
+0xFFFF
+// -32767
+0x8001
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/MaxPosInput12_s16.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/MaxPosInput12_s16.txt
@ -0,0 +1,36 @@
+H
+17
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
+// 1
+0x0001
+// 32766
+0x7FFE
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat13_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat13_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat15_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat15_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
+// -0.000061
+0xFFFE
+// -1.999908
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat17_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat17_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat19_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat19_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
+// -0.900031
+0x8CCC
+// -1.899969
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat20_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/NegSat20_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
+// -0.900061
+0x8CCB
+// -1.900000
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat12_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat12_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat14_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat14_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
+// 0.000061
+0x0002
+// 1.999908
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat16_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat16_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
+// 0.000000
+0x0000
+// 1.000000
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat17_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat17_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat18_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat18_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat19_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat19_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
+// 0.900031
+0x7334
+// 1.899939
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat20_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/PosSat20_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
+// 0.000061
+0x0002
+// 1.000000
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference10_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference10_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference11_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference11_q15.txt
@ -0,0 +1,4 @@
+H
+1
+// 13.425417
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference1_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference1_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference2_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference2_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference3_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference3_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference4_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference4_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference5_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference5_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference6_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference6_q15.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference7_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference7_q15.txt
@ -1,4 +1,4 @@
 H
 1
-// 0.049476
-0x0655
+// 0.000003
+0x0000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference7_q63.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference7_q63.txt
@ -0,0 +1,4 @@
+D
+1
+// 0.000003
+0x0000172BEFBB2F71
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference8_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference8_q15.txt
@ -1,4 +1,4 @@
 H
 1
-// 0.273481
-0x2301
+// 0.000008
+0x0000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference8_q63.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference8_q63.txt
@ -0,0 +1,4 @@
+D
+1
+// 0.000006
+0x00003027A897E616
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference9_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference9_q15.txt
@ -1,4 +1,4 @@
 H
 1
-// 0.308351
-0x2778
+// 0.000016
+0x0001
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference9_q63.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Reference9_q63.txt
@ -0,0 +1,4 @@
+D
+1
+// 0.000008
+0x00004122B89F72C9
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Shift21_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Shift21_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -0.254051
+0xDF7B
+// 0.271125
+0x22B4
+// -0.111913
+0xF1AD
+// 0.010024
+0x0148
+// 0.099078
+0x0CAF
+// 0.286423
+0x24AA
+// 0.082909
+0x0A9D
+// -0.109049
+0xF20B
+// 0.032135
+0x041D
+// -0.240805
+0xE12D
+// 0.195877
+0x1913
+// -0.221380
+0xE3AA
+// 0.666667
+0x5555
+// 0.009299
+0x0131
+// 0.180140
+0x170F
+// 0.055180
+0x0710
+// 0.117224
+0x0F01
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Shift22_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Shift22_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
+// 0.000061
+0x0002
+// 1.999878
+0x7FFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Shift23_q15.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ15/Shift23_q15.txt
@ -0,0 +1,36 @@
+H
+17
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
+// -0.000061
+0xFFFE
+// -1.999939
+0x8000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/Input12_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/Input12_q31.txt
@ -0,0 +1,20 @@
+W
+9
+// -0.085318
+0xF5144ACE
+// -0.193783
+0xE7321E29
+// -0.014971
+0xFE156F8A
+// 0.209494
+0x1AD0AF9C
+// -0.112886
+0xF18CF0A8
+// 0.333333
+0x2AAAAAAB
+// 0.221288
+0x1C5327BE
+// 0.021019
+0x02B0BCF5
+// 0.158600
+0x144D0407
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/Input1_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/Input1_q31.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/Input2_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/Input2_q31.txt
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/MaxNeg2Input12_s32.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/MaxNeg2Input12_s32.txt
@ -0,0 +1,20 @@
+W
+9
+// -2147483648
+0x80000000
+// -2
+0xFFFFFFFE
+// -2147483648
+0x80000000
+// -2
+0xFFFFFFFE
+// -2147483648
+0x80000000
+// -2
+0xFFFFFFFE
+// -2147483648
+0x80000000
+// -2
+0xFFFFFFFE
+// -2147483648
+0x80000000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/MaxNegInput12_s32.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/MaxNegInput12_s32.txt
@ -0,0 +1,20 @@
+W
+9
+// -2147483647
+0x80000001
+// -1
+0xFFFFFFFF
+// -2147483647
+0x80000001
+// -1
+0xFFFFFFFF
+// -2147483647
+0x80000001
+// -1
+0xFFFFFFFF
+// -2147483647
+0x80000001
+// -1
+0xFFFFFFFF
+// -2147483647
+0x80000001
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/MaxPosInput12_s32.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/MaxPosInput12_s32.txt
@ -0,0 +1,20 @@
+W
+9
+// 2147483646
+0x7FFFFFFE
+// 1
+0x00000001
+// 2147483646
+0x7FFFFFFE
+// 1
+0x00000001
+// 2147483646
+0x7FFFFFFE
+// 1
+0x00000001
+// 2147483646
+0x7FFFFFFE
+// 1
+0x00000001
+// 2147483646
+0x7FFFFFFE
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/NegSat13_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/NegSat13_q31.txt
@ -0,0 +1,20 @@
+W
+9
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/NegSat15_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/NegSat15_q31.txt
@ -0,0 +1,20 @@
+W
+9
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
+// -0.000000
+0xFFFFFFFE
+// -2.000000
+0x80000000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/NegSat19_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/NegSat19_q31.txt
@ -0,0 +1,20 @@
+W
+9
+// -1.900000
+0x80000000
+// -0.900000
+0x8CCCCCCC
+// -1.900000
+0x80000000
+// -0.900000
+0x8CCCCCCC
+// -1.900000
+0x80000000
+// -0.900000
+0x8CCCCCCC
+// -1.900000
+0x80000000
+// -0.900000
+0x8CCCCCCC
+// -1.900000
+0x80000000
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/PosSat12_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/PosSat12_q31.txt
@ -0,0 +1,20 @@
+W
+9
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
--- a/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/PosSat14_q31.txt
+++ b/Testing/Patterns/DSP/BasicMaths/BasicMathsQ31/PosSat14_q31.txt
@ -0,0 +1,20 @@
+W
+9
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
+// 0.000000
+0x00000002
+// 2.000000
+0x7FFFFFFF
--- a/Show More
+++ b/Show More