From 6f229b4f6528962a7114f71309f402bd404855a3 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Wed, 4 Nov 2020 10:46:38 +0100
Subject: [PATCH] CMSIS-DSP: New MVE implementation of the FIR f32

---
 .gitignore                                    |   2 +-
 .../ARM/arm_fir_example/arm_fir_example_f32.c |  25 +-
 Source/FilteringFunctions/arm_fir_f32.c       | 744 +++++++++---------
 Source/FilteringFunctions/arm_fir_init_f32.c  |  16 +-
 Testing/.gitignore                            |   2 +
 Testing/CMakeLists.txt                        |  23 +-
 Testing/Include/Tests/FIRF32.h                |   2 +
 Testing/Source/Benchmarks/FIRF32.cpp          |  22 +-
 Testing/Source/Tests/FIRF16.cpp               |   2 +-
 Testing/Source/Tests/FIRF32.cpp               |  21 +-
 Testing/Source/Tests/FIRQ15.cpp               |   2 +-
 Testing/Source/Tests/FIRQ31.cpp               |   2 +-
 Testing/Source/Tests/FIRQ7.cpp                |   2 +-
 Testing/TestScripts/CodeGen.py                |  22 +-
 Testing/createDefaultFolder.sh                |   2 +
 Testing/processTests.py                       |   6 +-
 16 files changed, 480 insertions(+), 415 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8b95ada7..f6edf40f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@ DSP_Lib_TestSuite/build/
 PythonWrapper/build/
 PythonWrapper/cmsisdsp.cp36-win_amd64.pyd
 PythonWrapper/rec_2.dat
-Output.pickle
+*.pickle
 build_*/
 Examples/ARM/arm_fft_bin_example/RTE/
 Examples/ARM/arm_fft_bin_example/RTE/
diff --git a/Examples/ARM/arm_fir_example/arm_fir_example_f32.c b/Examples/ARM/arm_fir_example/arm_fir_example_f32.c
index 768c030c..f28359fb 100644
--- a/Examples/ARM/arm_fir_example/arm_fir_example_f32.c
+++ b/Examples/ARM/arm_fir_example/arm_fir_example_f32.c
@@ -143,6 +143,14 @@ this example is not giving better SNR ...
 */
 #define SNR_THRESHOLD_F32    75.0f
 #define BLOCK_SIZE            32
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+/* Must be a multiple of 16 */
+#define NUM_TAPS_ARRAY_SIZE              32
+#else
+#define NUM_TAPS_ARRAY_SIZE              29
+#endif
+
 #define NUM_TAPS              29
 
 /* -------------------------------------------------------------------
@@ -162,20 +170,31 @@ static float32_t testOutput[TEST_LENGTH_SAMPLES];
 /* -------------------------------------------------------------------
  * Declare State buffer of size (numTaps + blockSize - 1)
  * ------------------------------------------------------------------- */
-
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static float32_t firStateF32[2 * BLOCK_SIZE + NUM_TAPS - 1];
+#else
 static float32_t firStateF32[BLOCK_SIZE + NUM_TAPS - 1];
+#endif 
 
 /* ----------------------------------------------------------------------
 ** FIR Coefficients buffer generated using fir1() MATLAB function.
 ** fir1(28, 6/24)
 ** ------------------------------------------------------------------- */
-
-const float32_t firCoeffs32[NUM_TAPS] = {
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+const float32_t firCoeffs32[NUM_TAPS_ARRAY_SIZE] = {
+  -0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f,
+  -0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f,
+  +0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f,
+  +0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f, 0.0f,0.0f,0.0f
+};
+#else
+const float32_t firCoeffs32[NUM_TAPS_ARRAY_SIZE] = {
   -0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f,
   -0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f,
   +0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f,
   +0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f
 };
+#endif
 
 /* ------------------------------------------------------------------
  * Global variables for FIR LPF Example
diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c
index 5a728e12..bcfe40f5 100644
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@@ -104,10 +104,18 @@
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first blockSize samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
+
   @par           Fixed-Point Behavior
                    Care must be taken when using the fixed-point versions of the FIR filter functions.
                    In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
                    Refer to the function specific documentation below for usage guidelines.
+
  */
 
 /**
@@ -126,578 +134,542 @@
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
-static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
+#define FIR_F32_MAX_COEF_BLK        8
+
+#define FIR_F32_CORE(pSamples, c, NB_TAPS)                                 \
+        vecAcc0 = vdupq_n_f32(0.0f);                                       \
+        for (int i = 0; i < NB_TAPS; i++) {                                \
+            vecIn0 = vld1q(&pSamples[i]);                                  \
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
+        }
+
+
+static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
+  const float32_t * __restrict pSrc, 
+  float32_t * __restrict pDst, uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
-    const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
-    const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
-    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
-    const float32_t *pTempSrc;          /* Temporary pointer to the source data */
-    float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    f32x4_t vecIn0;
-    f32x4_t vecAcc0;
-    float32_t c0, c1, c2, c3;
+    float32_t *pRefStatePtr = S->pState + blockSize;
+    float32_t      *pState = pRefStatePtr; /* State pointer */
+    const float32_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
+    float32_t      *pStateCur;  /* Points to the current sample of the state */
+    const float32_t *pSamples;  /* Temporary pointer to the sample buffer */
+    float32_t      *pOutput;    /* Temporary pointer to the output buffer */
+    const float32_t *pTempSrc;  /* Temporary pointer to the source data */
+    float32_t      *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    int32_t         blkCnt;
+    float32x4_t         vecIn0;
+    float32x4_t         vecAcc0;
+    const int       NB_TAPS=4;
+    float32_t       c[NB_TAPS];
+    const float32_t *pCoeffsCur = pCoeffs;
 
     /*
      * pState points to state array which contains previous frame (numTaps - 1) samples
      * pStateCur points to the location where the new input data should be written
      */
     pStateCur = &(pState[(numTaps - 1u)]);
-    pSamples  = pState;
-    pTempSrc  = pSrc;
-    pOutput   = pDst;
-
-    if (((numTaps - 1) / 4) == 0)
-    {
-        const float32_t *pCoeffsCur = pCoeffs;
-
-        c0 = *pCoeffsCur++;
-        c1 = *pCoeffsCur++;
-        c2 = *pCoeffsCur++;
-        c3 = *pCoeffsCur++;
-
-        blkCnt = blockSize >> 2;
-        while (blkCnt > 0U)
-        {
-            /*
-             * Save 4 input samples in the history buffer
-             */
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
-
-            vecIn0 = vld1q(pSamples);
-            vecAcc0 = vmulq(vecIn0, c0);
-
-            vecIn0 = vld1q(&pSamples[1]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
-            vecIn0 = vld1q(&pSamples[2]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+    pTempSrc = pSrc;
 
-            vecIn0 = vld1q(&pSamples[3]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+    pSamples = pState;
+    pOutput = pDst;
 
-            vst1q(pOutput, vecAcc0);
+    for (int i = 0; i < NB_TAPS; i++)
+        c[i] = *pCoeffsCur++;
 
-            pOutput += 4;
-            pSamples += 4;
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) {
+        /*
+         * Save 4 input samples in the history buffer
+         */
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
 
-            blkCnt--;
-        }
+        FIR_F32_CORE(pSamples, c, NB_TAPS);
 
-        blkCnt = blockSize & 3;
-        if (blkCnt > 0U)
-        {
-            mve_pred16_t p0 = vctp32q(blkCnt);
+        vst1q(pOutput, vecAcc0);
 
-            vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
-            pStateCur += blkCnt;
-            pTempSrc += blkCnt;
+        pOutput += 4;
+        pSamples += 4;
 
-            vecIn0 = vld1q(pSamples);
-            vecAcc0 = vmulq(vecIn0, c0);
+        blkCnt--;
+    }
 
-            vecIn0 = vld1q(&pSamples[1]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+    blkCnt = blockSize & 3;
+    {
+        mve_pred16_t    p0 = vctp32q(blkCnt);
 
-            vecIn0 = vld1q(&pSamples[2]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
 
-            vecIn0 = vld1q(&pSamples[3]);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        FIR_F32_CORE(pSamples, c, NB_TAPS);
 
-            vstrwq_p_f32(pOutput, vecAcc0, p0);
-        }
+        vstrwq_p_f32(pOutput, vecAcc0, p0);
     }
 
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
+    blkCnt = numTaps - 1;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+
+        vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
         pTempSrc += 4;
         pTempDest += 4;
-        blkCnt--;
-    }
-
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+        blkCnt -= 4;
     }
+    while (blkCnt > 0);
 }
 
 
-static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
+
+static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
+  const float32_t * __restrict pSrc, 
+  float32_t * __restrict pDst, uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    float32_t *pRefStatePtr = S->pState + blockSize;
+    float32_t *pState = pRefStatePtr;      /* State pointer */
     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
-    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    f32x4_t vecIn0;
-    f32x4_t vecAcc0;
+    int32_t  blkCnt;
     float32_t c0, c1, c2, c3;
     float32_t c4, c5, c6, c7;
-    const float32_t *pCoeffsCur = pCoeffs;
 
-    /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
-     */
-    pStateCur = &(pState[(numTaps - 1u)]);
+
     pTempSrc = pSrc;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int cnt = blockSize;
+    do {
+        mve_pred16_t p0 = vctp32q(cnt);
+        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 4;
+        pTempSrc += 4;
+        cnt -= 4;
+    } while(cnt > 0);
 
-    pSamples = pState;
-    pOutput = pDst;
 
-    c0 = *pCoeffsCur++;
-    c1 = *pCoeffsCur++;
-    c2 = *pCoeffsCur++;
-    c3 = *pCoeffsCur++;
-    c4 = *pCoeffsCur++;
-    c5 = *pCoeffsCur++;
-    c6 = *pCoeffsCur++;
-    c7 = *pCoeffsCur++;
 
-    blkCnt = blockSize >> 2;
-    while (blkCnt > 0U)
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 2;
+    while(cnt > 0) 
     {
-        /*
-         * Save 4 input samples in the history buffer
-         */
-        vst1q(pStateCur, vld1q(pTempSrc));
-        pStateCur += 4;
-        pTempSrc += 4;
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
 
         vecIn0 = vld1q(pSamples);
         vecAcc0 = vmulq(vecIn0, c0);
-
         vecIn0 = vld1q(&pSamples[1]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
         vecIn0 = vld1q(&pSamples[2]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
         vecIn0 = vld1q(&pSamples[3]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
         vecIn0 = vld1q(&pSamples[4]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
         vecIn0 = vld1q(&pSamples[5]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
         vecIn0 = vld1q(&pSamples[6]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
         vecIn0 = vld1q(&pSamples[7]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-        vst1q(pOutput, vecAcc0);
-
-        pOutput += 4;
         pSamples += 4;
-
-        blkCnt--;
+        vst1q(pDst, vecAcc0);
+        cnt--;
+        pDst += 4;
     }
 
-    blkCnt = blockSize & 3;
-    if (blkCnt > 0U)
+    cnt = blockSize & 3;
+    if (cnt > 0) 
     {
-        mve_pred16_t p0 = vctp32q(blkCnt);
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
 
-        vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
-        pStateCur += blkCnt;
-        pTempSrc += blkCnt;
+        mve_pred16_t p0 = vctp32q(cnt);
 
         vecIn0 = vld1q(pSamples);
         vecAcc0 = vmulq(vecIn0, c0);
-
         vecIn0 = vld1q(&pSamples[1]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
         vecIn0 = vld1q(&pSamples[2]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
         vecIn0 = vld1q(&pSamples[3]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
         vecIn0 = vld1q(&pSamples[4]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
         vecIn0 = vld1q(&pSamples[5]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
         vecIn0 = vld1q(&pSamples[6]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
         vecIn0 = vld1q(&pSamples[7]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-        vstrwq_p_f32(pOutput, vecAcc0, p0);
+        vstrwq_p_f32(pDst, vecAcc0,p0);
     }
 
+
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+    blkCnt = numTaps;
+    while (blkCnt > 0)
     {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 4;
-        pTempDest += 4;
+        *pTempDest++ = *pTempSrc++;
         blkCnt--;
     }
-
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
-    }
 }
 
 
+
 void arm_fir_f32(
 const arm_fir_instance_f32 * S,
 const float32_t * pSrc,
 float32_t * pDst,
 uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    /* 
+       S->pState is the arm_fir_partial_accu
+       S->pState + blockSize is the FIR state
+    */
+    float32_t *pRefStatePtr = S->pState + blockSize;
+    float32_t *pState = pRefStatePtr ;      /* State pointer */
     const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
     const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
     float32_t *pOutput;                 /* Temporary pointer to the output buffer */
     const float32_t *pTempSrc;          /* Temporary pointer to the source data */
     float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
     uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
     uint32_t  blkCnt;
-    int32_t numCnt;
-    f32x4_t vecIn0;
-    f32x4_t vecAcc0;
     float32_t c0, c1, c2, c3;
     float32_t c4, c5, c6, c7;
 
     /*
      * [1 to 8 taps] specialized routines
      */
-    if (blockSize >= 8)
+    if (numTaps <= 4)
     {
-       if (numTaps <= 4)
-       {
-           arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
-           return;
-       }
+        arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
+        return;
     }
-    if (blockSize >= 8)
+    else if (numTaps <= 8)
     {
-       if (numTaps <= 8)
-       {
-           arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
-           return;
-       }
+        arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
+        return;
     }
 
-    if (blockSize >= 8)
+    pTempSrc = pSrc;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int cnt = blockSize;
+    do {
+        mve_pred16_t p0 = vctp32q(cnt);
+        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 4;
+        pTempSrc += 4;
+        cnt -= 4;
+    } while(cnt > 0);
+
+    float32_t *partial_accu_ptr = S->pState;
+
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 2;
+    while(cnt > 0) {
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        pSamples += 4;
+        vst1q(partial_accu_ptr, vecAcc0);
+        cnt--;
+        partial_accu_ptr += 4;
+    }
+
+    cnt = blockSize & 3;
+    if (cnt > 0) 
     {
-        /*
-         * pState points to state array which contains previous frame (numTaps - 1) samples
-         * pStateCur points to the location where the new input data should be written
-         */
-        pStateCur = &(pState[(numTaps - 1u)]);
-        pTempSrc = pSrc;
-        pSamples = pState;
-        pOutput = pDst;
+        float32x4_t vecAcc0;
+        float32x4_t vecIn0;
 
-        blkCnt = blockSize >> 2;
-        while (blkCnt > 0U)
-        {
-            int32_t       i;
-            const float32_t *pCoeffsCur = pCoeffs;
-
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
-
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
+        mve_pred16_t p0 = vctp32q(cnt);
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        pSamples += cnt;
+        vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
+        partial_accu_ptr += cnt;
+    }
+
+    int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+    int sample_offset = FIR_F32_MAX_COEF_BLK;
+    while (localTaps > FIR_F32_MAX_COEF_BLK) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+
+        partial_accu_ptr = S->pState;
+        pSamples = pState + sample_offset;
+        int cnt = blockSize >> 2;
+        while(cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
             vecIn0 = vld1q(pSamples);
             vecAcc0 = vmulq(vecIn0, c0);
-
             vecIn0 = vld1q(&pSamples[1]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
             vecIn0 = vld1q(&pSamples[2]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
             vecIn0 = vld1q(&pSamples[3]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
             vecIn0 = vld1q(&pSamples[4]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
             vecIn0 = vld1q(&pSamples[5]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
             vecIn0 = vld1q(&pSamples[6]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
             vecIn0 = vld1q(&pSamples[7]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 4;
+            vecAcc0 += vld1q_f32(partial_accu_ptr);
+            vst1q(partial_accu_ptr, vecAcc0);
+            cnt--;
+            partial_accu_ptr += 4;
+        }
 
-            pSamples += 8;
-
-            numCnt = ((int32_t)numTaps - 8) / 8;
-
-            for (i = 0; i < numCnt; i++)
-            {
-                c0 = *pCoeffsCur++;
-                c1 = *pCoeffsCur++;
-                c2 = *pCoeffsCur++;
-                c3 = *pCoeffsCur++;
-                c4 = *pCoeffsCur++;
-                c5 = *pCoeffsCur++;
-                c6 = *pCoeffsCur++;
-                c7 = *pCoeffsCur++;
-
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
-                vecIn0 = vld1q(&pSamples[1]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
-                vecIn0 = vld1q(&pSamples[2]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
-                vecIn0 = vld1q(&pSamples[3]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
-                vecIn0 = vld1q(&pSamples[4]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
-                vecIn0 = vld1q(&pSamples[5]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
-                vecIn0 = vld1q(&pSamples[6]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        cnt = blockSize & 3;
+        if (cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
-                vecIn0 = vld1q(&pSamples[7]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            mve_pred16_t p0 = vctp32q(cnt);
 
-                pSamples += 8;
-            }
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += cnt;
+            vecAcc0 += vld1q_f32(partial_accu_ptr);
+            vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
+            partial_accu_ptr += cnt;
+        }
 
-            numCnt = ((int32_t)numTaps - 8) & 7;
+        localTaps -= FIR_F32_MAX_COEF_BLK;
+        sample_offset += FIR_F32_MAX_COEF_BLK;
+    }
 
-            while (numCnt > 0)
-            {
-                c0 = *pCoeffsCur++;
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-                pSamples ++;
+    pSamples = pState + sample_offset;
+
+    if (localTaps > 4) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+        pOutput = pDst;
 
-                numCnt --;
-            }
+        partial_accu_ptr = S->pState;
+        cnt = blockSize  >> 2;
+        while(cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
-            vst1q(pOutput, vecAcc0);
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 4;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0+pap);
+            cnt--;
+            partial_accu_ptr += 4;
             pOutput += 4;
-            pSamples = pSamples - numTaps + 4;
-
-            blkCnt--;
         }
 
-        blkCnt = blockSize & 3;
-        if (blkCnt > 0U)
-        {
-            mve_pred16_t p0 = vctp32q(blkCnt);
-            int32_t       i;
-            const float32_t *pCoeffsCur = pCoeffs;
-
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
-
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
+        cnt = blockSize  & 3;
+        if (cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
+
+            mve_pred16_t p0 = vctp32q(cnt);
 
             vecIn0 = vld1q(pSamples);
             vecAcc0 = vmulq(vecIn0, c0);
-
             vecIn0 = vld1q(&pSamples[1]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
             vecIn0 = vld1q(&pSamples[2]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
             vecIn0 = vld1q(&pSamples[3]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
             vecIn0 = vld1q(&pSamples[4]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
             vecIn0 = vld1q(&pSamples[5]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
             vecIn0 = vld1q(&pSamples[6]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
             vecIn0 = vld1q(&pSamples[7]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-            pSamples += 8;
-
-            numCnt = ((int32_t)numTaps - 8) / 8;
-
-            for (i = 0; i < numCnt; i++)
-            {
-                c0 = *pCoeffsCur++;
-                c1 = *pCoeffsCur++;
-                c2 = *pCoeffsCur++;
-                c3 = *pCoeffsCur++;
-                c4 = *pCoeffsCur++;
-                c5 = *pCoeffsCur++;
-                c6 = *pCoeffsCur++;
-                c7 = *pCoeffsCur++;
-
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
-                vecIn0 = vld1q(&pSamples[1]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
-                vecIn0 = vld1q(&pSamples[2]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
-                vecIn0 = vld1q(&pSamples[3]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
-                vecIn0 = vld1q(&pSamples[4]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
-                vecIn0 = vld1q(&pSamples[5]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
-                vecIn0 = vld1q(&pSamples[6]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
-                vecIn0 = vld1q(&pSamples[7]);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
-                pSamples += 8;
-            }
-
-            numCnt = ((int32_t)numTaps - 8) & 7;
-
-            while (numCnt > 0)
-            {
-                c0 = *pCoeffsCur++;
-                vecIn0 = vld1q(pSamples);
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-                pSamples ++;
-
-                numCnt --;
-            }
-
-            vstrwq_p_f32(pOutput, vecAcc0, p0);
+            pSamples += cnt;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
+            partial_accu_ptr += cnt;
+            pOutput += cnt;
         }
     }
-    else
-    {
-        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
-        float32_t *px;                                 /* Temporary pointer for state buffer */
-  const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
-        float32_t acc0;                                /* Accumulator */
-        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
-        uint32_t i, blkCnt;                    /* Loop counters */
-        pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
-        blkCnt = blockSize;
-        while (blkCnt > 0U)
-        {
-          /* Copy one sample at a time into state buffer */
-          *pStateCurnt++ = *pSrc++;
-
-          /* Set the accumulator to zero */
-          acc0 = 0.0f;
-
-          /* Initialize state pointer */
-          px = pState;
-
-          /* Initialize Coefficient pointer */
-          pb = pCoeffs;
-
-          i = numTaps;
-
-          /* Perform the multiply-accumulates */
-          while (i > 0U)
-          {
-            /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-            acc0 += *px++ * *pb++;
+    else {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        pOutput = pDst;
 
-            i--;
-          }
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 2;
+        while(cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            pSamples += 4;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0+pap);
+            cnt--;
+            partial_accu_ptr += 4;
+            pOutput += 4;
+        }
 
-          /* Store result in destination buffer. */
-          *pDst++ = acc0;
+        cnt = blockSize & 3;
+        if (cnt > 0) {
+            float32x4_t vecAcc0;
+            float32x4_t vecIn0;
 
-          /* Advance state pointer by 1 for the next sample */
-          pState = pState + 1U;
+            mve_pred16_t p0 = vctp32q(cnt);
 
-          /* Decrement loop counter */
-          blkCnt--;
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            pSamples += cnt;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
+            vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
+            partial_accu_ptr += cnt;
+            pOutput += cnt;
         }
     }
 
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pRefStatePtr[blockSize];
+    pTempDest = pRefStatePtr;
 
     blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    while (blkCnt > 0)
     {
         vst1q(pTempDest, vld1q(pTempSrc));
         pTempSrc += 4;
         pTempDest += 4;
         blkCnt--;
     }
-
     blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
+    if (blkCnt > 0)
     {
         mve_pred16_t p0 = vctp32q(blkCnt);
         vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
diff --git a/Source/FilteringFunctions/arm_fir_init_f32.c b/Source/FilteringFunctions/arm_fir_init_f32.c
index bdde8e21..2ffa8273 100644
--- a/Source/FilteringFunctions/arm_fir_init_f32.c
+++ b/Source/FilteringFunctions/arm_fir_init_f32.c
@@ -52,8 +52,8 @@
       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
   </pre>
   @par
-                   <code>pState</code> points to the array of state variables.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
+                   <code>pState</code> points to the array of state variables and some working memory for the Helium version.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
   @par          Initialization of Helium version
                  For Helium version the array of coefficients must be a multiple of 16 even if less
                  then 16 coefficients are used. The additional coefficients must be set to 0.
@@ -62,6 +62,13 @@
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first blockSize samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
+
  */
 
 void arm_fir_init_f32(
@@ -78,8 +85,11 @@ void arm_fir_init_f32(
   S->pCoeffs = pCoeffs;
 
   /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + blockSize) * sizeof(float32_t));
+#else
   memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
-
+#endif
   /* Assign state pointer */
   S->pState = pState;
 }
diff --git a/Testing/.gitignore b/Testing/.gitignore
index 3f9e95df..89decdd5 100644
--- a/Testing/.gitignore
+++ b/Testing/.gitignore
@@ -9,6 +9,8 @@ FullBenchmark/
 Output/
 GeneratedInclude/
 GeneratedSource/
+GeneratedIncludeBench/
+GeneratedSourceBench/
 *.db
 TestDesc.txt
 currentConfig.csv
diff --git a/Testing/CMakeLists.txt b/Testing/CMakeLists.txt
index 5d04ce8a..28cc8db4 100644
--- a/Testing/CMakeLists.txt
+++ b/Testing/CMakeLists.txt
@@ -85,6 +85,8 @@ option(FLOAT16TESTS "Float16 tests" OFF)
 option(MICROBENCH "Micro benchmarks" OFF)
 option(EXTERNAL "External benchmarks or tests" OFF)
 
+option(DISTINCT "Different generated folder for benchmarking and tests" OFF)
+
 project(Testing)
 
 # Needed to find the config modules
@@ -422,7 +424,12 @@ endif()
 
 
 target_sources(TestingLib PRIVATE testmain.cpp)
+
+if ((DISTINCT) AND (BENCHMARK))
+target_sources(TestingLib PRIVATE GeneratedSourceBench/TestDesc.cpp)
+else()
 target_sources(TestingLib PRIVATE GeneratedSource/TestDesc.cpp)
+endif()
 
 if (EMBEDDED)
     target_compile_definitions(TestingLib PUBLIC EMBEDDED)
@@ -468,7 +475,12 @@ if(NN)
 target_link_libraries(TestingLib PRIVATE CMSISNN)
 endif()
 target_include_directories(TestingLib PRIVATE FrameworkInclude)
+
+if ((DISTINCT) AND (BENCHMARK))
+target_include_directories(TestingLib PRIVATE GeneratedIncludeBench)
+else()
 target_include_directories(TestingLib PRIVATE GeneratedInclude)
+endif()
 
 configLib(TestingLib ${ROOT})
 #configDsp(TestingLib ${ROOT})
@@ -490,7 +502,12 @@ core_includes(FrameworkLib)
 add_executable(Testing main.cpp)
 # To see the file in the scatter load, it must not because
 # linked in a .a archive
+if ((DISTINCT) AND (BENCHMARK))
+target_include_directories(Testing PRIVATE GeneratedIncludeBench)
+else()
 target_include_directories(Testing PRIVATE GeneratedInclude)
+endif()
+
 target_sources(Testing PRIVATE patterndata.c)
 
 # With -O2, generated code is crashing on some cycle accurate models.
@@ -504,7 +521,11 @@ target_link_libraries(Testing PRIVATE FrameworkLib)
 
 if (EXTERNAL)
   target_include_directories(${EXTERNALPROJECT} PRIVATE FrameworkInclude)
-  target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedInclude)
+  if ((DISTINCT) AND (BENCHMARK))
+    target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedIncludeBench)
+  else()
+    target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedInclude)
+  endif()
   target_link_libraries(TestingLib PRIVATE ${EXTERNALPROJECT})
 endif()
 
diff --git a/Testing/Include/Tests/FIRF32.h b/Testing/Include/Tests/FIRF32.h
index bb7ed4a0..565d25fa 100644
--- a/Testing/Include/Tests/FIRF32.h
+++ b/Testing/Include/Tests/FIRF32.h
@@ -15,8 +15,10 @@ class FIRF32:public Client::Suite
             Client::Pattern<float32_t> coefs;
             Client::Pattern<float32_t> inputs;
             Client::RefPattern<int16_t> configs;
+
             Client::LocalPattern<float32_t> output;
             Client::LocalPattern<float32_t> state;
+            Client::LocalPattern<float32_t> tmp;
             // Reference patterns are not loaded when we are in dump mode
             Client::RefPattern<float32_t> ref;
 
diff --git a/Testing/Source/Benchmarks/FIRF32.cpp b/Testing/Source/Benchmarks/FIRF32.cpp
index 449b8b4f..41a32018 100755
--- a/Testing/Source/Benchmarks/FIRF32.cpp
+++ b/Testing/Source/Benchmarks/FIRF32.cpp
@@ -1,7 +1,10 @@
 #include "FIRF32.h"
 #include "Error.h"
 
-   
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) float32_t coeffArray[64];
+#endif 
+
     void FIRF32::test_fir_f32()
     {
        arm_fir_f32(&instFir, this->pSrc, this->pDst, this->nbSamples);
@@ -30,7 +33,7 @@
        samples.reload(FIRF32::SAMPLES1_F32_ID,mgr,this->nbSamples);
        coefs.reload(FIRF32::COEFS1_F32_ID,mgr,this->nbTaps);
 
-       state.create(this->nbSamples + this->nbTaps - 1,FIRF32::STATE_F32_ID,mgr);
+       state.create(this->nbSamples + this->nbSamples + this->nbTaps - 1,FIRF32::STATE_F32_ID,mgr);
        output.create(this->nbSamples,FIRF32::OUT_SAMPLES_F32_ID,mgr);
 
        switch(id)
@@ -38,8 +41,21 @@
            case TEST_FIR_F32_1:
               arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
 
-              this->pSrc=samples.ptr();
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+              /* Copy coefficients and pad to zero 
+              */
+              memset(coeffArray,0,32*sizeof(float32_t));
+              float32_t *ptr;
+
+              ptr=coefs.ptr();
+              memcpy(coeffArray,ptr,this->nbTaps*sizeof(float32_t));
+              this->pCoefs = coeffArray;
+#else
               this->pCoefs=coefs.ptr();
+#endif
+
+              this->pSrc=samples.ptr();
+              
               this->pDst=output.ptr();
            break;
 
diff --git a/Testing/Source/Tests/FIRF16.cpp b/Testing/Source/Tests/FIRF16.cpp
index aaea99ad..0573caa3 100755
--- a/Testing/Source/Tests/FIRF16.cpp
+++ b/Testing/Source/Tests/FIRF16.cpp
@@ -61,7 +61,7 @@ void checkInnerTail(float16_t *b)
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(float16_t));
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRF32.cpp b/Testing/Source/Tests/FIRF32.cpp
index 2cfdd56a..42960203 100644
--- a/Testing/Source/Tests/FIRF32.cpp
+++ b/Testing/Source/Tests/FIRF32.cpp
@@ -24,6 +24,7 @@ void checkInnerTail(float32_t *b)
     ASSERT_TRUE(b[3] == 0.0f);
 }
 
+
     void FIRF32::test_fir_f32()
     {
         
@@ -34,6 +35,7 @@ void checkInnerTail(float32_t *b)
         
         const float32_t *coefsp;
         const float32_t *inputp = inputs.ptr();
+
         float32_t *outp = output.ptr();
 
         unsigned long i;
@@ -42,6 +44,7 @@ void checkInnerTail(float32_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int nb=0;
 
         
 
@@ -58,10 +61,12 @@ void checkInnerTail(float32_t *b)
            blockSize = configp[0];
            numTaps = configp[1];
 
+           nb += 2*blockSize;
+
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(float32_t));
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
@@ -86,6 +91,7 @@ void checkInnerTail(float32_t *b)
            */
            inputp = inputs.ptr();
 
+          
            /*
            
            Python script is filtering a 2*blockSize number of samples.
@@ -94,7 +100,9 @@ void checkInnerTail(float32_t *b)
 
            */
 
+           
            arm_fir_f32(&this->S,inputp,outp,blockSize);
+           
            outp += blockSize;
            checkInnerTail(outp);
            
@@ -137,8 +145,15 @@ void checkInnerTail(float32_t *b)
        ref.reload(FIRF32::FIRREFS_F32_ID,mgr);
 
        output.create(ref.nbSamples(),FIRF32::OUT_F32_ID,mgr);
-       /* Max blockSize + numTaps - 1 as generated by Python script */
-       state.create(47,FIRF32::OUT_F32_ID,mgr);
+       /* Max 2*blockSize + numTaps - 1 as generated by Python script 
+          A temp buffer blockSize is used by Helium implementation.
+          It is at beginning of state buffer and is NOT the state
+          of the FIR which is in the following part.
+       */
+       state.create(47+47,FIRF32::OUT_F32_ID,mgr);
+
+
+
     }
 
     void FIRF32::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
diff --git a/Testing/Source/Tests/FIRQ15.cpp b/Testing/Source/Tests/FIRQ15.cpp
index bf786ad7..05dfa2cd 100644
--- a/Testing/Source/Tests/FIRQ15.cpp
+++ b/Testing/Source/Tests/FIRQ15.cpp
@@ -53,7 +53,7 @@ void checkInnerTail(q15_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(q15_t));
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp
index 979709da..dfee9e1e 100644
--- a/Testing/Source/Tests/FIRQ31.cpp
+++ b/Testing/Source/Tests/FIRQ31.cpp
@@ -54,7 +54,7 @@ void checkInnerTail(q31_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(q31_t));
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRQ7.cpp b/Testing/Source/Tests/FIRQ7.cpp
index 8516457b..4b76b630 100644
--- a/Testing/Source/Tests/FIRQ7.cpp
+++ b/Testing/Source/Tests/FIRQ7.cpp
@@ -53,7 +53,7 @@ void checkInnerTail(q7_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(q7_t));
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/TestScripts/CodeGen.py b/Testing/TestScripts/CodeGen.py
index 487dc12e..0e07d03c 100644
--- a/Testing/TestScripts/CodeGen.py
+++ b/Testing/TestScripts/CodeGen.py
@@ -832,7 +832,7 @@ class CodeGen:
         self._currentPaths = oldPath.copy()
         self._currentParamPaths = oldParamPath.copy()
 
-    def genCodeForTree(self,root):
+    def genCodeForTree(self,root,benchMode):
       """ Generate all files from the trees of tests
       
       Args:
@@ -845,17 +845,21 @@ class CodeGen:
       # Get a list of all suites contained in the tree
       suites = self.getSuites(root,[])
 
-      
+      src = "GeneratedSource"
+      header = "GeneratedInclude"
+      if benchMode:
+        src += "Bench"
+        header += "Bench"
       # Generate .cpp and .h files neded to run the tests
-      with open("GeneratedSource/TestDesc.cpp","w") as sourceFile:
-       with open("GeneratedInclude/TestDesc.h","w") as headerFile:
+      with open("%s/TestDesc.cpp" % src,"w") as sourceFile:
+       with open("%s/TestDesc.h" % header,"w") as headerFile:
          headerFile.write("#include \"Test.h\"\n")
          headerFile.write("#include \"Pattern.h\"\n")
 
          sourceFile.write("#include \"Test.h\"\n")
          for s in suites:
           headerFile.write("#include \"%s.h\"\n" % s)
-         self._genCode(root,"GeneratedInclude",sourceFile,headerFile)
+         self._genCode(root,"%s" % header,sourceFile,headerFile)
        
       # Generate a driver file for semihosting
       # (always generated for debug purpose since it is the reference format)
@@ -868,17 +872,17 @@ class CodeGen:
       # Driver file is similar in this case but different from semihosting
       # one.
       if not self._fpga:
-         with open("GeneratedInclude/TestDrive.h","w") as driverFile:
+         with open("%s/TestDrive.h" % header,"w") as driverFile:
             driverFile.write("// Empty driver include in semihosting mode")
-         with open("GeneratedInclude/Patterns.h","w") as includeFile:
+         with open("%s/Patterns.h" % header,"w") as includeFile:
             includeFile.write("// Empty pattern include in semihosting mode")
       else:
-        with open("GeneratedInclude/TestDrive.h","w") as driverFile:
+        with open("%s/TestDrive.h" % header,"w") as driverFile:
           driverFile.write("#ifndef _DRIVER_H_\n")
           driverFile.write("#define _DRIVER_H_\n")
           driverFile.write("__ALIGNED(8) const char testDesc[]={\n")
           self._offset=0
-          with open("GeneratedInclude/Patterns.h","w") as includeFile:
+          with open("%s/Patterns.h" % header,"w") as includeFile:
             includeFile.write("#ifndef _PATTERNS_H_\n")
             includeFile.write("#define _PATTERNS_H_\n")
             includeFile.write("__ALIGNED(8) const char patterns[]={\n")
diff --git a/Testing/createDefaultFolder.sh b/Testing/createDefaultFolder.sh
index 1cd3e3ba..ae260fe6 100755
--- a/Testing/createDefaultFolder.sh
+++ b/Testing/createDefaultFolder.sh
@@ -2,3 +2,5 @@ mkdir FullBenchmark
 mkdir Output
 mkdir GeneratedInclude
 mkdir GeneratedSource
+mkdir GeneratedIncludeBench
+mkdir GeneratedSourceBench
diff --git a/Testing/processTests.py b/Testing/processTests.py
index 14b29c58..20376984 100644
--- a/Testing/processTests.py
+++ b/Testing/processTests.py
@@ -5,7 +5,7 @@ import TestScripts.Deprecate as d
 
 
 parser = argparse.ArgumentParser(description='Parse test description')
-parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="File path")
+parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="Pickle path")
 
 parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path")
 parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path")
@@ -16,6 +16,8 @@ parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Para
 # So the .h for include files need to be generated.
 parser.add_argument('-e', action='store_true', help="Embedded test")
 
+parser.add_argument('-b', action='store_true', help="Benchmark mode to use different generated folders")
+
 parser.add_argument('others', nargs=argparse.REMAINDER)
 
 args = parser.parse_args()
@@ -32,6 +34,6 @@ if args.f is not None:
     d.deprecate(root,args.others)
     #print(root)
     # Generate code with the tree of tests
-    c.genCodeForTree(root)
+    c.genCodeForTree(root,args.b)
 else:
     parser.print_help()
\ No newline at end of file