CMSIS-DSP: New MVE implementation of the FIR f32

5 years ago · 6f229b4f65
parent fa73bb9825
commit 6f229b4f65
16 changed files with 480 additions and 415 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,7 @@ DSP_Lib_TestSuite/build/
 PythonWrapper/build/
 PythonWrapper/cmsisdsp.cp36-win_amd64.pyd
 PythonWrapper/rec_2.dat
-Output.pickle
+*.pickle
 build_*/
 Examples/ARM/arm_fft_bin_example/RTE/
 Examples/ARM/arm_fft_bin_example/RTE/
--- a/Examples/ARM/arm_fir_example/arm_fir_example_f32.c
+++ b/Examples/ARM/arm_fir_example/arm_fir_example_f32.c
@ -143,6 +143,14 @@ this example is not giving better SNR ...
 */
 #define SNR_THRESHOLD_F32    75.0f
 #define BLOCK_SIZE            32
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 /* Must be a multiple of 16 */
 #define NUM_TAPS_ARRAY_SIZE              32
 #else
 #define NUM_TAPS_ARRAY_SIZE              29
 #endif
 #define NUM_TAPS              29
 /* -------------------------------------------------------------------
@ -162,20 +170,31 @@ static float32_t testOutput[TEST_LENGTH_SAMPLES];
 /* -------------------------------------------------------------------
 * Declare State buffer of size (numTaps + blockSize - 1)
 * ------------------------------------------------------------------- */
-
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 static float32_t firStateF32[2 * BLOCK_SIZE + NUM_TAPS - 1];
 #else
 static float32_t firStateF32[BLOCK_SIZE + NUM_TAPS - 1];
 #endif 
 /* ----------------------------------------------------------------------
 ** FIR Coefficients buffer generated using fir1() MATLAB function.
 ** fir1(28, 6/24)
 ** ------------------------------------------------------------------- */
-
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-const float32_t firCoeffs32[NUM_TAPS] = {
+const float32_t firCoeffs32[NUM_TAPS_ARRAY_SIZE] = {
  -0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f,
  -0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f,
  +0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f,
  +0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f, 0.0f,0.0f,0.0f
 };
 #else
 const float32_t firCoeffs32[NUM_TAPS_ARRAY_SIZE] = {
  -0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f,
  -0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f,
  +0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f,
  +0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f
 };
 #endif
 /* ------------------------------------------------------------------
 * Global variables for FIR LPF Example
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@ -104,10 +104,18 @@
                 the implementation may require to read more coefficients due to the vectorization and
                 to avoid having to manage too many different cases in the code.
  @par          Helium state buffer
                 The state buffer must contain some additional temporary data
                 used during the computation but which is not the state of the FIR.
                 The first blockSize samples are temporary data.
                 The remaining samples are the state of the FIR filter.
                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
  @par           Fixed-Point Behavior
                   Care must be taken when using the fixed-point versions of the FIR filter functions.
                   In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
                   Refer to the function specific documentation below for usage guidelines.
 */
 /**
@ -126,578 +134,542 @@
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
+#define FIR_F32_MAX_COEF_BLK        8
 #define FIR_F32_CORE(pSamples, c, NB_TAPS)                                 \
        vecAcc0 = vdupq_n_f32(0.0f);                                       \
        for (int i = 0; i < NB_TAPS; i++) {                                \
            vecIn0 = vld1q(&pSamples[i]);                                  \
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
        }
 static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
  const float32_t * __restrict pSrc, 
  float32_t * __restrict pDst, uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    float32_t *pRefStatePtr = S->pState + blockSize;
-    const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
+    float32_t      *pState = pRefStatePtr; /* State pointer */
-    float32_t *pStateCur;               /* Points to the current sample of the state */
+    const float32_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
-    const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
+    float32_t      *pStateCur;  /* Points to the current sample of the state */
-    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
+    const float32_t *pSamples;  /* Temporary pointer to the sample buffer */
-    const float32_t *pTempSrc;          /* Temporary pointer to the source data */
+    float32_t      *pOutput;    /* Temporary pointer to the output buffer */
-    float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
+    const float32_t *pTempSrc;  /* Temporary pointer to the source data */
-    uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
+    float32_t      *pTempDest;  /* Temporary pointer to the destination buffer */
-    uint32_t  blkCnt;
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
-    f32x4_t vecIn0;
+    int32_t         blkCnt;
-    f32x4_t vecAcc0;
+    float32x4_t         vecIn0;
-    float32_t c0, c1, c2, c3;
+    float32x4_t         vecAcc0;
    const int       NB_TAPS=4;
    float32_t       c[NB_TAPS];
    const float32_t *pCoeffsCur = pCoeffs;
    /*
     * pState points to state array which contains previous frame (numTaps - 1) samples
     * pStateCur points to the location where the new input data should be written
     */
    pStateCur = &(pState[(numTaps - 1u)]);
-    pSamples  = pState;
+    pTempSrc = pSrc;
    pTempSrc  = pSrc;
    pOutput   = pDst;
    if (((numTaps - 1) / 4) == 0)
    {
        const float32_t *pCoeffsCur = pCoeffs;
        c0 = *pCoeffsCur++;
        c1 = *pCoeffsCur++;
        c2 = *pCoeffsCur++;
        c3 = *pCoeffsCur++;
        blkCnt = blockSize >> 2;
        while (blkCnt > 0U)
        {
            /*
             * Save 4 input samples in the history buffer
             */
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 4;
            pTempSrc += 4;
            vecIn0 = vld1q(pSamples);
            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-            vecIn0 = vld1q(&pSamples[3]);
+    pSamples = pState;
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+    pOutput = pDst;
-            vst1q(pOutput, vecAcc0);
+    for (int i = 0; i < NB_TAPS; i++)
        c[i] = *pCoeffsCur++;
-            pOutput += 4;
+    blkCnt = blockSize >> 2;
-            pSamples += 4;
+    while (blkCnt > 0) {
        /*
         * Save 4 input samples in the history buffer
         */
        vst1q(pStateCur, vld1q(pTempSrc));
        pStateCur += 4;
        pTempSrc += 4;
-            blkCnt--;
+        FIR_F32_CORE(pSamples, c, NB_TAPS);
        }
-        blkCnt = blockSize & 3;
+        vst1q(pOutput, vecAcc0);
        if (blkCnt > 0U)
        {
            mve_pred16_t p0 = vctp32q(blkCnt);
-            vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
+        pOutput += 4;
-            pStateCur += blkCnt;
+        pSamples += 4;
            pTempSrc += blkCnt;
-            vecIn0 = vld1q(pSamples);
+        blkCnt--;
-            vecAcc0 = vmulq(vecIn0, c0);
+    }
-            vecIn0 = vld1q(&pSamples[1]);
+    blkCnt = blockSize & 3;
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+    {
        mve_pred16_t    p0 = vctp32q(blkCnt);
-            vecIn0 = vld1q(&pSamples[2]);
+        vst1q(pStateCur, vld1q(pTempSrc));
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        pStateCur += 4;
        pTempSrc += 4;
-            vecIn0 = vld1q(&pSamples[3]);
+        FIR_F32_CORE(pSamples, c, NB_TAPS);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-            vstrwq_p_f32(pOutput, vecAcc0, p0);
+        vstrwq_p_f32(pOutput, vecAcc0, p0);
        }
    }
    /*
     * Copy the samples back into the history buffer start
     */
-    pTempSrc = &S->pState[blockSize];
+    pTempSrc = &pState[blockSize];
-    pTempDest = S->pState;
+    pTempDest = pState;
-    blkCnt = numTaps >> 2;
+    blkCnt = numTaps - 1;
-    while (blkCnt > 0U)
+    do {
-    {
+        mve_pred16_t    p = vctp32q(blkCnt);
-        vst1q(pTempDest, vld1q(pTempSrc));
+
        vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
        pTempSrc += 4;
        pTempDest += 4;
-        blkCnt--;
+        blkCnt -= 4;
    }
    blkCnt = numTaps & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
    }
    while (blkCnt > 0);
 }
-static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
+
 static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
  const float32_t * __restrict pSrc, 
  float32_t * __restrict pDst, uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    float32_t *pRefStatePtr = S->pState + blockSize;
    float32_t *pState = pRefStatePtr;      /* State pointer */
    const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
    float32_t *pStateCur;               /* Points to the current sample of the state */
    const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
    const float32_t *pTempSrc;          /* Temporary pointer to the source data */
    float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
    uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
+    int32_t  blkCnt;
    f32x4_t vecIn0;
    f32x4_t vecAcc0;
    float32_t c0, c1, c2, c3;
    float32_t c4, c5, c6, c7;
    const float32_t *pCoeffsCur = pCoeffs;
-    /*
+
     * pState points to state array which contains previous frame (numTaps - 1) samples
     * pStateCur points to the location where the new input data should be written
     */
    pStateCur = &(pState[(numTaps - 1u)]);
    pTempSrc = pSrc;
    pTempDest = &(pState[(numTaps - 1u)]);
    int cnt = blockSize;
    do {
        mve_pred16_t p0 = vctp32q(cnt);
        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
        pTempDest += 4;
        pTempSrc += 4;
        cnt -= 4;
    } while(cnt > 0);
    pSamples = pState;
    pOutput = pDst;
    c0 = *pCoeffsCur++;
    c1 = *pCoeffsCur++;
    c2 = *pCoeffsCur++;
    c3 = *pCoeffsCur++;
    c4 = *pCoeffsCur++;
    c5 = *pCoeffsCur++;
    c6 = *pCoeffsCur++;
    c7 = *pCoeffsCur++;
-    blkCnt = blockSize >> 2;
+    pSamples = pState;
-    while (blkCnt > 0U)
+    c0 = *pCoeffs++;
    c1 = *pCoeffs++;
    c2 = *pCoeffs++;
    c3 = *pCoeffs++;
    c4 = *pCoeffs++;
    c5 = *pCoeffs++;
    c6 = *pCoeffs++;
    c7 = *pCoeffs++;
    cnt = blockSize >> 2;
    while(cnt > 0) 
    {
-        /*
+        float32x4_t vecAcc0;
-         * Save 4 input samples in the history buffer
+        float32x4_t vecIn0;
         */
        vst1q(pStateCur, vld1q(pTempSrc));
        pStateCur += 4;
        pTempSrc += 4;
        vecIn0 = vld1q(pSamples);
        vecAcc0 = vmulq(vecIn0, c0);
        vecIn0 = vld1q(&pSamples[1]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
        vecIn0 = vld1q(&pSamples[2]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
        vecIn0 = vld1q(&pSamples[3]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
        vecIn0 = vld1q(&pSamples[4]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
        vecIn0 = vld1q(&pSamples[5]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
        vecIn0 = vld1q(&pSamples[6]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
        vecIn0 = vld1q(&pSamples[7]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
        vst1q(pOutput, vecAcc0);
        pOutput += 4;
        pSamples += 4;
-
+        vst1q(pDst, vecAcc0);
-        blkCnt--;
+        cnt--;
        pDst += 4;
    }
-    blkCnt = blockSize & 3;
+    cnt = blockSize & 3;
-    if (blkCnt > 0U)
+    if (cnt > 0) 
    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
+        float32x4_t vecAcc0;
        float32x4_t vecIn0;
-        vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
+        mve_pred16_t p0 = vctp32q(cnt);
        pStateCur += blkCnt;
        pTempSrc += blkCnt;
        vecIn0 = vld1q(pSamples);
        vecAcc0 = vmulq(vecIn0, c0);
        vecIn0 = vld1q(&pSamples[1]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
        vecIn0 = vld1q(&pSamples[2]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
        vecIn0 = vld1q(&pSamples[3]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
        vecIn0 = vld1q(&pSamples[4]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
        vecIn0 = vld1q(&pSamples[5]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
        vecIn0 = vld1q(&pSamples[6]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
        vecIn0 = vld1q(&pSamples[7]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
+        vstrwq_p_f32(pDst, vecAcc0,p0);
        vstrwq_p_f32(pOutput, vecAcc0, p0);
    }
    /*
     * Copy the samples back into the history buffer start
     */
-    pTempSrc = &S->pState[blockSize];
+    pTempSrc = &pState[blockSize];
-    pTempDest = S->pState;
+    pTempDest = pState;
-
+    blkCnt = numTaps;
-    blkCnt = numTaps >> 2;
+    while (blkCnt > 0)
    while (blkCnt > 0U)
    {
-        vst1q(pTempDest, vld1q(pTempSrc));
+        *pTempDest++ = *pTempSrc++;
        pTempSrc += 4;
        pTempDest += 4;
        blkCnt--;
    }
    blkCnt = numTaps & 3;
    if (blkCnt > 0U)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
    }
 }
 void arm_fir_f32(
 const arm_fir_instance_f32 * S,
 const float32_t * pSrc,
 float32_t * pDst,
 uint32_t blockSize)
 {
-    float32_t *pState = S->pState;      /* State pointer */
+    /* 
       S->pState is the arm_fir_partial_accu
       S->pState + blockSize is the FIR state
    */
    float32_t *pRefStatePtr = S->pState + blockSize;
    float32_t *pState = pRefStatePtr ;      /* State pointer */
    const float32_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
    float32_t *pStateCur;               /* Points to the current sample of the state */
    const float32_t *pSamples;          /* Temporary pointer to the sample buffer */
    float32_t *pOutput;                 /* Temporary pointer to the output buffer */
    const float32_t *pTempSrc;          /* Temporary pointer to the source data */
    float32_t *pTempDest;               /* Temporary pointer to the destination buffer */
    uint32_t  numTaps = S->numTaps;     /* Number of filter coefficients in the filter */
    uint32_t  blkCnt;
    int32_t numCnt;
    f32x4_t vecIn0;
    f32x4_t vecAcc0;
    float32_t c0, c1, c2, c3;
    float32_t c4, c5, c6, c7;
    /*
     * [1 to 8 taps] specialized routines
     */
-    if (blockSize >= 8)
+    if (numTaps <= 4)
    {
-       if (numTaps <= 4)
+        arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
-       {
+        return;
           arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
           return;
       }
    }
-    if (blockSize >= 8)
+    else if (numTaps <= 8)
    {
-       if (numTaps <= 8)
+        arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
-       {
+        return;
           arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
           return;
       }
    }
-    if (blockSize >= 8)
+    pTempSrc = pSrc;
    pTempDest = &(pState[(numTaps - 1u)]);
    int cnt = blockSize;
    do {
        mve_pred16_t p0 = vctp32q(cnt);
        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
        pTempDest += 4;
        pTempSrc += 4;
        cnt -= 4;
    } while(cnt > 0);
    float32_t *partial_accu_ptr = S->pState;
    pSamples = pState;
    c0 = *pCoeffs++;
    c1 = *pCoeffs++;
    c2 = *pCoeffs++;
    c3 = *pCoeffs++;
    c4 = *pCoeffs++;
    c5 = *pCoeffs++;
    c6 = *pCoeffs++;
    c7 = *pCoeffs++;
    cnt = blockSize >> 2;
    while(cnt > 0) {
        float32x4_t vecAcc0;
        float32x4_t vecIn0;
        vecIn0 = vld1q(pSamples);
        vecAcc0 = vmulq(vecIn0, c0);
        vecIn0 = vld1q(&pSamples[1]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
        vecIn0 = vld1q(&pSamples[2]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
        vecIn0 = vld1q(&pSamples[3]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
        vecIn0 = vld1q(&pSamples[4]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
        vecIn0 = vld1q(&pSamples[5]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
        vecIn0 = vld1q(&pSamples[6]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
        vecIn0 = vld1q(&pSamples[7]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
        pSamples += 4;
        vst1q(partial_accu_ptr, vecAcc0);
        cnt--;
        partial_accu_ptr += 4;
    }
    cnt = blockSize & 3;
    if (cnt > 0) 
    {
-        /*
+        float32x4_t vecAcc0;
-         * pState points to state array which contains previous frame (numTaps - 1) samples
+        float32x4_t vecIn0;
         * pStateCur points to the location where the new input data should be written
         */
        pStateCur = &(pState[(numTaps - 1u)]);
        pTempSrc = pSrc;
        pSamples = pState;
        pOutput = pDst;
-        blkCnt = blockSize >> 2;
+        mve_pred16_t p0 = vctp32q(cnt);
-        while (blkCnt > 0U)
+
-        {
+        vecIn0 = vld1q(pSamples);
-            int32_t       i;
+        vecAcc0 = vmulq(vecIn0, c0);
-            const float32_t *pCoeffsCur = pCoeffs;
+        vecIn0 = vld1q(&pSamples[1]);
-
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-            c0 = *pCoeffsCur++;
+        vecIn0 = vld1q(&pSamples[2]);
-            c1 = *pCoeffsCur++;
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-            c2 = *pCoeffsCur++;
+        vecIn0 = vld1q(&pSamples[3]);
-            c3 = *pCoeffsCur++;
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-            c4 = *pCoeffsCur++;
+        vecIn0 = vld1q(&pSamples[4]);
-            c5 = *pCoeffsCur++;
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-            c6 = *pCoeffsCur++;
+        vecIn0 = vld1q(&pSamples[5]);
-            c7 = *pCoeffsCur++;
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
+        vecIn0 = vld1q(&pSamples[6]);
-            vst1q(pStateCur, vld1q(pTempSrc));
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-            pStateCur += 4;
+        vecIn0 = vld1q(&pSamples[7]);
-            pTempSrc += 4;
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
        pSamples += cnt;
        vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
        partial_accu_ptr += cnt;
    }
    int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
    int sample_offset = FIR_F32_MAX_COEF_BLK;
    while (localTaps > FIR_F32_MAX_COEF_BLK) {
        c0 = *pCoeffs++;
        c1 = *pCoeffs++;
        c2 = *pCoeffs++;
        c3 = *pCoeffs++;
        c4 = *pCoeffs++;
        c5 = *pCoeffs++;
        c6 = *pCoeffs++;
        c7 = *pCoeffs++;
        partial_accu_ptr = S->pState;
        pSamples = pState + sample_offset;
        int cnt = blockSize >> 2;
        while(cnt > 0) {
            float32x4_t vecAcc0;
            float32x4_t vecIn0;
            vecIn0 = vld1q(pSamples);
            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            vecIn0 = vld1q(&pSamples[4]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
            vecIn0 = vld1q(&pSamples[5]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
            vecIn0 = vld1q(&pSamples[6]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
            vecIn0 = vld1q(&pSamples[7]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
            pSamples += 4;
            vecAcc0 += vld1q_f32(partial_accu_ptr);
            vst1q(partial_accu_ptr, vecAcc0);
            cnt--;
            partial_accu_ptr += 4;
        }
-            pSamples += 8;
+        cnt = blockSize & 3;
-
+        if (cnt > 0) {
-            numCnt = ((int32_t)numTaps - 8) / 8;
+            float32x4_t vecAcc0;
-
+            float32x4_t vecIn0;
            for (i = 0; i < numCnt; i++)
            {
                c0 = *pCoeffsCur++;
                c1 = *pCoeffsCur++;
                c2 = *pCoeffsCur++;
                c3 = *pCoeffsCur++;
                c4 = *pCoeffsCur++;
                c5 = *pCoeffsCur++;
                c6 = *pCoeffsCur++;
                c7 = *pCoeffsCur++;
                vecIn0 = vld1q(pSamples);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
                vecIn0 = vld1q(&pSamples[1]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
                vecIn0 = vld1q(&pSamples[2]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
                vecIn0 = vld1q(&pSamples[3]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
                vecIn0 = vld1q(&pSamples[4]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
                vecIn0 = vld1q(&pSamples[5]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
                vecIn0 = vld1q(&pSamples[6]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-                vecIn0 = vld1q(&pSamples[7]);
+            mve_pred16_t p0 = vctp32q(cnt);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-                pSamples += 8;
+            vecIn0 = vld1q(pSamples);
-            }
+            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            vecIn0 = vld1q(&pSamples[4]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
            vecIn0 = vld1q(&pSamples[5]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
            vecIn0 = vld1q(&pSamples[6]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
            vecIn0 = vld1q(&pSamples[7]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
            pSamples += cnt;
            vecAcc0 += vld1q_f32(partial_accu_ptr);
            vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
            partial_accu_ptr += cnt;
        }
-            numCnt = ((int32_t)numTaps - 8) & 7;
+        localTaps -= FIR_F32_MAX_COEF_BLK;
        sample_offset += FIR_F32_MAX_COEF_BLK;
    }
-            while (numCnt > 0)
+    pSamples = pState + sample_offset;
-            {
+
-                c0 = *pCoeffsCur++;
+    if (localTaps > 4) {
-                vecIn0 = vld1q(pSamples);
+        c0 = *pCoeffs++;
-                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
+        c1 = *pCoeffs++;
-                pSamples ++;
+        c2 = *pCoeffs++;
        c3 = *pCoeffs++;
        c4 = *pCoeffs++;
        c5 = *pCoeffs++;
        c6 = *pCoeffs++;
        c7 = *pCoeffs++;
        pOutput = pDst;
-                numCnt --;
+        partial_accu_ptr = S->pState;
-            }
+        cnt = blockSize  >> 2;
        while(cnt > 0) {
            float32x4_t vecAcc0;
            float32x4_t vecIn0;
-            vst1q(pOutput, vecAcc0);
+            vecIn0 = vld1q(pSamples);
            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            vecIn0 = vld1q(&pSamples[4]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
            vecIn0 = vld1q(&pSamples[5]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
            vecIn0 = vld1q(&pSamples[6]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
            vecIn0 = vld1q(&pSamples[7]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
            pSamples += 4;
            float32x4_t pap = vld1q_f32(partial_accu_ptr);
            vst1q(pOutput, vecAcc0+pap);
            cnt--;
            partial_accu_ptr += 4;
            pOutput += 4;
            pSamples = pSamples - numTaps + 4;
            blkCnt--;
        }
-        blkCnt = blockSize & 3;
+        cnt = blockSize  & 3;
-        if (blkCnt > 0U)
+        if (cnt > 0) {
-        {
+            float32x4_t vecAcc0;
-            mve_pred16_t p0 = vctp32q(blkCnt);
+            float32x4_t vecIn0;
-            int32_t       i;
+
-            const float32_t *pCoeffsCur = pCoeffs;
+            mve_pred16_t p0 = vctp32q(cnt);
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 4;
            pTempSrc += 4;
            c0 = *pCoeffsCur++;
            c1 = *pCoeffsCur++;
            c2 = *pCoeffsCur++;
            c3 = *pCoeffsCur++;
            c4 = *pCoeffsCur++;
            c5 = *pCoeffsCur++;
            c6 = *pCoeffsCur++;
            c7 = *pCoeffsCur++;
            vecIn0 = vld1q(pSamples);
            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            vecIn0 = vld1q(&pSamples[4]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
            vecIn0 = vld1q(&pSamples[5]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
            vecIn0 = vld1q(&pSamples[6]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
            vecIn0 = vld1q(&pSamples[7]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
+            pSamples += cnt;
-            pSamples += 8;
+            float32x4_t pap = vld1q_f32(partial_accu_ptr);
-
+            vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
-            numCnt = ((int32_t)numTaps - 8) / 8;
+            partial_accu_ptr += cnt;
-
+            pOutput += cnt;
            for (i = 0; i < numCnt; i++)
            {
                c0 = *pCoeffsCur++;
                c1 = *pCoeffsCur++;
                c2 = *pCoeffsCur++;
                c3 = *pCoeffsCur++;
                c4 = *pCoeffsCur++;
                c5 = *pCoeffsCur++;
                c6 = *pCoeffsCur++;
                c7 = *pCoeffsCur++;
                vecIn0 = vld1q(pSamples);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
                vecIn0 = vld1q(&pSamples[1]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
                vecIn0 = vld1q(&pSamples[2]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
                vecIn0 = vld1q(&pSamples[3]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
                vecIn0 = vld1q(&pSamples[4]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
                vecIn0 = vld1q(&pSamples[5]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
                vecIn0 = vld1q(&pSamples[6]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
                vecIn0 = vld1q(&pSamples[7]);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
                pSamples += 8;
            }
            numCnt = ((int32_t)numTaps - 8) & 7;
            while (numCnt > 0)
            {
                c0 = *pCoeffsCur++;
                vecIn0 = vld1q(pSamples);
                vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
                pSamples ++;
                numCnt --;
            }
            vstrwq_p_f32(pOutput, vecAcc0, p0);
        }
    }
-    else
+    else {
-    {
+        c0 = *pCoeffs++;
-        float32_t *pStateCurnt;                        /* Points to the current sample of the state */
+        c1 = *pCoeffs++;
-        float32_t *px;                                 /* Temporary pointer for state buffer */
+        c2 = *pCoeffs++;
-  const float32_t *pb;                                 /* Temporary pointer for coefficient buffer */
+        c3 = *pCoeffs++;
-        float32_t acc0;                                /* Accumulator */
+        pOutput = pDst;
        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
        uint32_t i, blkCnt;                    /* Loop counters */
        pStateCurnt = &(S->pState[(numTaps - 1U)]);
        blkCnt = blockSize;
        while (blkCnt > 0U)
        {
          /* Copy one sample at a time into state buffer */
          *pStateCurnt++ = *pSrc++;
          /* Set the accumulator to zero */
          acc0 = 0.0f;
          /* Initialize state pointer */
          px = pState;
          /* Initialize Coefficient pointer */
          pb = pCoeffs;
          i = numTaps;
          /* Perform the multiply-accumulates */
          while (i > 0U)
          {
            /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
            acc0 += *px++ * *pb++;
-            i--;
+        partial_accu_ptr = S->pState;
-          }
+        cnt = blockSize >> 2;
        while(cnt > 0) {
            float32x4_t vecAcc0;
            float32x4_t vecIn0;
            vecIn0 = vld1q(pSamples);
            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            pSamples += 4;
            float32x4_t pap = vld1q_f32(partial_accu_ptr);
            vst1q(pOutput, vecAcc0+pap);
            cnt--;
            partial_accu_ptr += 4;
            pOutput += 4;
        }
-          /* Store result in destination buffer. */
+        cnt = blockSize & 3;
-          *pDst++ = acc0;
+        if (cnt > 0) {
            float32x4_t vecAcc0;
            float32x4_t vecIn0;
-          /* Advance state pointer by 1 for the next sample */
+            mve_pred16_t p0 = vctp32q(cnt);
          pState = pState + 1U;
-          /* Decrement loop counter */
+            vecIn0 = vld1q(pSamples);
-          blkCnt--;
+            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            pSamples += cnt;
            float32x4_t pap = vld1q_f32(partial_accu_ptr);
            vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
            partial_accu_ptr += cnt;
            pOutput += cnt;
        }
    }
    /*
     * Copy the samples back into the history buffer start
     */
-    pTempSrc = &S->pState[blockSize];
+    pTempSrc = &pRefStatePtr[blockSize];
-    pTempDest = S->pState;
+    pTempDest = pRefStatePtr;
    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    while (blkCnt > 0)
    {
        vst1q(pTempDest, vld1q(pTempSrc));
        pTempSrc += 4;
        pTempDest += 4;
        blkCnt--;
    }
    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
+    if (blkCnt > 0)
    {
        mve_pred16_t p0 = vctp32q(blkCnt);
        vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
--- a/Source/FilteringFunctions/arm_fir_init_f32.c
+++ b/Source/FilteringFunctions/arm_fir_init_f32.c
@ -52,8 +52,8 @@
      {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
  </pre>
  @par
-                   <code>pState</code> points to the array of state variables.
+                   <code>pState</code> points to the array of state variables and some working memory for the Helium version.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
  @par          Initialization of Helium version
                 For Helium version the array of coefficients must be a multiple of 16 even if less
                 then 16 coefficients are used. The additional coefficients must be set to 0.
@ -62,6 +62,13 @@
                 the implementation may require to read more coefficients due to the vectorization and
                 to avoid having to manage too many different cases in the code.
  @par          Helium state buffer
                 The state buffer must contain some additional temporary data
                 used during the computation but which is not the state of the FIR.
                 The first blockSize samples are temporary data.
                 The remaining samples are the state of the FIR filter.
                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
 */
 void arm_fir_init_f32(
@ -78,8 +85,11 @@ void arm_fir_init_f32(
  S->pCoeffs = pCoeffs;
  /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  memset(pState, 0, (numTaps + (blockSize - 1U) + blockSize) * sizeof(float32_t));
 #else
  memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
-
+#endif
  /* Assign state pointer */
  S->pState = pState;
 }
--- a/Testing/.gitignore
+++ b/Testing/.gitignore
@ -9,6 +9,8 @@ FullBenchmark/
 Output/
 GeneratedInclude/
 GeneratedSource/
 GeneratedIncludeBench/
 GeneratedSourceBench/
 *.db
 TestDesc.txt
 currentConfig.csv
--- a/Testing/CMakeLists.txt
+++ b/Testing/CMakeLists.txt
@ -85,6 +85,8 @@ option(FLOAT16TESTS "Float16 tests" OFF)
 option(MICROBENCH "Micro benchmarks" OFF)
 option(EXTERNAL "External benchmarks or tests" OFF)
 option(DISTINCT "Different generated folder for benchmarking and tests" OFF)
 project(Testing)
 # Needed to find the config modules
@ -422,7 +424,12 @@ endif()
 target_sources(TestingLib PRIVATE testmain.cpp)
 if ((DISTINCT) AND (BENCHMARK))
 target_sources(TestingLib PRIVATE GeneratedSourceBench/TestDesc.cpp)
 else()
 target_sources(TestingLib PRIVATE GeneratedSource/TestDesc.cpp)
 endif()
 if (EMBEDDED)
    target_compile_definitions(TestingLib PUBLIC EMBEDDED)
@ -468,7 +475,12 @@ if(NN)
 target_link_libraries(TestingLib PRIVATE CMSISNN)
 endif()
 target_include_directories(TestingLib PRIVATE FrameworkInclude)
 if ((DISTINCT) AND (BENCHMARK))
 target_include_directories(TestingLib PRIVATE GeneratedIncludeBench)
 else()
 target_include_directories(TestingLib PRIVATE GeneratedInclude)
 endif()
 configLib(TestingLib ${ROOT})
 #configDsp(TestingLib ${ROOT})
@ -490,7 +502,12 @@ core_includes(FrameworkLib)
 add_executable(Testing main.cpp)
 # To see the file in the scatter load, it must not because
 # linked in a .a archive
 if ((DISTINCT) AND (BENCHMARK))
 target_include_directories(Testing PRIVATE GeneratedIncludeBench)
 else()
 target_include_directories(Testing PRIVATE GeneratedInclude)
 endif()
 target_sources(Testing PRIVATE patterndata.c)
 # With -O2, generated code is crashing on some cycle accurate models.
@ -504,7 +521,11 @@ target_link_libraries(Testing PRIVATE FrameworkLib)
 if (EXTERNAL)
  target_include_directories(${EXTERNALPROJECT} PRIVATE FrameworkInclude)
-  target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedInclude)
+  if ((DISTINCT) AND (BENCHMARK))
    target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedIncludeBench)
  else()
    target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedInclude)
  endif()
  target_link_libraries(TestingLib PRIVATE ${EXTERNALPROJECT})
 endif()
--- a/Testing/Include/Tests/FIRF32.h
+++ b/Testing/Include/Tests/FIRF32.h
@ -15,8 +15,10 @@ class FIRF32:public Client::Suite
            Client::Pattern<float32_t> coefs;
            Client::Pattern<float32_t> inputs;
            Client::RefPattern<int16_t> configs;
            Client::LocalPattern<float32_t> output;
            Client::LocalPattern<float32_t> state;
            Client::LocalPattern<float32_t> tmp;
            // Reference patterns are not loaded when we are in dump mode
            Client::RefPattern<float32_t> ref;
--- a/Testing/Source/Benchmarks/FIRF32.cpp
+++ b/Testing/Source/Benchmarks/FIRF32.cpp
@ -1,7 +1,10 @@
 #include "FIRF32.h"
 #include "Error.h"
-   
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 static __ALIGNED(8) float32_t coeffArray[64];
 #endif 
    void FIRF32::test_fir_f32()
    {
       arm_fir_f32(&instFir, this->pSrc, this->pDst, this->nbSamples);
@ -30,7 +33,7 @@
       samples.reload(FIRF32::SAMPLES1_F32_ID,mgr,this->nbSamples);
       coefs.reload(FIRF32::COEFS1_F32_ID,mgr,this->nbTaps);
-       state.create(this->nbSamples + this->nbTaps - 1,FIRF32::STATE_F32_ID,mgr);
+       state.create(this->nbSamples + this->nbSamples + this->nbTaps - 1,FIRF32::STATE_F32_ID,mgr);
       output.create(this->nbSamples,FIRF32::OUT_SAMPLES_F32_ID,mgr);
       switch(id)
@ -38,8 +41,21 @@
           case TEST_FIR_F32_1:
              arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
-              this->pSrc=samples.ptr();
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
              /* Copy coefficients and pad to zero 
              */
              memset(coeffArray,0,32*sizeof(float32_t));
              float32_t *ptr;
              ptr=coefs.ptr();
              memcpy(coeffArray,ptr,this->nbTaps*sizeof(float32_t));
              this->pCoefs = coeffArray;
 #else
              this->pCoefs=coefs.ptr();
 #endif
              this->pSrc=samples.ptr();
              this->pDst=output.ptr();
           break;
--- a/Testing/Source/Tests/FIRF16.cpp
+++ b/Testing/Source/Tests/FIRF16.cpp
@ -61,7 +61,7 @@ void checkInnerTail(float16_t *b)
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
           /* Copy coefficients and pad to zero 
           */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(float16_t));
           for(j=0;j < numTaps; j++)
           {
              coeffArray[j] = orgcoefsp[j];
--- a/Testing/Source/Tests/FIRF32.cpp
+++ b/Testing/Source/Tests/FIRF32.cpp
@ -24,6 +24,7 @@ void checkInnerTail(float32_t *b)
    ASSERT_TRUE(b[3] == 0.0f);
 }
    void FIRF32::test_fir_f32()
    {
@ -34,6 +35,7 @@ void checkInnerTail(float32_t *b)
        const float32_t *coefsp;
        const float32_t *inputp = inputs.ptr();
        float32_t *outp = output.ptr();
        unsigned long i;
@ -42,6 +44,7 @@ void checkInnerTail(float32_t *b)
 #endif
        int blockSize;
        int numTaps;
        int nb=0;
@ -58,10 +61,12 @@ void checkInnerTail(float32_t *b)
           blockSize = configp[0];
           numTaps = configp[1];
           nb += 2*blockSize;
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
           /* Copy coefficients and pad to zero 
           */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(float32_t));
           for(j=0;j < numTaps; j++)
           {
              coeffArray[j] = orgcoefsp[j];
@ -86,6 +91,7 @@ void checkInnerTail(float32_t *b)
           */
           inputp = inputs.ptr();
           /*
           Python script is filtering a 2*blockSize number of samples.
@ -94,7 +100,9 @@ void checkInnerTail(float32_t *b)
           */
           arm_fir_f32(&this->S,inputp,outp,blockSize);
           outp += blockSize;
           checkInnerTail(outp);
@ -137,8 +145,15 @@ void checkInnerTail(float32_t *b)
       ref.reload(FIRF32::FIRREFS_F32_ID,mgr);
       output.create(ref.nbSamples(),FIRF32::OUT_F32_ID,mgr);
-       /* Max blockSize + numTaps - 1 as generated by Python script */
+       /* Max 2*blockSize + numTaps - 1 as generated by Python script 
-       state.create(47,FIRF32::OUT_F32_ID,mgr);
+          A temp buffer blockSize is used by Helium implementation.
          It is at beginning of state buffer and is NOT the state
          of the FIR which is in the following part.
       */
       state.create(47+47,FIRF32::OUT_F32_ID,mgr);
    }
    void FIRF32::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
--- a/Testing/Source/Tests/FIRQ15.cpp
+++ b/Testing/Source/Tests/FIRQ15.cpp
@ -53,7 +53,7 @@ void checkInnerTail(q15_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
           /* Copy coefficients and pad to zero 
           */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(q15_t));
           for(j=0;j < numTaps; j++)
           {
              coeffArray[j] = orgcoefsp[j];
--- a/Testing/Source/Tests/FIRQ31.cpp
+++ b/Testing/Source/Tests/FIRQ31.cpp
@ -54,7 +54,7 @@ void checkInnerTail(q31_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
           /* Copy coefficients and pad to zero 
           */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(q31_t));
           for(j=0;j < numTaps; j++)
           {
              coeffArray[j] = orgcoefsp[j];
--- a/Testing/Source/Tests/FIRQ7.cpp
+++ b/Testing/Source/Tests/FIRQ7.cpp
@ -53,7 +53,7 @@ void checkInnerTail(q7_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
           /* Copy coefficients and pad to zero 
           */
-           memset(coeffArray,0,32);
+           memset(coeffArray,0,32*sizeof(q7_t));
           for(j=0;j < numTaps; j++)
           {
              coeffArray[j] = orgcoefsp[j];
--- a/Testing/TestScripts/CodeGen.py
+++ b/Testing/TestScripts/CodeGen.py
@ -832,7 +832,7 @@ class CodeGen:
        self._currentPaths = oldPath.copy()
        self._currentParamPaths = oldParamPath.copy()
-    def genCodeForTree(self,root):
+    def genCodeForTree(self,root,benchMode):
      """ Generate all files from the trees of tests
      Args:
@ -845,17 +845,21 @@ class CodeGen:
      # Get a list of all suites contained in the tree
      suites = self.getSuites(root,[])
-      
+      src = "GeneratedSource"
      header = "GeneratedInclude"
      if benchMode:
        src += "Bench"
        header += "Bench"
      # Generate .cpp and .h files neded to run the tests
-      with open("GeneratedSource/TestDesc.cpp","w") as sourceFile:
+      with open("%s/TestDesc.cpp" % src,"w") as sourceFile:
-       with open("GeneratedInclude/TestDesc.h","w") as headerFile:
+       with open("%s/TestDesc.h" % header,"w") as headerFile:
         headerFile.write("#include \"Test.h\"\n")
         headerFile.write("#include \"Pattern.h\"\n")
         sourceFile.write("#include \"Test.h\"\n")
         for s in suites:
          headerFile.write("#include \"%s.h\"\n" % s)
-         self._genCode(root,"GeneratedInclude",sourceFile,headerFile)
+         self._genCode(root,"%s" % header,sourceFile,headerFile)
      # Generate a driver file for semihosting
      # (always generated for debug purpose since it is the reference format)
@ -868,17 +872,17 @@ class CodeGen:
      # Driver file is similar in this case but different from semihosting
      # one.
      if not self._fpga:
-         with open("GeneratedInclude/TestDrive.h","w") as driverFile:
+         with open("%s/TestDrive.h" % header,"w") as driverFile:
            driverFile.write("// Empty driver include in semihosting mode")
-         with open("GeneratedInclude/Patterns.h","w") as includeFile:
+         with open("%s/Patterns.h" % header,"w") as includeFile:
            includeFile.write("// Empty pattern include in semihosting mode")
      else:
-        with open("GeneratedInclude/TestDrive.h","w") as driverFile:
+        with open("%s/TestDrive.h" % header,"w") as driverFile:
          driverFile.write("#ifndef _DRIVER_H_\n")
          driverFile.write("#define _DRIVER_H_\n")
          driverFile.write("__ALIGNED(8) const char testDesc[]={\n")
          self._offset=0
-          with open("GeneratedInclude/Patterns.h","w") as includeFile:
+          with open("%s/Patterns.h" % header,"w") as includeFile:
            includeFile.write("#ifndef _PATTERNS_H_\n")
            includeFile.write("#define _PATTERNS_H_\n")
            includeFile.write("__ALIGNED(8) const char patterns[]={\n")
--- a/Testing/createDefaultFolder.sh
+++ b/Testing/createDefaultFolder.sh
@ -2,3 +2,5 @@ mkdir FullBenchmark
 mkdir Output
 mkdir GeneratedInclude
 mkdir GeneratedSource
 mkdir GeneratedIncludeBench
 mkdir GeneratedSourceBench
--- a/Testing/processTests.py
+++ b/Testing/processTests.py
@ -5,7 +5,7 @@ import TestScripts.Deprecate as d
 parser = argparse.ArgumentParser(description='Parse test description')
-parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="File path")
+parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="Pickle path")
 parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path")
 parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path")
@ -16,6 +16,8 @@ parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Para
 # So the .h for include files need to be generated.
 parser.add_argument('-e', action='store_true', help="Embedded test")
 parser.add_argument('-b', action='store_true', help="Benchmark mode to use different generated folders")
 parser.add_argument('others', nargs=argparse.REMAINDER)
 args = parser.parse_args()
@ -32,6 +34,6 @@ if args.f is not None:
    d.deprecate(root,args.others)
    #print(root)
    # Generate code with the tree of tests
-    c.genCodeForTree(root)
+    c.genCodeForTree(root,args.b)
 else:
    parser.print_help()