From 68b219bb1f7ba7d5c037fbdff8ddc31204b3c47c Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Thu, 5 Nov 2020 10:56:11 +0100
Subject: [PATCH] CMSIS-DSP: New MVE implementation of the FIR F16

---
 Include/dsp/filtering_functions.h            |   2 +
 Include/dsp/filtering_functions_f16.h        |   2 +
 Source/FilteringFunctions/arm_fir_f16.c      | 382 +++++++++++--------
 Source/FilteringFunctions/arm_fir_f32.c      |  11 +-
 Source/FilteringFunctions/arm_fir_init_f16.c |  13 +-
 Testing/Source/Benchmarks/FIRF16.cpp         |  19 +-
 Testing/Source/Benchmarks/FIRF32.cpp         |   3 +-
 Testing/Source/Tests/FIRF16.cpp              |   4 +-
 8 files changed, 276 insertions(+), 160 deletions(-)

diff --git a/Include/dsp/filtering_functions.h b/Include/dsp/filtering_functions.h
index 0cbd7cf4..1f9e0fac 100755
--- a/Include/dsp/filtering_functions.h
+++ b/Include/dsp/filtering_functions.h
@@ -39,6 +39,8 @@ extern "C"
 {
 #endif
 
+
+
 #define DELTA_Q31          ((q31_t)(0x100))
 #define DELTA_Q15          ((q15_t)0x5)
 
diff --git a/Include/dsp/filtering_functions_f16.h b/Include/dsp/filtering_functions_f16.h
index 4a99e831..0265f04e 100755
--- a/Include/dsp/filtering_functions_f16.h
+++ b/Include/dsp/filtering_functions_f16.h
@@ -40,6 +40,8 @@ extern "C"
 
 #if defined(ARM_FLOAT16_SUPPORTED)
 
+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
+
  /**
    * @brief Instance structure for the floating-point FIR filter.
    */
diff --git a/Source/FilteringFunctions/arm_fir_f16.c b/Source/FilteringFunctions/arm_fir_f16.c
index d8713cf3..0cc55843 100755
--- a/Source/FilteringFunctions/arm_fir_f16.c
+++ b/Source/FilteringFunctions/arm_fir_f16.c
@@ -47,6 +47,8 @@
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
+#define FIR_F32_MAX_COEF_BLK        8
+
 #define FIR_F16_CORE(pSamples, c, NB_TAPS)                                 \
         vecAcc0 = vdupq_n_f16(0.0f16);                                     \
         for (int i = 0; i < NB_TAPS; i++) {                                \
@@ -54,7 +56,9 @@
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
         }
 
-static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, 
+    const float16_t * __restrict pSrc, 
+    float16_t * __restrict pDst, uint32_t blockSize)
 {
     float16_t      *pState = S->pState;     /* State pointer */
     const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
@@ -65,8 +69,8 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
     float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
     uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
     int32_t         blkCnt;
-    f16x8_t         vecIn0;
-    f16x8_t         vecAcc0;
+    float16x8_t         vecIn0;
+    float16x8_t         vecAcc0;
     const int       NB_TAPS=4;
     float16_t       c[NB_TAPS];
 
@@ -107,6 +111,7 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
     }
 
     blkCnt = blockSize & 7;
+    if (blkCnt)
     {
         mve_pred16_t    p0 = vctp16q(blkCnt);
 
@@ -141,7 +146,9 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
 }
 
 
-static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, 
+    const float16_t * __restrict pSrc, 
+    float16_t * __restrict pDst, uint32_t blockSize)
 {
     float16_t      *pState = S->pState;     /* State pointer */
     const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
@@ -152,8 +159,8 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
     float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
     uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
     int32_t         blkCnt;
-    f16x8_t         vecIn0;
-    f16x8_t         vecAcc0;
+    float16x8_t         vecIn0;
+    float16x8_t         vecAcc0;
     const int       NB_TAPS=8;
     float16_t       c[NB_TAPS];
 
@@ -194,6 +201,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
     }
 
     blkCnt = blockSize & 7;
+    if (blkCnt)
     {
         mve_pred16_t    p0 = vctp16q(blkCnt);
 
@@ -224,6 +232,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
         mve_pred16_t    p0 = vctp16q(blkCnt);
         vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
     }
+
 }
 
 
@@ -232,224 +241,299 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
   float16_t * pDst, 
   uint32_t blockSize)
 {
-    float16_t *pState = S->pState;  /* State pointer */
-    const float16_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float16_t *pStateCur;       /* Points to the current sample of the state */
-    const float16_t *pSamples;        /* Temporary pointer to the sample buffer */
-    float16_t *pOutput;         /* Temporary pointer to the output buffer */
-    const float16_t *pTempSrc;        /* Temporary pointer to the source data */
-    float16_t *pTempDest;       /* Temporary pointer to the destination buffer */
-    int32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    f16x8_t vecIn0;
-    f16x8_t vecAcc0;
-    float16_t c0, c1, c2, c3;
-    float16_t c4, c5, c6, c7;
+    float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8);
+    float16_t *pState = pRefStatePtr ;      /* State pointer */
+    const float16_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
+    const float16_t *pSamples;  /* Temporary pointer to the sample buffer */
+    float16_t      *pOutput;    /* Temporary pointer to the output buffer */
+    const float16_t *pTempSrc;  /* Temporary pointer to the source data */
+    float16_t      *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    uint32_t        blkCnt;
+    float16_t       c0, c1, c2, c3;
+    float16_t       c4, c5, c6, c7;
 
     /*
      * [1 to 8 taps] specialized routines
      */
-    if (numTaps <= 4)
-    {
+    if (numTaps <= 4) {
         arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
         return;
-    }
-    else if (numTaps <= 8)
-    {
+    } else if (numTaps <= 8) {
         arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
         return;
     }
 
-    /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
-     */
-    pStateCur = &(pState[(numTaps - 1u)]);
-    /*
-     * Copy new data into state so that we obtain a continuous sample buffer
-     * containing both the tail end of the old data and the new data.
-     */
-    pSamples = pState;
     pTempSrc = pSrc;
-    pOutput = pDst;
-
-    blkCnt = blockSize >> 3;
-    while (blkCnt > 0U)
-    {
-        int       i;
-        const float16_t *pCoeffsCur = pCoeffs;
-
-        /*
-         * Save 8 input samples in the history buffer
-         */
-        vst1q(pStateCur, vld1q(pTempSrc));
-        pStateCur += 8;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int             cnt = blockSize;
+    do {
+        mve_pred16_t    p0 = vctp16q(cnt);
+        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 8;
         pTempSrc += 8;
+        cnt -= 8;
+    } while (cnt > 0);
+
+    float16_t      *partial_accu_ptr = S->pState;
 
-        c0 = *pCoeffsCur++;
-        c1 = *pCoeffsCur++;
-        c2 = *pCoeffsCur++;
-        c3 = *pCoeffsCur++;
-        c4 = *pCoeffsCur++;
-        c5 = *pCoeffsCur++;
-        c6 = *pCoeffsCur++;
-        c7 = *pCoeffsCur++;
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 3;
+    while (cnt > 0) {
+        float16x8_t     vecAcc0;
+        float16x8_t     vecIn0;
 
         vecIn0 = vld1q(pSamples);
         vecAcc0 = vmulq(vecIn0, c0);
-
         vecIn0 = vld1q(&pSamples[1]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
         vecIn0 = vld1q(&pSamples[2]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
         vecIn0 = vld1q(&pSamples[3]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
         vecIn0 = vld1q(&pSamples[4]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
         vecIn0 = vld1q(&pSamples[5]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
         vecIn0 = vld1q(&pSamples[6]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        pSamples += 8;
+        vst1q(partial_accu_ptr, vecAcc0);
+        cnt--;
+        partial_accu_ptr += 8;
+    }
+
+    cnt = blockSize & 7;
+    if (cnt > 0) {
+        float16x8_t     vecAcc0;
+        float16x8_t     vecIn0;
+
+        mve_pred16_t p0 = vctp16q(cnt);
 
+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
         vecIn0 = vld1q(&pSamples[7]);
         vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+    }
 
-        pSamples += 8;
+    int             localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+    int             sample_offset = FIR_F32_MAX_COEF_BLK;
+    while (localTaps > FIR_F32_MAX_COEF_BLK) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+
+        partial_accu_ptr = S->pState;
+        pSamples = pState + sample_offset;
+        int  cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
 
-        for (i = 0; i <= ((numTaps - 9) / 8); i++)
-        {
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
 
             vecIn0 = vld1q(pSamples);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
+            vecAcc0 = vmulq(vecIn0, c0);
             vecIn0 = vld1q(&pSamples[1]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
             vecIn0 = vld1q(&pSamples[2]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
             vecIn0 = vld1q(&pSamples[3]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
             vecIn0 = vld1q(&pSamples[4]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
             vecIn0 = vld1q(&pSamples[5]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
             vecIn0 = vld1q(&pSamples[6]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
             vecIn0 = vld1q(&pSamples[7]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
             pSamples += 8;
+            vecAcc0 += vld1q_f16(partial_accu_ptr);
+            vst1q(partial_accu_ptr, vecAcc0);
+            cnt--;
+            partial_accu_ptr += 8;
         }
 
-        vst1q(pOutput, vecAcc0);
-        pOutput += 8;
-        pSamples = pSamples - (i + 1) * 8 + 8;
-
-        blkCnt--;
-    }
-
-    blkCnt = blockSize & 7;
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
-        int       i;
-        const float16_t *pCoeffsCur = pCoeffs;
-
-        vst1q(pStateCur, vld1q(pTempSrc));
-        pStateCur += 8;
-        pTempSrc += 8;
-
-        c0 = *pCoeffsCur++;
-        c1 = *pCoeffsCur++;
-        c2 = *pCoeffsCur++;
-        c3 = *pCoeffsCur++;
-        c4 = *pCoeffsCur++;
-        c5 = *pCoeffsCur++;
-        c6 = *pCoeffsCur++;
-        c7 = *pCoeffsCur++;
-
-        vecIn0 = vld1q(pSamples);
-        vecAcc0 = vmulq(vecIn0, c0);
-
-        vecIn0 = vld1q(&pSamples[1]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
 
-        vecIn0 = vld1q(&pSamples[2]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            mve_pred16_t p0 = vctp16q(cnt);
 
-        vecIn0 = vld1q(&pSamples[3]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
-        vecIn0 = vld1q(&pSamples[4]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            vecAcc0 += vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+        }
 
-        vecIn0 = vld1q(&pSamples[5]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        localTaps -= FIR_F32_MAX_COEF_BLK;
+        sample_offset += FIR_F32_MAX_COEF_BLK;
+    }
 
-        vecIn0 = vld1q(&pSamples[6]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+    pSamples = pState + sample_offset;
+
+    if (localTaps > 4) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+        pOutput = pDst;
+
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
 
-        vecIn0 = vld1q(&pSamples[7]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 8;
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0 + pap);
+            cnt--;
+            partial_accu_ptr += 8;
+            pOutput += 8;
+        }
 
-        pSamples += 8;
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
 
-        for (i = 0; i <= ((numTaps - 9) / 8); i++)
-        {
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
+            mve_pred16_t p0 = vctp16q(cnt);
 
             vecIn0 = vld1q(pSamples);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
+            vecAcc0 = vmulq(vecIn0, c0);
             vecIn0 = vld1q(&pSamples[1]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
             vecIn0 = vld1q(&pSamples[2]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
             vecIn0 = vld1q(&pSamples[3]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
             vecIn0 = vld1q(&pSamples[4]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
             vecIn0 = vld1q(&pSamples[5]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
             vecIn0 = vld1q(&pSamples[6]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
             vecIn0 = vld1q(&pSamples[7]);
             vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+            pOutput += cnt;
+        }
+
+    } else {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        pOutput = pDst;
 
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
             pSamples += 8;
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0 + pap);
+            cnt--;
+            partial_accu_ptr += 8;
+            pOutput += 8;
         }
 
-        vstrhq_p_f16(pOutput, vecAcc0, p0);
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            mve_pred16_t p0 = vctp16q(cnt);
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+            pOutput += cnt;
+        }
     }
 
     /*
@@ -459,17 +543,15 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
     pTempDest = pState;
 
     blkCnt = numTaps >> 3;
-    while (blkCnt > 0U)
-    {
+    while (blkCnt > 0U) {
         vst1q(pTempDest, vld1q(pTempSrc));
         pTempSrc += 8;
         pTempDest += 8;
         blkCnt--;
     }
     blkCnt = numTaps & 7;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
         vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
     }
 }
diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c
index 3731e9cc..6fa87565 100644
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@@ -107,9 +107,12 @@
   @par          Helium state buffer
                  The state buffer must contain some additional temporary data
                  used during the computation but which is not the state of the FIR.
-                 The first blockSize samples are temporary data.
+                 The first A samples are temporary data.
                  The remaining samples are the state of the FIR filter.
-                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
+  @par                 
+                 So the state buffer has size <code> numTaps + A * blockSize - 1 </code> :
+                 - A is blockSize for f32
+                 - A is 8*ceil(blockSize/8) for f16
 
   @par           Fixed-Point Behavior
                    Care must be taken when using the fixed-point versions of the FIR filter functions.
@@ -144,7 +147,7 @@
         }
 
 
-static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
+__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
   const float32_t * __restrict pSrc, 
   float32_t * __restrict pDst, uint32_t blockSize)
 {
@@ -229,7 +232,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
 
 
 
-static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
+__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
   const float32_t * __restrict pSrc, 
   float32_t * __restrict pDst, uint32_t blockSize)
 {
diff --git a/Source/FilteringFunctions/arm_fir_init_f16.c b/Source/FilteringFunctions/arm_fir_init_f16.c
index 4e300d7c..8b63796b 100755
--- a/Source/FilteringFunctions/arm_fir_init_f16.c
+++ b/Source/FilteringFunctions/arm_fir_init_f16.c
@@ -52,7 +52,7 @@
   </pre>
   @par
                    <code>pState</code> points to the array of state variables.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
   @par          Initialization of Helium version
                  For Helium version the array of coefficients must be a multiple of 16 even if less
                  then 16 coefficients are used. The additional coefficients must be set to 0.
@@ -61,6 +61,13 @@
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first 8*ceil(blockSize/8) samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+                 So the state buffer has size <code> numTaps + 8*ceil(blockSize/8) + blockSize - 1 </code>
+
  */
 
 void arm_fir_init_f16(
@@ -77,7 +84,11 @@ void arm_fir_init_f16(
   S->pCoeffs = pCoeffs;
 
   /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t));
+#else
   memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t));
+#endif 
 
   /* Assign state pointer */
   S->pState = pState;
diff --git a/Testing/Source/Benchmarks/FIRF16.cpp b/Testing/Source/Benchmarks/FIRF16.cpp
index d3605e1b..86b54bca 100755
--- a/Testing/Source/Benchmarks/FIRF16.cpp
+++ b/Testing/Source/Benchmarks/FIRF16.cpp
@@ -1,6 +1,9 @@
 #include "FIRF16.h"
 #include "Error.h"
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) float16_t coeffArray[64];
+#endif 
    
     void FIRF16::test_fir_f16()
     {
@@ -21,16 +24,28 @@
        samples.reload(FIRF16::SAMPLES1_F16_ID,mgr,this->nbSamples);
        coefs.reload(FIRF16::COEFS1_F16_ID,mgr,this->nbTaps);
 
-       state.create(this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
+       state.create(ROUND_UP(this->nbSamples,8) + this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
        output.create(this->nbSamples,FIRF16::OUT_SAMPLES_F16_ID,mgr);
 
        switch(id)
        {
            case TEST_FIR_F16_1:
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+              /* Copy coefficients and pad to zero 
+              */
+              memset(coeffArray,0,32*sizeof(float16_t));
+              float16_t *ptr;
+
+              ptr=coefs.ptr();
+              memcpy(coeffArray,ptr,this->nbTaps*sizeof(float16_t));
+              this->pCoefs = coeffArray;
+#else
+              this->pCoefs=coefs.ptr();
+#endif
+
               arm_fir_init_f16(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
 
               this->pSrc=samples.ptr();
-              this->pCoefs=coefs.ptr();
               this->pDst=output.ptr();
            break;
 
diff --git a/Testing/Source/Benchmarks/FIRF32.cpp b/Testing/Source/Benchmarks/FIRF32.cpp
index 41a32018..bbe9ed55 100755
--- a/Testing/Source/Benchmarks/FIRF32.cpp
+++ b/Testing/Source/Benchmarks/FIRF32.cpp
@@ -39,7 +39,6 @@ static __ALIGNED(8) float32_t coeffArray[64];
        switch(id)
        {
            case TEST_FIR_F32_1:
-              arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
               /* Copy coefficients and pad to zero 
@@ -57,6 +56,8 @@ static __ALIGNED(8) float32_t coeffArray[64];
               this->pSrc=samples.ptr();
               
               this->pDst=output.ptr();
+
+              arm_fir_init_f32(&instFir,this->nbTaps,this->pCoefs,state.ptr(),this->nbSamples);
            break;
 
            case TEST_LMS_F32_2:
diff --git a/Testing/Source/Tests/FIRF16.cpp b/Testing/Source/Tests/FIRF16.cpp
index 0573caa3..0933c743 100755
--- a/Testing/Source/Tests/FIRF16.cpp
+++ b/Testing/Source/Tests/FIRF16.cpp
@@ -137,8 +137,8 @@ void checkInnerTail(float16_t *b)
        ref.reload(FIRF16::FIRREFS_F16_ID,mgr);
 
        output.create(ref.nbSamples(),FIRF16::OUT_F16_ID,mgr);
-       /* Max blockSize + numTaps - 1 as generated by Python script */
-       state.create(47,FIRF16::OUT_F16_ID,mgr);
+       /* > Max  8*ceil(blockSize,8) +  blockSize + numTaps - 1 as generated by Python script */
+       state.create(47+47,FIRF16::OUT_F16_ID,mgr);
     }
 
     void FIRF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)