CMSIS-DSP: New MVE implementation of the FIR F16

5 years ago · 68b219bb1f
parent 364fdb9a28
commit 68b219bb1f
8 changed files with 276 additions and 160 deletions
--- a/Include/dsp/filtering_functions.h
+++ b/Include/dsp/filtering_functions.h
@ -39,6 +39,8 @@ extern "C"
 {
 #endif

+
+
 #define DELTA_Q31          ((q31_t)(0x100))
 #define DELTA_Q15          ((q15_t)0x5)

--- a/Include/dsp/filtering_functions_f16.h
+++ b/Include/dsp/filtering_functions_f16.h
@ -40,6 +40,8 @@ extern "C"

 #if defined(ARM_FLOAT16_SUPPORTED)

+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
+
 /**
   * @brief Instance structure for the floating-point FIR filter.
   */
--- a/Source/FilteringFunctions/arm_fir_f16.c
+++ b/Source/FilteringFunctions/arm_fir_f16.c
@ -47,6 +47,8 @@

 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)

+#define FIR_F32_MAX_COEF_BLK        8
+
 #define FIR_F16_CORE(pSamples, c, NB_TAPS)                                 \
        vecAcc0 = vdupq_n_f16(0.0f16);                                     \
        for (int i = 0; i < NB_TAPS; i++) {                                \
@ -54,7 +56,9 @@
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]);                        \
        }

-static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, 
+    const float16_t * __restrict pSrc, 
+    float16_t * __restrict pDst, uint32_t blockSize)
 {
    float16_t      *pState = S->pState;     /* State pointer */
    const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
@ -65,8 +69,8 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
    float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
    int32_t         blkCnt;
-    f16x8_t         vecIn0;
-    f16x8_t         vecAcc0;
+    float16x8_t         vecIn0;
+    float16x8_t         vecAcc0;
    const int       NB_TAPS=4;
    float16_t       c[NB_TAPS];

@ -107,6 +111,7 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
    }

    blkCnt = blockSize & 7;
+    if (blkCnt)
    {
        mve_pred16_t    p0 = vctp16q(blkCnt);

@ -141,7 +146,9 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
 }


-static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, 
+    const float16_t * __restrict pSrc, 
+    float16_t * __restrict pDst, uint32_t blockSize)
 {
    float16_t      *pState = S->pState;     /* State pointer */
    const float16_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */
@ -152,8 +159,8 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
    float16_t      *pTempDest;              /* Temporary pointer to the destination buffer */
    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */
    int32_t         blkCnt;
-    f16x8_t         vecIn0;
-    f16x8_t         vecAcc0;
+    float16x8_t         vecIn0;
+    float16x8_t         vecAcc0;
    const int       NB_TAPS=8;
    float16_t       c[NB_TAPS];

@ -194,6 +201,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
    }

    blkCnt = blockSize & 7;
+    if (blkCnt)
    {
        mve_pred16_t    p0 = vctp16q(blkCnt);

@ -224,6 +232,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
        mve_pred16_t    p0 = vctp16q(blkCnt);
        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
    }
+
 }


@ -232,224 +241,299 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
  float16_t * pDst, 
  uint32_t blockSize)
 {
-    float16_t *pState = S->pState;  /* State pointer */
-    const float16_t *pCoeffs = S->pCoeffs;    /* Coefficient pointer */
-    float16_t *pStateCur;       /* Points to the current sample of the state */
-    const float16_t *pSamples;        /* Temporary pointer to the sample buffer */
-    float16_t *pOutput;         /* Temporary pointer to the output buffer */
-    const float16_t *pTempSrc;        /* Temporary pointer to the source data */
-    float16_t *pTempDest;       /* Temporary pointer to the destination buffer */
-    int32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    f16x8_t vecIn0;
-    f16x8_t vecAcc0;
-    float16_t c0, c1, c2, c3;
-    float16_t c4, c5, c6, c7;
+    float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8);
+    float16_t *pState = pRefStatePtr ;      /* State pointer */
+    const float16_t *pCoeffs = S->pCoeffs;      /* Coefficient pointer */
+    const float16_t *pSamples;  /* Temporary pointer to the sample buffer */
+    float16_t      *pOutput;    /* Temporary pointer to the output buffer */
+    const float16_t *pTempSrc;  /* Temporary pointer to the source data */
+    float16_t      *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    uint32_t        blkCnt;
+    float16_t       c0, c1, c2, c3;
+    float16_t       c4, c5, c6, c7;

    /*
     * [1 to 8 taps] specialized routines
     */
-    if (numTaps <= 4)
-    {
+    if (numTaps <= 4) {
        arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
        return;
-    }
-    else if (numTaps <= 8)
-    {
+    } else if (numTaps <= 8) {
        arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
        return;
    }

-    /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
-     */
-    pStateCur = &(pState[(numTaps - 1u)]);
-    /*
-     * Copy new data into state so that we obtain a continuous sample buffer
-     * containing both the tail end of the old data and the new data.
-     */
-    pSamples = pState;
    pTempSrc = pSrc;
-    pOutput = pDst;
-
-    blkCnt = blockSize >> 3;
-    while (blkCnt > 0U)
-    {
-        int       i;
-        const float16_t *pCoeffsCur = pCoeffs;
-
-        /*
-         * Save 8 input samples in the history buffer
-         */
-        vst1q(pStateCur, vld1q(pTempSrc));
-        pStateCur += 8;
+    pTempDest = &(pState[(numTaps - 1u)]);
+    int             cnt = blockSize;
+    do {
+        mve_pred16_t    p0 = vctp16q(cnt);
+        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+        pTempDest += 8;
        pTempSrc += 8;
+        cnt -= 8;
+    } while (cnt > 0);
+
+    float16_t      *partial_accu_ptr = S->pState;

-        c0 = *pCoeffsCur++;
-        c1 = *pCoeffsCur++;
-        c2 = *pCoeffsCur++;
-        c3 = *pCoeffsCur++;
-        c4 = *pCoeffsCur++;
-        c5 = *pCoeffsCur++;
-        c6 = *pCoeffsCur++;
-        c7 = *pCoeffsCur++;
+    pSamples = pState;
+    c0 = *pCoeffs++;
+    c1 = *pCoeffs++;
+    c2 = *pCoeffs++;
+    c3 = *pCoeffs++;
+    c4 = *pCoeffs++;
+    c5 = *pCoeffs++;
+    c6 = *pCoeffs++;
+    c7 = *pCoeffs++;
+
+    cnt = blockSize >> 3;
+    while (cnt > 0) {
+        float16x8_t     vecAcc0;
+        float16x8_t     vecIn0;

        vecIn0 = vld1q(pSamples);
        vecAcc0 = vmulq(vecIn0, c0);
-
        vecIn0 = vld1q(&pSamples[1]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
        vecIn0 = vld1q(&pSamples[2]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
        vecIn0 = vld1q(&pSamples[3]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
        vecIn0 = vld1q(&pSamples[4]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
        vecIn0 = vld1q(&pSamples[5]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
        vecIn0 = vld1q(&pSamples[6]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+        vecIn0 = vld1q(&pSamples[7]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        pSamples += 8;
+        vst1q(partial_accu_ptr, vecAcc0);
+        cnt--;
+        partial_accu_ptr += 8;
+    }
+
+    cnt = blockSize & 7;
+    if (cnt > 0) {
+        float16x8_t     vecAcc0;
+        float16x8_t     vecIn0;
+
+        mve_pred16_t p0 = vctp16q(cnt);

+
+        vecIn0 = vld1q(pSamples);
+        vecAcc0 = vmulq(vecIn0, c0);
+        vecIn0 = vld1q(&pSamples[1]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        vecIn0 = vld1q(&pSamples[2]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+        vecIn0 = vld1q(&pSamples[3]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+        vecIn0 = vld1q(&pSamples[4]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+        vecIn0 = vld1q(&pSamples[5]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        vecIn0 = vld1q(&pSamples[6]);
+        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
        vecIn0 = vld1q(&pSamples[7]);
        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+        vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+    }

-        pSamples += 8;
+    int             localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+    int             sample_offset = FIR_F32_MAX_COEF_BLK;
+    while (localTaps > FIR_F32_MAX_COEF_BLK) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+
+        partial_accu_ptr = S->pState;
+        pSamples = pState + sample_offset;
+        int  cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;

-        for (i = 0; i <= ((numTaps - 9) / 8); i++)
-        {
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;

            vecIn0 = vld1q(pSamples);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
+            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
            vecIn0 = vld1q(&pSamples[4]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
            vecIn0 = vld1q(&pSamples[5]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
            vecIn0 = vld1q(&pSamples[6]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
            vecIn0 = vld1q(&pSamples[7]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
            pSamples += 8;
+            vecAcc0 += vld1q_f16(partial_accu_ptr);
+            vst1q(partial_accu_ptr, vecAcc0);
+            cnt--;
+            partial_accu_ptr += 8;
        }

-        vst1q(pOutput, vecAcc0);
-        pOutput += 8;
-        pSamples = pSamples - (i + 1) * 8 + 8;
-
-        blkCnt--;
-    }
-
-    blkCnt = blockSize & 7;
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
-        int       i;
-        const float16_t *pCoeffsCur = pCoeffs;
-
-        vst1q(pStateCur, vld1q(pTempSrc));
-        pStateCur += 8;
-        pTempSrc += 8;
-
-        c0 = *pCoeffsCur++;
-        c1 = *pCoeffsCur++;
-        c2 = *pCoeffsCur++;
-        c3 = *pCoeffsCur++;
-        c4 = *pCoeffsCur++;
-        c5 = *pCoeffsCur++;
-        c6 = *pCoeffsCur++;
-        c7 = *pCoeffsCur++;
-
-        vecIn0 = vld1q(pSamples);
-        vecAcc0 = vmulq(vecIn0, c0);
-
-        vecIn0 = vld1q(&pSamples[1]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;

-        vecIn0 = vld1q(&pSamples[2]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            mve_pred16_t p0 = vctp16q(cnt);

-        vecIn0 = vld1q(&pSamples[3]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
-        vecIn0 = vld1q(&pSamples[4]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            vecAcc0 += vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+        }

-        vecIn0 = vld1q(&pSamples[5]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+        localTaps -= FIR_F32_MAX_COEF_BLK;
+        sample_offset += FIR_F32_MAX_COEF_BLK;
+    }

-        vecIn0 = vld1q(&pSamples[6]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+    pSamples = pState + sample_offset;
+
+    if (localTaps > 4) {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        c4 = *pCoeffs++;
+        c5 = *pCoeffs++;
+        c6 = *pCoeffs++;
+        c7 = *pCoeffs++;
+        pOutput = pDst;
+
+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;

-        vecIn0 = vld1q(&pSamples[7]);
-        vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            vecIn0 = vld1q(&pSamples[4]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+            vecIn0 = vld1q(&pSamples[5]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+            vecIn0 = vld1q(&pSamples[6]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+            vecIn0 = vld1q(&pSamples[7]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            pSamples += 8;
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0 + pap);
+            cnt--;
+            partial_accu_ptr += 8;
+            pOutput += 8;
+        }

-        pSamples += 8;
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;

-        for (i = 0; i <= ((numTaps - 9) / 8); i++)
-        {
-            c0 = *pCoeffsCur++;
-            c1 = *pCoeffsCur++;
-            c2 = *pCoeffsCur++;
-            c3 = *pCoeffsCur++;
-            c4 = *pCoeffsCur++;
-            c5 = *pCoeffsCur++;
-            c6 = *pCoeffsCur++;
-            c7 = *pCoeffsCur++;
+            mve_pred16_t p0 = vctp16q(cnt);

            vecIn0 = vld1q(pSamples);
-            vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
+            vecAcc0 = vmulq(vecIn0, c0);
            vecIn0 = vld1q(&pSamples[1]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
            vecIn0 = vld1q(&pSamples[2]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
            vecIn0 = vld1q(&pSamples[3]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
            vecIn0 = vld1q(&pSamples[4]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
            vecIn0 = vld1q(&pSamples[5]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
            vecIn0 = vld1q(&pSamples[6]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
            vecIn0 = vld1q(&pSamples[7]);
            vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+            pOutput += cnt;
+        }
+
+    } else {
+        c0 = *pCoeffs++;
+        c1 = *pCoeffs++;
+        c2 = *pCoeffs++;
+        c3 = *pCoeffs++;
+        pOutput = pDst;

+        partial_accu_ptr = S->pState;
+        cnt = blockSize >> 3;
+        while (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
            pSamples += 8;
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vst1q(pOutput, vecAcc0 + pap);
+            cnt--;
+            partial_accu_ptr += 8;
+            pOutput += 8;
        }

-        vstrhq_p_f16(pOutput, vecAcc0, p0);
+        cnt = blockSize & 7;
+        if (cnt > 0) {
+            float16x8_t     vecAcc0;
+            float16x8_t     vecIn0;
+
+            mve_pred16_t p0 = vctp16q(cnt);
+
+            vecIn0 = vld1q(pSamples);
+            vecAcc0 = vmulq(vecIn0, c0);
+            vecIn0 = vld1q(&pSamples[1]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+            vecIn0 = vld1q(&pSamples[2]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+            vecIn0 = vld1q(&pSamples[3]);
+            vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+            float16x8_t     pap = vld1q_f16(partial_accu_ptr);
+            vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+            pOutput += cnt;
+        }
    }

    /*
@ -459,17 +543,15 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
    pTempDest = pState;

    blkCnt = numTaps >> 3;
-    while (blkCnt > 0U)
-    {
+    while (blkCnt > 0U) {
        vst1q(pTempDest, vld1q(pTempSrc));
        pTempSrc += 8;
        pTempDest += 8;
        blkCnt--;
    }
    blkCnt = numTaps & 7;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
+    if (blkCnt > 0U) {
+        mve_pred16_t    p0 = vctp16q(blkCnt);
        vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
    }
 }
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@ -107,9 +107,12 @@
  @par          Helium state buffer
                 The state buffer must contain some additional temporary data
                 used during the computation but which is not the state of the FIR.
-                 The first blockSize samples are temporary data.
+                 The first A samples are temporary data.
                 The remaining samples are the state of the FIR filter.
-                 So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
+  @par                 
+                 So the state buffer has size <code> numTaps + A * blockSize - 1 </code> :
+                 - A is blockSize for f32
+                 - A is 8*ceil(blockSize/8) for f16

  @par           Fixed-Point Behavior
                   Care must be taken when using the fixed-point versions of the FIR filter functions.
@ -144,7 +147,7 @@
        }


-static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
+__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, 
  const float32_t * __restrict pSrc, 
  float32_t * __restrict pDst, uint32_t blockSize)
 {
@ -229,7 +232,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,



-static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
+__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, 
  const float32_t * __restrict pSrc, 
  float32_t * __restrict pDst, uint32_t blockSize)
 {
--- a/Source/FilteringFunctions/arm_fir_init_f16.c
+++ b/Source/FilteringFunctions/arm_fir_init_f16.c
@ -52,7 +52,7 @@
  </pre>
  @par
                   <code>pState</code> points to the array of state variables.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
  @par          Initialization of Helium version
                 For Helium version the array of coefficients must be a multiple of 16 even if less
                 then 16 coefficients are used. The additional coefficients must be set to 0.
@ -61,6 +61,13 @@
                 the implementation may require to read more coefficients due to the vectorization and
                 to avoid having to manage too many different cases in the code.

+  @par          Helium state buffer
+                 The state buffer must contain some additional temporary data
+                 used during the computation but which is not the state of the FIR.
+                 The first 8*ceil(blockSize/8) samples are temporary data.
+                 The remaining samples are the state of the FIR filter.
+                 So the state buffer has size <code> numTaps + 8*ceil(blockSize/8) + blockSize - 1 </code>
+
 */

 void arm_fir_init_f16(
@ -77,7 +84,11 @@ void arm_fir_init_f16(
  S->pCoeffs = pCoeffs;

  /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t));
+#else
  memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t));
+#endif 

  /* Assign state pointer */
  S->pState = pState;
--- a/Testing/Source/Benchmarks/FIRF16.cpp
+++ b/Testing/Source/Benchmarks/FIRF16.cpp
@ -1,6 +1,9 @@
 #include "FIRF16.h"
 #include "Error.h"

+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) float16_t coeffArray[64];
+#endif 
   
    void FIRF16::test_fir_f16()
    {
@ -21,16 +24,28 @@
       samples.reload(FIRF16::SAMPLES1_F16_ID,mgr,this->nbSamples);
       coefs.reload(FIRF16::COEFS1_F16_ID,mgr,this->nbTaps);

-       state.create(this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
+       state.create(ROUND_UP(this->nbSamples,8) + this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
       output.create(this->nbSamples,FIRF16::OUT_SAMPLES_F16_ID,mgr);

       switch(id)
       {
           case TEST_FIR_F16_1:
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+              /* Copy coefficients and pad to zero 
+              */
+              memset(coeffArray,0,32*sizeof(float16_t));
+              float16_t *ptr;
+
+              ptr=coefs.ptr();
+              memcpy(coeffArray,ptr,this->nbTaps*sizeof(float16_t));
+              this->pCoefs = coeffArray;
+#else
+              this->pCoefs=coefs.ptr();
+#endif
+
              arm_fir_init_f16(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);

              this->pSrc=samples.ptr();
-              this->pCoefs=coefs.ptr();
              this->pDst=output.ptr();
           break;

--- a/Testing/Source/Benchmarks/FIRF32.cpp
+++ b/Testing/Source/Benchmarks/FIRF32.cpp
@ -39,7 +39,6 @@ static __ALIGNED(8) float32_t coeffArray[64];
       switch(id)
       {
           case TEST_FIR_F32_1:
-              arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);

 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
              /* Copy coefficients and pad to zero 
@ -57,6 +56,8 @@ static __ALIGNED(8) float32_t coeffArray[64];
              this->pSrc=samples.ptr();
              
              this->pDst=output.ptr();
+
+              arm_fir_init_f32(&instFir,this->nbTaps,this->pCoefs,state.ptr(),this->nbSamples);
           break;

           case TEST_LMS_F32_2:
--- a/Testing/Source/Tests/FIRF16.cpp
+++ b/Testing/Source/Tests/FIRF16.cpp
@ -137,8 +137,8 @@ void checkInnerTail(float16_t *b)
       ref.reload(FIRF16::FIRREFS_F16_ID,mgr);

       output.create(ref.nbSamples(),FIRF16::OUT_F16_ID,mgr);
-       /* Max blockSize + numTaps - 1 as generated by Python script */
-       state.create(47,FIRF16::OUT_F16_ID,mgr);
+       /* > Max  8*ceil(blockSize,8) +  blockSize + numTaps - 1 as generated by Python script */
+       state.create(47+47,FIRF16::OUT_F16_ID,mgr);
    }

    void FIRF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)