From a108d6763e0c7df369d0814b5f2388effcd497fe Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Thu, 5 Nov 2020 14:59:01 +0100
Subject: [PATCH] CMSIS-DSP: Added new MVE implementation of FIR Q31

---
 Include/dsp/filtering_functions_f16.h        |    2 -
 Include/dsp/utils.h                          |    1 +
 Source/FilteringFunctions/arm_fir_f32.c      |    5 +-
 Source/FilteringFunctions/arm_fir_init_q31.c |   22 +-
 Source/FilteringFunctions/arm_fir_q31.c      | 1024 ++++++++++--------
 Testing/Source/Benchmarks/FIRQ31.cpp         |   19 +-
 Testing/Source/Tests/FIRQ31.cpp              |    7 +-
 7 files changed, 611 insertions(+), 469 deletions(-)
diff --git a/Include/dsp/filtering_functions_f16.h b/Include/dsp/filtering_functions_f16.h
index 0265f04e..4a99e831 100755
--- a/Include/dsp/filtering_functions_f16.h
+++ b/Include/dsp/filtering_functions_f16.h
@@ -40,8 +40,6 @@ extern "C"
 
 #if defined(ARM_FLOAT16_SUPPORTED)
 
-#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
-
  /**
    * @brief Instance structure for the floating-point FIR filter.
    */
diff --git a/Include/dsp/utils.h b/Include/dsp/utils.h
index 794023c5..7f5acb37 100755
--- a/Include/dsp/utils.h
+++ b/Include/dsp/utils.h
@@ -42,6 +42,7 @@ extern "C"
 
 #define SQ(x) ((x) * (x))
 
+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
 
 
   /**
diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c
index 6fa87565..7f3da5e0 100644
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@@ -110,9 +110,11 @@
                  The first A samples are temporary data.
                  The remaining samples are the state of the FIR filter.
   @par                 
-                 So the state buffer has size <code> numTaps + A * blockSize - 1 </code> :
+                 So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
                  - A is blockSize for f32
                  - A is 8*ceil(blockSize/8) for f16
+                 - A is 8*ceil(blockSize/4) for q31
+
 
   @par           Fixed-Point Behavior
                    Care must be taken when using the fixed-point versions of the FIR filter functions.
@@ -200,6 +202,7 @@ __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
     }
 
     blkCnt = blockSize & 3;
+    if (blkCnt)
     {
         mve_pred16_t    p0 = vctp32q(blkCnt);
 
diff --git a/Source/FilteringFunctions/arm_fir_init_q31.c b/Source/FilteringFunctions/arm_fir_init_q31.c
index 2a9600c1..e491437e 100644
--- a/Source/FilteringFunctions/arm_fir_init_q31.c
+++ b/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -52,7 +52,23 @@
       {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
   </pre>
                    <code>pState</code> points to the array of state variables.
-                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
+                   <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
+
+   @par          Initialization of Helium version
+                   For Helium version the array of coefficients must be a multiple of 16 even if less
+                   then 16 coefficients are used. The additional coefficients must be set to 0.
+                   It does not mean that all the coefficients will be used in the filter (numTaps
+                   is still set to its right value in the init function.) It just means that
+                   the implementation may require to read more coefficients due to the vectorization and
+                   to avoid having to manage too many different cases in the code.
+  
+    @par          Helium state buffer
+                   The state buffer must contain some additional temporary data
+                   used during the computation but which is not the state of the FIR.
+                   The first 2*4*ceil(blockSize/4) samples are temporary data.
+                   The remaining samples are the state of the FIR filter.
+                   So the state buffer has size <code> numTaps + 8*ceil(blockSize/4) + blockSize - 1 </code>
+  
  */
 
 void arm_fir_init_q31(
@@ -69,7 +85,11 @@ void arm_fir_init_q31(
   S->pCoeffs = pCoeffs;
 
   /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+  memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t));
+  #else
   memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+  #endif
 
   /* Assign state pointer */
   S->pState = pState;
diff --git a/Source/FilteringFunctions/arm_fir_q31.c b/Source/FilteringFunctions/arm_fir_q31.c
index bf406350..eda1d4f0 100644
--- a/Source/FilteringFunctions/arm_fir_q31.c
+++ b/Source/FilteringFunctions/arm_fir_q31.c
@@ -28,6 +28,7 @@
 
 #include "dsp/filtering_functions.h"
 
+
 /**
   @ingroup groupFilters
  */
@@ -58,12 +59,160 @@
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 #include "arm_helium_utils.h"
-                                        
 
 
-static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
+#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs)                 \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q31_t    *pSmp = &pSamples[j];                           \
+            q31x4_t         vecIn0;                                        \
+            q63_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 4 * i);                  \
+                acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
+            }                                                              \
+            *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
+        }
+
+
+#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)     \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q31_t    *pSmp = &pSamples[j];                           \
+            q31x4_t         vecIn0;                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 4 * i);                  \
+                acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
+            }                                                              \
+            *arm_fir_partial_accu_ptr++ = acc[j];                          \
+        }
+
+
+#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs)      \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q31_t    *pSmp = &pSamples[j];                           \
+            q31x4_t         vecIn0;                                        \
+                                                                           \
+            acc[j] = *arm_fir_partial_accu_ptr++;                          \
+                                                                           \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 4 * i);                  \
+                acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]);       \
+            }                                                              \
+            *pOutput++ = (q31_t)asrl(acc[j], 23);                          \
+        }
+
+                      
+#define FIR_Q31_MAIN_CORE()                                                              \
+{                                                                                        \
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);                          \
+    q31_t      *pState = pRefStatePtr; /* State pointer */                               \
+    const q31_t *pCoeffs = S->pCoeffs;  /* Coefficient pointer */                        \
+    q31_t       *pStateCur;             /* Points to the current sample of the state */  \
+    const q31_t *pSamples;              /* Temporary pointer to the sample buffer */     \
+    q31_t       *pOutput;               /* Temporary pointer to the output buffer */     \
+    const q31_t *pTempSrc;              /* Temporary pointer to the source data */       \
+    q31_t       *pTempDest;             /* Temporary pointer to the destination buffer */\
+    uint32_t     numTaps = S->numTaps;  /* Number of filter coefficients in the filter */\
+    int32_t      blkCnt;                                                                 \
+    const int32_t   nbVecTaps = (NBTAPS / 4);                                            \
+                                                                                         \
+    /*                                                                                   \
+     * load coefs                                                                        \
+     */                                                                                  \
+    q31x4_t         vecCoeffs[nbVecTaps];                                                \
+                                                                                         \
+    for (int i = 0; i < nbVecTaps; i++)                                                  \
+        vecCoeffs[i] = vld1q(pCoeffs + 4 * i);                                           \
+                                                                                         \
+    /*                                                                                   \
+     * pState points to state array which contains previous frame (numTaps - 1) samples  \
+     * pStateCur points to the location where the new input data should be written       \
+     */                                                                                  \
+    pStateCur = &(pState[(numTaps - 1u)]);                                               \
+    pTempSrc = pSrc;                                                                     \
+    pSamples = pState;                                                                   \
+    pOutput = pDst;                                                                      \
+                                                                                         \
+    blkCnt = blockSize >> 2;                                                             \
+    while (blkCnt > 0) {                                                                 \
+        /*                                                                               \
+         * Save 4 input samples in the history buffer                                    \
+         */                                                                              \
+        vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));                                     \
+        pStateCur += 4;                                                                  \
+        pTempSrc += 4;                                                                   \
+                                                                                         \
+        FIR_Q31_CORE(4, nbVecTaps, pSamples, vecCoeffs);                                 \
+                                                                                         \
+        pSamples += 4;                                                                   \
+        /*                                                                               \
+         * Decrement the sample block loop counter                                       \
+         */                                                                              \
+        blkCnt--;                                                                        \
+    }                                                                                    \
+                                                                                         \
+    /* tail */                                                                           \
+    int32_t        residual = blockSize & 3;                                             \
+    switch (residual) {                                                                  \
+      case 3:                                                                            \
+          {                                                                              \
+              for (int i = 0; i < residual; i++)                                         \
+                  *pStateCur++ = *pTempSrc++;                                            \
+                                                                                         \
+              FIR_Q31_CORE(3, nbVecTaps, pSamples, vecCoeffs);                           \
+          }                                                                              \
+          break;                                                                         \
+                                                                                         \
+      case 2:                                                                            \
+          {                                                                              \
+              for (int i = 0; i < residual; i++)                                         \
+                  *pStateCur++ = *pTempSrc++;                                            \
+                                                                                         \
+               FIR_Q31_CORE(2, nbVecTaps, pSamples, vecCoeffs);                          \
+          }                                                                              \
+          break;                                                                         \
+                                                                                         \
+      case 1:                                                                            \
+          {                                                                              \
+              for (int i = 0; i < residual; i++)                                         \
+                  *pStateCur++ = *pTempSrc++;                                            \
+                                                                                         \
+              FIR_Q31_CORE(1, nbVecTaps, pSamples, vecCoeffs);                           \
+          }                                                                              \
+          break;                                                                         \
+    }                                                                                    \
+                                                                                         \
+    /*                                                                                   \
+     * Copy the samples back into the history buffer start                               \
+     */                                                                                  \
+    pTempSrc = &pState[blockSize];                                                       \
+    pTempDest = pState;                                                                  \
+                                                                                         \
+    blkCnt =(numTaps - 1) >> 2;                                                          \
+    while (blkCnt > 0)                                                                   \
+    {                                                                                    \
+        vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc));                                     \
+        pTempSrc += 4;                                                                   \
+        pTempDest += 4;                                                                  \
+        blkCnt--;                                                                        \
+    }                                                                                    \
+    blkCnt = (numTaps - 1) & 3;                                                          \
+    if (blkCnt > 0)                                                                      \
+    {                                                                                    \
+        mve_pred16_t p0 = vctp32q(blkCnt);                                               \
+        vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0);                         \
+    }                                                                                    \
+}
+
+static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
 {
-    q31_t    *pState = S->pState;   /* State pointer */
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+    q31_t      *pState = pRefStatePtr; /* State pointer */
     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
     q31_t    *pStateCur;        /* Points to the current sample of the state */
     const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
@@ -74,6 +223,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     uint32_t  blkCnt;
     q31x4_t vecIn0;
 
+
     /*
      * pState points to state array which contains previous frame (numTaps - 1) samples
      * pStateCur points to the location where the new input data should be written
@@ -83,7 +233,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     pSamples = pState;
     pOutput = pDst;
 
-    q63_t     acc0, acc1, acc2, acc3;
+    q63_t     acc0=0, acc1=0, acc2=0, acc3=0;
     /*
      * load 4 coefs
      */
@@ -131,7 +281,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     }
 
     uint32_t  residual = blockSize & 3;
-    
     switch (residual)
     {
     case 3:
@@ -139,7 +288,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
             /*
              * Save 4 input samples in the history buffer
              */
-
             *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
             pStateCur += 4;
             pTempSrc += 4;
@@ -205,14 +353,13 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         break;
     }
 
-
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
-    blkCnt = numTaps >> 2;
+    blkCnt = (numTaps-1) >> 2;
     while (blkCnt > 0U)
     {
         vst1q(pTempDest, vld1q(pTempSrc));
@@ -220,7 +367,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         pTempDest += 4;
         blkCnt--;
     }
-    blkCnt = numTaps & 3;
+    blkCnt = (numTaps-1) & 3;
     if (blkCnt > 0U)
     {
         mve_pred16_t p0 = vctp32q(blkCnt);
@@ -228,9 +375,274 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     }
 }
 
-static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
+
+
+static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
 {
-    q31_t    *pState = S->pState;   /* State pointer */
+    #define NBTAPS 8
+    FIR_Q31_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 12
+    FIR_Q31_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    FIR_Q31_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 20
+    FIR_Q31_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 24
+    FIR_Q31_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 28
+    FIR_Q31_MAIN_CORE();
+    #undef NBTAPS
+}
+
+static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S, 
+    const q31_t * __restrict pSrc, 
+    q31_t * __restrict pDst,
+                               uint32_t blockSize)
+{
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+    q31_t      *pState = pRefStatePtr; /* State pointer */
+    const q31_t    *pCoeffs = S->pCoeffs;       /* Coefficient pointer */
+    q31_t          *pStateCur;  /* Points to the current sample of the state */
+    const q31_t    *pSamples;   /* Temporary pointer to the sample buffer */
+    q31_t          *pOutput;    /* Temporary pointer to the output buffer */
+    const q31_t    *pTempSrc;   /* Temporary pointer to the source data */
+    q31_t          *pTempDest;  /* Temporary pointer to the destination buffer */
+    uint32_t        numTaps = S->numTaps;       /* Number of filter coefficients in the filter */
+    int32_t         blkCnt;
+    q63_t           acc0, acc1, acc2, acc3;
+
+#define MAX_VECT_BATCH 7
+
+    /*
+     * pre-load 28 1st coefs
+     */
+    q31x4_t         vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
+    q31x4_t         vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
+    q31x4_t         vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
+    q31x4_t         vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
+    q31x4_t         vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
+    q31x4_t         vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
+    q31x4_t         vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
+
+    /*
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
+     */
+    pStateCur = &(pState[(numTaps - 1u)]);
+    pTempSrc = pSrc;
+    pSamples = pState;
+
+    q63_t          *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) {
+        /*
+         * Save 4 input samples in the history buffer
+         */
+        vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
+
+        const q31_t    *pSmp;
+        q31x4_t         vecIn0;
+
+        pSmp = &pSamples[0];
+
+        vecIn0 = vld1q(pSmp);
+        acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
+
+        *arm_fir_partial_accu_ptr++ = acc0;
+
+        pSmp = &pSamples[1];
+
+        vecIn0 = vld1q(pSmp);
+        acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
+
+        *arm_fir_partial_accu_ptr++ = acc1;
+
+        pSmp = &pSamples[2];
+
+        vecIn0 = vld1q(pSmp);
+        acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
+        *arm_fir_partial_accu_ptr++ = acc2;
+
+        pSmp = &pSamples[3];
+
+        vecIn0 = vld1q(pSmp);
+        acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
+        vecIn0 = vld1q(pSmp + 4 * 1);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
+        vecIn0 = vld1q(pSmp + 4 * 2);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
+        vecIn0 = vld1q(pSmp + 4 * 3);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
+        vecIn0 = vld1q(pSmp + 4 * 4);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
+        vecIn0 = vld1q(pSmp + 4 * 5);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
+        vecIn0 = vld1q(pSmp + 4 * 6);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
+
+        *arm_fir_partial_accu_ptr++ = acc3;
+
+        pSamples += 4;
+        /*
+         * Decrement the sample block loop counter
+         */
+        blkCnt--;
+    }
+
+
+    /* reminder */
+
+    /* load last 4 coef */
+    vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
+    arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+    pOutput = pDst;
+    pSamples = pState + (MAX_VECT_BATCH * 4);
+
+
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) {
+        q31x4_t         vecIn0;
+
+        /* reload intermediate MAC */
+        acc0 = *arm_fir_partial_accu_ptr++;
+        acc1 = *arm_fir_partial_accu_ptr++;
+        acc2 = *arm_fir_partial_accu_ptr++;
+        acc3 = *arm_fir_partial_accu_ptr++;
+
+
+        vecIn0 = vld1q(&pSamples[0]);
+        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
+
+        vecIn0 = vld1q(&pSamples[1]);
+        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
+
+        vecIn0 = vld1q(&pSamples[2]);
+        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
+
+        vecIn0 = vld1q(&pSamples[3]);
+        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
+
+        *pOutput++ = asrl(acc0, 23);
+        *pOutput++ = asrl(acc1, 23);
+        *pOutput++ = asrl(acc2, 23);
+        *pOutput++ = asrl(acc3, 23);
+
+        pSamples += 4;
+        /*
+         * Decrement the sample block loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * Copy the samples back into the history buffer start
+     */
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
+
+    blkCnt = numTaps - 1;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+
+        vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
+        pTempSrc += 4;
+        pTempDest += 4;
+        blkCnt -= 4;
+    }
+    while (blkCnt > 0);
+}
+
+
+
+void arm_fir_q31(
+  const arm_fir_instance_q31 * S,
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+    q31_t      *pState = pRefStatePtr; /* State pointer */
     const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
     q31_t    *pStateCur;        /* Points to the current sample of the state */
     const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
@@ -240,60 +652,110 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
     uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
     uint32_t  blkCnt;
     q31x4_t vecIn0;
+    uint32_t  tapsBlkCnt = (numTaps + 3) / 4;
     q63_t     acc0, acc1, acc2, acc3;
-    q31x4_t vecCoeffs1_4, vecCoeffs5_8;
+    q31x4_t vecCoeffs;
+
 
     /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
+     * [1 to 32 taps] specialized routines
      */
-    pStateCur = &(pState[(numTaps - 1u)]);
-    pTempSrc = pSrc;
-    pSamples = pState;
-    pOutput = pDst;
-
+    if (numTaps <= 4)
+    {
+        arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 8)
+    {
+        arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 12)
+    {
+        arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 16)
+    {
+        arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 20)
+    {
+        arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 24)
+    {
+        arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 28)
+    {
+        arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if ((numTaps <= 32)  && (blockSize >= 32))
+    {
+        arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
 
     /*
-     * load 8 coefs
+     * pState points to state array which contains previous frame (numTaps - 1) samples
+     * pStateCur points to the location where the new input data should be written
      */
-    vecCoeffs1_4 = *(q31x4_t *) pCoeffs;
-    vecCoeffs5_8 = *(q31x4_t *) (pCoeffs + 4);
-
-    blkCnt = blockSize >> 2;
-    while (blkCnt > 0U)
+    pStateCur   = &(pState[(numTaps - 1u)]);
+    pSamples    = pState;
+    pTempSrc    = pSrc;
+    pOutput     = pDst;
+    blkCnt      = blockSize >> 2;
+    while (blkCnt > 0)
     {
+        const q31_t    *pCoeffsTmp = pCoeffs;
         const q31_t    *pSamplesTmp = pSamples;
 
+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+        acc3 = 0LL;
+
         /*
          * Save 4 input samples in the history buffer
          */
         vst1q(pStateCur, vld1q(pTempSrc));
+        pStateCur += 4;
+        pTempSrc += 4;
 
-        vecIn0 = vld1q(pSamplesTmp);
-        acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-        vecIn0 = vld1q(&pSamplesTmp[1]);
-        acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-        vecIn0 = vld1q(&pSamplesTmp[2]);
-        acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-        vecIn0 = vld1q(&pSamplesTmp[3]);
-        acc3 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+        int       i = tapsBlkCnt;
+        while (i > 0)
+        {
+            /*
+             * load 4 coefs
+             */
+            vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 
-        vecIn0 = vld1q(&pSamplesTmp[4]);
-        acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[5]);
-        acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[6]);
-        acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
 
-        vecIn0 = vld1q(&pSamplesTmp[7]);
-        acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5_8);
+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
 
+            pSamplesTmp += 4;
+            pCoeffsTmp += 4;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }
 
+        /* .54-> .31 conversion and store accumulators */
         acc0 = asrl(acc0, 23);
         acc1 = asrl(acc1, 23);
         acc2 = asrl(acc2, 23);
@@ -305,8 +767,6 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         *pOutput++ = (q31_t) acc3;
 
         pSamples += 4;
-        pStateCur += 4;
-        pTempSrc += 4;
 
         /*
          * Decrement the sample block loop counter
@@ -314,11 +774,18 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         blkCnt--;
     }
 
-    uint32_t  residual = blockSize & 3;
+    int32_t  residual = blockSize & 3;
     switch (residual)
     {
     case 3:
         {
+            const q31_t    *pCoeffsTmp = pCoeffs;
+            const q31_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+            acc2 = 0LL;
+
             /*
              * Save 4 input samples in the history buffer
              */
@@ -326,23 +793,24 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
             pStateCur += 4;
             pTempSrc += 4;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 
-            vecIn0 = vld1q(&pSamples[2]);
-            acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[4]);
-            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+                vecIn0 = vld1q(&pSamplesTmp[1]);
+                acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[5]);
-            acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[6]);
-            acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
+                pSamplesTmp += 4;
+                pCoeffsTmp += 4;
+                i--;
+            }
 
             acc0 = asrl(acc0, 23);
             acc1 = asrl(acc1, 23);
@@ -356,6 +824,12 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
 
     case 2:
         {
+            const q31_t    *pCoeffsTmp = pCoeffs;
+            const q31_t    *pSamplesTmp = pSamples;
+
+            acc0 = 0LL;
+            acc1 = 0LL;
+
             /*
              * Save 4 input samples in the history buffer
              */
@@ -363,17 +837,21 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
             pStateCur += 4;
             pTempSrc += 4;
 
-            vecIn0 = vld1q(pSamples);
-            acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q31x4_t *) pCoeffsTmp;
 
-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[4]);
-            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+                vecIn0 = vld1q(&pSamplesTmp[1]);
+                acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
 
-            vecIn0 = vld1q(&pSamples[5]);
-            acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+                pSamplesTmp += 4;
+                pCoeffsTmp += 4;
+                i--;
+            }
 
             acc0 = asrl(acc0, 23);
             acc1 = asrl(acc1, 23);
@@ -384,431 +862,55 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
         break;
 
     case 1:
-        {
-            /*
-             * Save 4 input samples in the history buffer
-             */
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 4;
-            pTempSrc += 4;
-
-            vecIn0 = vld1q(pSamples);
-            acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
-            vecIn0 = vld1q(&pSamples[4]);
-            acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
-
-            acc0 = asrl(acc0, 23);
-
-            *pOutput++ = (q31_t) acc0;
-        }
-        break;
-    }
-
-    /*
-     * Copy the samples back into the history buffer start
-     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 4;
-        pTempDest += 4;
-        blkCnt--;
-    }
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp32q(blkCnt);
-        vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
-    }
-}
-
-void arm_fir_q31(
-  const arm_fir_instance_q31 * S,
-  const q31_t * pSrc,
-        q31_t * pDst,
-        uint32_t blockSize)
-{
-    q31_t    *pState = S->pState;   /* State pointer */
-    const q31_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
-    q31_t    *pStateCur;        /* Points to the current sample of the state */
-    const q31_t    *pSamples;         /* Temporary pointer to the sample buffer */
-    q31_t    *pOutput;          /* Temporary pointer to the output buffer */
-    const q31_t    *pTempSrc;         /* Temporary pointer to the source data */
-    q31_t    *pTempDest;        /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    q31x4_t vecIn0;
-    uint32_t  tapsBlkCnt = (numTaps + 3) / 4;
-    q63_t     acc0, acc1, acc2, acc3;
-    q31x4_t vecCoeffs;
-
-    /*
-     * [1 to 8 taps] specialized routines
-     */
-    
-    if (blockSize >= 8)
-    {
-        if (numTaps <= 4)
-        {
-            arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
-            return;
-        }
-        else if (numTaps <= 8)
-        {
-            arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
-            return;
-        }
-    }
-
-
-    /*
-     * pState points to state array which contains previous frame (numTaps - 1) samples
-     * pStateCur points to the location where the new input data should be written
-     */
-    if (blockSize >= 8)
-    {
-        pStateCur   = &(pState[(numTaps - 1u)]);
-        pSamples    = pState;
-        pTempSrc    = pSrc;
-        pOutput     = pDst;
-        blkCnt      = blockSize >> 2;
-        while (blkCnt > 0U)
         {
             const q31_t    *pCoeffsTmp = pCoeffs;
             const q31_t    *pSamplesTmp = pSamples;
-    
+
             acc0 = 0LL;
-            acc1 = 0LL;
-            acc2 = 0LL;
-            acc3 = 0LL;
-    
+
             /*
              * Save 4 input samples in the history buffer
              */
             vst1q(pStateCur, vld1q(pTempSrc));
             pStateCur += 4;
             pTempSrc += 4;
-    
-            tapsBlkCnt = (numTaps ) / 4;
-            uint32_t       i = tapsBlkCnt ;
-            while (i > 0U)
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
             {
-                /*
-                 * load 4 coefs
-                 */
                 vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-    
+
                 vecIn0 = vld1q(pSamplesTmp);
                 acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[1]);
-                acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[2]);
-                acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[3]);
-                acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
-    
+
                 pSamplesTmp += 4;
                 pCoeffsTmp += 4;
-                /*
-                 * Decrement the taps block loop counter
-                 */
                 i--;
             }
 
-            tapsBlkCnt = (numTaps ) & 3;
-            i = tapsBlkCnt ;
-            while (i > 0U)
-            {
-                /*
-                 * load 4 coefs
-                 */
-
-                /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-                acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-                acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
-                acc3 += ((q63_t) pSamplesTmp[3] * *pCoeffsTmp) >> 8;
-
-    
-                pSamplesTmp += 1;
-                pCoeffsTmp += 1;
-                /*
-                 * Decrement the taps block loop counter
-                 */
-                i--;
-            }
-    
-            /* .54-> .31 conversion and store accumulators */
             acc0 = asrl(acc0, 23);
-            acc1 = asrl(acc1, 23);
-            acc2 = asrl(acc2, 23);
-            acc3 = asrl(acc3, 23);
-    
-            *pOutput++ = (q31_t) acc0;
-            *pOutput++ = (q31_t) acc1;
-            *pOutput++ = (q31_t) acc2;
-            *pOutput++ = (q31_t) acc3;
-    
-            pSamples += 4;
 
-            
-            /*
-             * Decrement the sample block loop counter
-             */
-            blkCnt--;
-        }
-    
-        uint32_t  residual = blockSize & 3;
-        switch (residual)
-        {
-        case 3:
-            {
-                const q31_t    *pCoeffsTmp = pCoeffs;
-                const q31_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-                acc2 = 0LL;
-    
-                /*
-                 * Save 4 input samples in the history buffer
-                 */
-              
-                *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
-                pStateCur += 4;
-                pTempSrc += 4;
-    
-                tapsBlkCnt = numTaps  / 4;
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[2]);
-                    acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 4;
-                    pCoeffsTmp += 4;
-                    i--;
-                }
-
-                tapsBlkCnt = (numTaps ) & 3;
-                
-                i = tapsBlkCnt ;
-                while (i > 0U)
-                {
-                   
-                    /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                    acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-                    acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-                    acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
-    
-                    pSamplesTmp += 1;
-                    pCoeffsTmp += 1;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-    
-                acc0 = asrl(acc0, 23);
-                acc1 = asrl(acc1, 23);
-                acc2 = asrl(acc2, 23);
-    
-                *pOutput++ = (q31_t) acc0;
-                *pOutput++ = (q31_t) acc1;
-                *pOutput++ = (q31_t) acc2;
-            }
-            break;
-    
-        case 2:
-            {
-                const q31_t    *pCoeffsTmp = pCoeffs;
-                const q31_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-    
-                /*
-                 * Save 4 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 4;
-                pTempSrc += 4;
-    
-                tapsBlkCnt = (numTaps ) / 4;
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 4;
-                    pCoeffsTmp += 4;
-                    i--;
-                }
-
-                tapsBlkCnt = (numTaps ) & 3;
-                i = tapsBlkCnt ;
-                while (i > 0U)
-                {
-                   
-
-                    /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                    acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-                    acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-    
-                    pSamplesTmp += 1;
-                    pCoeffsTmp += 1;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                acc0 = asrl(acc0, 23);
-                acc1 = asrl(acc1, 23);
-    
-                *pOutput++ = (q31_t) acc0;
-                *pOutput++ = (q31_t) acc1;
-            }
-            break;
-    
-        case 1:
-            {
-                const q31_t    *pCoeffsTmp = pCoeffs;
-                const q31_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-    
-                /*
-                 * Save 4 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 4;
-                pTempSrc += 4;
-    
-                tapsBlkCnt = (numTaps ) / 4;
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 4;
-                    pCoeffsTmp += 4;
-                    i--;
-                }
-
-                tapsBlkCnt = (numTaps ) & 3;
-                i = tapsBlkCnt ;
-                while (i > 0U)
-                {
-                   
-
-                    /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-                    acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-    
-                    pSamplesTmp += 1;
-                    pCoeffsTmp += 1;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                acc0 = asrl(acc0, 23);
-    
-                *pOutput++ = (q31_t) acc0;
-            }
-            break;
-        }
-    }
-    else
-    {
-         
-                q31_t *pStateCurnt;                            /* Points to the current sample of the state */
-                q31_t *px;                                     /* Temporary pointer for state buffer */
-          const q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
-                q63_t acc0;                                    /* Accumulator */
-                uint32_t i, blkCnt;                    /* Loop counters */
-          pStateCurnt = &(S->pState[(numTaps - 1U)]);
-          blkCnt = blockSize;
-        
-          while (blkCnt > 0U)
-          {
-            /* Copy one sample at a time into state buffer */
-            *pStateCurnt++ = *pSrc++;
-        
-            /* Set the accumulator to zero */
-            acc0 = 0;
-        
-            /* Initialize state pointer */
-            px = pState;
-        
-            /* Initialize Coefficient pointer */
-            pb = pCoeffs;
-        
-            i = numTaps;
-        
-            /* Perform the multiply-accumulates */
-            do
-            {
-              /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
-              acc0 += (q63_t) *px++ * *pb++;
-        
-              i--;
-            } while (i > 0U);
-        
-            /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
-            *pDst++ = (q31_t) (acc0 >> 31U);
-        
-            /* Advance state pointer by 1 for the next sample */
-            pState = pState + 1U;
-        
-            /* Decrement loop counter */
-            blkCnt--;
+            *pOutput++ = (q31_t) acc0;
         }
+        break;
     }
 
     /*
      * Copy the samples back into the history buffer start
      */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;
 
-    blkCnt = numTaps >> 2;
-    while (blkCnt > 0U)
+    blkCnt = (numTaps - 1U) >> 2;
+    while (blkCnt > 0)
     {
         vst1q(pTempDest, vld1q(pTempSrc));
         pTempSrc += 4;
         pTempDest += 4;
         blkCnt--;
     }
-    blkCnt = numTaps & 3;
-    if (blkCnt > 0U)
+    blkCnt = (numTaps - 1U) & 3;
+    if (blkCnt > 0)
     {
         mve_pred16_t p0 = vctp32q(blkCnt);
         vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
diff --git a/Testing/Source/Benchmarks/FIRQ31.cpp b/Testing/Source/Benchmarks/FIRQ31.cpp
index 3626a134..25cc206e 100755
--- a/Testing/Source/Benchmarks/FIRQ31.cpp
+++ b/Testing/Source/Benchmarks/FIRQ31.cpp
@@ -1,6 +1,9 @@
 #include "FIRQ31.h"
 #include "Error.h"
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) q31_t coeffArray[64];
+#endif 
    
     void FIRQ31::test_fir_q31()
     {
@@ -30,16 +33,28 @@
        samples.reload(FIRQ31::SAMPLES1_Q31_ID,mgr,this->nbSamples);
        coefs.reload(FIRQ31::COEFS1_Q31_ID,mgr,this->nbTaps);
 
-       state.create(this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
+       state.create(2*ROUND_UP(this->nbSamples,4) + this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
        output.create(this->nbSamples,FIRQ31::OUT_SAMPLES_Q31_ID,mgr);
 
        switch(id)
        {
            case TEST_FIR_Q31_1:
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+              /* Copy coefficients and pad to zero 
+              */
+              memset(coeffArray,0,32*sizeof(q31_t));
+              q31_t *ptr;
+
+              ptr=coefs.ptr();
+              memcpy(coeffArray,ptr,this->nbTaps*sizeof(q31_t));
+              this->pCoefs = coeffArray;
+#else
+              this->pCoefs=coefs.ptr();
+#endif
+
               arm_fir_init_q31(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
 
               this->pSrc=samples.ptr();
-              this->pCoefs=coefs.ptr();
               this->pDst=output.ptr();
            break;
 
diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp
index dfee9e1e..80f8195e 100644
--- a/Testing/Source/Tests/FIRQ31.cpp
+++ b/Testing/Source/Tests/FIRQ31.cpp
@@ -37,6 +37,7 @@ void checkInnerTail(q31_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int nb=1;
 
         /*
 
@@ -98,6 +99,8 @@ void checkInnerTail(q31_t *b)
            configp += 2;
            orgcoefsp += numTaps;
 
+           nb += blockSize + blockSize;
+
 
         }
 
@@ -129,8 +132,8 @@ void checkInnerTail(q31_t *b)
        ref.reload(FIRQ31::FIRREFS_Q31_ID,mgr);
 
        output.create(ref.nbSamples(),FIRQ31::OUT_Q31_ID,mgr);
-       /* Max blockSize + numTaps - 1 as generated by Python script */
-       state.create(47,FIRQ31::OUT_Q31_ID,mgr);
+       /* > Max blockSize + numTaps - 1 as generated by Python script */
+       state.create(47 + 47+47,FIRQ31::OUT_Q31_ID,mgr);
     }
 
     void FIRQ31::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)