CMSIS-DSP: Added new MVE implementation for FIR Q15 and Q7.

6 years ago · c4283d209f
parent a108d6763e
commit c4283d209f
10 changed files with 621 additions and 635 deletions
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@ -114,6 +114,7 @@
                 - A is blockSize for f32
                 - A is 8*ceil(blockSize/8) for f16
                 - A is 8*ceil(blockSize/4) for q31
+                 - A is 0 for other datatypes (q15 and q7)


  @par           Fixed-Point Behavior
--- a/Source/FilteringFunctions/arm_fir_q15.c
+++ b/Source/FilteringFunctions/arm_fir_q15.c
@ -60,7 +60,140 @@

 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)

-static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pSrc, q15_t * pDst, uint32_t blockSize)
+
+#define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)        \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q15_t    *pSmp = &pSample[j];                            \
+            q63_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 8 * i);                  \
+                acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]);         \
+            }                                                              \
+            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15);               \
+        }
+
+#define FIR_Q15_MAIN_CORE()                                                                  \
+{                                                                                            \
+    q15_t          *pState = S->pState;     /* State pointer */                              \
+    const q15_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
+    q15_t          *pStateCur;              /* Points to the current sample of the state */  \
+    const q15_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
+    q15_t          *pOutput;                /* Temporary pointer to the output buffer */     \
+    const q15_t    *pTempSrc;               /* Temporary pointer to the source data */       \
+    q15_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
+    uint32_t        numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
+    int32_t         blkCnt;                                                                  \
+    q15x8_t         vecIn0;                                                                  \
+    const int32_t   nbVecTaps = (NBTAPS / 8);                                                \
+                                                                                             \
+    /*                                                                                       \
+     * load coefs                                                                            \
+     */                                                                                      \
+    q15x8_t         vecCoeffs[nbVecTaps];                                                    \
+                                                                                             \
+    for (int i = 0; i < nbVecTaps; i++)                                                      \
+        vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i);                                          \
+                                                                                             \
+    /*                                                                                       \
+     * pState points to state array which contains previous frame (numTaps - 1) samples      \
+     * pStateCur points to the location where the new input data should be written           \
+     */                                                                                      \
+    pStateCur = &(pState[(numTaps - 1u)]);                                                   \
+    pTempSrc = pSrc;                                                                         \
+    pSamples = pState;                                                                       \
+    pOutput = pDst;                                                                          \
+                                                                                             \
+    blkCnt = blockSize >> 2;                                                                 \
+    while (blkCnt > 0) {                                                                     \
+        /*                                                                                   \
+         * Save 4 input samples in the history buffer                                        \
+         */                                                                                  \
+        vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc));                                         \
+        pStateCur += 4;                                                                      \
+        pTempSrc += 4;                                                                       \
+                                                                                             \
+        FIR_Q15_CORE(pOutput, 4, nbVecTaps, pSamples, vecCoeffs);                            \
+        pSamples += 4;                                                                       \
+                                                                                             \
+        blkCnt--;                                                                            \
+    }                                                                                        \
+                                                                                             \
+    /* tail */                                                                               \
+    int32_t        residual = blockSize & 3;                                                \
+                                                                                             \
+    for (int i = 0; i < residual; i++)                                                       \
+        *pStateCur++ = *pTempSrc++;                                                          \
+                                                                                             \
+    FIR_Q15_CORE(pOutput, residual, nbVecTaps, pSamples, vecCoeffs);                         \
+                                                                                             \
+    /*                                                                                       \
+     * Copy the samples back into the history buffer start                                   \
+     */                                                                                      \
+    pTempSrc = &pState[blockSize];                                                           \
+    pTempDest = pState;                                                                      \
+                                                                                             \
+    /* current compiler limitation */                                                        \
+    blkCnt = (numTaps - 1) >> 3;                                                             \
+    while (blkCnt > 0)                                                                       \
+    {                                                                                        \
+        vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc));                                         \
+        pTempSrc += 8;                                                                       \
+        pTempDest += 8;                                                                      \
+        blkCnt--;                                                                            \
+    }                                                                                        \
+    blkCnt = (numTaps - 1) & 7;                                                              \
+    if (blkCnt > 0)                                                                          \
+    {                                                                                        \
+        mve_pred16_t p = vctp16q(blkCnt);                                                    \
+        vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p);                               \
+    }                                                                                        \
+}
+    
+static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 32
+    FIR_Q15_MAIN_CORE();
+    #undef NBTAPS
+}
+
+static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 24
+    FIR_Q15_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc,
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    FIR_Q15_MAIN_CORE();
+    #undef NBTAPS
+}
+
+static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, 
+  const q15_t * __restrict pSrc, 
+  q15_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 8
+    FIR_Q15_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+void arm_fir_q15(
+  const arm_fir_instance_q15 * S,
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
 {
    q15_t    *pState = S->pState;   /* State pointer */
    const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@ -72,46 +205,81 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
    uint32_t  blkCnt;
    q15x8_t vecIn0;
-    /*
-     * load 8 coefs
-     */
-    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffs;
+    uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
+    q63_t     acc0, acc1, acc2, acc3;
+
+
+int32_t nbTaps = (numTaps + 7) >> 3;

+switch(nbTaps) {
+
+    case 1:
+        arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 2:
+        arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 3:
+        arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
+        return;
+    case 4:
+        arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
    /*
     * pState points to state array which contains previous frame (numTaps - 1) samples
     * pStateCur points to the location where the new input data should be written
     */
-    pStateCur = &(pState[(numTaps - 1u)]);
-    pTempSrc = pSrc;
-    pSamples = pState;
-    pOutput = pDst;
-
-    q63_t     acc0, acc1, acc2, acc3;
-
-    blkCnt = blockSize >> 2;
+    pStateCur   = &(pState[(numTaps - 1u)]);
+    pTempSrc    = pSrc;
+    pSamples    = pState;
+    pOutput     = pDst;
+    blkCnt      = blockSize >> 2;

    while (blkCnt > 0U)
    {
+        const q15_t    *pCoeffsTmp = pCoeffs;
        const q15_t    *pSamplesTmp = pSamples;

+        acc0 = 0LL;
+        acc1 = 0LL;
+        acc2 = 0LL;
+        acc3 = 0LL;
+
        /*
-         * Save 4 input samples in the history buffer
+         * Save 8 input samples in the history buffer
         */
        vst1q(pStateCur, vld1q(pTempSrc));
        pStateCur += 8;
        pTempSrc += 8;

-        vecIn0 = vld1q(pSamplesTmp);
-        acc0 = vmlaldavq(vecIn0, vecCoeffs);
+        int       i = tapsBlkCnt;
+        while (i > 0)
+        {
+            /*
+             * load 8 coefs
+             */
+            q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);

-        vecIn0 = vld1q(&pSamplesTmp[1]);
-        acc1 = vmlaldavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);

-        vecIn0 = vld1q(&pSamplesTmp[2]);
-        acc2 = vmlaldavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);

-        vecIn0 = vld1q(&pSamplesTmp[3]);
-        acc3 = vmlaldavq(vecIn0, vecCoeffs);
+            pSamplesTmp += 8;
+            pCoeffsTmp += 8;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }

        *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
        *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@ -130,6 +298,7 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
    {
    case 3:
        {
+            const q15_t    *pCoeffsTmp = pCoeffs;
            const q15_t    *pSamplesTmp = pSamples;

            acc0 = 0LL;
@ -137,20 +306,40 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
            acc2 = 0LL;

            /*
-             * Save 4 input samples in the history buffer
+             * Save 8 input samples in the history buffer
             */
            *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
            pStateCur += 8;
            pTempSrc += 8;

-            vecIn0 = vld1q(pSamplesTmp);
-            acc0 = vmlaldavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;

-            vecIn0 = vld1q(&pSamplesTmp[1]);
-            acc1 = vmlaldavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);

-            vecIn0 = vld1q(&pSamplesTmp[2]);
-            acc2 = vmlaldavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }
+
+            acc0 = asrl(acc0, 15);
+            acc1 = asrl(acc1, 15);
+            acc2 = asrl(acc2, 15);

            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@ -160,23 +349,39 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS

    case 2:
        {
+            const q15_t    *pCoeffsTmp = pCoeffs;
            const q15_t    *pSamplesTmp = pSamples;

            acc0 = 0LL;
            acc1 = 0LL;
-
            /*
-             * Save 4 input samples in the history buffer
+             * Save 8 input samples in the history buffer
             */
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 8;
            pTempSrc += 8;

-            vecIn0 = vld1q(pSamplesTmp);
-            acc0 = vmlaldavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                /*
+                 * load 8 coefs
+                 */
+                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;

-            vecIn0 = vld1q(&pSamplesTmp[1]);
-            acc1 = vmlaldavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[2]);
+                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 8;
+                pCoeffsTmp += 8;
+                /*
+                 * Decrement the taps block loop counter
+                 */
+                i--;
+            }

            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@ -185,126 +390,29 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS

    case 1:
        {
+            const q15_t    *pCoeffsTmp = pCoeffs;
            const q15_t    *pSamplesTmp = pSamples;

            acc0 = 0LL;

-            /*
-             * Save 4 input samples in the history buffer
-             */
-            vst1q(pStateCur, vld1q(pTempSrc));
-            pStateCur += 8;
-            pTempSrc += 8;
-
-            vecIn0 = vld1q(pSamplesTmp);
-            acc0 = vmlaldavq(vecIn0, vecCoeffs);
-
-            pSamplesTmp += 4;
-
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-        }
-        break;
-    }
-
-    /*
-     * Copy the samples back into the history buffer start
-     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 3;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 8;
-        pTempDest += 8;
-        blkCnt--;
-    }
-    blkCnt = numTaps & 7;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp16q(blkCnt);
-        vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
-    }
-}
-
-void arm_fir_q15(
-  const arm_fir_instance_q15 * S,
-  const q15_t * pSrc,
-        q15_t * pDst,
-        uint32_t blockSize)
-{
-    q15_t    *pState = S->pState;   /* State pointer */
-    const q15_t    *pCoeffs = S->pCoeffs; /* Coefficient pointer */
-    q15_t    *pStateCur;        /* Points to the current sample of the state */
-    const q15_t    *pSamples;         /* Temporary pointer to the sample buffer */
-    q15_t    *pOutput;          /* Temporary pointer to the output buffer */
-    const q15_t    *pTempSrc;         /* Temporary pointer to the source data */
-    q15_t    *pTempDest;        /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    q15x8_t vecIn0;
-    uint32_t  tapsBlkCnt = (numTaps + 7) / 8;
-    q63_t     acc0, acc1, acc2, acc3;
-
-    if (blockSize >= 12)
-    {
-       if(numTaps <= 8) {
-           /* [1 to 8 taps] specialized routine */
-           arm_fir_q15_1_8_mve(S,pSrc, pDst, blockSize);
-           return;
-       }
-    }
-
-    if (blockSize >= 12)
-    {
-        /*
-         * pState points to state array which contains previous frame (numTaps - 1) samples
-         * pStateCur points to the location where the new input data should be written
-         */
-        pStateCur   = &(pState[(numTaps - 1u)]);
-        pTempSrc    = pSrc;
-        pSamples    = pState;
-        pOutput     = pDst;
-        blkCnt      = blockSize >> 2;
-    
-        while (blkCnt > 0U)
-        {
-            const q15_t    *pCoeffsTmp = pCoeffs;
-            const q15_t    *pSamplesTmp = pSamples;
-    
-            acc0 = 0LL;
-            acc1 = 0LL;
-            acc2 = 0LL;
-            acc3 = 0LL;
-    
            /*
             * Save 8 input samples in the history buffer
             */
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 8;
            pTempSrc += 8;
-    
-            uint32_t       i = tapsBlkCnt;
-            while (i > 0U)
+
+            int       i = tapsBlkCnt;
+            while (i > 0)
            {
                /*
                 * load 8 coefs
                 */
                q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
+
                vecIn0 = vld1q(pSamplesTmp);
-                acc0 =  vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[1]);
-                acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[2]);
-                acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
-    
-                vecIn0 = vld1q(&pSamplesTmp[3]);
-                acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
-    
+                acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
                pSamplesTmp += 8;
                pCoeffsTmp += 8;
                /*
@ -312,197 +420,17 @@ void arm_fir_q15(
                 */
                i--;
            }
-    
+
            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
-            *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
-    
-            pSamples += 4;
-            /*
-             * Decrement the sample block loop counter
-             */
-            blkCnt--;
        }
-    
-        uint32_t  residual = blockSize & 3;
-        switch (residual)
-        {
-        case 3:
-            {
-                const q15_t    *pCoeffsTmp = pCoeffs;
-                const q15_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-                acc2 = 0LL;
-    
-                /*
-                 * Save 8 input samples in the history buffer
-                 */
-                *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
-                pStateCur += 8;
-                pTempSrc += 8;
-    
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    /*
-                     * load 8 coefs
-                     */
-                    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[2]);
-                    acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 8;
-                    pCoeffsTmp += 8;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-              
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
-            }
-            break;
-    
-        case 2:
-            {
-                const q15_t    *pCoeffsTmp = pCoeffs;
-                const q15_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-                acc1 = 0LL;
-                /*
-                 * Save 8 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 8;
-                pTempSrc += 8;
-    
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    /*
-                     * load 8 coefs
-                     */
-                    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                    vecIn0 = vld1q(&pSamplesTmp[1]);
-                    acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 8;
-                    pCoeffsTmp += 8;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
-            }
-            break;
-    
-        case 1:
-            {
-                const q15_t    *pCoeffsTmp = pCoeffs;
-                const q15_t    *pSamplesTmp = pSamples;
-    
-                acc0 = 0LL;
-    
-                /*
-                 * Save 8 input samples in the history buffer
-                 */
-                vst1q(pStateCur, vld1q(pTempSrc));
-                pStateCur += 8;
-                pTempSrc += 8;
-    
-                uint32_t       i = tapsBlkCnt;
-                while (i > 0U)
-                {
-                    /*
-                     * load 8 coefs
-                     */
-                    q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
-    
-                    vecIn0 = vld1q(pSamplesTmp);
-                    acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
-    
-                    pSamplesTmp += 8;
-                    pCoeffsTmp += 8;
-                    /*
-                     * Decrement the taps block loop counter
-                     */
-                    i--;
-                }
-    
-                *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
-            }
-            break;
-        }
-    }
-    else
-    {
-        q15_t *pStateCurnt;                            /* Points to the current sample of the state */
-            q15_t *px;                                     /* Temporary pointer for state buffer */
-      const q15_t *pb;                                     /* Temporary pointer for coefficient buffer */
-            q63_t acc0;                                    /* Accumulator */
-            uint32_t  blkCnt,tapCnt;                    /* Loop counters */
-      pStateCurnt = &(S->pState[(numTaps - 1U)]);
-      blkCnt = blockSize;
-      while (blkCnt > 0U)
-      {
-        /* Copy two samples into state buffer */
-        *pStateCurnt++ = *pSrc++;
-    
-        /* Set the accumulator to zero */
-        acc0 = 0;
-    
-        /* Use SIMD to hold states and coefficients */
-        px = pState;
-        pb = pCoeffs;
-    
-        tapCnt = numTaps >> 1U;
-    
-        while (tapCnt > 0U)
-        {
-          acc0 += (q15_t) *px++ * *pb++;
-          acc0 += (q15_t) *px++ * *pb++;
-    
-          tapCnt--;
-        }
-        
-    
-        /* The result is in 2.30 format. Convert to 1.15 with saturation.
-           Then store the output in the destination buffer. */
-        *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
-    
-        /* Advance state pointer by 1 for the next sample */
-        pState = pState + 1U;
-    
-        /* Decrement loop counter */
-        blkCnt--;
-      }
+        break;
    }
+
    /*
     * Copy the samples back into the history buffer start
     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
+    pTempSrc = &pState[blockSize];
+    pTempDest = pState;

    blkCnt = numTaps >> 3;
    while (blkCnt > 0U)
--- a/Source/FilteringFunctions/arm_fir_q7.c
+++ b/Source/FilteringFunctions/arm_fir_q7.c
@ -56,7 +56,115 @@

 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

-void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t * pDst, uint32_t blockSize)
+#define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs)         \
+        for (int j = 0; j < nbAcc; j++) {                                  \
+            const q7_t     *pSmp = &pSample[j];                            \
+            q31_t           acc[4];                                        \
+                                                                           \
+            acc[j] = 0;                                                    \
+            for (int i = 0; i < nbVecTaps; i++) {                          \
+                vecIn0 = vld1q(pSmp + 16 * i);                   \
+                acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]);          \
+            }                                                              \
+            *pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8);                 \
+        }
+
+#define FIR_Q7_MAIN_CORE()                                                                  \
+{                                                                                           \
+     q7_t          *pState = S->pState;     /* State pointer */                             \
+    const q7_t    *pCoeffs = S->pCoeffs;   /* Coefficient pointer */                        \
+    q7_t          *pStateCur;              /* Points to the current sample of the state */  \
+    const q7_t    *pSamples;               /* Temporary pointer to the sample buffer */     \
+    q7_t          *pOutput;                /* Temporary pointer to the output buffer */     \
+    const q7_t    *pTempSrc;               /* Temporary pointer to the source data */       \
+    q7_t          *pTempDest;              /* Temporary pointer to the destination buffer */\
+    uint32_t       numTaps = S->numTaps;   /* Number of filter coefficients in the filter */\
+    int32_t        blkCnt;                                                                  \
+    q7x16_t        vecIn0;                                                                  \
+    const int32_t  nbVecTaps = (NBTAPS / 16);                                     \
+                                                                                            \
+    /*                                                                                      \
+     * load coefs                                                                           \
+     */                                                                                     \
+    q7x16_t         vecCoeffs[nbVecTaps];                                                   \
+                                                                                            \
+    for (int i = 0; i < nbVecTaps; i++)                                                     \
+        vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i);                               \
+                                                                                            \
+    /*                                                                                      \
+     * pState points to state array which contains previous frame (numTaps - 1) samples     \
+     * pStateCur points to the location where the new input data should be written          \
+     */                                                                                     \
+    pStateCur = &(pState[(numTaps - 1u)]);                                                  \
+    pTempSrc = pSrc;                                                                        \
+    pSamples = pState;                                                                      \
+    pOutput = pDst;                                                                         \
+                                                                                            \
+    blkCnt = blockSize >> 2;                                                                \
+    while (blkCnt > 0) {                                                                   \
+        /*                                                                                  \
+         * Save 4 input samples in the history buffer                                       \
+         */                                                                                 \
+        vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc));                                        \
+        pStateCur += 4;                                                                     \
+        pTempSrc += 4;                                                                      \
+                                                                                            \
+        FIR_Q7_CORE(pOutput, 4, nbVecTaps, pSamples, vecCoeffs);                            \
+        pSamples += 4;                                                                      \
+                                                                                            \
+        blkCnt--;                                                                           \
+    }                                                                                       \
+                                                                                            \
+    /* tail */                                                                              \
+    int32_t        residual = blockSize & 3;                                               \
+                                                                                            \
+    for (int i = 0; i < residual; i++)                                                      \
+        *pStateCur++ = *pTempSrc++;                                                         \
+                                                                                            \
+    FIR_Q7_CORE(pOutput, residual, nbVecTaps, pSamples, vecCoeffs);                         \
+                                                                                            \
+                                                                                            \
+    /*                                                                                      \
+     * Copy the samples back into the history buffer start                                  \
+     */                                                                                     \
+    pTempSrc = &pState[blockSize];                                                          \
+    pTempDest = pState;                                                                     \
+    blkCnt = numTaps - 1;                                                                   \
+    do {                                                                                    \
+        mve_pred16_t    p = vctp8q(blkCnt);                                                 \
+                                                                                            \
+        vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p);                                \
+        pTempSrc += 16;                                                           \
+        pTempDest += 16;                                                          \
+        blkCnt -= 16;                                                             \
+    }                                                                                       \
+    while (blkCnt > 0);                                                                     \
+}
+
+static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S, 
+  const q7_t * __restrict pSrc,
+  q7_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 32
+    FIR_Q7_MAIN_CORE();
+    #undef NBTAPS
+}
+
+
+void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, 
+  const q7_t * __restrict pSrc, 
+  q7_t * __restrict pDst, uint32_t blockSize)
+{
+    #define NBTAPS 16
+    FIR_Q7_MAIN_CORE();
+    #undef NBTAPS
+}
+
+void arm_fir_q7(
+  const arm_fir_instance_q7 * S,
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
 {
    q7_t     *pState = S->pState;   /* State pointer */
    const q7_t     *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@ -68,9 +176,27 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
    uint32_t  blkCnt;
    q7x16_t  vecIn0;
+    uint32_t  tapsBlkCnt = (numTaps + 15) / 16;
    q31_t     acc0, acc1, acc2, acc3;
    q7x16_t  vecCoeffs;

+    if (numTaps <= 16)
+    {
+        /*
+         * [1 to 16 taps] specialized routine
+         */
+        arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+    else if (numTaps <= 32)
+    {
+        /*
+         * [17 to 32 taps] specialized routine
+         */
+        arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize);
+        return;
+    }
+
    /*
     * pState points to state array which contains previous frame (numTaps - 1) samples
     * pStateCur points to the location where the new input data should be written
@ -82,12 +208,17 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
    blkCnt      = blockSize >> 2;

    /*
-     * load 16 coefs
+     * outer samples loop
     */
-    vecCoeffs = *(q7x16_t *) pCoeffs;
-
    while (blkCnt > 0U)
    {
+        const q7_t     *pCoeffsTmp = pCoeffs;
+        const q7_t     *pSamplesTmp = pSamples;
+
+        acc0 = 0;
+        acc1 = 0;
+        acc2 = 0;
+        acc3 = 0;
        /*
         * Save 16 input samples in the history buffer
         */
@ -95,18 +226,36 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
        pStateCur += 16;
        pTempSrc += 16;

-        vecIn0 = vld1q(pSamples);
-        acc0 = vmladavq(vecIn0, vecCoeffs);
+        /*
+         * inner coefficients loop
+         */
+        int       i = tapsBlkCnt;
+        while (i > 0)
+        {
+            /*
+             * load 16 coefs
+             */
+            vecCoeffs = *(q7x16_t *) pCoeffsTmp;

-        vecIn0 = vld1q(&pSamples[1]);;
-        acc1 = vmladavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(pSamplesTmp);
+            acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);

-        vecIn0 = vld1q(&pSamples[2]);;
-        acc2 = vmladavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[1]);
+            acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);

-        vecIn0 = vld1q(&pSamples[3]);
-        acc3 = vmladavq(vecIn0, vecCoeffs);
+            vecIn0 = vld1q(&pSamplesTmp[2]);
+            acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);

+            vecIn0 = vld1q(&pSamplesTmp[3]);
+            acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
+
+            pSamplesTmp += 16;
+            pCoeffsTmp += 16;
+            /*
+             * Decrement the taps block loop counter
+             */
+            i--;
+        }
        /*
         * Store the 1.7 format filter output in destination buffer
         */
@ -127,18 +276,37 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
    {
    case 3:
        {
+            const q7_t     *pCoeffsTmp = pCoeffs;
+            const q7_t     *pSamplesTmp = pSamples;
+
+            acc0 = 0;
+            acc1 = 0;
+            acc2 = 0;
+            /*
+             * Save 16 input samples in the history buffer
+             */
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 16;
            pTempSrc += 16;

-            vecIn0 = vld1q(pSamples);
-            acc0 = vmladavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);

-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vmladavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(&pSamplesTmp[8]);
+                acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);

-            vecIn0 = vld1q(&pSamples[2]);
-            acc2 = vmladavq(vecIn0, vecCoeffs);
+                pSamplesTmp += 16;
+                pCoeffsTmp += 16;
+                i--;
+            }

            *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
            *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
@ -148,15 +316,33 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t

    case 2:
        {
+            const q7_t     *pCoeffsTmp = pCoeffs;
+            const q7_t     *pSamplesTmp = pSamples;
+
+            acc0 = 0;
+            acc1 = 0;
+            /*
+             * Save 16 input samples in the history buffer
+             */
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 16;
            pTempSrc += 16;

-            vecIn0 = vld1q(pSamples);
-            acc0 = vmladavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q7x16_t *) pCoeffsTmp;

-            vecIn0 = vld1q(&pSamples[1]);
-            acc1 = vmladavq(vecIn0, vecCoeffs);
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+                vecIn0 = vld1q(&pSamplesTmp[4]);
+                acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
+
+                pSamplesTmp += 16;
+                pCoeffsTmp += 16;
+                i--;
+            }

            *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
            *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
@ -165,13 +351,29 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t

    case 1:
        {
+            const q7_t     *pCoeffsTmp = pCoeffs;
+            const q7_t     *pSamplesTmp = pSamples;
+
+            acc0 = 0;
+            /*
+             * Save 16 input samples in the history buffer
+             */
            vst1q(pStateCur, vld1q(pTempSrc));
            pStateCur += 16;
            pTempSrc += 16;

-            vecIn0 = vld1q(pSamples);
-            acc0 = vmladavq(vecIn0, vecCoeffs);
+            int       i = tapsBlkCnt;
+            while (i > 0)
+            {
+                vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+                vecIn0 = vld1q(pSamplesTmp);
+                acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);

+                pSamplesTmp += 16;
+                pCoeffsTmp += 16;
+                i--;
+            }
            *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
        }
        break;
@ -198,288 +400,6 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
        vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
    }
 }
-
-void arm_fir_q7(
-  const arm_fir_instance_q7 * S,
-  const q7_t * pSrc,
-        q7_t * pDst,
-        uint32_t blockSize)
-{
-    q7_t     *pState = S->pState;   /* State pointer */
-    const q7_t     *pCoeffs = S->pCoeffs; /* Coefficient pointer */
-    q7_t     *pStateCur;        /* Points to the current sample of the state */
-    const q7_t     *pSamples;         /* Temporary pointer to the sample buffer */
-    q7_t     *pOutput;          /* Temporary pointer to the output buffer */
-    const q7_t     *pTempSrc;         /* Temporary pointer to the source data */
-    q7_t     *pTempDest;        /* Temporary pointer to the destination buffer */
-    uint32_t  numTaps = S->numTaps; /* Number of filter coefficients in the filter */
-    uint32_t  blkCnt;
-    q7x16_t  vecIn0;
-    uint32_t  tapsBlkCnt = (numTaps + 15) / 16;
-    q31_t     acc0, acc1, acc2, acc3;
-    q7x16_t  vecCoeffs;
-
-    if (blockSize >= 20)
-    {
-        if (numTaps <= 16)
-        {
-            /*
-             * [1 to 16 taps] specialized routine
-             */
-            arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
-            return;
-        }
-    }
-
-    if (blockSize >= 20)
-    {
-      /*
-       * pState points to state array which contains previous frame (numTaps - 1) samples
-       * pStateCur points to the location where the new input data should be written
-       */
-      pStateCur   = &(pState[(numTaps - 1u)]);
-      pSamples    = pState;
-      pTempSrc    = pSrc;
-      pOutput     = pDst;
-      blkCnt      = blockSize >> 2;
-  
-      /*
-       * outer samples loop
-       */
-      while (blkCnt > 0U)
-      {
-          const q7_t     *pCoeffsTmp = pCoeffs;
-          const q7_t     *pSamplesTmp = pSamples;
-  
-          acc0 = 0;
-          acc1 = 0;
-          acc2 = 0;
-          acc3 = 0;
-          /*
-           * Save 16 input samples in the history buffer
-           */
-          vst1q(pStateCur, vld1q(pTempSrc));
-          pStateCur += 16;
-          pTempSrc += 16;
-  
-          /*
-           * inner coefficients loop
-           */
-          uint32_t       i = tapsBlkCnt;
-          while (i > 0U)
-          {
-              /*
-               * load 16 coefs
-               */
-              vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-              vecIn0 = vld1q(pSamplesTmp);
-              acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-              vecIn0 = vld1q(&pSamplesTmp[1]);
-              acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
-  
-              vecIn0 = vld1q(&pSamplesTmp[2]);
-              acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
-  
-              vecIn0 = vld1q(&pSamplesTmp[3]);
-              acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
-  
-              pSamplesTmp += 16;
-              pCoeffsTmp += 16;
-              /*
-               * Decrement the taps block loop counter
-               */
-              i--;
-          }
-          /*
-           * Store the 1.7 format filter output in destination buffer
-           */
-          *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-          *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
-          *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
-          *pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8);
-  
-          pSamples += 4;
-          /*
-           * Decrement the sample block loop counter
-           */
-          blkCnt--;
-      }
-  
-      uint32_t  residual = blockSize & 3;
-      switch (residual)
-      {
-      case 3:
-          {
-              const q7_t     *pCoeffsTmp = pCoeffs;
-              const q7_t     *pSamplesTmp = pSamples;
-  
-              acc0 = 0;
-              acc1 = 0;
-              acc2 = 0;
-              /*
-               * Save 16 input samples in the history buffer
-               */
-              vst1q(pStateCur, vld1q(pTempSrc));
-              pStateCur += 16;
-              pTempSrc += 16;
-  
-              uint32_t       i = tapsBlkCnt;
-              while (i > 0U)
-              {
-                  vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-                  vecIn0 = vld1q(pSamplesTmp);
-                  acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-                  vecIn0 = vld1q(&pSamplesTmp[1]);
-                  acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
-  
-                  vecIn0 = vld1q(&pSamplesTmp[2]);
-                  acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
-  
-                  pSamplesTmp += 16;
-                  pCoeffsTmp += 16;
-                  i--;
-              }
-  
-              *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-              *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
-              *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
-          }
-          break;
-  
-      case 2:
-          {
-              const q7_t     *pCoeffsTmp = pCoeffs;
-              const q7_t     *pSamplesTmp = pSamples;
-  
-              acc0 = 0;
-              acc1 = 0;
-              /*
-               * Save 16 input samples in the history buffer
-               */
-              vst1q(pStateCur, vld1q(pTempSrc));
-              pStateCur += 16;
-              pTempSrc += 16;
-  
-              uint32_t       i = tapsBlkCnt;
-              while (i > 0U)
-              {
-                  vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-                  vecIn0 = vld1q(pSamplesTmp);
-                  acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-                  vecIn0 = vld1q(&pSamplesTmp[1]);
-                  acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
-  
-                  pSamplesTmp += 16;
-                  pCoeffsTmp += 16;
-                  i--;
-              }
-  
-              *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-              *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
-          }
-          break;
-  
-      case 1:
-          {
-              const q7_t     *pCoeffsTmp = pCoeffs;
-              const q7_t     *pSamplesTmp = pSamples;
-  
-              acc0 = 0;
-              /*
-               * Save 16 input samples in the history buffer
-               */
-              vst1q(pStateCur, vld1q(pTempSrc));
-              pStateCur += 16;
-              pTempSrc += 16;
-  
-              uint32_t       i = tapsBlkCnt;
-              while (i > 0U)
-              {
-                  vecCoeffs = *(q7x16_t *) pCoeffsTmp;
-  
-                  vecIn0 = vld1q(pSamplesTmp);
-                  acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
-  
-                  pSamplesTmp += 16;
-                  pCoeffsTmp += 16;
-                  i--;
-              }
-              *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
-          }
-          break;
-      }
-    }
-    else
-    {
-        q7_t *pStateCurnt;                            /* Points to the current sample of the state */
-            q7_t *px;                                     /* Temporary pointer for state buffer */
-      const q7_t *pb;                                     /* Temporary pointer for coefficient buffer */
-            q31_t acc0;                                    /* Accumulator */
-            uint32_t  i,blkCnt;                    /* Loop counters */
-      pStateCurnt = &(S->pState[(numTaps - 1U)]);
-      blkCnt = blockSize;
-
-         while (blkCnt > 0U)
-           {
-             /* Copy one sample at a time into state buffer */
-             *pStateCurnt++ = *pSrc++;
-         
-             /* Set the accumulator to zero */
-             acc0 = 0;
-         
-             /* Initialize state pointer */
-             px = pState;
-         
-             /* Initialize Coefficient pointer */
-             pb = pCoeffs;
-         
-             i = numTaps;
-         
-             /* Perform the multiply-accumulates */
-             while (i > 0U)
-             {
-               acc0 += (q15_t) * (px++) * (*(pb++));
-               i--;
-             } 
-         
-             /* The result is in 2.14 format. Convert to 1.7
-                Then store the output in the destination buffer. */
-             *pDst++ = __SSAT((acc0 >> 7U), 8);
-         
-             /* Advance state pointer by 1 for the next sample */
-             pState = pState + 1U;
-         
-             /* Decrement loop counter */
-             blkCnt--;
-           }
-    }
-    /*
-     * Copy the samples back into the history buffer start
-     */
-    pTempSrc = &S->pState[blockSize];
-    pTempDest = S->pState;
-
-    blkCnt = numTaps >> 4;
-    while (blkCnt > 0U)
-    {
-        vst1q(pTempDest, vld1q(pTempSrc));
-        pTempSrc += 16;
-        pTempDest += 16;
-        blkCnt--;
-    }
-    blkCnt = numTaps & 0xF;
-    if (blkCnt > 0U)
-    {
-        mve_pred16_t p0 = vctp8q(blkCnt);
-        vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
-    }
-}
 #else
 void arm_fir_q7(
  const arm_fir_instance_q7 * S,
--- a/Testing/CMakeLists.txt
+++ b/Testing/CMakeLists.txt
@ -168,6 +168,7 @@ set (NNSRC
   Source/Benchmarks/FIRF32.cpp
   Source/Benchmarks/FIRQ31.cpp
   Source/Benchmarks/FIRQ15.cpp
+   Source/Benchmarks/FIRQ7.cpp
   Source/Benchmarks/MISCF32.cpp
   Source/Benchmarks/MISCQ31.cpp
   Source/Benchmarks/MISCQ15.cpp
--- a/Testing/Include/Benchmarks/FIRQ7.h
+++ b/Testing/Include/Benchmarks/FIRQ7.h
@ -0,0 +1,33 @@
+#include "Test.h"
+#include "Pattern.h"
+
+#include "dsp/filtering_functions.h"
+
+class FIRQ7:public Client::Suite
+    {
+        public:
+            FIRQ7(Testing::testID_t id);
+            virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
+            virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
+        private:
+            #include "FIRQ7_decl.h"
+            Client::Pattern<q7_t> coefs;
+            Client::Pattern<q7_t> samples;
+            Client::Pattern<q7_t> refs;
+
+            Client::LocalPattern<q7_t> output;
+            Client::LocalPattern<q7_t> error;
+            Client::LocalPattern<q7_t> state;
+
+            int nbTaps;
+            int nbSamples;
+
+            arm_fir_instance_q7  instFir;
+
+            const q7_t *pSrc;
+            const q7_t *pCoefs;
+            q7_t *pDst;
+            const q7_t *pRef;
+            q7_t *pErr;
+            
+    };
--- a/Testing/Source/Benchmarks/FIRQ15.cpp
+++ b/Testing/Source/Benchmarks/FIRQ15.cpp
@ -1,6 +1,9 @@
 #include "FIRQ15.h"
 #include "Error.h"

+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) q15_t coeffArray[64];
+#endif 
   
    void FIRQ15::test_fir_q15()
    {
@ -35,10 +38,21 @@
       switch(id)
       {
           case TEST_FIR_Q15_1:
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+              /* Copy coefficients and pad to zero 
+              */
+              memset(coeffArray,0,32*sizeof(q15_t));
+              q15_t *ptr;
+
+              ptr=coefs.ptr();
+              memcpy(coeffArray,ptr,this->nbTaps*sizeof(q15_t));
+              this->pCoefs = coeffArray;
+#else
+              this->pCoefs=coefs.ptr();
+#endif
              arm_fir_init_q15(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);

              this->pSrc=samples.ptr();
-              this->pCoefs=coefs.ptr();
              this->pDst=output.ptr();
           break;

--- a/Testing/Source/Benchmarks/FIRQ31.cpp
+++ b/Testing/Source/Benchmarks/FIRQ31.cpp
@ -1,7 +1,7 @@
 #include "FIRQ31.h"
 #include "Error.h"

-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 static __ALIGNED(8) q31_t coeffArray[64];
 #endif 
   
@ -39,7 +39,7 @@ static __ALIGNED(8) q31_t coeffArray[64];
       switch(id)
       {
           case TEST_FIR_Q31_1:
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
              /* Copy coefficients and pad to zero 
              */
              memset(coeffArray,0,32*sizeof(q31_t));
--- a/Testing/Source/Benchmarks/FIRQ7.cpp
+++ b/Testing/Source/Benchmarks/FIRQ7.cpp
@ -0,0 +1,60 @@
+#include "FIRQ7.h"
+#include "Error.h"
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) q7_t coeffArray[64];
+#endif 
+   
+    void FIRQ7::test_fir_q7()
+    {
+       arm_fir_q7(&instFir, this->pSrc, this->pDst, this->nbSamples);
+    } 
+
+   
+
+    
+    void FIRQ7::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
+    {
+
+
+       std::vector<Testing::param_t>::iterator it = params.begin();
+       this->nbTaps = *it++;
+       this->nbSamples = *it;
+
+       samples.reload(FIRQ7::SAMPLES1_Q7_ID,mgr,this->nbSamples);
+       coefs.reload(FIRQ7::COEFS1_Q7_ID,mgr,this->nbTaps);
+
+       state.create(this->nbSamples + this->nbTaps - 1,FIRQ7::STATE_Q7_ID,mgr);
+       output.create(this->nbSamples,FIRQ7::OUT_SAMPLES_Q7_ID,mgr);
+
+       switch(id)
+       {
+           case TEST_FIR_Q7_1:
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+              /* Copy coefficients and pad to zero 
+              */
+              memset(coeffArray,0,32*sizeof(q7_t));
+              q7_t *ptr;
+
+              ptr=coefs.ptr();
+              memcpy(coeffArray,ptr,this->nbTaps*sizeof(q7_t));
+              this->pCoefs = coeffArray;
+#else
+              this->pCoefs=coefs.ptr();
+#endif
+              arm_fir_init_q7(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
+
+              this->pSrc=samples.ptr();
+              this->pDst=output.ptr();
+           break;
+
+           
+
+          
+       }
+       
+    }
+
+    void FIRQ7::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
+    {
+    }
--- a/Testing/Source/Tests/FIRQ15.cpp
+++ b/Testing/Source/Tests/FIRQ15.cpp
@ -130,10 +130,10 @@ void checkInnerTail(q15_t *b)
       ref.reload(FIRQ15::FIRREFS_Q15_ID,mgr);

       output.create(ref.nbSamples(),FIRQ15::OUT_Q15_ID,mgr);
-       /* Max blockSize + numTaps as generated by Python script 
+       /* > Max blockSize + numTaps as generated by Python script 
          numTaps may be increased by 1 by Python script to force it to even values
          */
-       state.create(41,FIRQ15::OUT_Q15_ID,mgr);
+       state.create(3 * 41,FIRQ15::OUT_Q15_ID,mgr);
    }

    void FIRQ15::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
--- a/Testing/bench.txt
+++ b/Testing/bench.txt
@ -499,6 +499,35 @@ group Root {
                   Normalized LMS Filter:test_lms_norm_q15
                } -> PARAM1_ID
             }
+
+             suite FIR Q7 {
+                class = FIRQ7
+                folder = FIRQ7
+
+                ParamList {
+                  NumTaps, NB
+                  Summary NumTaps, NB
+                  Names "Number of taps","Number of samples"
+                  Formula "NumTaps * NB"
+                }
+
+                Pattern SAMPLES1_Q7_ID : Samples1_q7.txt 
+                Pattern REFS1_Q7_ID : Refs1_q7.txt 
+                Pattern COEFS1_Q7_ID : Coefs1_q7.txt 
+
+                Output  OUT_SAMPLES_Q7_ID : Output
+                Output  STATE_Q7_ID : State
+                Output  ERR_Q7_ID : Err
+
+                Params PARAM1_ID = {
+                  NumTaps = [16,32,64]
+                  NB = [64,128,256]
+                }
+
+                Functions {
+                   FIR Filter:test_fir_q7
+                } -> PARAM1_ID
+             }
           }

           group Convolutions / Correlations {