diff --git a/Source/FilteringFunctions/arm_fir_init_f16.c b/Source/FilteringFunctions/arm_fir_init_f16.c
index 35df2738..4b1ec39d 100755
--- a/Source/FilteringFunctions/arm_fir_init_f16.c
+++ b/Source/FilteringFunctions/arm_fir_init_f16.c
@@ -54,10 +54,10 @@
                    <code>pState</code> points to the array of state variables.
                    <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
   @par          Initialization of Helium version
-                 For Helium version the array of coefficients must be a multiple of 16 even if less
-                 then 16 coefficients are used. The additional coefficients must be set to 0.
-                 It does not mean that all the coefficients will be used in the filter (numTaps
-                 is still set to its right value in the init function.) It just means that
+                 For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+                 then 4a coefficients are defined in the FIR. The additional coefficients 
+                 (4a - numTaps) must be set to 0.
+                 numTaps is still set to its right value in the init function. It means that
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
diff --git a/Source/FilteringFunctions/arm_fir_init_f32.c b/Source/FilteringFunctions/arm_fir_init_f32.c
index 2ffa8273..e55c402a 100644
--- a/Source/FilteringFunctions/arm_fir_init_f32.c
+++ b/Source/FilteringFunctions/arm_fir_init_f32.c
@@ -55,10 +55,10 @@
                    <code>pState</code> points to the array of state variables and some working memory for the Helium version.
                    <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
   @par          Initialization of Helium version
-                 For Helium version the array of coefficients must be a multiple of 16 even if less
-                 then 16 coefficients are used. The additional coefficients must be set to 0.
-                 It does not mean that all the coefficients will be used in the filter (numTaps
-                 is still set to its right value in the init function.) It just means that
+                 For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+                 then 4a coefficients are defined in the FIR. The additional coefficients 
+                 (4a - numTaps) must be set to 0.
+                 numTaps is still set to its right value in the init function. It means that
                  the implementation may require to read more coefficients due to the vectorization and
                  to avoid having to manage too many different cases in the code.
 
diff --git a/Source/FilteringFunctions/arm_fir_init_q15.c b/Source/FilteringFunctions/arm_fir_init_q15.c
index ab14f7f4..793ca755 100644
--- a/Source/FilteringFunctions/arm_fir_init_q15.c
+++ b/Source/FilteringFunctions/arm_fir_init_q15.c
@@ -73,6 +73,14 @@
   </pre>
                    <code>pState</code> points to the array of state variables.
                    <code>pState</code> is of length <code>numTaps+blockSize</code>, when running on Cortex-M4 and Cortex-M3  and is of length <code>numTaps+blockSize-1</code>, when running on Cortex-M0 where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q15()</code>.
+ 
+  @par          Initialization of Helium version
+                   For Helium version the array of coefficients must be a multiple of 8 (8a) even if less
+                   then 8a coefficients are defined in the FIR. The additional coefficients 
+                   (8a - numTaps) must be set to 0.
+                   numTaps is still set to its right value in the init function. It means that
+                   the implementation may require to read more coefficients due to the vectorization and
+                   to avoid having to manage too many different cases in the code.
  */
 
 arm_status arm_fir_init_q15(
diff --git a/Source/FilteringFunctions/arm_fir_init_q31.c b/Source/FilteringFunctions/arm_fir_init_q31.c
index e491437e..f4285b44 100644
--- a/Source/FilteringFunctions/arm_fir_init_q31.c
+++ b/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -55,10 +55,10 @@
                    <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
 
    @par          Initialization of Helium version
-                   For Helium version the array of coefficients must be a multiple of 16 even if less
-                   then 16 coefficients are used. The additional coefficients must be set to 0.
-                   It does not mean that all the coefficients will be used in the filter (numTaps
-                   is still set to its right value in the init function.) It just means that
+                   For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+                   then 4a coefficients are defined in the FIR. The additional coefficients 
+                   (4a - numTaps) must be set to 0.
+                   numTaps is still set to its right value in the init function. It means that
                    the implementation may require to read more coefficients due to the vectorization and
                    to avoid having to manage too many different cases in the code.
   
diff --git a/Source/FilteringFunctions/arm_fir_init_q7.c b/Source/FilteringFunctions/arm_fir_init_q7.c
index 3ae5be23..7c88def2 100644
--- a/Source/FilteringFunctions/arm_fir_init_q7.c
+++ b/Source/FilteringFunctions/arm_fir_init_q7.c
@@ -54,6 +54,15 @@
   @par
                    <code>pState</code> points to the array of state variables.
                    <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q7()</code>.
+  
+  @par          Initialization of Helium version
+                   For Helium version the array of coefficients must be a multiple of 16 (16a) even if less
+                   then 16a coefficients are defined in the FIR. The additional coefficients 
+                   (16a - numTaps) must be set to 0.
+                   numTaps is still set to its right value in the init function. It means that
+                   the implementation may require to read more coefficients due to the vectorization and
+                   to avoid having to manage too many different cases in the code.
+
  */
 
 void arm_fir_init_q7(
diff --git a/Testing/Source/Tests/FIRF16.cpp b/Testing/Source/Tests/FIRF16.cpp
index a3a847fd..a6652593 100755
--- a/Testing/Source/Tests/FIRF16.cpp
+++ b/Testing/Source/Tests/FIRF16.cpp
@@ -24,6 +24,9 @@ static void checkInnerTail(float16_t *b)
     ASSERT_TRUE(b[3] == 0.0f);
 }
 
+// Coef must be padded to a multiple of 4
+#define FIRCOEFPADDING 2
+
     void FIRF16::test_fir_f16()
     {
         
@@ -42,6 +45,7 @@ static void checkInnerTail(float16_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int round;
 
         
 
@@ -59,9 +63,20 @@ static void checkInnerTail(float16_t *b)
            numTaps = configp[1];
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-           /* Copy coefficients and pad to zero 
+            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32*sizeof(float16_t));
+           memset(coeffArray,127,32*sizeof(float16_t));
+           round = numTaps >> FIRCOEFPADDING;
+           if ((round << FIRCOEFPADDING) < numTaps)
+           {
+             round ++;
+           }
+           round = round<<FIRCOEFPADDING;
+           memset(coeffArray,0,round*sizeof(float16_t));
+
+           //printf("blockSize=%d, numTaps=%d, round=%d (%d)\n",blockSize,numTaps,round,round - numTaps);
+
+
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRF32.cpp b/Testing/Source/Tests/FIRF32.cpp
index 8db410e3..90e3cb1b 100644
--- a/Testing/Source/Tests/FIRF32.cpp
+++ b/Testing/Source/Tests/FIRF32.cpp
@@ -24,6 +24,8 @@ static void checkInnerTail(float32_t *b)
     ASSERT_TRUE(b[3] == 0.0f);
 }
 
+// Coef must be padded to a multiple of 4
+#define FIRCOEFPADDING 2
 
     void FIRF32::test_fir_f32()
     {
@@ -44,6 +46,7 @@ static void checkInnerTail(float32_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int round;
         int nb=0;
 
         
@@ -61,12 +64,24 @@ static void checkInnerTail(float32_t *b)
            blockSize = configp[0];
            numTaps = configp[1];
 
+
            nb += 2*blockSize;
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32*sizeof(float32_t));
+           memset(coeffArray,127,32*sizeof(float32_t));
+           round = numTaps >> FIRCOEFPADDING;
+           if ((round << FIRCOEFPADDING) < numTaps)
+           {
+             round ++;
+           }
+           round = round<<FIRCOEFPADDING;
+           memset(coeffArray,0,round*sizeof(float32_t));
+
+           //printf("blockSize=%d, numTaps=%d, round=%d (%d)\n",blockSize,numTaps,round,round - numTaps);
+
+
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRQ15.cpp b/Testing/Source/Tests/FIRQ15.cpp
index 26123069..f4db51ea 100644
--- a/Testing/Source/Tests/FIRQ15.cpp
+++ b/Testing/Source/Tests/FIRQ15.cpp
@@ -18,6 +18,9 @@ static void checkInnerTail(q15_t *b)
     ASSERT_TRUE(b[3] == 0);
 }
 
+// Coef must be padded to a multiple of 8
+#define FIRCOEFPADDING 3
+
     void FIRQ15::test_fir_q15()
     {
         
@@ -36,6 +39,7 @@ static void checkInnerTail(q15_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int round;
 
         /*
 
@@ -51,9 +55,19 @@ static void checkInnerTail(q15_t *b)
            numTaps = configp[1];
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           /* Copy coefficients and pad to zero 
+            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32*sizeof(q15_t));
+           memset(coeffArray,127,32*sizeof(q15_t));
+           round = numTaps >> FIRCOEFPADDING;
+           if ((round << FIRCOEFPADDING) < numTaps)
+           {
+             round ++;
+           }
+           round = round<<FIRCOEFPADDING;
+           memset(coeffArray,0,round*sizeof(q15_t));
+
+           //printf("blockSize=%d, numTaps=%d, round=%d (%d)\n",blockSize,numTaps,round,round - numTaps);
+
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp
index 0070c671..695e43b4 100644
--- a/Testing/Source/Tests/FIRQ31.cpp
+++ b/Testing/Source/Tests/FIRQ31.cpp
@@ -18,6 +18,8 @@ static void checkInnerTail(q31_t *b)
     ASSERT_TRUE(b[3] == 0);
 }
 
+// Coef must be padded to a multiple of 4
+#define FIRCOEFPADDING 2
 
     void FIRQ31::test_fir_q31()
     {
@@ -37,6 +39,7 @@ static void checkInnerTail(q31_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int round;
         int nb=1;
 
         /*
@@ -55,7 +58,18 @@ static void checkInnerTail(q31_t *b)
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32*sizeof(q31_t));
+           memset(coeffArray,127,32*sizeof(q31_t));
+           round = numTaps >> FIRCOEFPADDING;
+           if ((round << FIRCOEFPADDING) < numTaps)
+           {
+             round ++;
+           }
+           round = round<<FIRCOEFPADDING;
+           memset(coeffArray,0,round*sizeof(q31_t));
+
+           //printf("blockSize=%d, numTaps=%d, round=%d (%d)\n",blockSize,numTaps,round,round - numTaps);
+
+
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];
diff --git a/Testing/Source/Tests/FIRQ7.cpp b/Testing/Source/Tests/FIRQ7.cpp
index 11d6ec7e..69423504 100644
--- a/Testing/Source/Tests/FIRQ7.cpp
+++ b/Testing/Source/Tests/FIRQ7.cpp
@@ -18,6 +18,9 @@ static void checkInnerTail(q7_t *b)
     ASSERT_TRUE(b[3] == 0);
 }
 
+// Coef must be padded to a multiple of 16
+#define FIRCOEFPADDING 4
+
     void FIRQ7::test_fir_q7()
     {
         
@@ -36,6 +39,7 @@ static void checkInnerTail(q7_t *b)
 #endif
         int blockSize;
         int numTaps;
+        int round;
 
         /*
 
@@ -51,9 +55,20 @@ static void checkInnerTail(q7_t *b)
            numTaps = configp[1];
 
 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
-           /* Copy coefficients and pad to zero 
+            /* Copy coefficients and pad to zero 
            */
-           memset(coeffArray,0,32*sizeof(q7_t));
+           memset(coeffArray,127,32*sizeof(q7_t));
+           round = numTaps >> FIRCOEFPADDING;
+           if ((round << FIRCOEFPADDING) < numTaps)
+           {
+             round ++;
+           }
+           round = round<<FIRCOEFPADDING;
+           memset(coeffArray,0,round*sizeof(q7_t));
+
+           //printf("blockSize=%d, numTaps=%d, round=%d (%d)\n",blockSize,numTaps,round,round - numTaps);
+
+
            for(j=0;j < numTaps; j++)
            {
               coeffArray[j] = orgcoefsp[j];