diff --git a/Include/dsp/filtering_functions.h b/Include/dsp/filtering_functions.h
index ef7f2dd5..0d2f9ce8 100755
--- a/Include/dsp/filtering_functions.h
+++ b/Include/dsp/filtering_functions.h
@@ -1173,10 +1173,17 @@ arm_status arm_fir_decimate_init_f32(
 
 
 #if defined(ARM_MATH_NEON) 
+/**
+  @brief         Compute new coefficient arrays for use in vectorized filter (Neon only).
+  @param[in]     numStages         number of 2nd order stages in the filter.
+  @param[in]     pCoeffs           points to the original filter coefficients.
+  @param[in]     pComputedCoeffs   points to the new computed coefficients for the vectorized version.
+  @return        none
+*/
 void arm_biquad_cascade_df2T_compute_coefs_f32(
-  arm_biquad_cascade_df2T_instance_f32 * S,
   uint8_t numStages,
-  const float32_t * pCoeffs);
+  const float32_t * pCoeffs,
+  float32_t * pComputedCoeffs);
 #endif
   /**
    * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
index 27d1eef8..e3b350c3 100644
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
@@ -37,78 +37,32 @@
   @{
  */
 
-/**
-  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
-  @param[in,out] S           points to an instance of the filter data structure.
-  @param[in]     numStages   number of 2nd order stages in the filter.
-  @param[in]     pCoeffs     points to the filter coefficients.
-  @param[in]     pState      points to the state buffer.
-  @return        none
 
-  @par           Coefficient and State Ordering
-                   The coefficients are stored in the array <code>pCoeffs</code> in the following order
-                   in the not Neon version.
-  <pre>
-      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
-  </pre>
-                   
-  @par
-                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
-                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
-                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
-
-                   For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
-                   32*x + 5*y
-                   and it must be initialized using the function
-                   arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
-                   standard array coefficient as parameters.
-
-                   But, an array of 8*numstages is a good approximation.
-
-                   Then, the initialization can be done with:
-  <pre>
-                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, neonCoefs, stateNeon);
-                   arm_biquad_cascade_df2T_compute_coefs_f32(&SNeon,nbCascade,coefs);
-  </pre>
-
-  @par             In this example, neonCoefs is a bigger array of size 8 * numStages.
-                   coefs is the standard array:
-
-  <pre>
-      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
-  </pre>
-
-
-  @par
-                   The <code>pState</code> is a pointer to state array.
-                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
-                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
-                   The state array has a total length of <code>2*numStages</code> values.
-                   The state variables are updated after each block of data is processed; the coefficients are untouched.
- */
 
 #if defined(ARM_MATH_NEON) 
-/*
+/**
+  @brief         Compute new coefficient arrays for use in vectorized filter (Neon only).
+  @param[in]     numStages         number of 2nd order stages in the filter.
+  @param[in]     pCoeffs           points to the original filter coefficients.
+  @param[in]     pComputedCoeffs   points to the new computed coefficients for the vectorized Neon version.
+  @return        none
+
+  @par   Size of coefficient arrays:
+            pCoeffs has size 5 * numStages 
 
-Must be called after initializing the biquad instance.
-pCoeffs has size 5 * nbCascade
-Whereas the pCoeffs for the init has size (4*4 + 4*4)* nbCascade 
+            pComputedCoeffs has size 8 * numStages
 
-So this pCoeffs is the one which would be used for the not Neon version.
-The pCoeffs passed in init is bigger than the one for the not Neon version.
+            pComputedCoeffs is the array to be used in arm_biquad_cascade_df2T_init_f32.
 
 */
 void arm_biquad_cascade_df2T_compute_coefs_f32(
-  arm_biquad_cascade_df2T_instance_f32 * S,
   uint8_t numStages,
-  const float32_t * pCoeffs)
+  const float32_t * pCoeffs,
+  float32_t * pComputedCoeffs)
 {
    uint8_t cnt;
-   float32_t *pDstCoeffs;
    float32_t b0[4],b1[4],b2[4],a1[4],a2[4];
 
-   pDstCoeffs = (float32_t*)S->pCoeffs;
-
    cnt = numStages >> 2; 
    while(cnt > 0)
    {
@@ -123,52 +77,52 @@ void arm_biquad_cascade_df2T_compute_coefs_f32(
       }
 
       /* Vec 1 */
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = b0[1];
-      *pDstCoeffs++ = b0[2];
-      *pDstCoeffs++ = b0[3];
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = b0[1];
+      *pComputedCoeffs++ = b0[2];
+      *pComputedCoeffs++ = b0[3];
 
       /* Vec 2 */
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = b0[1] * b0[2];
-      *pDstCoeffs++ = b0[2] * b0[3];
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = b0[1] * b0[2];
+      *pComputedCoeffs++ = b0[2] * b0[3];
 
       /* Vec 3 */
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = b0[1] * b0[2] * b0[3];
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = b0[1] * b0[2] * b0[3];
       
       /* Vec 4 */
-      *pDstCoeffs++ = b0[0];
-      *pDstCoeffs++ = b0[0] * b0[1];
-      *pDstCoeffs++ = b0[0] * b0[1] * b0[2];
-      *pDstCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3];
+      *pComputedCoeffs++ = b0[0];
+      *pComputedCoeffs++ = b0[0] * b0[1];
+      *pComputedCoeffs++ = b0[0] * b0[1] * b0[2];
+      *pComputedCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3];
 
       /* Vec 5 */
-      *pDstCoeffs++ = b1[0];
-      *pDstCoeffs++ = b1[1];
-      *pDstCoeffs++ = b1[2];
-      *pDstCoeffs++ = b1[3];
+      *pComputedCoeffs++ = b1[0];
+      *pComputedCoeffs++ = b1[1];
+      *pComputedCoeffs++ = b1[2];
+      *pComputedCoeffs++ = b1[3];
 
       /* Vec 6 */
-      *pDstCoeffs++ = b2[0];
-      *pDstCoeffs++ = b2[1];
-      *pDstCoeffs++ = b2[2];
-      *pDstCoeffs++ = b2[3];
+      *pComputedCoeffs++ = b2[0];
+      *pComputedCoeffs++ = b2[1];
+      *pComputedCoeffs++ = b2[2];
+      *pComputedCoeffs++ = b2[3];
 
       /* Vec 7 */
-      *pDstCoeffs++ = a1[0];
-      *pDstCoeffs++ = a1[1];
-      *pDstCoeffs++ = a1[2];
-      *pDstCoeffs++ = a1[3];
+      *pComputedCoeffs++ = a1[0];
+      *pComputedCoeffs++ = a1[1];
+      *pComputedCoeffs++ = a1[2];
+      *pComputedCoeffs++ = a1[3];
 
       /* Vec 8 */
-      *pDstCoeffs++ = a2[0];
-      *pDstCoeffs++ = a2[1];
-      *pDstCoeffs++ = a2[2];
-      *pDstCoeffs++ = a2[3];
+      *pComputedCoeffs++ = a2[0];
+      *pComputedCoeffs++ = a2[1];
+      *pComputedCoeffs++ = a2[2];
+      *pComputedCoeffs++ = a2[3];
 
       cnt--;
    }
@@ -176,17 +130,66 @@ void arm_biquad_cascade_df2T_compute_coefs_f32(
    cnt = numStages & 0x3;
    while(cnt > 0)
    {
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
       cnt--;
    }
 
 }
 #endif 
 
+/**
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order
+                   in the not Neon version.
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+                   
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+
+                   For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
+                   32*x + 5*y
+                   and it must be initialized using the function
+                   arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
+                   standard array coefficient as parameters.
+
+                   But, an array of 8*numstages is a good approximation.
+
+                   Then, the initialization can be done with:
+  <pre>
+                   arm_biquad_cascade_df2T_compute_coefs_f32(nbCascade,coefs,computedCoefs);
+                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, computedCoefs, stateNeon);
+  </pre>
+
+  @par             In this example, computedCoefs is a bigger array of size 8 * numStages.
+                   coefs is the standard array:
+
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+
+
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
 void arm_biquad_cascade_df2T_init_f32(
         arm_biquad_cascade_df2T_instance_f32 * S,
         uint8_t numStages,
diff --git a/Source/MatrixFunctions/arm_mat_mult_f32.c b/Source/MatrixFunctions/arm_mat_mult_f32.c
index 54481187..d1fd9eac 100644
--- a/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -28,6 +28,10 @@
 
 #include "dsp/matrix_functions.h"
 
+#if defined(ARM_MATH_NEON)
+#define GROUPOFROWS 8
+#endif
+
 /**
  * @ingroup groupMatrix
  */
@@ -54,14 +58,7 @@
  * @{
  */
 
-/**
- * @brief Floating-point matrix multiplication.
- * @param[in]       *pSrcA points to the first input matrix structure
- * @param[in]       *pSrcB points to the second input matrix structure
- * @param[out]      *pDst points to output matrix structure
- * @return     		The function returns either
- * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
- */
+
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
@@ -258,6 +255,14 @@ __STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve(
 }
 
 
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
 arm_status arm_mat_mult_f32(
   const arm_matrix_instance_f32 * pSrcA,
   const arm_matrix_instance_f32 * pSrcB,
@@ -512,9 +517,14 @@ arm_status arm_mat_mult_f32(
 #else
 
 #if defined(ARM_MATH_NEON)
-
-#define GROUPOFROWS 8
-
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
 arm_status arm_mat_mult_f32(
   const arm_matrix_instance_f32 * pSrcA,
   const arm_matrix_instance_f32 * pSrcB,
@@ -843,6 +853,14 @@ arm_status arm_mat_mult_f32(
   return (status);
 }
 #else
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
 arm_status arm_mat_mult_f32(
   const arm_matrix_instance_f32 * pSrcA,
   const arm_matrix_instance_f32 * pSrcB,
diff --git a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
index 46bc689f..724f286d 100755
--- a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
+++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@@ -33,8 +33,28 @@
 #include <limits.h>
 #include <math.h>
 
+#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
 
+/*
+
+_Float16 is not supported in g++ so we avoid putting _Float16 definitions
+in the public headers.
 
+This function should at some point be moved in FastMath.
+
+*/
+__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
+{
+    float16_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = (_Float16)r * (_Float16)x;
+        nb--;
+    }
+    return(r);
+}
+#endif
 
 /**
  * @addtogroup polysvm
@@ -42,6 +62,13 @@
  */
 
 
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
 /**
  * @brief SVM polynomial prediction
  * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
@@ -50,12 +77,6 @@
  * @return none.
  *
  */
-
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_helium_utils.h"
-#include "arm_vec_math_f16.h"
-
 void arm_svm_polynomial_predict_f16(
     const arm_svm_polynomial_instance_f16 *S,
     const float16_t * in,
@@ -306,26 +327,15 @@ void arm_svm_polynomial_predict_f16(
 
 #else
 
-/*
-
-_Float16 is not supported in g++ so we avoid putting _Float16 definitions
-in the public headers.
-
-This function should at some point be moved in FastMath.
-
-*/
-__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
-{
-    float16_t r = x;
-    nb --;
-    while(nb > 0)
-    {
-        r = (_Float16)r * (_Float16)x;
-        nb--;
-    }
-    return(r);
-}
 
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
 void arm_svm_polynomial_predict_f16(
     const arm_svm_polynomial_instance_f16 *S,
     const float16_t * in,
diff --git a/Testing/Source/Tests/BIQUADF32.cpp b/Testing/Source/Tests/BIQUADF32.cpp
index e9683bc1..25ef1d87 100755
--- a/Testing/Source/Tests/BIQUADF32.cpp
+++ b/Testing/Source/Tests/BIQUADF32.cpp
@@ -92,11 +92,8 @@ a double precision computation.
 
         float32_t *statep = state.ptr();
 
-#if !defined(ARM_MATH_NEON) 
         const float32_t *coefsp = coefs.ptr();
-#else
-        float32_t *coefsp = coefs.ptr();
-#endif
+
         
         const float32_t *inputp = inputs.ptr();
         float32_t *outp = output.ptr();
@@ -126,13 +123,15 @@ a double precision computation.
 #else
            float32_t *vecCoefsPtr = vecCoefs.ptr();
 
+           // Those Neon coefs must be computed from original coefs
+           arm_biquad_cascade_df2T_compute_coefs_f32(3,coefsp,vecCoefsPtr);
+
            arm_biquad_cascade_df2T_init_f32(&this->Sdf2T,
                     3,
                     vecCoefsPtr,
                     statep);
 
-           // Those Neon coefs must be computed from original coefs
-           arm_biquad_cascade_df2T_compute_coefs_f32(&this->Sdf2T,3,coefsp);
+           
 #endif
 
            /*
@@ -290,13 +289,15 @@ a double precision computation.
 #else
            float32_t *vecCoefsPtr = vecCoefs.ptr();
 
+           // Those Neon coefs must be computed from original coefs
+           arm_biquad_cascade_df2T_compute_coefs_f32(numStages,coefsp,vecCoefsPtr);
+
            arm_biquad_cascade_df2T_init_f32(&this->Sdf2T,
                     numStages,
                     vecCoefsPtr,
                     statep);
 
-           // Those Neon coefs must be computed from original coefs
-           arm_biquad_cascade_df2T_compute_coefs_f32(&this->Sdf2T,numStages,coefsp);
+           
 #endif
            coefsp += numStages * 5;