CMSIS-DSP: Correction of issue 1217

Wrong initialization code for Neon version of biquad DF2T. Initialization function was trying to modify a const array. Added Neon function to Doxygen output and some correction because of Doxygen.
5 years ago · 2a2f745bd3
parent 5fac45cc96
commit 2a2f745bd3
5 changed files with 181 additions and 142 deletions
--- a/Include/dsp/filtering_functions.h
+++ b/Include/dsp/filtering_functions.h
@ -1173,10 +1173,17 @@ arm_status arm_fir_decimate_init_f32(


 #if defined(ARM_MATH_NEON) 
+/**
+  @brief         Compute new coefficient arrays for use in vectorized filter (Neon only).
+  @param[in]     numStages         number of 2nd order stages in the filter.
+  @param[in]     pCoeffs           points to the original filter coefficients.
+  @param[in]     pComputedCoeffs   points to the new computed coefficients for the vectorized version.
+  @return        none
+*/
 void arm_biquad_cascade_df2T_compute_coefs_f32(
-  arm_biquad_cascade_df2T_instance_f32 * S,
  uint8_t numStages,
-  const float32_t * pCoeffs);
+  const float32_t * pCoeffs,
+  float32_t * pComputedCoeffs);
 #endif
  /**
   * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
@ -37,78 +37,32 @@
  @{
 */

+
+
+#if defined(ARM_MATH_NEON) 
 /**
-  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
-  @param[in,out] S           points to an instance of the filter data structure.
+  @brief         Compute new coefficient arrays for use in vectorized filter (Neon only).
  @param[in]     numStages         number of 2nd order stages in the filter.
-  @param[in]     pCoeffs     points to the filter coefficients.
-  @param[in]     pState      points to the state buffer.
+  @param[in]     pCoeffs           points to the original filter coefficients.
+  @param[in]     pComputedCoeffs   points to the new computed coefficients for the vectorized Neon version.
  @return        none

-  @par           Coefficient and State Ordering
-                   The coefficients are stored in the array <code>pCoeffs</code> in the following order
-                   in the not Neon version.
-  <pre>
-      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
-  </pre>
+  @par   Size of coefficient arrays:
+            pCoeffs has size 5 * numStages 

-  @par
-                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
-                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
-                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
-
-                   For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
-                   32*x + 5*y
-                   and it must be initialized using the function
-                   arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
-                   standard array coefficient as parameters.
+            pComputedCoeffs has size 8 * numStages

-                   But, an array of 8*numstages is a good approximation.
-
-                   Then, the initialization can be done with:
-  <pre>
-                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, neonCoefs, stateNeon);
-                   arm_biquad_cascade_df2T_compute_coefs_f32(&SNeon,nbCascade,coefs);
-  </pre>
-
-  @par             In this example, neonCoefs is a bigger array of size 8 * numStages.
-                   coefs is the standard array:
-
-  <pre>
-      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
-  </pre>
-
-
-  @par
-                   The <code>pState</code> is a pointer to state array.
-                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
-                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
-                   The state array has a total length of <code>2*numStages</code> values.
-                   The state variables are updated after each block of data is processed; the coefficients are untouched.
- */
-
-#if defined(ARM_MATH_NEON) 
-/*
-
-Must be called after initializing the biquad instance.
-pCoeffs has size 5 * nbCascade
-Whereas the pCoeffs for the init has size (4*4 + 4*4)* nbCascade 
-
-So this pCoeffs is the one which would be used for the not Neon version.
-The pCoeffs passed in init is bigger than the one for the not Neon version.
+            pComputedCoeffs is the array to be used in arm_biquad_cascade_df2T_init_f32.

 */
 void arm_biquad_cascade_df2T_compute_coefs_f32(
-  arm_biquad_cascade_df2T_instance_f32 * S,
  uint8_t numStages,
-  const float32_t * pCoeffs)
+  const float32_t * pCoeffs,
+  float32_t * pComputedCoeffs)
 {
   uint8_t cnt;
-   float32_t *pDstCoeffs;
   float32_t b0[4],b1[4],b2[4],a1[4],a2[4];

-   pDstCoeffs = (float32_t*)S->pCoeffs;
-
   cnt = numStages >> 2; 
   while(cnt > 0)
   {
@ -123,52 +77,52 @@ void arm_biquad_cascade_df2T_compute_coefs_f32(
      }

      /* Vec 1 */
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = b0[1];
-      *pDstCoeffs++ = b0[2];
-      *pDstCoeffs++ = b0[3];
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = b0[1];
+      *pComputedCoeffs++ = b0[2];
+      *pComputedCoeffs++ = b0[3];

      /* Vec 2 */
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = b0[1] * b0[2];
-      *pDstCoeffs++ = b0[2] * b0[3];
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = b0[1] * b0[2];
+      *pComputedCoeffs++ = b0[2] * b0[3];

      /* Vec 3 */
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = 0;
-      *pDstCoeffs++ = b0[1] * b0[2] * b0[3];
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = 0;
+      *pComputedCoeffs++ = b0[1] * b0[2] * b0[3];
      
      /* Vec 4 */
-      *pDstCoeffs++ = b0[0];
-      *pDstCoeffs++ = b0[0] * b0[1];
-      *pDstCoeffs++ = b0[0] * b0[1] * b0[2];
-      *pDstCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3];
+      *pComputedCoeffs++ = b0[0];
+      *pComputedCoeffs++ = b0[0] * b0[1];
+      *pComputedCoeffs++ = b0[0] * b0[1] * b0[2];
+      *pComputedCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3];

      /* Vec 5 */
-      *pDstCoeffs++ = b1[0];
-      *pDstCoeffs++ = b1[1];
-      *pDstCoeffs++ = b1[2];
-      *pDstCoeffs++ = b1[3];
+      *pComputedCoeffs++ = b1[0];
+      *pComputedCoeffs++ = b1[1];
+      *pComputedCoeffs++ = b1[2];
+      *pComputedCoeffs++ = b1[3];

      /* Vec 6 */
-      *pDstCoeffs++ = b2[0];
-      *pDstCoeffs++ = b2[1];
-      *pDstCoeffs++ = b2[2];
-      *pDstCoeffs++ = b2[3];
+      *pComputedCoeffs++ = b2[0];
+      *pComputedCoeffs++ = b2[1];
+      *pComputedCoeffs++ = b2[2];
+      *pComputedCoeffs++ = b2[3];

      /* Vec 7 */
-      *pDstCoeffs++ = a1[0];
-      *pDstCoeffs++ = a1[1];
-      *pDstCoeffs++ = a1[2];
-      *pDstCoeffs++ = a1[3];
+      *pComputedCoeffs++ = a1[0];
+      *pComputedCoeffs++ = a1[1];
+      *pComputedCoeffs++ = a1[2];
+      *pComputedCoeffs++ = a1[3];

      /* Vec 8 */
-      *pDstCoeffs++ = a2[0];
-      *pDstCoeffs++ = a2[1];
-      *pDstCoeffs++ = a2[2];
-      *pDstCoeffs++ = a2[3];
+      *pComputedCoeffs++ = a2[0];
+      *pComputedCoeffs++ = a2[1];
+      *pComputedCoeffs++ = a2[2];
+      *pComputedCoeffs++ = a2[3];

      cnt--;
   }
@ -176,17 +130,66 @@ void arm_biquad_cascade_df2T_compute_coefs_f32(
   cnt = numStages & 0x3;
   while(cnt > 0)
   {
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
-      *pDstCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
+      *pComputedCoeffs++ = *pCoeffs++;
      cnt--;
   }

 }
 #endif 

+/**
+  @brief         Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+  @param[in,out] S           points to an instance of the filter data structure.
+  @param[in]     numStages   number of 2nd order stages in the filter.
+  @param[in]     pCoeffs     points to the filter coefficients.
+  @param[in]     pState      points to the state buffer.
+  @return        none
+
+  @par           Coefficient and State Ordering
+                   The coefficients are stored in the array <code>pCoeffs</code> in the following order
+                   in the not Neon version.
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+                   
+  @par
+                   where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,
+                   <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,
+                   and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.
+
+                   For Neon version, this array is bigger. If numstages = 4x + y, then the array has size:
+                   32*x + 5*y
+                   and it must be initialized using the function
+                   arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the
+                   standard array coefficient as parameters.
+
+                   But, an array of 8*numstages is a good approximation.
+
+                   Then, the initialization can be done with:
+  <pre>
+                   arm_biquad_cascade_df2T_compute_coefs_f32(nbCascade,coefs,computedCoefs);
+                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, computedCoefs, stateNeon);
+  </pre>
+
+  @par             In this example, computedCoefs is a bigger array of size 8 * numStages.
+                   coefs is the standard array:
+
+  <pre>
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  </pre>
+
+
+  @par
+                   The <code>pState</code> is a pointer to state array.
+                   Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.
+                   The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+                   The state array has a total length of <code>2*numStages</code> values.
+                   The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
 void arm_biquad_cascade_df2T_init_f32(
        arm_biquad_cascade_df2T_instance_f32 * S,
        uint8_t numStages,
--- a/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f32.c
@ -28,6 +28,10 @@

 #include "dsp/matrix_functions.h"

+#if defined(ARM_MATH_NEON)
+#define GROUPOFROWS 8
+#endif
+
 /**
 * @ingroup groupMatrix
 */
@ -54,14 +58,7 @@
 * @{
 */

-/**
- * @brief Floating-point matrix multiplication.
- * @param[in]       *pSrcA points to the first input matrix structure
- * @param[in]       *pSrcB points to the second input matrix structure
- * @param[out]      *pDst points to output matrix structure
- * @return     		The function returns either
- * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
- */
+

 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -258,6 +255,14 @@ __STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve(
 }


+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
 arm_status arm_mat_mult_f32(
  const arm_matrix_instance_f32 * pSrcA,
  const arm_matrix_instance_f32 * pSrcB,
@ -512,9 +517,14 @@ arm_status arm_mat_mult_f32(
 #else

 #if defined(ARM_MATH_NEON)
-
-#define GROUPOFROWS 8
-
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
 arm_status arm_mat_mult_f32(
  const arm_matrix_instance_f32 * pSrcA,
  const arm_matrix_instance_f32 * pSrcB,
@ -843,6 +853,14 @@ arm_status arm_mat_mult_f32(
  return (status);
 }
 #else
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
 arm_status arm_mat_mult_f32(
  const arm_matrix_instance_f32 * pSrcA,
  const arm_matrix_instance_f32 * pSrcB,
--- a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
+++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@ -33,8 +33,28 @@
 #include <limits.h>
 #include <math.h>

+#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)

+/*
+
+_Float16 is not supported in g++ so we avoid putting _Float16 definitions
+in the public headers.

+This function should at some point be moved in FastMath.
+
+*/
+__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
+{
+    float16_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = (_Float16)r * (_Float16)x;
+        nb--;
+    }
+    return(r);
+}
+#endif

 /**
 * @addtogroup polysvm
@ -42,6 +62,13 @@
 */


+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
 /**
 * @brief SVM polynomial prediction
 * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
@ -50,12 +77,6 @@
 * @return none.
 *
 */
-
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-#include "arm_helium_utils.h"
-#include "arm_vec_math_f16.h"
-
 void arm_svm_polynomial_predict_f16(
    const arm_svm_polynomial_instance_f16 *S,
    const float16_t * in,
@ -306,26 +327,15 @@ void arm_svm_polynomial_predict_f16(

 #else

-/*
-
-_Float16 is not supported in g++ so we avoid putting _Float16 definitions
-in the public headers.
-
-This function should at some point be moved in FastMath.

+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
 */
-__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
-{
-    float16_t r = x;
-    nb --;
-    while(nb > 0)
-    {
-        r = (_Float16)r * (_Float16)x;
-        nb--;
-    }
-    return(r);
-}
-
 void arm_svm_polynomial_predict_f16(
    const arm_svm_polynomial_instance_f16 *S,
    const float16_t * in,
--- a/Testing/Source/Tests/BIQUADF32.cpp
+++ b/Testing/Source/Tests/BIQUADF32.cpp
@ -92,11 +92,8 @@ a double precision computation.

        float32_t *statep = state.ptr();

-#if !defined(ARM_MATH_NEON) 
        const float32_t *coefsp = coefs.ptr();
-#else
-        float32_t *coefsp = coefs.ptr();
-#endif
+
        
        const float32_t *inputp = inputs.ptr();
        float32_t *outp = output.ptr();
@ -126,13 +123,15 @@ a double precision computation.
 #else
           float32_t *vecCoefsPtr = vecCoefs.ptr();

+           // Those Neon coefs must be computed from original coefs
+           arm_biquad_cascade_df2T_compute_coefs_f32(3,coefsp,vecCoefsPtr);
+
           arm_biquad_cascade_df2T_init_f32(&this->Sdf2T,
                    3,
                    vecCoefsPtr,
                    statep);

-           // Those Neon coefs must be computed from original coefs
-           arm_biquad_cascade_df2T_compute_coefs_f32(&this->Sdf2T,3,coefsp);
+           
 #endif

           /*
@ -290,13 +289,15 @@ a double precision computation.
 #else
           float32_t *vecCoefsPtr = vecCoefs.ptr();

+           // Those Neon coefs must be computed from original coefs
+           arm_biquad_cascade_df2T_compute_coefs_f32(numStages,coefsp,vecCoefsPtr);
+
           arm_biquad_cascade_df2T_init_f32(&this->Sdf2T,
                    numStages,
                    vecCoefsPtr,
                    statep);

-           // Those Neon coefs must be computed from original coefs
-           arm_biquad_cascade_df2T_compute_coefs_f32(&this->Sdf2T,numStages,coefsp);
+           
 #endif
           coefsp += numStages * 5;