diff --git a/Include/dsp/filtering_functions.h b/Include/dsp/filtering_functions.h index ef7f2dd5..0d2f9ce8 100755 --- a/Include/dsp/filtering_functions.h +++ b/Include/dsp/filtering_functions.h @@ -1173,10 +1173,17 @@ arm_status arm_fir_decimate_init_f32( #if defined(ARM_MATH_NEON) +/** + @brief Compute new coefficient arrays for use in vectorized filter (Neon only). + @param[in] numStages number of 2nd order stages in the filter. + @param[in] pCoeffs points to the original filter coefficients. + @param[in] pComputedCoeffs points to the new computed coefficients for the vectorized version. + @return none +*/ void arm_biquad_cascade_df2T_compute_coefs_f32( - arm_biquad_cascade_df2T_instance_f32 * S, uint8_t numStages, - const float32_t * pCoeffs); + const float32_t * pCoeffs, + float32_t * pComputedCoeffs); #endif /** * @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c index 27d1eef8..e3b350c3 100644 --- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c @@ -37,78 +37,32 @@ @{ */ -/** - @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. - @param[in,out] S points to an instance of the filter data structure. - @param[in] numStages number of 2nd order stages in the filter. - @param[in] pCoeffs points to the filter coefficients. - @param[in] pState points to the state buffer. - @return none - @par Coefficient and State Ordering - The coefficients are stored in the array pCoeffs in the following order - in the not Neon version. -
-      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
-  
- - @par - where b1x and a1x are the coefficients for the first stage, - b2x and a2x are the coefficients for the second stage, - and so on. The pCoeffs array contains a total of 5*numStages values. - - For Neon version, this array is bigger. If numstages = 4x + y, then the array has size: - 32*x + 5*y - and it must be initialized using the function - arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the - standard array coefficient as parameters. - - But, an array of 8*numstages is a good approximation. - - Then, the initialization can be done with: -
-                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, neonCoefs, stateNeon);
-                   arm_biquad_cascade_df2T_compute_coefs_f32(&SNeon,nbCascade,coefs);
-  
- - @par In this example, neonCoefs is a bigger array of size 8 * numStages. - coefs is the standard array: - -
-      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
-  
- - - @par - The pState is a pointer to state array. - Each Biquad stage has 2 state variables d1, and d2. - The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on. - The state array has a total length of 2*numStages values. - The state variables are updated after each block of data is processed; the coefficients are untouched. - */ #if defined(ARM_MATH_NEON) -/* +/** + @brief Compute new coefficient arrays for use in vectorized filter (Neon only). + @param[in] numStages number of 2nd order stages in the filter. + @param[in] pCoeffs points to the original filter coefficients. + @param[in] pComputedCoeffs points to the new computed coefficients for the vectorized Neon version. + @return none + + @par Size of coefficient arrays: + pCoeffs has size 5 * numStages -Must be called after initializing the biquad instance. -pCoeffs has size 5 * nbCascade -Whereas the pCoeffs for the init has size (4*4 + 4*4)* nbCascade + pComputedCoeffs has size 8 * numStages -So this pCoeffs is the one which would be used for the not Neon version. -The pCoeffs passed in init is bigger than the one for the not Neon version. + pComputedCoeffs is the array to be used in arm_biquad_cascade_df2T_init_f32. */ void arm_biquad_cascade_df2T_compute_coefs_f32( - arm_biquad_cascade_df2T_instance_f32 * S, uint8_t numStages, - const float32_t * pCoeffs) + const float32_t * pCoeffs, + float32_t * pComputedCoeffs) { uint8_t cnt; - float32_t *pDstCoeffs; float32_t b0[4],b1[4],b2[4],a1[4],a2[4]; - pDstCoeffs = (float32_t*)S->pCoeffs; - cnt = numStages >> 2; while(cnt > 0) { @@ -123,52 +77,52 @@ void arm_biquad_cascade_df2T_compute_coefs_f32( } /* Vec 1 */ - *pDstCoeffs++ = 0; - *pDstCoeffs++ = b0[1]; - *pDstCoeffs++ = b0[2]; - *pDstCoeffs++ = b0[3]; + *pComputedCoeffs++ = 0; + *pComputedCoeffs++ = b0[1]; + *pComputedCoeffs++ = b0[2]; + *pComputedCoeffs++ = b0[3]; /* Vec 2 */ - *pDstCoeffs++ = 0; - *pDstCoeffs++ = 0; - *pDstCoeffs++ = b0[1] * b0[2]; - *pDstCoeffs++ = b0[2] * b0[3]; + *pComputedCoeffs++ = 0; + *pComputedCoeffs++ = 0; + *pComputedCoeffs++ = b0[1] * b0[2]; + *pComputedCoeffs++ = b0[2] * b0[3]; /* Vec 3 */ - *pDstCoeffs++ = 0; - *pDstCoeffs++ = 0; - *pDstCoeffs++ = 0; - *pDstCoeffs++ = b0[1] * b0[2] * b0[3]; + *pComputedCoeffs++ = 0; + *pComputedCoeffs++ = 0; + *pComputedCoeffs++ = 0; + *pComputedCoeffs++ = b0[1] * b0[2] * b0[3]; /* Vec 4 */ - *pDstCoeffs++ = b0[0]; - *pDstCoeffs++ = b0[0] * b0[1]; - *pDstCoeffs++ = b0[0] * b0[1] * b0[2]; - *pDstCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3]; + *pComputedCoeffs++ = b0[0]; + *pComputedCoeffs++ = b0[0] * b0[1]; + *pComputedCoeffs++ = b0[0] * b0[1] * b0[2]; + *pComputedCoeffs++ = b0[0] * b0[1] * b0[2] * b0[3]; /* Vec 5 */ - *pDstCoeffs++ = b1[0]; - *pDstCoeffs++ = b1[1]; - *pDstCoeffs++ = b1[2]; - *pDstCoeffs++ = b1[3]; + *pComputedCoeffs++ = b1[0]; + *pComputedCoeffs++ = b1[1]; + *pComputedCoeffs++ = b1[2]; + *pComputedCoeffs++ = b1[3]; /* Vec 6 */ - *pDstCoeffs++ = b2[0]; - *pDstCoeffs++ = b2[1]; - *pDstCoeffs++ = b2[2]; - *pDstCoeffs++ = b2[3]; + *pComputedCoeffs++ = b2[0]; + *pComputedCoeffs++ = b2[1]; + *pComputedCoeffs++ = b2[2]; + *pComputedCoeffs++ = b2[3]; /* Vec 7 */ - *pDstCoeffs++ = a1[0]; - *pDstCoeffs++ = a1[1]; - *pDstCoeffs++ = a1[2]; - *pDstCoeffs++ = a1[3]; + *pComputedCoeffs++ = a1[0]; + *pComputedCoeffs++ = a1[1]; + *pComputedCoeffs++ = a1[2]; + *pComputedCoeffs++ = a1[3]; /* Vec 8 */ - *pDstCoeffs++ = a2[0]; - *pDstCoeffs++ = a2[1]; - *pDstCoeffs++ = a2[2]; - *pDstCoeffs++ = a2[3]; + *pComputedCoeffs++ = a2[0]; + *pComputedCoeffs++ = a2[1]; + *pComputedCoeffs++ = a2[2]; + *pComputedCoeffs++ = a2[3]; cnt--; } @@ -176,17 +130,66 @@ void arm_biquad_cascade_df2T_compute_coefs_f32( cnt = numStages & 0x3; while(cnt > 0) { - *pDstCoeffs++ = *pCoeffs++; - *pDstCoeffs++ = *pCoeffs++; - *pDstCoeffs++ = *pCoeffs++; - *pDstCoeffs++ = *pCoeffs++; - *pDstCoeffs++ = *pCoeffs++; + *pComputedCoeffs++ = *pCoeffs++; + *pComputedCoeffs++ = *pCoeffs++; + *pComputedCoeffs++ = *pCoeffs++; + *pComputedCoeffs++ = *pCoeffs++; + *pComputedCoeffs++ = *pCoeffs++; cnt--; } } #endif +/** + @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter. + @param[in,out] S points to an instance of the filter data structure. + @param[in] numStages number of 2nd order stages in the filter. + @param[in] pCoeffs points to the filter coefficients. + @param[in] pState points to the state buffer. + @return none + + @par Coefficient and State Ordering + The coefficients are stored in the array pCoeffs in the following order + in the not Neon version. +
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  
+ + @par + where b1x and a1x are the coefficients for the first stage, + b2x and a2x are the coefficients for the second stage, + and so on. The pCoeffs array contains a total of 5*numStages values. + + For Neon version, this array is bigger. If numstages = 4x + y, then the array has size: + 32*x + 5*y + and it must be initialized using the function + arm_biquad_cascade_df2T_compute_coefs_f32 which is taking the + standard array coefficient as parameters. + + But, an array of 8*numstages is a good approximation. + + Then, the initialization can be done with: +
+                   arm_biquad_cascade_df2T_compute_coefs_f32(nbCascade,coefs,computedCoefs);
+                   arm_biquad_cascade_df2T_init_f32(&SNeon, nbCascade, computedCoefs, stateNeon);
+  
+ + @par In this example, computedCoefs is a bigger array of size 8 * numStages. + coefs is the standard array: + +
+      {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+  
+ + + @par + The pState is a pointer to state array. + Each Biquad stage has 2 state variables d1, and d2. + The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on. + The state array has a total length of 2*numStages values. + The state variables are updated after each block of data is processed; the coefficients are untouched. + */ void arm_biquad_cascade_df2T_init_f32( arm_biquad_cascade_df2T_instance_f32 * S, uint8_t numStages, diff --git a/Source/MatrixFunctions/arm_mat_mult_f32.c b/Source/MatrixFunctions/arm_mat_mult_f32.c index 54481187..d1fd9eac 100644 --- a/Source/MatrixFunctions/arm_mat_mult_f32.c +++ b/Source/MatrixFunctions/arm_mat_mult_f32.c @@ -28,6 +28,10 @@ #include "dsp/matrix_functions.h" +#if defined(ARM_MATH_NEON) +#define GROUPOFROWS 8 +#endif + /** * @ingroup groupMatrix */ @@ -54,14 +58,7 @@ * @{ */ -/** - * @brief Floating-point matrix multiplication. - * @param[in] *pSrcA points to the first input matrix structure - * @param[in] *pSrcB points to the second input matrix structure - * @param[out] *pDst points to output matrix structure - * @return The function returns either - * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. - */ + #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) @@ -258,6 +255,14 @@ __STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve( } +/** + * @brief Floating-point matrix multiplication. + * @param[in] *pSrcA points to the first input matrix structure + * @param[in] *pSrcB points to the second input matrix structure + * @param[out] *pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ arm_status arm_mat_mult_f32( const arm_matrix_instance_f32 * pSrcA, const arm_matrix_instance_f32 * pSrcB, @@ -512,9 +517,14 @@ arm_status arm_mat_mult_f32( #else #if defined(ARM_MATH_NEON) - -#define GROUPOFROWS 8 - +/** + * @brief Floating-point matrix multiplication. + * @param[in] *pSrcA points to the first input matrix structure + * @param[in] *pSrcB points to the second input matrix structure + * @param[out] *pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ arm_status arm_mat_mult_f32( const arm_matrix_instance_f32 * pSrcA, const arm_matrix_instance_f32 * pSrcB, @@ -843,6 +853,14 @@ arm_status arm_mat_mult_f32( return (status); } #else +/** + * @brief Floating-point matrix multiplication. + * @param[in] *pSrcA points to the first input matrix structure + * @param[in] *pSrcB points to the second input matrix structure + * @param[out] *pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ arm_status arm_mat_mult_f32( const arm_matrix_instance_f32 * pSrcA, const arm_matrix_instance_f32 * pSrcB, diff --git a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c index 46bc689f..724f286d 100755 --- a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c +++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c @@ -33,8 +33,28 @@ #include #include +#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE) +/* + +_Float16 is not supported in g++ so we avoid putting _Float16 definitions +in the public headers. +This function should at some point be moved in FastMath. + +*/ +__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb) +{ + float16_t r = x; + nb --; + while(nb > 0) + { + r = (_Float16)r * (_Float16)x; + nb--; + } + return(r); +} +#endif /** * @addtogroup polysvm @@ -42,6 +62,13 @@ */ + + +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_math_f16.h" + /** * @brief SVM polynomial prediction * @param[in] S Pointer to an instance of the polynomial SVM structure. @@ -50,12 +77,6 @@ * @return none. * */ - -#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) - -#include "arm_helium_utils.h" -#include "arm_vec_math_f16.h" - void arm_svm_polynomial_predict_f16( const arm_svm_polynomial_instance_f16 *S, const float16_t * in, @@ -306,26 +327,15 @@ void arm_svm_polynomial_predict_f16( #else -/* - -_Float16 is not supported in g++ so we avoid putting _Float16 definitions -in the public headers. - -This function should at some point be moved in FastMath. - -*/ -__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb) -{ - float16_t r = x; - nb --; - while(nb > 0) - { - r = (_Float16)r * (_Float16)x; - nb--; - } - return(r); -} +/** + * @brief SVM polynomial prediction + * @param[in] S Pointer to an instance of the polynomial SVM structure. + * @param[in] in Pointer to input vector + * @param[out] pResult Decision value + * @return none. + * + */ void arm_svm_polynomial_predict_f16( const arm_svm_polynomial_instance_f16 *S, const float16_t * in, diff --git a/Testing/Source/Tests/BIQUADF32.cpp b/Testing/Source/Tests/BIQUADF32.cpp index e9683bc1..25ef1d87 100755 --- a/Testing/Source/Tests/BIQUADF32.cpp +++ b/Testing/Source/Tests/BIQUADF32.cpp @@ -92,11 +92,8 @@ a double precision computation. float32_t *statep = state.ptr(); -#if !defined(ARM_MATH_NEON) const float32_t *coefsp = coefs.ptr(); -#else - float32_t *coefsp = coefs.ptr(); -#endif + const float32_t *inputp = inputs.ptr(); float32_t *outp = output.ptr(); @@ -126,13 +123,15 @@ a double precision computation. #else float32_t *vecCoefsPtr = vecCoefs.ptr(); + // Those Neon coefs must be computed from original coefs + arm_biquad_cascade_df2T_compute_coefs_f32(3,coefsp,vecCoefsPtr); + arm_biquad_cascade_df2T_init_f32(&this->Sdf2T, 3, vecCoefsPtr, statep); - // Those Neon coefs must be computed from original coefs - arm_biquad_cascade_df2T_compute_coefs_f32(&this->Sdf2T,3,coefsp); + #endif /* @@ -290,13 +289,15 @@ a double precision computation. #else float32_t *vecCoefsPtr = vecCoefs.ptr(); + // Those Neon coefs must be computed from original coefs + arm_biquad_cascade_df2T_compute_coefs_f32(numStages,coefsp,vecCoefsPtr); + arm_biquad_cascade_df2T_init_f32(&this->Sdf2T, numStages, vecCoefsPtr, statep); - // Those Neon coefs must be computed from original coefs - arm_biquad_cascade_df2T_compute_coefs_f32(&this->Sdf2T,numStages,coefsp); + #endif coefsp += numStages * 5;