From 433ecc8b2256fdf876090dbd91a174acf829100e Mon Sep 17 00:00:00 2001 From: Silfurion Date: Tue, 23 Aug 2022 16:14:10 +0200 Subject: [PATCH] Updated the indentation and added the optimized var function --- Source/BasicMathFunctions/arm_dot_prod_f64.c | 152 +-- .../arm_chebyshev_distance_f64.c | 80 +- .../arm_cityblock_distance_f64.c | 64 +- .../arm_euclidean_distance_f64.c | 52 +- Source/FastMathFunctions/arm_vexp_f64.c | 72 +- Source/FastMathFunctions/arm_vlog_f64.c | 70 +- .../arm_biquad_cascade_df2T_f64.c | 1090 ++++++++--------- .../arm_biquad_cascade_df2T_init_f64.c | 30 +- Source/FilteringFunctions/arm_correlate_f64.c | 726 +++++------ Source/FilteringFunctions/arm_fir_f64.c | 330 ++--- Source/FilteringFunctions/arm_fir_init_f64.c | 30 +- Source/MatrixFunctions/arm_mat_cholesky_f64.c | 416 +++---- Source/MatrixFunctions/arm_mat_init_f64.c | 24 +- .../arm_mat_solve_lower_triangular_f64.c | 332 ++--- .../arm_mat_solve_upper_triangular_f64.c | 310 ++--- Source/MatrixFunctions/arm_mat_trans_f64.c | 396 +++--- .../arm_absmax_no_idx_f64.c | 12 +- .../arm_absmin_no_idx_f64.c | 180 +-- Source/StatisticsFunctions/arm_entropy_f64.c | 68 +- .../arm_kullback_leibler_f64.c | 94 +- .../StatisticsFunctions/arm_max_no_idx_f64.c | 12 +- Source/StatisticsFunctions/arm_mean_f64.c | 114 +- .../StatisticsFunctions/arm_min_no_idx_f64.c | 82 +- Source/StatisticsFunctions/arm_mse_f64.c | 138 +-- Source/StatisticsFunctions/arm_power_f64.c | 140 +-- Source/StatisticsFunctions/arm_var_f64.c | 98 +- 26 files changed, 2557 insertions(+), 2555 deletions(-) diff --git a/Source/BasicMathFunctions/arm_dot_prod_f64.c b/Source/BasicMathFunctions/arm_dot_prod_f64.c index 4e300331..3fafd10c 100644 --- a/Source/BasicMathFunctions/arm_dot_prod_f64.c +++ b/Source/BasicMathFunctions/arm_dot_prod_f64.c @@ -47,85 +47,85 @@ */ #if defined(ARM_MATH_NEON) void arm_dot_prod_f64( - const float64_t * pSrcA, - const float64_t * pSrcB, - uint32_t blockSize, - float64_t * result) - { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary return variable */ - float64x2_t sumV ; /* Neon buffer for sum variable */ - - /* Neon Buffer Initialisation */ - sumV = vsetq_lane_f64(0.0f, sumV, 0); - sumV = vsetq_lane_f64(0.0f, sumV, 1); - - /* Neon Buffer for sources */ - float64x2_t pSrcAV; - float64x2_t pSrcBV; - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize >> 1U; - - while (blkCnt > 0U) - { - /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ - - /* Load source value in Neon Buffer */ - pSrcAV = vld1q_f64(pSrcA); - pSrcBV = vld1q_f64(pSrcB); - /* Calculate dot product and store result in a temporary buffer. */ - sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV); - - pSrcA+=2; - pSrcB+=2; - /* Decrement loop counter */ - blkCnt--; - } - /* Sum both 64 bits part in the float64x2 */ - sum = vaddvq_f64(sumV); - - - /* Tail */ - blkCnt = blockSize & 1 ; - - while(blkCnt > 0U) - { - sum += (*pSrcA++) * (*pSrcB++); - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result in destination buffer */ - *result = sum; - } + const float64_t * pSrcA, + const float64_t * pSrcB, + uint32_t blockSize, + float64_t * result) +{ + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary return variable */ + float64x2_t sumV ; /* Neon buffer for sum variable */ + + /* Neon Buffer Initialisation */ + sumV = vsetq_lane_f64(0.0f, sumV, 0); + sumV = vsetq_lane_f64(0.0f, sumV, 1); + + /* Neon Buffer for sources */ + float64x2_t pSrcAV; + float64x2_t pSrcBV; + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ + + /* Load source value in Neon Buffer */ + pSrcAV = vld1q_f64(pSrcA); + pSrcBV = vld1q_f64(pSrcB); + /* Calculate dot product and store result in a temporary buffer. */ + sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV); + + pSrcA+=2; + pSrcB+=2; + /* Decrement loop counter */ + blkCnt--; + } + /* Sum both 64 bits part in the float64x2 */ + sum = vaddvq_f64(sumV); + + + /* Tail */ + blkCnt = blockSize & 1 ; + + while(blkCnt > 0U) + { + sum += (*pSrcA++) * (*pSrcB++); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result in destination buffer */ + *result = sum; +} #else void arm_dot_prod_f64( - const float64_t * pSrcA, - const float64_t * pSrcB, - uint32_t blockSize, - float64_t * result) + const float64_t * pSrcA, + const float64_t * pSrcB, + uint32_t blockSize, + float64_t * result) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary return variable */ - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ - - /* Calculate dot product and store result in a temporary buffer. */ - sum += (*pSrcA++) * (*pSrcB++); - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result in destination buffer */ - *result = sum; + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary return variable */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ + + /* Calculate dot product and store result in a temporary buffer. */ + sum += (*pSrcA++) * (*pSrcB++); + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result in destination buffer */ + *result = sum; } #endif diff --git a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c index 63090948..76bd3a63 100644 --- a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c +++ b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c @@ -48,49 +48,49 @@ */ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize) { - - float64_t diff=0., maxVal,tmpA, tmpB; - uint32_t blkCnt; - maxVal = F64_MIN; + + float64_t diff=0., maxVal,tmpA, tmpB; + uint32_t blkCnt; + maxVal = F64_MIN; #if defined(ARM_MATH_NEON) - float64x2_t diffV , tmpAV , tmpBV , maxValV ; - maxValV = vdupq_n_f64(maxVal); - blkCnt = blockSize >> 1U ; - while(blkCnt > 0U) - { - tmpAV = vld1q_f64(pA); - tmpBV = vld1q_f64(pB); - diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV))); - maxValV = vmaxq_f64(maxValV, diffV); - pA+=2; - pB+=2; - blkCnt--; - } - maxVal =vgetq_lane_f64(maxValV, 0); - if(maxVal < vgetq_lane_f64(maxValV, 1)) - { - maxVal = vgetq_lane_f64(maxValV, 1); - } - blkCnt = blockSize & 1; - - + float64x2_t diffV , tmpAV , tmpBV , maxValV ; + maxValV = vdupq_n_f64(maxVal); + blkCnt = blockSize >> 1U ; + while(blkCnt > 0U) + { + tmpAV = vld1q_f64(pA); + tmpBV = vld1q_f64(pB); + diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV))); + maxValV = vmaxq_f64(maxValV, diffV); + pA+=2; + pB+=2; + blkCnt--; + } + maxVal =vgetq_lane_f64(maxValV, 0); + if(maxVal < vgetq_lane_f64(maxValV, 1)) + { + maxVal = vgetq_lane_f64(maxValV, 1); + } + blkCnt = blockSize & 1; + + #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - - while(blkCnt > 0) - { - tmpA = *pA++; - tmpB = *pB++; - diff = fabs(tmpA - tmpB); - if (diff > maxVal) - { - maxVal = diff; - } - blkCnt --; - } - - return(maxVal); + + while(blkCnt > 0) + { + tmpA = *pA++; + tmpB = *pB++; + diff = fabs(tmpA - tmpB); + if (diff > maxVal) + { + maxVal = diff; + } + blkCnt --; + } + + return(maxVal); } /** diff --git a/Source/DistanceFunctions/arm_cityblock_distance_f64.c b/Source/DistanceFunctions/arm_cityblock_distance_f64.c index 2f63ffae..9edb7041 100644 --- a/Source/DistanceFunctions/arm_cityblock_distance_f64.c +++ b/Source/DistanceFunctions/arm_cityblock_distance_f64.c @@ -47,41 +47,41 @@ */ float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize) { - float64_t accum,tmpA, tmpB; - uint32_t blkCnt; - accum = 0.; - + float64_t accum,tmpA, tmpB; + uint32_t blkCnt; + accum = 0.; + #if defined(ARM_MATH_NEON) - float64x2_t tmpAV, tmpBV,accumV , subV; - accumV = vdupq_n_f64(0.0f); - blkCnt = blockSize >> 1U; - - while(blkCnt > 0U) - { - tmpAV = vld1q_f64(pA); - tmpBV = vld1q_f64(pB); - subV = vabdq_f64(tmpAV, tmpBV); - accumV = vaddq_f64(accumV, subV); - pA+=2; - pB+=2; - blkCnt--; - } - accum = vaddvq_f64(accumV); - blkCnt = blockSize & 1 ; - + float64x2_t tmpAV, tmpBV,accumV , subV; + accumV = vdupq_n_f64(0.0f); + blkCnt = blockSize >> 1U; + + while(blkCnt > 0U) + { + tmpAV = vld1q_f64(pA); + tmpBV = vld1q_f64(pB); + subV = vabdq_f64(tmpAV, tmpBV); + accumV = vaddq_f64(accumV, subV); + pA+=2; + pB+=2; + blkCnt--; + } + accum = vaddvq_f64(accumV); + blkCnt = blockSize & 1 ; + #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - while(blkCnt > 0) - { - tmpA = *pA++; - tmpB = *pB++; - accum += fabs(tmpA - tmpB); - - blkCnt--; - } - - return(accum); + while(blkCnt > 0) + { + tmpA = *pA++; + tmpB = *pB++; + accum += fabs(tmpA - tmpB); + + blkCnt--; + } + + return(accum); } /** diff --git a/Source/DistanceFunctions/arm_euclidean_distance_f64.c b/Source/DistanceFunctions/arm_euclidean_distance_f64.c index 794bf93d..1b00c9bf 100644 --- a/Source/DistanceFunctions/arm_euclidean_distance_f64.c +++ b/Source/DistanceFunctions/arm_euclidean_distance_f64.c @@ -49,35 +49,35 @@ */ float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize) { - float64_t accum=0.,tmp; - uint32_t blkCnt; + float64_t accum=0.,tmp; + uint32_t blkCnt; #if defined(ARM_MATH_NEON) - float64x2_t accumV,tmpV , pAV ,pBV; - accumV = vdupq_n_f64(0.0f); - blkCnt = blockSize >> 1U; - while(blkCnt > 0U) - { - pAV = vld1q_f64(pA); - pBV = vld1q_f64(pB); - tmpV = vsubq_f64(pAV, pBV); - accumV = vmlaq_f64(accumV, tmpV, tmpV); - pA+=2; - pB+=2; - blkCnt--; - } - accum = vaddvq_f64(accumV); - blkCnt = blockSize & 1; + float64x2_t accumV,tmpV , pAV ,pBV; + accumV = vdupq_n_f64(0.0f); + blkCnt = blockSize >> 1U; + while(blkCnt > 0U) + { + pAV = vld1q_f64(pA); + pBV = vld1q_f64(pB); + tmpV = vsubq_f64(pAV, pBV); + accumV = vmlaq_f64(accumV, tmpV, tmpV); + pA+=2; + pB+=2; + blkCnt--; + } + accum = vaddvq_f64(accumV); + blkCnt = blockSize & 1; #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - while(blkCnt > 0) - { - tmp = *pA++ - *pB++; - accum += SQ(tmp); - blkCnt --; - } - tmp = sqrt(accum); - return(tmp); + while(blkCnt > 0) + { + tmp = *pA++ - *pB++; + accum += SQ(tmp); + blkCnt --; + } + tmp = sqrt(accum); + return(tmp); } /** diff --git a/Source/FastMathFunctions/arm_vexp_f64.c b/Source/FastMathFunctions/arm_vexp_f64.c index e6c8f359..d275e1d9 100644 --- a/Source/FastMathFunctions/arm_vexp_f64.c +++ b/Source/FastMathFunctions/arm_vexp_f64.c @@ -33,44 +33,44 @@ #endif void arm_vexp_f64( - const float64_t * pSrc, - float64_t * pDst, - uint32_t blockSize) + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) { - uint32_t blkCnt; + uint32_t blkCnt; #if defined(ARM_MATH_NEON) - - - float64x2_t src; - float64x2_t dst; - - blkCnt = blockSize >> 1U; - - while (blkCnt > 0U) - { - src = vld1q_f64(pSrc); - dst = vexpq_f64(src); - vst1q_f64(pDst, dst); - - pSrc += 2; - pDst += 2; - - blkCnt--; - } - - blkCnt = blockSize & 1; + + + float64x2_t src; + float64x2_t dst; + + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + src = vld1q_f64(pSrc); + dst = vexpq_f64(src); + vst1q_f64(pDst, dst); + + pSrc += 2; + pDst += 2; + + blkCnt--; + } + + blkCnt = blockSize & 1; #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - while (blkCnt > 0U) - { - /* C = log(A) */ - - - /* Calculate log and store result in destination buffer. */ - *pDst++ = exp(*pSrc++); - - /* Decrement loop counter */ - blkCnt--; - } + while (blkCnt > 0U) + { + /* C = log(A) */ + + + /* Calculate log and store result in destination buffer. */ + *pDst++ = exp(*pSrc++); + + /* Decrement loop counter */ + blkCnt--; + } } diff --git a/Source/FastMathFunctions/arm_vlog_f64.c b/Source/FastMathFunctions/arm_vlog_f64.c index b9ab25ec..008c22dc 100644 --- a/Source/FastMathFunctions/arm_vlog_f64.c +++ b/Source/FastMathFunctions/arm_vlog_f64.c @@ -34,43 +34,43 @@ #endif void arm_vlog_f64( - const float64_t * pSrc, - float64_t * pDst, - uint32_t blockSize) + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) { - uint32_t blkCnt; + uint32_t blkCnt; #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) - float64x2_t src; - float64x2_t dst; - - blkCnt = blockSize >> 1U; - - while (blkCnt > 0U) - { - src = vld1q_f64(pSrc); - dst = vlogq_f64(src); - vst1q_f64(pDst, dst); - - pSrc += 2; - pDst += 2; - /* Decrement loop counter */ - blkCnt--; - } - - blkCnt = blockSize & 1; + float64x2_t src; + float64x2_t dst; + + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + src = vld1q_f64(pSrc); + dst = vlogq_f64(src); + vst1q_f64(pDst, dst); + + pSrc += 2; + pDst += 2; + /* Decrement loop counter */ + blkCnt--; + } + + blkCnt = blockSize & 1; #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - - - while (blkCnt > 0U) - { - /* C = log(A) */ - - /* Calculate log and store result in destination buffer. */ - *pDst++ = log(*pSrc++); - - /* Decrement loop counter */ - blkCnt--; - } + + + while (blkCnt > 0U) + { + /* C = log(A) */ + + /* Calculate log and store result in destination buffer. */ + *pDst++ = log(*pSrc++); + + /* Decrement loop counter */ + blkCnt--; + } } diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c index e936c3fd..99fb2b41 100644 --- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c @@ -29,574 +29,574 @@ #include "dsp/filtering_functions.h" /** - @ingroup groupFilters -*/ + @ingroup groupFilters + */ /** - @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure - - This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure. - The filters are implemented as a cascade of second order Biquad sections. - These functions provide a slight memory savings as compared to the direct form I Biquad filter functions. - Only floating-point data is supported. - - This function operate on blocks of input and output data and each call to the function - processes blockSize samples through the filter. - pSrc points to the array of input data and - pDst points to the array of output data. - Both arrays contain blockSize values. - - @par Algorithm - Each Biquad stage implements a second order filter using the difference equation: -
-     y[n] = b0 * x[n] + d1
-     d1 = b1 * x[n] + a1 * y[n] + d2
-     d2 = b2 * x[n] + a2 * y[n]
-  
- where d1 and d2 represent the two state values. - @par - A Biquad filter using a transposed Direct Form II structure is shown below. - \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" - Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. - Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. - Pay careful attention to the sign of the feedback coefficients. - Some design tools flip the sign of the feedback coefficients: -
-     y[n] = b0 * x[n] + d1;
-     d1 = b1 * x[n] - a1 * y[n] + d2;
-     d2 = b2 * x[n] - a2 * y[n];
-  
- In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. - @par - Higher order filters are realized as a cascade of second order sections. - numStages refers to the number of second order stages used. - For example, an 8th order filter would be realized with numStages=4 second order stages. - A 9th order filter would be realized with numStages=5 second order stages with the - coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). - @par - pState points to the state variable array. - Each Biquad stage has 2 state variables d1 and d2. - The state variables are arranged in the pState array as: -
-      {d11, d12, d21, d22, ...}
-  
- where d1x refers to the state variables for the first Biquad and - d2x refers to the state variables for the second Biquad. - The state array has a total length of 2*numStages values. - The state variables are updated after each block of data is processed; the coefficients are untouched. - @par - The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II. - The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types. - That is why the Direct Form I structure supports Q15 and Q31 data types. - The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2. - Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad. - The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage. - - @par Instance Structure - The coefficients and state variables for a filter are stored together in an instance data structure. - A separate instance structure must be defined for each filter. - Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. - - @par Init Functions - There is also an associated initialization function. - The initialization function performs following operations: - - Sets the values of the internal structure fields. - - Zeros out the values in the state buffer. - To do this manually without calling the init function, assign the follow subfields of the instance structure: - numStages, pCoeffs, pState. Also set all of the values in pState to zero. - @par - Use of the initialization function is optional. - However, if the initialization function is used, then the instance structure cannot be placed into a const data section. - To place an instance structure into a const data section, the instance structure must be manually initialized. - Set the values in the state buffer to zeros before static initialization. - For example, to statically initialize the instance structure use -
-      arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
-      arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
-  
- where numStages is the number of Biquad stages in the filter; - pState is the address of the state buffer. - pCoeffs is the address of the coefficient buffer; -*/ + @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure + + This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure. + The filters are implemented as a cascade of second order Biquad sections. + These functions provide a slight memory savings as compared to the direct form I Biquad filter functions. + Only floating-point data is supported. + + This function operate on blocks of input and output data and each call to the function + processes blockSize samples through the filter. + pSrc points to the array of input data and + pDst points to the array of output data. + Both arrays contain blockSize values. + + @par Algorithm + Each Biquad stage implements a second order filter using the difference equation: +
+ y[n] = b0 * x[n] + d1
+ d1 = b1 * x[n] + a1 * y[n] + d2
+ d2 = b2 * x[n] + a2 * y[n]
+ 
+ where d1 and d2 represent the two state values. + @par + A Biquad filter using a transposed Direct Form II structure is shown below. + \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" + Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients. + Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients. + Pay careful attention to the sign of the feedback coefficients. + Some design tools flip the sign of the feedback coefficients: +
+ y[n] = b0 * x[n] + d1;
+ d1 = b1 * x[n] - a1 * y[n] + d2;
+ d2 = b2 * x[n] - a2 * y[n];
+ 
+ In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library. + @par + Higher order filters are realized as a cascade of second order sections. + numStages refers to the number of second order stages used. + For example, an 8th order filter would be realized with numStages=4 second order stages. + A 9th order filter would be realized with numStages=5 second order stages with the + coefficients for one of the stages configured as a first order filter (b2=0 and a2=0). + @par + pState points to the state variable array. + Each Biquad stage has 2 state variables d1 and d2. + The state variables are arranged in the pState array as: +
+ {d11, d12, d21, d22, ...}
+ 
+ where d1x refers to the state variables for the first Biquad and + d2x refers to the state variables for the second Biquad. + The state array has a total length of 2*numStages values. + The state variables are updated after each block of data is processed; the coefficients are untouched. + @par + The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II. + The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types. + That is why the Direct Form I structure supports Q15 and Q31 data types. + The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2. + Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad. + The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage. + + @par Instance Structure + The coefficients and state variables for a filter are stored together in an instance data structure. + A separate instance structure must be defined for each filter. + Coefficient arrays may be shared among several instances while state variable arrays cannot be shared. + + @par Init Functions + There is also an associated initialization function. + The initialization function performs following operations: + - Sets the values of the internal structure fields. + - Zeros out the values in the state buffer. + To do this manually without calling the init function, assign the follow subfields of the instance structure: + numStages, pCoeffs, pState. Also set all of the values in pState to zero. + @par + Use of the initialization function is optional. + However, if the initialization function is used, then the instance structure cannot be placed into a const data section. + To place an instance structure into a const data section, the instance structure must be manually initialized. + Set the values in the state buffer to zeros before static initialization. + For example, to statically initialize the instance structure use +
+ arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
+ arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
+ 
+ where numStages is the number of Biquad stages in the filter; + pState is the address of the state buffer. + pCoeffs is the address of the coefficient buffer; + */ /** - @addtogroup BiquadCascadeDF2T - @{ + @addtogroup BiquadCascadeDF2T + @{ */ /** - @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. - @param[in] S points to an instance of the filter data structure - @param[in] pSrc points to the block of input data - @param[out] pDst points to the block of output data - @param[in] blockSize number of samples to process - @return none + @brief Processing function for the floating-point transposed direct form II Biquad cascade filter. + @param[in] S points to an instance of the filter data structure + @param[in] pSrc points to the block of input data + @param[out] pDst points to the block of output data + @param[in] blockSize number of samples to process + @return none */ #if defined(ARM_MATH_NEON) void arm_biquad_cascade_df2T_f64( - const arm_biquad_cascade_df2T_instance_f64 * S, - const float64_t * pSrc, - float64_t * pDst, - uint32_t blockSize) + const arm_biquad_cascade_df2T_instance_f64 * S, + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) { - - - - const float64_t *pIn = pSrc; /* source pointer */ - float64_t Xn0, Xn1; - float64_t acc0, acc1; - float64_t *pOut = pDst; /* destination pointer */ - float64_t *pState = S->pState; /* State pointer */ - uint32_t sample, stage = S->numStages; /* loop counters */ - float64_t const *pCurCoeffs = /* coefficient pointer */ - (float64_t const *) S->pCoeffs; - float64x2_t b0Coeffs, a0Coeffs; /* Coefficients vector */ - float64x2_t state; /* State vector*/ - float64x2_t zeroV = vdupq_n_f64(0); - - float64_t b0 ; - - - do - { - - /* Reading the coefficients */ - b0 = *pCurCoeffs++ ; - b0Coeffs = vld1q_f64(pCurCoeffs); - pCurCoeffs += 2 ; - a0Coeffs = vld1q_f64(pCurCoeffs); - pCurCoeffs +=2 ; - - state = vld1q_f64(pState); - - sample = blockSize >> 1U; - while (sample > 0U) - { - - /* y[n] = b0 * x[n] + d1 */ - /* d1 = b1 * x[n] + a1 * y[n] + d2 */ - /* d2 = b2 * x[n] + a2 * y[n] */ - - Xn0 = *pIn++ ; - - /* Calculation of acc0*/ - acc0 = b0*Xn0+vgetq_lane_f64(state, 0); - - - /* - * final initial - * state b0Coeffs state a0Coeffs - * | | | | - * __ __ __ __ - * / \ / \ / \ / \ - * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0 - * | d2 | | b2 | | 0 | | a2 | - * \__/ \__/ \__/ \__/ - */ - - /* state -> initial state (see above) */ - - state = vextq_f64(state, zeroV, 1); - - /* Calculation of final state */ - state = vfmaq_n_f64(state, b0Coeffs, Xn0); - state = vfmaq_n_f64(state, a0Coeffs, acc0); - - *pOut++ = acc0 ; - - /* y[n] = b0 * x[n] + d1 */ - /* d1 = b1 * x[n] + a1 * y[n] + d2 */ - /* d2 = b2 * x[n] + a2 * y[n] */ - - Xn0 = *pIn++ ; - - /* Calculation of acc0*/ - acc0 = b0*Xn0+vgetq_lane_f64(state, 0); - - - /* - * final initial - * state b0Coeffs state a0Coeffs - * | | | | - * __ __ __ __ - * / \ / \ / \ / \ - * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0 - * | d2 | | b2 | | 0 | | a2 | - * \__/ \__/ \__/ \__/ - */ - - /* state -> initial state (see above) */ - - state = vextq_f64(state, zeroV, 1); - - /* Calculation of final state */ - state = vfmaq_n_f64(state, b0Coeffs, Xn0); - state = vfmaq_n_f64(state, a0Coeffs, acc0); - - *pOut++ = acc0; - sample--; - } - sample = blockSize & 1 ; - while (sample > 0U) - { - - /* y[n] = b0 * x[n] + d1 */ - /* d1 = b1 * x[n] + a1 * y[n] + d2 */ - /* d2 = b2 * x[n] + a2 * y[n] */ - - Xn0 = *pIn++ ; - - /* Calculation of acc0*/ - acc0 = b0*Xn0+vgetq_lane_f64(state, 0); - - - /* - * final initial - * state b0Coeffs state a0Coeffs - * | | | | - * __ __ __ __ - * / \ / \ / \ / \ - * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0 - * | d2 | | b2 | | 0 | | a2 | - * \__/ \__/ \__/ \__/ - */ - - /* state -> initial state (see above) */ - - state = vextq_f64(state, zeroV, 1); - - /* Calculation of final state */ - state = vfmaq_n_f64(state, b0Coeffs, Xn0); - state = vfmaq_n_f64(state, a0Coeffs, acc0); - - *pOut++ = acc0 ; - sample--; - } - /* Store the updated state variables back into the state array */ - pState[0] = vgetq_lane_f64(state, 0); - pState[1] = vgetq_lane_f64(state, 1); - - pState += 2U; - - /* The current stage output is given as the input to the next stage */ - pIn = pDst; - - /* Reset the output working pointer */ - pOut = pDst; - - stage--; - - } while (stage > 0U); - + + + + const float64_t *pIn = pSrc; /* source pointer */ + float64_t Xn0, Xn1; + float64_t acc0, acc1; + float64_t *pOut = pDst; /* destination pointer */ + float64_t *pState = S->pState; /* State pointer */ + uint32_t sample, stage = S->numStages; /* loop counters */ + float64_t const *pCurCoeffs = /* coefficient pointer */ + (float64_t const *) S->pCoeffs; + float64x2_t b0Coeffs, a0Coeffs; /* Coefficients vector */ + float64x2_t state; /* State vector*/ + float64x2_t zeroV = vdupq_n_f64(0); + + float64_t b0 ; + + + do + { + + /* Reading the coefficients */ + b0 = *pCurCoeffs++ ; + b0Coeffs = vld1q_f64(pCurCoeffs); + pCurCoeffs += 2 ; + a0Coeffs = vld1q_f64(pCurCoeffs); + pCurCoeffs +=2 ; + + state = vld1q_f64(pState); + + sample = blockSize >> 1U; + while (sample > 0U) + { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + Xn0 = *pIn++ ; + + /* Calculation of acc0*/ + acc0 = b0*Xn0+vgetq_lane_f64(state, 0); + + + /* + * final initial + * state b0Coeffs state a0Coeffs + * | | | | + * __ __ __ __ + * / \ / \ / \ / \ + * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0 + * | d2 | | b2 | | 0 | | a2 | + * \__/ \__/ \__/ \__/ + */ + + /* state -> initial state (see above) */ + + state = vextq_f64(state, zeroV, 1); + + /* Calculation of final state */ + state = vfmaq_n_f64(state, b0Coeffs, Xn0); + state = vfmaq_n_f64(state, a0Coeffs, acc0); + + *pOut++ = acc0 ; + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + Xn0 = *pIn++ ; + + /* Calculation of acc0*/ + acc0 = b0*Xn0+vgetq_lane_f64(state, 0); + + + /* + * final initial + * state b0Coeffs state a0Coeffs + * | | | | + * __ __ __ __ + * / \ / \ / \ / \ + * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0 + * | d2 | | b2 | | 0 | | a2 | + * \__/ \__/ \__/ \__/ + */ + + /* state -> initial state (see above) */ + + state = vextq_f64(state, zeroV, 1); + + /* Calculation of final state */ + state = vfmaq_n_f64(state, b0Coeffs, Xn0); + state = vfmaq_n_f64(state, a0Coeffs, acc0); + + *pOut++ = acc0; + sample--; + } + sample = blockSize & 1 ; + while (sample > 0U) + { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + Xn0 = *pIn++ ; + + /* Calculation of acc0*/ + acc0 = b0*Xn0+vgetq_lane_f64(state, 0); + + + /* + * final initial + * state b0Coeffs state a0Coeffs + * | | | | + * __ __ __ __ + * / \ / \ / \ / \ + * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0 + * | d2 | | b2 | | 0 | | a2 | + * \__/ \__/ \__/ \__/ + */ + + /* state -> initial state (see above) */ + + state = vextq_f64(state, zeroV, 1); + + /* Calculation of final state */ + state = vfmaq_n_f64(state, b0Coeffs, Xn0); + state = vfmaq_n_f64(state, a0Coeffs, acc0); + + *pOut++ = acc0 ; + sample--; + } + /* Store the updated state variables back into the state array */ + pState[0] = vgetq_lane_f64(state, 0); + pState[1] = vgetq_lane_f64(state, 1); + + pState += 2U; + + /* The current stage output is given as the input to the next stage */ + pIn = pDst; + + /* Reset the output working pointer */ + pOut = pDst; + + stage--; + + } while (stage > 0U); + } #else void arm_biquad_cascade_df2T_f64( - const arm_biquad_cascade_df2T_instance_f64 * S, - const float64_t * pSrc, - float64_t * pDst, - uint32_t blockSize) + const arm_biquad_cascade_df2T_instance_f64 * S, + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) { - - const float64_t *pIn = pSrc; /* Source pointer */ - float64_t *pOut = pDst; /* Destination pointer */ - float64_t *pState = S->pState; /* State pointer */ - const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - float64_t acc1; /* Accumulator */ - float64_t b0, b1, b2, a1, a2; /* Filter coefficients */ - float64_t Xn1; /* Temporary input */ - float64_t d1, d2; /* State variables */ - uint32_t sample, stage = S->numStages; /* Loop counters */ - - - do - { - /* Reading the coefficients */ - b0 = pCoeffs[0]; - b1 = pCoeffs[1]; - b2 = pCoeffs[2]; - a1 = pCoeffs[3]; - a2 = pCoeffs[4]; - - /* Reading the state values */ - d1 = pState[0]; - d2 = pState[1]; - - pCoeffs += 5U; - + + const float64_t *pIn = pSrc; /* Source pointer */ + float64_t *pOut = pDst; /* Destination pointer */ + float64_t *pState = S->pState; /* State pointer */ + const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float64_t acc1; /* Accumulator */ + float64_t b0, b1, b2, a1, a2; /* Filter coefficients */ + float64_t Xn1; /* Temporary input */ + float64_t d1, d2; /* State variables */ + uint32_t sample, stage = S->numStages; /* Loop counters */ + + + do + { + /* Reading the coefficients */ + b0 = pCoeffs[0]; + b1 = pCoeffs[1]; + b2 = pCoeffs[2]; + a1 = pCoeffs[3]; + a2 = pCoeffs[4]; + + /* Reading the state values */ + d1 = pState[0]; + d2 = pState[1]; + + pCoeffs += 5U; + #if defined (ARM_MATH_LOOPUNROLL) - - /* Loop unrolling: Compute 16 outputs at a time */ - sample = blockSize >> 4U; - - while (sample > 0U) { - - /* y[n] = b0 * x[n] + d1 */ - /* d1 = b1 * x[n] + a1 * y[n] + d2 */ - /* d2 = b2 * x[n] + a2 * y[n] */ - -/* 1 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - - -/* 2 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 3 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 4 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 5 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 6 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 7 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 8 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 9 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 10 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 11 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 12 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 13 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 14 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 15 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - -/* 16 */ - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - - /* decrement loop counter */ - sample--; - } - - /* Loop unrolling: Compute remaining outputs */ - sample = blockSize & 0xFU; - + + /* Loop unrolling: Compute 16 outputs at a time */ + sample = blockSize >> 4U; + + while (sample > 0U) { + + /* y[n] = b0 * x[n] + d1 */ + /* d1 = b1 * x[n] + a1 * y[n] + d2 */ + /* d2 = b2 * x[n] + a2 * y[n] */ + + /* 1 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + + /* 2 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 3 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 4 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 5 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 6 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 7 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 8 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 9 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 10 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 11 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 12 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 13 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 14 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 15 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* 16 */ + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* decrement loop counter */ + sample--; + } + + /* Loop unrolling: Compute remaining outputs */ + sample = blockSize & 0xFU; + #else - - /* Initialize blkCnt with number of samples */ - sample = blockSize; - + + /* Initialize blkCnt with number of samples */ + sample = blockSize; + #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ - - while (sample > 0U) { - Xn1 = *pIn++; - - acc1 = b0 * Xn1 + d1; - - d1 = b1 * Xn1 + d2; - d1 += a1 * acc1; - - d2 = b2 * Xn1; - d2 += a2 * acc1; - - *pOut++ = acc1; - - /* decrement loop counter */ - sample--; - } - - /* Store the updated state variables back into the state array */ - pState[0] = d1; - pState[1] = d2; - - pState += 2U; - - /* The current stage output is given as the input to the next stage */ - pIn = pDst; - - /* Reset the output working pointer */ - pOut = pDst; - - /* decrement loop counter */ - stage--; - - } while (stage > 0U); - + + while (sample > 0U) { + Xn1 = *pIn++; + + acc1 = b0 * Xn1 + d1; + + d1 = b1 * Xn1 + d2; + d1 += a1 * acc1; + + d2 = b2 * Xn1; + d2 += a2 * acc1; + + *pOut++ = acc1; + + /* decrement loop counter */ + sample--; + } + + /* Store the updated state variables back into the state array */ + pState[0] = d1; + pState[1] = d2; + + pState += 2U; + + /* The current stage output is given as the input to the next stage */ + pIn = pDst; + + /* Reset the output working pointer */ + pOut = pDst; + + /* decrement loop counter */ + stage--; + + } while (stage > 0U); + } #endif /** - @} end of BiquadCascadeDF2T group + @} end of BiquadCascadeDF2T group */ diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c index 5ab1c9a2..90e83df7 100644 --- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c @@ -63,22 +63,22 @@ */ void arm_biquad_cascade_df2T_init_f64( - arm_biquad_cascade_df2T_instance_f64 * S, - uint8_t numStages, - const float64_t * pCoeffs, - float64_t * pState) + arm_biquad_cascade_df2T_instance_f64 * S, + uint8_t numStages, + const float64_t * pCoeffs, + float64_t * pState) { - /* Assign filter stages */ - S->numStages = numStages; - - /* Assign coefficient pointer */ - S->pCoeffs = pCoeffs; - - /* Clear state buffer and size is always 2 * numStages */ - memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t)); - - /* Assign state pointer */ - S->pState = pState; + /* Assign filter stages */ + S->numStages = numStages; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer and size is always 2 * numStages */ + memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t)); + + /* Assign state pointer */ + S->pState = pState; } /** diff --git a/Source/FilteringFunctions/arm_correlate_f64.c b/Source/FilteringFunctions/arm_correlate_f64.c index 8d52bcf6..1d9c4bd7 100644 --- a/Source/FilteringFunctions/arm_correlate_f64.c +++ b/Source/FilteringFunctions/arm_correlate_f64.c @@ -48,384 +48,384 @@ */ void arm_correlate_f64( - const float64_t * pSrcA, - uint32_t srcALen, - const float64_t * pSrcB, - uint32_t srcBLen, - float64_t * pDst) + const float64_t * pSrcA, + uint32_t srcALen, + const float64_t * pSrcB, + uint32_t srcBLen, + float64_t * pDst) { - const float64_t *pIn1; /* InputA pointer */ - const float64_t *pIn2; /* InputB pointer */ - float64_t *pOut = pDst; /* Output pointer */ - const float64_t *px; /* Intermediate inputA pointer */ - const float64_t *py; /* Intermediate inputB pointer */ - const float64_t *pSrc1; - float64_t sum; - uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ - uint32_t j, k, count, blkCnt; /* Loop counters */ - uint32_t outBlockSize; /* Loop counter */ - int32_t inc = 1; /* Destination address modifier */ + const float64_t *pIn1; /* InputA pointer */ + const float64_t *pIn2; /* InputB pointer */ + float64_t *pOut = pDst; /* Output pointer */ + const float64_t *px; /* Intermediate inputA pointer */ + const float64_t *py; /* Intermediate inputB pointer */ + const float64_t *pSrc1; + float64_t sum; + uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ + uint32_t j, k, count, blkCnt; /* Loop counters */ + uint32_t outBlockSize; /* Loop counter */ + int32_t inc = 1; /* Destination address modifier */ #if defined(ARM_MATH_NEON) - float64x2_t sumV,pxV,pyV ; + float64x2_t sumV,pxV,pyV ; #endif - - /* The algorithm implementation is based on the lengths of the inputs. */ - /* srcB is always made to slide across srcA. */ - /* So srcBLen is always considered as shorter or equal to srcALen */ - /* But CORR(x, y) is reverse of CORR(y, x) */ - /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ - /* and the destination pointer modifier, inc is set to -1 */ - /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ - /* But to improve the performance, - * we assume zeroes in the output instead of zero padding either of the the inputs*/ - /* If srcALen > srcBLen, - * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ - /* If srcALen < srcBLen, - * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ - if (srcALen >= srcBLen) - { - /* Initialization of inputA pointer */ - pIn1 = pSrcA; - - /* Initialization of inputB pointer */ - pIn2 = pSrcB; - - /* Number of output samples is calculated */ - outBlockSize = (2U * srcALen) - 1U; - - /* When srcALen > srcBLen, zero padding has to be done to srcB - * to make their lengths equal. - * Instead, (outBlockSize - (srcALen + srcBLen - 1)) - * number of output samples are made zero */ - j = outBlockSize - (srcALen + (srcBLen - 1U)); - - /* Updating the pointer position to non zero value */ - pOut += j; - } - else - { - /* Initialization of inputA pointer */ - pIn1 = pSrcB; - - /* Initialization of inputB pointer */ - pIn2 = pSrcA; - - /* srcBLen is always considered as shorter or equal to srcALen */ - j = srcBLen; - srcBLen = srcALen; - srcALen = j; - - /* CORR(x, y) = Reverse order(CORR(y, x)) */ - /* Hence set the destination pointer to point to the last output sample */ - pOut = pDst + ((srcALen + srcBLen) - 2U); - - /* Destination address modifier is set to -1 */ - inc = -1; - } - - /* The function is internally - * divided into three stages according to the number of multiplications that has to be - * taken place between inputA samples and inputB samples. In the first stage of the - * algorithm, the multiplications increase by one for every iteration. - * In the second stage of the algorithm, srcBLen number of multiplications are done. - * In the third stage of the algorithm, the multiplications decrease by one - * for every iteration. */ - - /* The algorithm is implemented in three stages. - The loop counters of each stage is initiated here. */ - blockSize1 = srcBLen - 1U; - blockSize2 = srcALen - (srcBLen - 1U); - blockSize3 = blockSize1; - - /* -------------------------- - * Initializations of stage1 - * -------------------------*/ - - /* sum = x[0] * y[srcBlen - 1] - * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] - * .... - * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] - */ - - /* In this stage the MAC operations are increased by 1 for every iteration. - The count variable holds the number of MAC operations performed */ - count = 1U; - - /* Working pointer of inputA */ - px = pIn1; - - /* Working pointer of inputB */ - pSrc1 = pIn2 + (srcBLen - 1U); - py = pSrc1; - - /* ------------------------ - * Stage1 process - * ----------------------*/ - - /* The first stage starts here */ - while (blockSize1 > 0U) - { - /* Accumulator is made zero for every iteration */ - sum = 0.; + + /* The algorithm implementation is based on the lengths of the inputs. */ + /* srcB is always made to slide across srcA. */ + /* So srcBLen is always considered as shorter or equal to srcALen */ + /* But CORR(x, y) is reverse of CORR(y, x) */ + /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ + /* and the destination pointer modifier, inc is set to -1 */ + /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ + /* But to improve the performance, + * we assume zeroes in the output instead of zero padding either of the the inputs*/ + /* If srcALen > srcBLen, + * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ + /* If srcALen < srcBLen, + * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ + if (srcALen >= srcBLen) + { + /* Initialization of inputA pointer */ + pIn1 = pSrcA; + + /* Initialization of inputB pointer */ + pIn2 = pSrcB; + + /* Number of output samples is calculated */ + outBlockSize = (2U * srcALen) - 1U; + + /* When srcALen > srcBLen, zero padding has to be done to srcB + * to make their lengths equal. + * Instead, (outBlockSize - (srcALen + srcBLen - 1)) + * number of output samples are made zero */ + j = outBlockSize - (srcALen + (srcBLen - 1U)); + + /* Updating the pointer position to non zero value */ + pOut += j; + } + else + { + /* Initialization of inputA pointer */ + pIn1 = pSrcB; + + /* Initialization of inputB pointer */ + pIn2 = pSrcA; + + /* srcBLen is always considered as shorter or equal to srcALen */ + j = srcBLen; + srcBLen = srcALen; + srcALen = j; + + /* CORR(x, y) = Reverse order(CORR(y, x)) */ + /* Hence set the destination pointer to point to the last output sample */ + pOut = pDst + ((srcALen + srcBLen) - 2U); + + /* Destination address modifier is set to -1 */ + inc = -1; + } + + /* The function is internally + * divided into three stages according to the number of multiplications that has to be + * taken place between inputA samples and inputB samples. In the first stage of the + * algorithm, the multiplications increase by one for every iteration. + * In the second stage of the algorithm, srcBLen number of multiplications are done. + * In the third stage of the algorithm, the multiplications decrease by one + * for every iteration. */ + + /* The algorithm is implemented in three stages. + The loop counters of each stage is initiated here. */ + blockSize1 = srcBLen - 1U; + blockSize2 = srcALen - (srcBLen - 1U); + blockSize3 = blockSize1; + + /* -------------------------- + * Initializations of stage1 + * -------------------------*/ + + /* sum = x[0] * y[srcBlen - 1] + * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] + * .... + * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] + */ + + /* In this stage the MAC operations are increased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = 1U; + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + pSrc1 = pIn2 + (srcBLen - 1U); + py = pSrc1; + + /* ------------------------ + * Stage1 process + * ----------------------*/ + + /* The first stage starts here */ + while (blockSize1 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.; #if defined(ARM_MATH_NEON) - sumV = vdupq_n_f64(0.0f); - k = count >> 1U ; - - while(k > 0U) - { - pxV = vld1q_f64(px); - pyV = vld1q_f64(py); - sumV = vmlaq_f64(sumV, pxV, pyV); - px+=2; - py+=2; - k--; - } - sum =vaddvq_f64(sumV); - k = count & 1 ; + sumV = vdupq_n_f64(0.0f); + k = count >> 1U ; + + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = count & 1 ; #else - /* Initialize k with number of samples */ - k = count; + /* Initialize k with number of samples */ + k = count; #endif - while (k > 0U) - { - /* Perform the multiply-accumulate */ - /* x[0] * y[srcBLen - 1] */ - sum += *px++ * *py++; - - /* Decrement loop counter */ - k--; - } - - /* Store the result in the accumulator in the destination buffer. */ - - *pOut = sum; - /* Destination pointer is updated according to the address modifier, inc */ - pOut += inc; - - /* Update the inputA and inputB pointers for next MAC calculation */ - py = pSrc1 - count; - px = pIn1; - - /* Increment MAC count */ - count++; - - /* Decrement loop counter */ - blockSize1--; - - } - - /* -------------------------- - * Initializations of stage2 - * ------------------------*/ - - /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] - * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] - * .... - * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] - */ - - /* Working pointer of inputA */ - px = pIn1; - - /* Working pointer of inputB */ - py = pIn2; - - /* count is index by which the pointer pIn1 to be incremented */ - count = 0U; - - /* ------------------- - * Stage2 process - * ------------------*/ - - /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. - * So, to loop unroll over blockSize2, - * srcBLen should be greater than or equal to 4 */ - if (srcBLen >= 4U) - { - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize2; - - while (blkCnt > 0U) - { - /* Accumulator is made zero for every iteration */ - sum = 0.; + while (k > 0U) + { + /* Perform the multiply-accumulate */ + /* x[0] * y[srcBLen - 1] */ + sum += *px++ * *py++; + + /* Decrement loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + py = pSrc1 - count; + px = pIn1; + + /* Increment MAC count */ + count++; + + /* Decrement loop counter */ + blockSize1--; + + } + + /* -------------------------- + * Initializations of stage2 + * ------------------------*/ + + /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] + * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] + * .... + * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + */ + + /* Working pointer of inputA */ + px = pIn1; + + /* Working pointer of inputB */ + py = pIn2; + + /* count is index by which the pointer pIn1 to be incremented */ + count = 0U; + + /* ------------------- + * Stage2 process + * ------------------*/ + + /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. + * So, to loop unroll over blockSize2, + * srcBLen should be greater than or equal to 4 */ + if (srcBLen >= 4U) + { + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.; #if defined(ARM_MATH_NEON) - sumV = vdupq_n_f64(0.0f); - k = srcBLen >> 1U ; - while(k > 0U) - { - pxV = vld1q_f64(px); - pyV = vld1q_f64(py); - sumV = vmlaq_f64(sumV, pxV, pyV); - px+=2; - py+=2; - k--; - } - sum =vaddvq_f64(sumV); - k = srcBLen & 1 ; - + sumV = vdupq_n_f64(0.0f); + k = srcBLen >> 1U ; + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = srcBLen & 1 ; + #else - - /* Initialize blkCnt with number of samples */ - k = srcBLen; + + /* Initialize blkCnt with number of samples */ + k = srcBLen; #endif - while (k > 0U) - { - /* Perform the multiply-accumulate */ - sum += *px++ * *py++; - - /* Decrement the loop counter */ - k--; - } - - /* Store the result in the accumulator in the destination buffer. */ - *pOut = sum; - - /* Destination pointer is updated according to the address modifier, inc */ - pOut += inc; - - /* Increment the pointer pIn1 index, count by 1 */ - count++; - - /* Update the inputA and inputB pointers for next MAC calculation */ - px = pIn1 + count; - py = pIn2; - - /* Decrement the loop counter */ - blkCnt--; - } - } - else - { - /* If the srcBLen is not a multiple of 4, - * the blockSize2 loop cannot be unrolled by 4 */ - blkCnt = blockSize2; - - while (blkCnt > 0U) - { - /* Accumulator is made zero for every iteration */ - sum = 0.; + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + else + { + /* If the srcBLen is not a multiple of 4, + * the blockSize2 loop cannot be unrolled by 4 */ + blkCnt = blockSize2; + + while (blkCnt > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.; #if defined(ARM_MATH_NEON) - sumV = vdupq_n_f64(0.0f); - k = srcBLen >> 1U ; - while(k > 0U) - { - pxV = vld1q_f64(px); - pyV = vld1q_f64(py); - sumV = vmlaq_f64(sumV, pxV, pyV); - px+=2; - py+=2; - k--; - } - sum =vaddvq_f64(sumV); - k = srcBLen & 1 ; - + sumV = vdupq_n_f64(0.0f); + k = srcBLen >> 1U ; + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = srcBLen & 1 ; + #else - - /* Loop over srcBLen */ - k = srcBLen; + + /* Loop over srcBLen */ + k = srcBLen; #endif - while (k > 0U) - { - /* Perform the multiply-accumulate */ - sum += *px++ * *py++; - - /* Decrement the loop counter */ - k--; - } - - /* Store the result in the accumulator in the destination buffer. */ - *pOut = sum; - /* Destination pointer is updated according to the address modifier, inc */ - pOut += inc; - - /* Increment the pointer pIn1 index, count by 1 */ - count++; - - /* Update the inputA and inputB pointers for next MAC calculation */ - px = pIn1 + count; - py = pIn2; - - /* Decrement the loop counter */ - blkCnt--; - } - } - - - /* -------------------------- - * Initializations of stage3 - * -------------------------*/ - - /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] - * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] - * .... - * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] - * sum += x[srcALen-1] * y[0] - */ - - /* In this stage the MAC operations are decreased by 1 for every iteration. - The count variable holds the number of MAC operations performed */ - count = srcBLen - 1U; - - /* Working pointer of inputA */ - pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); - px = pSrc1; - - /* Working pointer of inputB */ - py = pIn2; - - /* ------------------- - * Stage3 process - * ------------------*/ - - while (blockSize3 > 0U) - { - /* Accumulator is made zero for every iteration */ - sum = 0.; + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement the loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Increment the pointer pIn1 index, count by 1 */ + count++; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = pIn1 + count; + py = pIn2; + + /* Decrement the loop counter */ + blkCnt--; + } + } + + + /* -------------------------- + * Initializations of stage3 + * -------------------------*/ + + /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] + * .... + * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] + * sum += x[srcALen-1] * y[0] + */ + + /* In this stage the MAC operations are decreased by 1 for every iteration. + The count variable holds the number of MAC operations performed */ + count = srcBLen - 1U; + + /* Working pointer of inputA */ + pSrc1 = pIn1 + (srcALen - (srcBLen - 1U)); + px = pSrc1; + + /* Working pointer of inputB */ + py = pIn2; + + /* ------------------- + * Stage3 process + * ------------------*/ + + while (blockSize3 > 0U) + { + /* Accumulator is made zero for every iteration */ + sum = 0.; #if defined(ARM_MATH_NEON) - sumV = vdupq_n_f64(0.0f); - k = count >> 1U ; - - while(k > 0U) - { - pxV = vld1q_f64(px); - pyV = vld1q_f64(py); - sumV = vmlaq_f64(sumV, pxV, pyV); - px+=2; - py+=2; - k--; - } - sum =vaddvq_f64(sumV); - k = count & 1 ; + sumV = vdupq_n_f64(0.0f); + k = count >> 1U ; + + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = count & 1 ; #else - - /* Initialize blkCnt with number of samples */ - k = count; + + /* Initialize blkCnt with number of samples */ + k = count; #endif - while (k > 0U) - { - /* Perform the multiply-accumulate */ - sum += *px++ * *py++; - - /* Decrement loop counter */ - k--; - } - - /* Store the result in the accumulator in the destination buffer. */ - *pOut = sum; - /* Destination pointer is updated according to the address modifier, inc */ - pOut += inc; - - /* Update the inputA and inputB pointers for next MAC calculation */ - px = ++pSrc1; - py = pIn2; - - /* Decrement MAC count */ - count--; - - /* Decrement the loop counter */ - blockSize3--; - } + while (k > 0U) + { + /* Perform the multiply-accumulate */ + sum += *px++ * *py++; + + /* Decrement loop counter */ + k--; + } + + /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; + /* Destination pointer is updated according to the address modifier, inc */ + pOut += inc; + + /* Update the inputA and inputB pointers for next MAC calculation */ + px = ++pSrc1; + py = pIn2; + + /* Decrement MAC count */ + count--; + + /* Decrement the loop counter */ + blockSize3--; + } } /** diff --git a/Source/FilteringFunctions/arm_fir_f64.c b/Source/FilteringFunctions/arm_fir_f64.c index 51fc8ece..9d598081 100644 --- a/Source/FilteringFunctions/arm_fir_f64.c +++ b/Source/FilteringFunctions/arm_fir_f64.c @@ -47,177 +47,177 @@ */ #if defined(ARM_MATH_NEON) void arm_fir_f64( - const arm_fir_instance_f64 * S, - const float64_t * pSrc, - float64_t * pDst, - uint32_t blockSize) + const arm_fir_instance_f64 * S, + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) { - float64_t *pState = S->pState; /* State pointer */ - const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - float64_t *pStateCurnt; /* Points to the current sample of the state */ - float64_t *px; /* Temporary pointer for state buffer */ - const float64_t *pb; /* Temporary pointer for coefficient buffer */ - float64x2_t pxV; - float64x2_t pbV; - float64x2_t acc0V; - float64_t acc0; /* Accumulator */ - uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ - uint32_t i, tapCnt, blkCnt; /* Loop counters */ - - /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ - /* pStateCurnt points to the location where the new input data should be written */ - pStateCurnt = &(S->pState[(numTaps - 1U)]); - - /* Initialize blkCnt with number of taps */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Copy one sample at a time into state buffer */ - *pStateCurnt++ = *pSrc++; - - /* Set the accumulator to zero */ - acc0 = 0.; - acc0V = vdupq_n_f64(0.0f); - - /* Initialize state pointer */ - px = pState; - - /* Initialize Coefficient pointer */ - pb = pCoeffs; - - i = numTaps >> 1U; - - /* Perform the multiply-accumulates */ - while (i > 0U) - { - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - pxV = vld1q_f64(px); - pbV = vld1q_f64(pb); - acc0V = vmlaq_f64(acc0V, pxV, pbV); - px+=2; - pb+=2; - - i--; - } - - acc0 = vaddvq_f64(acc0V); - i = numTaps%2 ; - while(i >0U) - { - acc0+= *px++ * *pb++ ; - i--; - } - - /* Store result in destination buffer. */ - - *pDst++ = acc0; - - /* Advance state pointer by 1 for the next sample */ - pState = pState + 1U; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Processing is complete. - Now copy the last numTaps - 1 samples to the start of the state buffer. - This prepares the state buffer for the next function call. */ - - /* Points to the start of the state buffer */ - pStateCurnt = S->pState; - - /* Initialize tapCnt with number of taps */ - tapCnt = (numTaps - 1U); - - /* Copy remaining data */ - while (tapCnt > 0U) - { - *pStateCurnt++ = *pState++; - - /* Decrement loop counter */ - tapCnt--; - } - + float64_t *pState = S->pState; /* State pointer */ + const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float64_t *pStateCurnt; /* Points to the current sample of the state */ + float64_t *px; /* Temporary pointer for state buffer */ + const float64_t *pb; /* Temporary pointer for coefficient buffer */ + float64x2_t pxV; + float64x2_t pbV; + float64x2_t acc0V; + float64_t acc0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initialize blkCnt with number of taps */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.; + acc0V = vdupq_n_f64(0.0f); + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps >> 1U; + + /* Perform the multiply-accumulates */ + while (i > 0U) + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + pxV = vld1q_f64(px); + pbV = vld1q_f64(pb); + acc0V = vmlaq_f64(acc0V, pxV, pbV); + px+=2; + pb+=2; + + i--; + } + + acc0 = vaddvq_f64(acc0V); + i = numTaps%2 ; + while(i >0U) + { + acc0+= *px++ * *pb++ ; + i--; + } + + /* Store result in destination buffer. */ + + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Processing is complete. + Now copy the last numTaps - 1 samples to the start of the state buffer. + This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Initialize tapCnt with number of taps */ + tapCnt = (numTaps - 1U); + + /* Copy remaining data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement loop counter */ + tapCnt--; + } + } #else void arm_fir_f64( - const arm_fir_instance_f64 * S, - const float64_t * pSrc, - float64_t * pDst, - uint32_t blockSize) + const arm_fir_instance_f64 * S, + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) { - float64_t *pState = S->pState; /* State pointer */ - const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - float64_t *pStateCurnt; /* Points to the current sample of the state */ - float64_t *px; /* Temporary pointer for state buffer */ - const float64_t *pb; /* Temporary pointer for coefficient buffer */ - float64_t acc0; /* Accumulator */ - uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ - uint32_t i, tapCnt, blkCnt; /* Loop counters */ - - /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ - /* pStateCurnt points to the location where the new input data should be written */ - pStateCurnt = &(S->pState[(numTaps - 1U)]); - - /* Initialize blkCnt with number of taps */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Copy one sample at a time into state buffer */ - *pStateCurnt++ = *pSrc++; - - /* Set the accumulator to zero */ - acc0 = 0.; - - /* Initialize state pointer */ - px = pState; - - /* Initialize Coefficient pointer */ - pb = pCoeffs; - - i = numTaps; - - /* Perform the multiply-accumulates */ - while (i > 0U) - { - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - acc0 += *px++ * *pb++; - - i--; - } - - /* Store result in destination buffer. */ - *pDst++ = acc0; - - /* Advance state pointer by 1 for the next sample */ - pState = pState + 1U; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Processing is complete. - Now copy the last numTaps - 1 samples to the start of the state buffer. - This prepares the state buffer for the next function call. */ - - /* Points to the start of the state buffer */ - pStateCurnt = S->pState; - - /* Initialize tapCnt with number of taps */ - tapCnt = (numTaps - 1U); - - /* Copy remaining data */ - while (tapCnt > 0U) - { - *pStateCurnt++ = *pState++; - - /* Decrement loop counter */ - tapCnt--; - } - + float64_t *pState = S->pState; /* State pointer */ + const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float64_t *pStateCurnt; /* Points to the current sample of the state */ + float64_t *px; /* Temporary pointer for state buffer */ + const float64_t *pb; /* Temporary pointer for coefficient buffer */ + float64_t acc0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initialize blkCnt with number of taps */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.; + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps; + + /* Perform the multiply-accumulates */ + while (i > 0U) + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + acc0 += *px++ * *pb++; + + i--; + } + + /* Store result in destination buffer. */ + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Processing is complete. + Now copy the last numTaps - 1 samples to the start of the state buffer. + This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Initialize tapCnt with number of taps */ + tapCnt = (numTaps - 1U); + + /* Copy remaining data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement loop counter */ + tapCnt--; + } + } #endif diff --git a/Source/FilteringFunctions/arm_fir_init_f64.c b/Source/FilteringFunctions/arm_fir_init_f64.c index 645ad9f6..27c56596 100644 --- a/Source/FilteringFunctions/arm_fir_init_f64.c +++ b/Source/FilteringFunctions/arm_fir_init_f64.c @@ -61,22 +61,22 @@ */ void arm_fir_init_f64( - arm_fir_instance_f64 * S, - uint16_t numTaps, - const float64_t * pCoeffs, - float64_t * pState, - uint32_t blockSize) + arm_fir_instance_f64 * S, + uint16_t numTaps, + const float64_t * pCoeffs, + float64_t * pState, + uint32_t blockSize) { - /* Assign filter taps */ - S->numTaps = numTaps; - - /* Assign coefficient pointer */ - S->pCoeffs = pCoeffs; - - /* Clear state buffer. The size is always (blockSize + numTaps - 1) */ - memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float64_t)); - /* Assign state pointer */ - S->pState = pState; + /* Assign filter taps */ + S->numTaps = numTaps; + + /* Assign coefficient pointer */ + S->pCoeffs = pCoeffs; + + /* Clear state buffer. The size is always (blockSize + numTaps - 1) */ + memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float64_t)); + /* Assign state pointer */ + S->pState = pState; } /** diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/Source/MatrixFunctions/arm_mat_cholesky_f64.c index b1605116..bfc4234f 100755 --- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c +++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c @@ -55,223 +55,223 @@ #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) arm_status arm_mat_cholesky_f64( - const arm_matrix_instance_f64 * pSrc, - arm_matrix_instance_f64 * pDst) + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst) { - - arm_status status; /* status of matrix inverse */ - - + + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((pSrc->numRows != pSrc->numCols) || - (pDst->numRows != pDst->numCols) || - (pSrc->numRows != pDst->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((pSrc->numRows != pSrc->numCols) || + (pDst->numRows != pDst->numCols) || + (pSrc->numRows != pDst->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - int i,j,k; - int n = pSrc->numRows; - float64_t invSqrtVj; - float64_t *pA,*pG; - int kCnt; - - - float64x2_t acc, acc0, acc1, acc2, acc3; - float64x2_t vecGi; - float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3; - - float64x2_t tmp = vdupq_n_f64(0.0f); - - float64_t sum=0.0f; - float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f; - - - pA = pSrc->pData; - pG = pDst->pData; - - for(i=0 ;i < n ; i++) - { - for(j=i ; j+3 < n ; j+=4) - { - pG[(j + 0) * n + i] = pA[(j + 0) * n + i]; - pG[(j + 1) * n + i] = pA[(j + 1) * n + i]; - pG[(j + 2) * n + i] = pA[(j + 2) * n + i]; - pG[(j + 3) * n + i] = pA[(j + 3) * n + i]; - - acc0 = vdupq_n_f64(0.0f); - acc1 = vdupq_n_f64(0.0f); - acc2 = vdupq_n_f64(0.0f); - acc3 = vdupq_n_f64(0.0f); - - kCnt = i >> 1U; - k=0; - while(kCnt > 0) - { - - vecGi=vld1q_f64(&pG[i * n + k]); - - vecGj0=vld1q_f64(&pG[(j + 0) * n + k]); - vecGj1=vld1q_f64(&pG[(j + 1) * n + k]); - vecGj2=vld1q_f64(&pG[(j + 2) * n + k]); - vecGj3=vld1q_f64(&pG[(j + 3) * n + k]); - - acc0 = vfmaq_f64(acc0, vecGi, vecGj0); - acc1 = vfmaq_f64(acc1, vecGi, vecGj1); - acc2 = vfmaq_f64(acc2, vecGi, vecGj2); - acc3 = vfmaq_f64(acc3, vecGi, vecGj3); - - kCnt--; - k+=2; - } - - - sum0 = vaddvq_f64(acc0); - sum1 = vaddvq_f64(acc1); - sum2 = vaddvq_f64(acc2); - sum3 = vaddvq_f64(acc3); - - - kCnt = i & 1; - while(kCnt > 0) - { - - sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k]; - sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k]; - sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k]; - sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k]; - kCnt--; - k++; - } - - pG[(j + 0) * n + i] -= sum0; - pG[(j + 1) * n + i] -= sum1; - pG[(j + 2) * n + i] -= sum2; - pG[(j + 3) * n + i] -= sum3; - } - - for(; j < n ; j++) - { - pG[j * n + i] = pA[j * n + i]; - - acc = vdupq_n_f64(0.0f); - - kCnt = i >> 1U; - k=0; - while(kCnt > 0) - { - - vecGi=vld1q_f64(&pG[i * n + k]); - vecGj=vld1q_f64(&pG[j * n + k]); - - acc = vfmaq_f64(acc, vecGi, vecGj); - - kCnt--; - k+=2; - } - - - sum = vaddvq_f64(acc); - - kCnt = i & 1; - while(kCnt > 0) - { - sum = sum + pG[i * n + k] * pG[(j + 0) * n + k]; - - - kCnt--; - k++; - } - - pG[j * n + i] -= sum; - } - - if (pG[i * n + i] <= 0.0f) - { - return(ARM_MATH_DECOMPOSITION_FAILURE); - } - - invSqrtVj = 1.0f/sqrtf(pG[i * n + i]); - SCALE_COL_F64(pDst,i,invSqrtVj,i); - } - - status = ARM_MATH_SUCCESS; - - } - - - /* Return to application */ - return (status); + + { + int i,j,k; + int n = pSrc->numRows; + float64_t invSqrtVj; + float64_t *pA,*pG; + int kCnt; + + + float64x2_t acc, acc0, acc1, acc2, acc3; + float64x2_t vecGi; + float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3; + + float64x2_t tmp = vdupq_n_f64(0.0f); + + float64_t sum=0.0f; + float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f; + + + pA = pSrc->pData; + pG = pDst->pData; + + for(i=0 ;i < n ; i++) + { + for(j=i ; j+3 < n ; j+=4) + { + pG[(j + 0) * n + i] = pA[(j + 0) * n + i]; + pG[(j + 1) * n + i] = pA[(j + 1) * n + i]; + pG[(j + 2) * n + i] = pA[(j + 2) * n + i]; + pG[(j + 3) * n + i] = pA[(j + 3) * n + i]; + + acc0 = vdupq_n_f64(0.0f); + acc1 = vdupq_n_f64(0.0f); + acc2 = vdupq_n_f64(0.0f); + acc3 = vdupq_n_f64(0.0f); + + kCnt = i >> 1U; + k=0; + while(kCnt > 0) + { + + vecGi=vld1q_f64(&pG[i * n + k]); + + vecGj0=vld1q_f64(&pG[(j + 0) * n + k]); + vecGj1=vld1q_f64(&pG[(j + 1) * n + k]); + vecGj2=vld1q_f64(&pG[(j + 2) * n + k]); + vecGj3=vld1q_f64(&pG[(j + 3) * n + k]); + + acc0 = vfmaq_f64(acc0, vecGi, vecGj0); + acc1 = vfmaq_f64(acc1, vecGi, vecGj1); + acc2 = vfmaq_f64(acc2, vecGi, vecGj2); + acc3 = vfmaq_f64(acc3, vecGi, vecGj3); + + kCnt--; + k+=2; + } + + + sum0 = vaddvq_f64(acc0); + sum1 = vaddvq_f64(acc1); + sum2 = vaddvq_f64(acc2); + sum3 = vaddvq_f64(acc3); + + + kCnt = i & 1; + while(kCnt > 0) + { + + sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k]; + sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k]; + sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k]; + sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k]; + kCnt--; + k++; + } + + pG[(j + 0) * n + i] -= sum0; + pG[(j + 1) * n + i] -= sum1; + pG[(j + 2) * n + i] -= sum2; + pG[(j + 3) * n + i] -= sum3; + } + + for(; j < n ; j++) + { + pG[j * n + i] = pA[j * n + i]; + + acc = vdupq_n_f64(0.0f); + + kCnt = i >> 1U; + k=0; + while(kCnt > 0) + { + + vecGi=vld1q_f64(&pG[i * n + k]); + vecGj=vld1q_f64(&pG[j * n + k]); + + acc = vfmaq_f64(acc, vecGi, vecGj); + + kCnt--; + k+=2; + } + + + sum = vaddvq_f64(acc); + + kCnt = i & 1; + while(kCnt > 0) + { + sum = sum + pG[i * n + k] * pG[(j + 0) * n + k]; + + + kCnt--; + k++; + } + + pG[j * n + i] -= sum; + } + + if (pG[i * n + i] <= 0.0f) + { + return(ARM_MATH_DECOMPOSITION_FAILURE); + } + + invSqrtVj = 1.0f/sqrtf(pG[i * n + i]); + SCALE_COL_F64(pDst,i,invSqrtVj,i); + } + + status = ARM_MATH_SUCCESS; + + } + + + /* Return to application */ + return (status); } #else arm_status arm_mat_cholesky_f64( - const arm_matrix_instance_f64 * pSrc, - arm_matrix_instance_f64 * pDst) + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst) { - - arm_status status; /* status of matrix inverse */ - - + + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((pSrc->numRows != pSrc->numCols) || - (pDst->numRows != pDst->numCols) || - (pSrc->numRows != pDst->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((pSrc->numRows != pSrc->numCols) || + (pDst->numRows != pDst->numCols) || + (pSrc->numRows != pDst->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - int i,j,k; - int n = pSrc->numRows; - float64_t invSqrtVj; - float64_t *pA,*pG; - - pA = pSrc->pData; - pG = pDst->pData; - - - for(i=0 ; i < n ; i++) - { - for(j=i ; j < n ; j++) - { - pG[j * n + i] = pA[j * n + i]; - - for(k=0; k < i ; k++) - { - pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k]; - } - } - - if (pG[i * n + i] <= 0.0) - { - return(ARM_MATH_DECOMPOSITION_FAILURE); - } - - invSqrtVj = 1.0/sqrt(pG[i * n + i]); - SCALE_COL_F64(pDst,i,invSqrtVj,i); - - } - - status = ARM_MATH_SUCCESS; - - } - - - /* Return to application */ - return (status); + + { + int i,j,k; + int n = pSrc->numRows; + float64_t invSqrtVj; + float64_t *pA,*pG; + + pA = pSrc->pData; + pG = pDst->pData; + + + for(i=0 ; i < n ; i++) + { + for(j=i ; j < n ; j++) + { + pG[j * n + i] = pA[j * n + i]; + + for(k=0; k < i ; k++) + { + pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k]; + } + } + + if (pG[i * n + i] <= 0.0) + { + return(ARM_MATH_DECOMPOSITION_FAILURE); + } + + invSqrtVj = 1.0/sqrt(pG[i * n + i]); + SCALE_COL_F64(pDst,i,invSqrtVj,i); + + } + + status = ARM_MATH_SUCCESS; + + } + + + /* Return to application */ + return (status); } #endif diff --git a/Source/MatrixFunctions/arm_mat_init_f64.c b/Source/MatrixFunctions/arm_mat_init_f64.c index 5abe302b..e3bff4e9 100644 --- a/Source/MatrixFunctions/arm_mat_init_f64.c +++ b/Source/MatrixFunctions/arm_mat_init_f64.c @@ -56,19 +56,19 @@ */ void arm_mat_init_f64( - arm_matrix_instance_f64 * S, - uint16_t nRows, - uint16_t nColumns, - float64_t * pData) + arm_matrix_instance_f64 * S, + uint16_t nRows, + uint16_t nColumns, + float64_t * pData) { - /* Assign Number of Rows */ - S->numRows = nRows; - - /* Assign Number of Columns */ - S->numCols = nColumns; - - /* Assign Data pointer */ - S->pData = pData; + /* Assign Number of Rows */ + S->numRows = nRows; + + /* Assign Number of Columns */ + S->numCols = nColumns; + + /* Assign Data pointer */ + S->pData = pData; } /** diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c index 49d3ac28..e7be7cc1 100755 --- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c +++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c @@ -48,179 +48,179 @@ */ #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) - arm_status arm_mat_solve_lower_triangular_f64( - const arm_matrix_instance_f64 * lt, - const arm_matrix_instance_f64 * a, - arm_matrix_instance_f64 * dst) - { - arm_status status; /* status of matrix inverse */ - - +arm_status arm_mat_solve_lower_triangular_f64( + const arm_matrix_instance_f64 * lt, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst) +{ + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((lt->numRows != lt->numCols) || - (lt->numRows != a->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((lt->numRows != lt->numCols) || + (lt->numRows != a->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - /* a1 b1 c1 x1 = a1 - b2 c2 x2 a2 - c3 x3 a3 - - x3 = a3 / c3 - x2 = (a2 - c2 x3) / b2 - - */ - int i,j,k,n,cols; - - n = dst->numRows; - cols = dst->numCols; - - float64_t *pX = dst->pData; - float64_t *pLT = lt->pData; - float64_t *pA = a->pData; - - float64_t *lt_row; - float64_t *a_col; - - float64_t invLT; - - float64x2_t vecA; - float64x2_t vecX; - - for(i=0; i < n ; i++) - { - - for(j=0; j+1 < cols; j += 2) - { - vecA = vld1q_f64(&pA[i * cols + j]); - - for(k=0; k < i; k++) - { - vecX = vld1q_f64(&pX[cols*k+j]); - vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX); - } - - if (pLT[n*i + i]==0.0f) - { - return(ARM_MATH_SINGULAR); - } - - invLT = 1.0f / pLT[n*i + i]; - vecA = vmulq_f64(vecA,vdupq_n_f64(invLT)); - vst1q_f64(&pX[i*cols+j],vecA); - - } - - for(; j < cols; j ++) - { - a_col = &pA[j]; - lt_row = &pLT[n*i]; - - float64_t tmp=a_col[i * cols]; - - for(k=0; k < i; k++) - { - tmp -= lt_row[k] * pX[cols*k+j]; - } - - if (lt_row[i]==0.0f) - { - return(ARM_MATH_SINGULAR); - } - tmp = tmp / lt_row[i]; - pX[i*cols+j] = tmp; - } - - } - status = ARM_MATH_SUCCESS; - - } - - /* Return to application */ - return (status); + + { + /* a1 b1 c1 x1 = a1 + b2 c2 x2 a2 + c3 x3 a3 + + x3 = a3 / c3 + x2 = (a2 - c2 x3) / b2 + + */ + int i,j,k,n,cols; + + n = dst->numRows; + cols = dst->numCols; + + float64_t *pX = dst->pData; + float64_t *pLT = lt->pData; + float64_t *pA = a->pData; + + float64_t *lt_row; + float64_t *a_col; + + float64_t invLT; + + float64x2_t vecA; + float64x2_t vecX; + + for(i=0; i < n ; i++) + { + + for(j=0; j+1 < cols; j += 2) + { + vecA = vld1q_f64(&pA[i * cols + j]); + + for(k=0; k < i; k++) + { + vecX = vld1q_f64(&pX[cols*k+j]); + vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX); + } + + if (pLT[n*i + i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + + invLT = 1.0f / pLT[n*i + i]; + vecA = vmulq_f64(vecA,vdupq_n_f64(invLT)); + vst1q_f64(&pX[i*cols+j],vecA); + + } + + for(; j < cols; j ++) + { + a_col = &pA[j]; + lt_row = &pLT[n*i]; + + float64_t tmp=a_col[i * cols]; + + for(k=0; k < i; k++) + { + tmp -= lt_row[k] * pX[cols*k+j]; + } + + if (lt_row[i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + tmp = tmp / lt_row[i]; + pX[i*cols+j] = tmp; + } + + } + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); } #else - arm_status arm_mat_solve_lower_triangular_f64( - const arm_matrix_instance_f64 * lt, - const arm_matrix_instance_f64 * a, - arm_matrix_instance_f64 * dst) - { - arm_status status; /* status of matrix inverse */ - - +arm_status arm_mat_solve_lower_triangular_f64( + const arm_matrix_instance_f64 * lt, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst) +{ + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((lt->numRows != lt->numCols) || - (lt->numRows != a->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((lt->numRows != lt->numCols) || + (lt->numRows != a->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - /* a1 b1 c1 x1 = a1 - b2 c2 x2 a2 - c3 x3 a3 - - x3 = a3 / c3 - x2 = (a2 - c2 x3) / b2 - - */ - int i,j,k,n,cols; - - float64_t *pX = dst->pData; - float64_t *pLT = lt->pData; - float64_t *pA = a->pData; - - float64_t *lt_row; - float64_t *a_col; - - n = dst->numRows; - cols = dst->numCols; - - for(j=0; j < cols; j ++) - { - a_col = &pA[j]; - - for(i=0; i < n ; i++) - { - float64_t tmp=a_col[i * cols]; - - lt_row = &pLT[n*i]; - - for(k=0; k < i; k++) - { - tmp -= lt_row[k] * pX[cols*k+j]; - } - - if (lt_row[i]==0.0) - { - return(ARM_MATH_SINGULAR); - } - tmp = tmp / lt_row[i]; - pX[i*cols+j] = tmp; - } - - } - status = ARM_MATH_SUCCESS; - - } - - /* Return to application */ - return (status); + + { + /* a1 b1 c1 x1 = a1 + b2 c2 x2 a2 + c3 x3 a3 + + x3 = a3 / c3 + x2 = (a2 - c2 x3) / b2 + + */ + int i,j,k,n,cols; + + float64_t *pX = dst->pData; + float64_t *pLT = lt->pData; + float64_t *pA = a->pData; + + float64_t *lt_row; + float64_t *a_col; + + n = dst->numRows; + cols = dst->numCols; + + for(j=0; j < cols; j ++) + { + a_col = &pA[j]; + + for(i=0; i < n ; i++) + { + float64_t tmp=a_col[i * cols]; + + lt_row = &pLT[n*i]; + + for(k=0; k < i; k++) + { + tmp -= lt_row[k] * pX[cols*k+j]; + } + + if (lt_row[i]==0.0) + { + return(ARM_MATH_SINGULAR); + } + tmp = tmp / lt_row[i]; + pX[i*cols+j] = tmp; + } + + } + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); } #endif /** diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c index 68c10194..73291722 100755 --- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c +++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c @@ -48,168 +48,168 @@ */ #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) - arm_status arm_mat_solve_upper_triangular_f64( - const arm_matrix_instance_f64 * ut, - const arm_matrix_instance_f64 * a, - arm_matrix_instance_f64 * dst) - { -arm_status status; /* status of matrix inverse */ - - +arm_status arm_mat_solve_upper_triangular_f64( + const arm_matrix_instance_f64 * ut, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst) +{ + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((ut->numRows != ut->numCols) || - (ut->numRows != a->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((ut->numRows != ut->numCols) || + (ut->numRows != a->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - - int i,j,k,n,cols; - - n = dst->numRows; - cols = dst->numCols; - - float64_t *pX = dst->pData; - float64_t *pUT = ut->pData; - float64_t *pA = a->pData; - - float64_t *ut_row; - float64_t *a_col; - - float64_t invUT; - - float64x2_t vecA; - float64x2_t vecX; - - for(i=n-1; i >= 0 ; i--) - { - for(j=0; j+1 < cols; j +=2) - { - vecA = vld1q_f64(&pA[i * cols + j]); - - for(k=n-1; k > i; k--) - { - vecX = vld1q_f64(&pX[cols*k+j]); - vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX); - } - - if (pUT[n*i + i]==0.0f) - { - return(ARM_MATH_SINGULAR); - } - - invUT = 1.0f / pUT[n*i + i]; - vecA = vmulq_f64(vecA,vdupq_n_f64(invUT)); - - - vst1q_f64(&pX[i*cols+j],vecA); - } - - for(; j < cols; j ++) - { - a_col = &pA[j]; - - ut_row = &pUT[n*i]; - - float64_t tmp=a_col[i * cols]; - - for(k=n-1; k > i; k--) - { - tmp -= ut_row[k] * pX[cols*k+j]; - } - - if (ut_row[i]==0.0f) - { - return(ARM_MATH_SINGULAR); - } - tmp = tmp / ut_row[i]; - pX[i*cols+j] = tmp; - } - - } - status = ARM_MATH_SUCCESS; - - } - - - /* Return to application */ - return (status); + + { + + int i,j,k,n,cols; + + n = dst->numRows; + cols = dst->numCols; + + float64_t *pX = dst->pData; + float64_t *pUT = ut->pData; + float64_t *pA = a->pData; + + float64_t *ut_row; + float64_t *a_col; + + float64_t invUT; + + float64x2_t vecA; + float64x2_t vecX; + + for(i=n-1; i >= 0 ; i--) + { + for(j=0; j+1 < cols; j +=2) + { + vecA = vld1q_f64(&pA[i * cols + j]); + + for(k=n-1; k > i; k--) + { + vecX = vld1q_f64(&pX[cols*k+j]); + vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX); + } + + if (pUT[n*i + i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + + invUT = 1.0f / pUT[n*i + i]; + vecA = vmulq_f64(vecA,vdupq_n_f64(invUT)); + + + vst1q_f64(&pX[i*cols+j],vecA); + } + + for(; j < cols; j ++) + { + a_col = &pA[j]; + + ut_row = &pUT[n*i]; + + float64_t tmp=a_col[i * cols]; + + for(k=n-1; k > i; k--) + { + tmp -= ut_row[k] * pX[cols*k+j]; + } + + if (ut_row[i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + tmp = tmp / ut_row[i]; + pX[i*cols+j] = tmp; + } + + } + status = ARM_MATH_SUCCESS; + + } + + + /* Return to application */ + return (status); } #else - arm_status arm_mat_solve_upper_triangular_f64( - const arm_matrix_instance_f64 * ut, - const arm_matrix_instance_f64 * a, - arm_matrix_instance_f64 * dst) - { -arm_status status; /* status of matrix inverse */ - - +arm_status arm_mat_solve_upper_triangular_f64( + const arm_matrix_instance_f64 * ut, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst) +{ + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((ut->numRows != ut->numCols) || - (ut->numRows != a->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((ut->numRows != ut->numCols) || + (ut->numRows != a->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - - int i,j,k,n,cols; - - float64_t *pX = dst->pData; - float64_t *pUT = ut->pData; - float64_t *pA = a->pData; - - float64_t *ut_row; - float64_t *a_col; - - n = dst->numRows; - cols = dst->numCols; - - for(j=0; j < cols; j ++) - { - a_col = &pA[j]; - - for(i=n-1; i >= 0 ; i--) - { - float64_t tmp=a_col[i * cols]; - - ut_row = &pUT[n*i]; - - for(k=n-1; k > i; k--) - { - tmp -= ut_row[k] * pX[cols*k+j]; - } - - if (ut_row[i]==0.0) - { - return(ARM_MATH_SINGULAR); - } - tmp = tmp / ut_row[i]; - pX[i*cols+j] = tmp; - } - - } - status = ARM_MATH_SUCCESS; - - } - - - /* Return to application */ - return (status); + + { + + int i,j,k,n,cols; + + float64_t *pX = dst->pData; + float64_t *pUT = ut->pData; + float64_t *pA = a->pData; + + float64_t *ut_row; + float64_t *a_col; + + n = dst->numRows; + cols = dst->numCols; + + for(j=0; j < cols; j ++) + { + a_col = &pA[j]; + + for(i=n-1; i >= 0 ; i--) + { + float64_t tmp=a_col[i * cols]; + + ut_row = &pUT[n*i]; + + for(k=n-1; k > i; k--) + { + tmp -= ut_row[k] * pX[cols*k+j]; + } + + if (ut_row[i]==0.0) + { + return(ARM_MATH_SINGULAR); + } + tmp = tmp / ut_row[i]; + pX[i*cols+j] = tmp; + } + + } + status = ARM_MATH_SUCCESS; + + } + + + /* Return to application */ + return (status); } #endif diff --git a/Source/MatrixFunctions/arm_mat_trans_f64.c b/Source/MatrixFunctions/arm_mat_trans_f64.c index 08184731..03737e8a 100755 --- a/Source/MatrixFunctions/arm_mat_trans_f64.c +++ b/Source/MatrixFunctions/arm_mat_trans_f64.c @@ -57,216 +57,216 @@ #if defined(ARM_MATH_NEON) arm_status arm_mat_trans_f64( - const arm_matrix_instance_f64 * pSrc, - arm_matrix_instance_f64 * pDst) + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst) { - float64_t *pIn = pSrc->pData; /* input data matrix pointer */ - float64_t *pOut = pDst->pData; /* output data matrix pointer */ - float64_t *px; /* Temporary output data matrix pointer */ - uint16_t nRows = pSrc->numRows; /* number of rows */ - uint16_t nColumns = pSrc->numCols; /* number of columns */ - - uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */ - arm_status status; /* status of matrix transpose */ - + float64_t *pIn = pSrc->pData; /* input data matrix pointer */ + float64_t *pOut = pDst->pData; /* output data matrix pointer */ + float64_t *px; /* Temporary output data matrix pointer */ + uint16_t nRows = pSrc->numRows; /* number of rows */ + uint16_t nColumns = pSrc->numCols; /* number of columns */ + + uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */ + arm_status status; /* status of matrix transpose */ + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else + + /* Check for matrix mismatch condition */ + if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - /* Matrix transpose by exchanging the rows with columns */ - /* Row loop */ - rowCnt = row >> 1; - while (rowCnt > 0U) - { - float64_t *row0,*row1; - float64x2x4_t raV; - - blkCnt = nColumns >> 2; - - /* The pointer px is set to starting address of the column being processed */ - px = pOut + i; - - /* Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ - while (blkCnt > 0U) /* Column loop */ - { - row0 = pIn; - row1 = pIn+nColumns; - pIn+=4; - raV = vld4q_lane_f64(row0, raV, 0); - raV = vld4q_lane_f64(row1, raV, 1); - - vst1q_f64(px,raV.val[0]); - px += nRows; - - vst1q_f64(px,raV.val[1]); - px += nRows; - - vst1q_f64(px,raV.val[2]); - px += nRows; - - vst1q_f64(px,raV.val[3]); - px += nRows; - - /* Decrement the column loop counter */ - blkCnt--; - } - - /* Perform matrix transpose for last 3 samples here. */ - blkCnt = nColumns % 0x4U; - - while (blkCnt > 0U) - { - /* Read and store the input element in the destination */ - *px++ = *pIn; - *px++ = *(pIn + 1 * nColumns); - - px += (nRows - 2); - pIn++; - - /* Decrement the column loop counter */ - blkCnt--; - } - - i += 2; - pIn += 1 * nColumns; - - /* Decrement the row loop counter */ - rowCnt--; - - } /* Row loop end */ - - rowCnt = row & 1; - while (rowCnt > 0U) - { - blkCnt = nColumns ; - /* The pointer px is set to starting address of the column being processed */ - px = pOut + i; - - while (blkCnt > 0U) - { - /* Read and store the input element in the destination */ - *px = *pIn++; - - /* Update the pointer px to point to the next row of the transposed matrix */ - px += nRows; - - /* Decrement the column loop counter */ - blkCnt--; - } - i++; - rowCnt -- ; - } - - /* Set status as ARM_MATH_SUCCESS */ - status = ARM_MATH_SUCCESS; - } - - /* Return to application */ - return (status); + + { + /* Matrix transpose by exchanging the rows with columns */ + /* Row loop */ + rowCnt = row >> 1; + while (rowCnt > 0U) + { + float64_t *row0,*row1; + float64x2x4_t raV; + + blkCnt = nColumns >> 2; + + /* The pointer px is set to starting address of the column being processed */ + px = pOut + i; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) /* Column loop */ + { + row0 = pIn; + row1 = pIn+nColumns; + pIn+=4; + raV = vld4q_lane_f64(row0, raV, 0); + raV = vld4q_lane_f64(row1, raV, 1); + + vst1q_f64(px,raV.val[0]); + px += nRows; + + vst1q_f64(px,raV.val[1]); + px += nRows; + + vst1q_f64(px,raV.val[2]); + px += nRows; + + vst1q_f64(px,raV.val[3]); + px += nRows; + + /* Decrement the column loop counter */ + blkCnt--; + } + + /* Perform matrix transpose for last 3 samples here. */ + blkCnt = nColumns % 0x4U; + + while (blkCnt > 0U) + { + /* Read and store the input element in the destination */ + *px++ = *pIn; + *px++ = *(pIn + 1 * nColumns); + + px += (nRows - 2); + pIn++; + + /* Decrement the column loop counter */ + blkCnt--; + } + + i += 2; + pIn += 1 * nColumns; + + /* Decrement the row loop counter */ + rowCnt--; + + } /* Row loop end */ + + rowCnt = row & 1; + while (rowCnt > 0U) + { + blkCnt = nColumns ; + /* The pointer px is set to starting address of the column being processed */ + px = pOut + i; + + while (blkCnt > 0U) + { + /* Read and store the input element in the destination */ + *px = *pIn++; + + /* Update the pointer px to point to the next row of the transposed matrix */ + px += nRows; + + /* Decrement the column loop counter */ + blkCnt--; + } + i++; + rowCnt -- ; + } + + /* Set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); } #else arm_status arm_mat_trans_f64( - const arm_matrix_instance_f64 * pSrc, - arm_matrix_instance_f64 * pDst) + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst) { - float64_t *pIn = pSrc->pData; /* input data matrix pointer */ - float64_t *pOut = pDst->pData; /* output data matrix pointer */ - float64_t *px; /* Temporary output data matrix pointer */ - uint16_t nRows = pSrc->numRows; /* number of rows */ - uint16_t nCols = pSrc->numCols; /* number of columns */ - uint64_t col, row = nRows, i = 0U; /* Loop counters */ - arm_status status; /* status of matrix transpose */ - + float64_t *pIn = pSrc->pData; /* input data matrix pointer */ + float64_t *pOut = pDst->pData; /* output data matrix pointer */ + float64_t *px; /* Temporary output data matrix pointer */ + uint16_t nRows = pSrc->numRows; /* number of rows */ + uint16_t nCols = pSrc->numCols; /* number of columns */ + uint64_t col, row = nRows, i = 0U; /* Loop counters */ + arm_status status; /* status of matrix transpose */ + #ifdef ARM_MATH_MATRIX_CHECK - - /* Check for matrix mismatch condition */ - if ((pSrc->numRows != pDst->numCols) || - (pSrc->numCols != pDst->numRows) ) - { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } - else - + + /* Check for matrix mismatch condition */ + if ((pSrc->numRows != pDst->numCols) || + (pSrc->numCols != pDst->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - - { - /* Matrix transpose by exchanging the rows with columns */ - /* row loop */ - do - { - /* Pointer px is set to starting address of column being processed */ - px = pOut + i; - + + { + /* Matrix transpose by exchanging the rows with columns */ + /* row loop */ + do + { + /* Pointer px is set to starting address of column being processed */ + px = pOut + i; + #if defined (ARM_MATH_LOOPUNROLL) - - /* Loop unrolling: Compute 4 outputs at a time */ - col = nCols >> 2U; - - while (col > 0U) /* column loop */ - { - /* Read and store input element in destination */ - *px = *pIn++; - /* Update pointer px to point to next row of transposed matrix */ - px += nRows; - - *px = *pIn++; - px += nRows; - - *px = *pIn++; - px += nRows; - - *px = *pIn++; - px += nRows; - - /* Decrement column loop counter */ - col--; - } - - /* Loop unrolling: Compute remaining outputs */ - col = nCols % 0x4U; - + + /* Loop unrolling: Compute 4 outputs at a time */ + col = nCols >> 2U; + + while (col > 0U) /* column loop */ + { + /* Read and store input element in destination */ + *px = *pIn++; + /* Update pointer px to point to next row of transposed matrix */ + px += nRows; + + *px = *pIn++; + px += nRows; + + *px = *pIn++; + px += nRows; + + *px = *pIn++; + px += nRows; + + /* Decrement column loop counter */ + col--; + } + + /* Loop unrolling: Compute remaining outputs */ + col = nCols % 0x4U; + #else - - /* Initialize col with number of samples */ - col = nCols; - + + /* Initialize col with number of samples */ + col = nCols; + #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ - - while (col > 0U) - { - /* Read and store input element in destination */ - *px = *pIn++; - - /* Update pointer px to point to next row of transposed matrix */ - px += nRows; - - /* Decrement column loop counter */ - col--; - } - - i++; - - /* Decrement row loop counter */ - row--; - - } while (row > 0U); /* row loop end */ - - /* Set status as ARM_MATH_SUCCESS */ - status = ARM_MATH_SUCCESS; - } - - /* Return to application */ - return (status); + + while (col > 0U) + { + /* Read and store input element in destination */ + *px = *pIn++; + + /* Update pointer px to point to next row of transposed matrix */ + px += nRows; + + /* Decrement column loop counter */ + col--; + } + + i++; + + /* Decrement row loop counter */ + row--; + + } while (row > 0U); /* row loop end */ + + /* Set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); } #endif /** diff --git a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c index 38cf538f..2f17e023 100755 --- a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c @@ -49,9 +49,9 @@ #if defined(ARM_MATH_NEON) void arm_absmax_no_idx_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { float64_t maxVal , in; /* Temporary variables to store the output value. */ uint32_t blkCnt; /* Loop counter */ @@ -112,9 +112,9 @@ void arm_absmax_no_idx_f64( } #else void arm_absmax_no_idx_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { float64_t maxVal, out; /* Temporary variables to store the output value. */ uint32_t blkCnt; /* Loop counter */ diff --git a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c index ca521085..5e4abfcc 100755 --- a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c @@ -47,101 +47,101 @@ #if defined(ARM_MATH_NEON) void arm_absmin_no_idx_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - float64_t minVal , in; /* Temporary variables to store the output value. */ - uint32_t blkCnt; /* Loop counter */ - - float64x2_t minV; - float64x2_t pSrcV ; - pSrcV = vld1q_f64(pSrc); - pSrc += 2 ; - minV = vabsq_f64(pSrcV); - - - - - /* Load first input value that act as reference value for comparision */ - - - /* Initialize blkCnt with number of samples */ - blkCnt = (blockSize - 2U) >> 1U; - - while (blkCnt > 0U) - { - /* Initialize maxVal to the next consecutive values one by one */ - pSrcV = vld1q_f64(pSrc); - minV = vminq_f64(minV, vabsq_f64(pSrcV)); - - pSrc += 2 ; - - /* Decrement loop counter */ - blkCnt--; - } - minVal =vgetq_lane_f64(minV, 0); - if(minVal > vgetq_lane_f64(minV, 1)) - { - minVal = vgetq_lane_f64(minV, 1); - } - blkCnt = (blockSize - 2U) & 1; - - while (blkCnt > 0U) - { - /* Initialize maxVal to the next consecutive values one by one */ - in = fabs(*pSrc++); - - /* compare for the maximum value */ - if (minVal > in) - { - /* Update the maximum value and it's index */ - minVal = in; - } - - /* Decrement loop counter */ - blkCnt--; - } - *pResult = minVal; - - - /* Store the maximum value and it's index into destination pointers */ - + float64_t minVal , in; /* Temporary variables to store the output value. */ + uint32_t blkCnt; /* Loop counter */ + + float64x2_t minV; + float64x2_t pSrcV ; + pSrcV = vld1q_f64(pSrc); + pSrc += 2 ; + minV = vabsq_f64(pSrcV); + + + + + /* Load first input value that act as reference value for comparision */ + + + /* Initialize blkCnt with number of samples */ + blkCnt = (blockSize - 2U) >> 1U; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + pSrcV = vld1q_f64(pSrc); + minV = vminq_f64(minV, vabsq_f64(pSrcV)); + + pSrc += 2 ; + + /* Decrement loop counter */ + blkCnt--; + } + minVal =vgetq_lane_f64(minV, 0); + if(minVal > vgetq_lane_f64(minV, 1)) + { + minVal = vgetq_lane_f64(minV, 1); + } + blkCnt = (blockSize - 2U) & 1; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + in = fabs(*pSrc++); + + /* compare for the maximum value */ + if (minVal > in) + { + /* Update the maximum value and it's index */ + minVal = in; + } + + /* Decrement loop counter */ + blkCnt--; + } + *pResult = minVal; + + + /* Store the maximum value and it's index into destination pointers */ + } #else void arm_absmin_no_idx_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - float64_t minVal, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt; /* Loop counter */ - - - /* Load first input value that act as reference value for comparision */ - out = fabs(*pSrc++); - - /* Initialize blkCnt with number of samples */ - blkCnt = (blockSize - 1U); - - while (blkCnt > 0U) - { - /* Initialize minVal to the next consecutive values one by one */ - minVal = fabs(*pSrc++); - - /* compare for the minimum value */ - if (out > minVal) - { - /* Update the minimum value and it's index */ - out = minVal; - } - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store the minimum value and it's index into destination pointers */ - *pResult = out; + float64_t minVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt; /* Loop counter */ + + + /* Load first input value that act as reference value for comparision */ + out = fabs(*pSrc++); + + /* Initialize blkCnt with number of samples */ + blkCnt = (blockSize - 1U); + + while (blkCnt > 0U) + { + /* Initialize minVal to the next consecutive values one by one */ + minVal = fabs(*pSrc++); + + /* compare for the minimum value */ + if (out > minVal) + { + /* Update the minimum value and it's index */ + out = minVal; + } + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store the minimum value and it's index into destination pointers */ + *pResult = out; } #endif /** diff --git a/Source/StatisticsFunctions/arm_entropy_f64.c b/Source/StatisticsFunctions/arm_entropy_f64.c index a8fc7ed0..a321d2af 100755 --- a/Source/StatisticsFunctions/arm_entropy_f64.c +++ b/Source/StatisticsFunctions/arm_entropy_f64.c @@ -49,43 +49,43 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize) { - const float64_t *pIn; - uint32_t blkCnt; - float64_t accum, p; - - pIn = pSrcA; - - - accum = 0.0; + const float64_t *pIn; + uint32_t blkCnt; + float64_t accum, p; + + pIn = pSrcA; + + + accum = 0.0; #if defined(ARM_MATH_NEON) - float64x2_t sumV ,pInV ; - sumV = vdupq_n_f64(0.0f); - blkCnt = blockSize >> 1U ; - - while(blkCnt > 0){ - pInV = vld1q_f64(pIn); - sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) ); - pIn += 2 ; - blkCnt--; - - } - accum = vaddvq_f64(sumV); - blkCnt = blockSize & 1 ; + float64x2_t sumV ,pInV ; + sumV = vdupq_n_f64(0.0f); + blkCnt = blockSize >> 1U ; + + while(blkCnt > 0){ + pInV = vld1q_f64(pIn); + sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) ); + pIn += 2 ; + blkCnt--; + + } + accum = vaddvq_f64(sumV); + blkCnt = blockSize & 1 ; #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - - while(blkCnt > 0) - { - p = *pIn++; - - accum += p * log(p); - - blkCnt--; - - } - - return(-accum); + + while(blkCnt > 0) + { + p = *pIn++; + + accum += p * log(p); + + blkCnt--; + + } + + return(-accum); } /** diff --git a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c index a52059ed..867ad0cc 100755 --- a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c +++ b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c @@ -50,53 +50,53 @@ float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,const float64_t * pSrcB,uint32_t blockSize) { - const float64_t *pInA, *pInB; - uint32_t blkCnt; - float64_t accum, pA,pB; - - float64x2_t accumV; - float64x2_t tmpVA, tmpVB,tmpV; - - pInA = pSrcA; - pInB = pSrcB; - - accum = 0.0f; - accumV = vdupq_n_f64(0.0f); - - blkCnt = blockSize >> 1; - while(blkCnt > 0) - { - tmpVA = vld1q_f64(pInA); - pInA += 2; - - tmpVB = vld1q_f64(pInB); - pInB += 2; - - tmpV = vinvq_f64(tmpVA); - tmpVB = vmulq_f64(tmpVB, tmpV); - tmpVB = vlogq_f64(tmpVB); - - accumV = vmlaq_f64(accumV, tmpVA, tmpVB); - - blkCnt--; - - } - - - accum = vaddvq_f64(accumV); - - blkCnt = blockSize & 1; - while(blkCnt > 0) - { - pA = *pInA++; - pB = *pInB++; - accum += pA * logf(pB/pA); - - blkCnt--; - - } - - return(-accum); + const float64_t *pInA, *pInB; + uint32_t blkCnt; + float64_t accum, pA,pB; + + float64x2_t accumV; + float64x2_t tmpVA, tmpVB,tmpV; + + pInA = pSrcA; + pInB = pSrcB; + + accum = 0.0f; + accumV = vdupq_n_f64(0.0f); + + blkCnt = blockSize >> 1; + while(blkCnt > 0) + { + tmpVA = vld1q_f64(pInA); + pInA += 2; + + tmpVB = vld1q_f64(pInB); + pInB += 2; + + tmpV = vinvq_f64(tmpVA); + tmpVB = vmulq_f64(tmpVB, tmpV); + tmpVB = vlogq_f64(tmpVB); + + accumV = vmlaq_f64(accumV, tmpVA, tmpVB); + + blkCnt--; + + } + + + accum = vaddvq_f64(accumV); + + blkCnt = blockSize & 1; + while(blkCnt > 0) + { + pA = *pInA++; + pB = *pInB++; + accum += pA * logf(pB/pA); + + blkCnt--; + + } + + return(-accum); } #else diff --git a/Source/StatisticsFunctions/arm_max_no_idx_f64.c b/Source/StatisticsFunctions/arm_max_no_idx_f64.c index f6404c01..fcd05f94 100644 --- a/Source/StatisticsFunctions/arm_max_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_max_no_idx_f64.c @@ -47,9 +47,9 @@ */ #if defined(ARM_MATH_NEON) void arm_max_no_idx_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { float64_t maxVal , in; /* Temporary variables to store the output value. */ uint32_t blkCnt; /* Loop counter */ @@ -110,9 +110,9 @@ void arm_max_no_idx_f64( } #else void arm_max_no_idx_f64( - const float64_t *pSrc, - uint32_t blockSize, - float64_t *pResult) + const float64_t *pSrc, + uint32_t blockSize, + float64_t *pResult) { float64_t maxValue = F64_MIN; float64_t newVal; diff --git a/Source/StatisticsFunctions/arm_mean_f64.c b/Source/StatisticsFunctions/arm_mean_f64.c index 577ecf13..b4465bdc 100644 --- a/Source/StatisticsFunctions/arm_mean_f64.c +++ b/Source/StatisticsFunctions/arm_mean_f64.c @@ -49,68 +49,68 @@ #if defined(ARM_MATH_NEON) void arm_mean_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64x2_t vSum = vdupq_n_f64(0.0f); - float64_t sum = 0.; /* Temporary result storage */ - float64x2_t afterLoad ; - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize >> 1U; - - - while (blkCnt > 0U) - { - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - - afterLoad = vld1q_f64(pSrc); - vSum = vaddq_f64(vSum, afterLoad); - pSrc += 2; - /* Decrement loop counter */ - blkCnt--; - - - } - sum = vaddvq_f64(vSum); - - blkCnt = blockSize & 1; - - while (blkCnt > 0U) - { - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - sum += *pSrc++; - - /* Decrement loop counter */ - blkCnt--; - } - *pResult = (sum/blockSize); + uint32_t blkCnt; /* Loop counter */ + float64x2_t vSum = vdupq_n_f64(0.0f); + float64_t sum = 0.; /* Temporary result storage */ + float64x2_t afterLoad ; + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize >> 1U; + + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + + afterLoad = vld1q_f64(pSrc); + vSum = vaddq_f64(vSum, afterLoad); + pSrc += 2; + /* Decrement loop counter */ + blkCnt--; + + + } + sum = vaddvq_f64(vSum); + + blkCnt = blockSize & 1; + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + sum += *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + *pResult = (sum/blockSize); } #else void arm_mean_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary result storage */ - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - sum += *pSrc++; - - /* Decrement loop counter */ - blkCnt--; - } - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - /* Store result to destination */ - *pResult = (sum / blockSize); + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + sum += *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ + /* Store result to destination */ + *pResult = (sum / blockSize); } #endif /** diff --git a/Source/StatisticsFunctions/arm_min_no_idx_f64.c b/Source/StatisticsFunctions/arm_min_no_idx_f64.c index b9696ff8..903bd90a 100755 --- a/Source/StatisticsFunctions/arm_min_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_min_no_idx_f64.c @@ -46,51 +46,51 @@ @return none */ void arm_min_no_idx_f64( - const float64_t *pSrc, - uint32_t blockSize, - float64_t *pResult) + const float64_t *pSrc, + uint32_t blockSize, + float64_t *pResult) { - float64_t minValue = F64_MAX; - float64_t newVal; - uint32_t blkCnt ; + float64_t minValue = F64_MAX; + float64_t newVal; + uint32_t blkCnt ; #if defined(ARM_MATH_NEON) - float64x2_t minValueV , newValV ; - minValueV = vdupq_n_f64(F64_MAX); - blkCnt = blockSize >> 1U; - while(blkCnt > 0) - { - newValV = vld1q_f64(pSrc); - minValueV = vminq_f64(minValueV, newValV); - pSrc += 2 ; - blkCnt--; - - } - minValue =vgetq_lane_f64(minValueV, 0); - if(minValue > vgetq_lane_f64(minValueV, 1)) - { - minValue = vgetq_lane_f64(minValueV, 1); - } - - blkCnt = blockSize & 1 ; + float64x2_t minValueV , newValV ; + minValueV = vdupq_n_f64(F64_MAX); + blkCnt = blockSize >> 1U; + while(blkCnt > 0) + { + newValV = vld1q_f64(pSrc); + minValueV = vminq_f64(minValueV, newValV); + pSrc += 2 ; + blkCnt--; + + } + minValue =vgetq_lane_f64(minValueV, 0); + if(minValue > vgetq_lane_f64(minValueV, 1)) + { + minValue = vgetq_lane_f64(minValueV, 1); + } + + blkCnt = blockSize & 1 ; #else - blkCnt = blockSize; + blkCnt = blockSize; #endif - - while (blkCnt > 0U) - { - newVal = *pSrc++; - - /* compare for the minimum value */ - if (minValue > newVal) - { - /* Update the minimum value and it's index */ - minValue = newVal; - } - - blkCnt --; - } - - *pResult = minValue; + + while (blkCnt > 0U) + { + newVal = *pSrc++; + + /* compare for the minimum value */ + if (minValue > newVal) + { + /* Update the minimum value and it's index */ + minValue = newVal; + } + + blkCnt --; + } + + *pResult = minValue; } /** diff --git a/Source/StatisticsFunctions/arm_mse_f64.c b/Source/StatisticsFunctions/arm_mse_f64.c index 13daa2cb..b2a4ae4b 100755 --- a/Source/StatisticsFunctions/arm_mse_f64.c +++ b/Source/StatisticsFunctions/arm_mse_f64.c @@ -47,84 +47,84 @@ */ void arm_mse_f64( - const float64_t * pSrcA, - const float64_t * pSrcB, - uint32_t blockSize, - float64_t * result) + const float64_t * pSrcA, + const float64_t * pSrcB, + uint32_t blockSize, + float64_t * result) { - - uint32_t blkCnt; /* Loop counter */ - float64_t inA, inB; - float64_t sum = 0.0; + + uint32_t blkCnt; /* Loop counter */ + float64_t inA, inB; + float64_t sum = 0.0; #if defined (ARM_MATH_NEON) - - float64x2_t inAV , inBV , subV, sumV; - sumV = vdupq_n_f64(0.0f); - - blkCnt = blockSize >> 1U ; - - while (blkCnt > 0U) - { - inAV = vld1q_f64(pSrcA); - pSrcA+=2; - inBV = vld1q_f64(pSrcB); - pSrcB+=2; - subV = vsubq_f64(inAV, inBV); - sumV = vmlaq_f64(sumV, subV, subV); - - blkCnt--; - - } - sum = vaddvq_f64(sumV); - blkCnt = (blockSize) & 1; - + + float64x2_t inAV , inBV , subV, sumV; + sumV = vdupq_n_f64(0.0f); + + blkCnt = blockSize >> 1U ; + + while (blkCnt > 0U) + { + inAV = vld1q_f64(pSrcA); + pSrcA+=2; + inBV = vld1q_f64(pSrcB); + pSrcB+=2; + subV = vsubq_f64(inAV, inBV); + sumV = vmlaq_f64(sumV, subV, subV); + + blkCnt--; + + } + sum = vaddvq_f64(sumV); + blkCnt = (blockSize) & 1; + #else - /* Temporary return variable */ + /* Temporary return variable */ #if defined (ARM_MATH_LOOPUNROLL) - blkCnt = (blockSize) >> 1; - + blkCnt = (blockSize) >> 1; + #pragma clang loop vectorize(enable) - while (blkCnt > 0U) - { - - - inA = *pSrcA++; - inB = *pSrcB++; - inA = inA - inB; - sum += inA * inA; - - inA = *pSrcA++; - inB = *pSrcB++; - inA = inA - inB; - sum += inA * inA; - - /* Decrement loop counter */ - blkCnt--; - } - - - /* Loop unrolling: Compute remaining outputs */ - blkCnt = (blockSize) & 1; + while (blkCnt > 0U) + { + + + inA = *pSrcA++; + inB = *pSrcB++; + inA = inA - inB; + sum += inA * inA; + + inA = *pSrcA++; + inB = *pSrcB++; + inA = inA - inB; + sum += inA * inA; + + /* Decrement loop counter */ + blkCnt--; + } + + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = (blockSize) & 1; #else - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; #endif #endif -//#pragma clang loop vectorize(enable) unroll(disable) - while (blkCnt > 0U) - { - inA = *pSrcA++; - inB = *pSrcB++; - inA = inA - inB; - sum += inA * inA; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result in destination buffer */ - *result = sum / blockSize; + //#pragma clang loop vectorize(enable) unroll(disable) + while (blkCnt > 0U) + { + inA = *pSrcA++; + inB = *pSrcB++; + inA = inA - inB; + sum += inA * inA; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result in destination buffer */ + *result = sum / blockSize; } diff --git a/Source/StatisticsFunctions/arm_power_f64.c b/Source/StatisticsFunctions/arm_power_f64.c index 710dc834..5760a7f8 100644 --- a/Source/StatisticsFunctions/arm_power_f64.c +++ b/Source/StatisticsFunctions/arm_power_f64.c @@ -46,81 +46,81 @@ */ #if defined(ARM_MATH_NEON) void arm_power_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary result storage */ - float64x2_t sumV ; /* Temporary variable to store input value */ - sumV = vdupq_n_f64(0.0f); - float64x2_t pSrcV; - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize >> 1U; - - while (blkCnt > 0U) - { - /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ - - /* Compute Power and store result in a temporary variable, sum. */ - pSrcV = vld1q_f64(pSrc); - sumV = vmlaq_f64(sumV, pSrcV, pSrcV); - pSrc+= 2 ; - - - - /* Decrement loop counter */ - blkCnt--; - } - sum = vaddvq_f64(sumV); - - - float64_t in; - blkCnt = blockSize & 1; - - while (blkCnt > 0U) - { - /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ - - /* Compute Power and store result in a temporary variable, sum. */ - in = *pSrc++; - sum += in * in; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result to destination */ - *pResult = sum; + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + float64x2_t sumV ; /* Temporary variable to store input value */ + sumV = vdupq_n_f64(0.0f); + float64x2_t pSrcV; + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ + pSrcV = vld1q_f64(pSrc); + sumV = vmlaq_f64(sumV, pSrcV, pSrcV); + pSrc+= 2 ; + + + + /* Decrement loop counter */ + blkCnt--; + } + sum = vaddvq_f64(sumV); + + + float64_t in; + blkCnt = blockSize & 1; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ + in = *pSrc++; + sum += in * in; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result to destination */ + *pResult = sum; } #else void arm_power_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary result storage */ - float64_t in; /* Temporary variable to store input value */ - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ - - /* Compute Power and store result in a temporary variable, sum. */ - in = *pSrc++; - sum += in * in; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result to destination */ - *pResult = sum; + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + float64_t in; /* Temporary variable to store input value */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ + in = *pSrc++; + sum += in * in; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result to destination */ + *pResult = sum; } #endif /** diff --git a/Source/StatisticsFunctions/arm_var_f64.c b/Source/StatisticsFunctions/arm_var_f64.c index 6fd106aa..58a626b8 100644 --- a/Source/StatisticsFunctions/arm_var_f64.c +++ b/Source/StatisticsFunctions/arm_var_f64.c @@ -45,56 +45,58 @@ @return none */ void arm_var_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary result storage */ - float64_t fSum = 0.; - float64_t fMean, fValue; - const float64_t * pInput = pSrc; - - if (blockSize <= 1U) - { - *pResult = 0; - return; - } - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ - - sum += *pInput++; - - /* Decrement loop counter */ - blkCnt--; - } - - /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ - fMean = sum / (float64_t) blockSize; - - pInput = pSrc; - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - fValue = *pInput++ - fMean; - fSum += fValue * fValue; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Variance */ - *pResult = fSum / (float64_t)(blockSize - 1.); + + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + float64_t fSum = 0.; + float64_t fMean, fValue; + const float64_t * pInput = pSrc; + + if (blockSize <= 1U) + { + *pResult = 0; + return; + } + arm_mean_f64(pInput, blockSize, &fMean); +#if defined(ARM_MATH_NEON) + float64x2_t fValueV ,fsumV , pInputV , fMeanV; + fsumV = vdupq_n_f64(0.0f); + fMeanV = vdupq_n_f64(fMean); + blkCnt = blockSize >> 1U; + + while(blkCnt > 0U) + { + pInputV = vld1q_f64(pInput); + fValueV = vsubq_f64(pInputV, fMeanV); + fsumV = vmlaq_f64(fsumV, fValueV, fValueV); + pInput += 2 ; + blkCnt--; + } + fSum = vaddvq_f64(fsumV); + + blkCnt = blockSize & 1 ; + +#else + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; +#endif + while (blkCnt > 0U) + { + fValue = *pInput++ - fMean; + fSum += fValue * fValue; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Variance */ + *pResult = fSum / (float64_t)(blockSize - 1.); } /** - @} end of variance group + @} end of variance group */