diff --git a/Source/BasicMathFunctions/arm_dot_prod_f64.c b/Source/BasicMathFunctions/arm_dot_prod_f64.c
index 4e300331..3fafd10c 100644
--- a/Source/BasicMathFunctions/arm_dot_prod_f64.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f64.c
@@ -47,85 +47,85 @@
*/
#if defined(ARM_MATH_NEON)
void arm_dot_prod_f64(
- const float64_t * pSrcA,
- const float64_t * pSrcB,
- uint32_t blockSize,
- float64_t * result)
- {
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary return variable */
- float64x2_t sumV ; /* Neon buffer for sum variable */
-
- /* Neon Buffer Initialisation */
- sumV = vsetq_lane_f64(0.0f, sumV, 0);
- sumV = vsetq_lane_f64(0.0f, sumV, 1);
-
- /* Neon Buffer for sources */
- float64x2_t pSrcAV;
- float64x2_t pSrcBV;
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize >> 1U;
-
- while (blkCnt > 0U)
- {
- /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-
- /* Load source value in Neon Buffer */
- pSrcAV = vld1q_f64(pSrcA);
- pSrcBV = vld1q_f64(pSrcB);
- /* Calculate dot product and store result in a temporary buffer. */
- sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV);
-
- pSrcA+=2;
- pSrcB+=2;
- /* Decrement loop counter */
- blkCnt--;
- }
- /* Sum both 64 bits part in the float64x2 */
- sum = vaddvq_f64(sumV);
-
-
- /* Tail */
- blkCnt = blockSize & 1 ;
-
- while(blkCnt > 0U)
- {
- sum += (*pSrcA++) * (*pSrcB++);
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result in destination buffer */
- *result = sum;
- }
+ const float64_t * pSrcA,
+ const float64_t * pSrcB,
+ uint32_t blockSize,
+ float64_t * result)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary return variable */
+ float64x2_t sumV ; /* Neon buffer for sum variable */
+
+ /* Neon Buffer Initialisation */
+ sumV = vsetq_lane_f64(0.0f, sumV, 0);
+ sumV = vsetq_lane_f64(0.0f, sumV, 1);
+
+ /* Neon Buffer for sources */
+ float64x2_t pSrcAV;
+ float64x2_t pSrcBV;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+ /* Load source value in Neon Buffer */
+ pSrcAV = vld1q_f64(pSrcA);
+ pSrcBV = vld1q_f64(pSrcB);
+ /* Calculate dot product and store result in a temporary buffer. */
+ sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV);
+
+ pSrcA+=2;
+ pSrcB+=2;
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ /* Sum both 64 bits part in the float64x2 */
+ sum = vaddvq_f64(sumV);
+
+
+ /* Tail */
+ blkCnt = blockSize & 1 ;
+
+ while(blkCnt > 0U)
+ {
+ sum += (*pSrcA++) * (*pSrcB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *result = sum;
+}
#else
void arm_dot_prod_f64(
- const float64_t * pSrcA,
- const float64_t * pSrcB,
- uint32_t blockSize,
- float64_t * result)
+ const float64_t * pSrcA,
+ const float64_t * pSrcB,
+ uint32_t blockSize,
+ float64_t * result)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary return variable */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
-
- /* Calculate dot product and store result in a temporary buffer. */
- sum += (*pSrcA++) * (*pSrcB++);
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result in destination buffer */
- *result = sum;
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary return variable */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+ /* Calculate dot product and store result in a temporary buffer. */
+ sum += (*pSrcA++) * (*pSrcB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *result = sum;
}
#endif
diff --git a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
index 63090948..76bd3a63 100644
--- a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
+++ b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
@@ -48,49 +48,49 @@
*/
float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
-
- float64_t diff=0., maxVal,tmpA, tmpB;
- uint32_t blkCnt;
- maxVal = F64_MIN;
+
+ float64_t diff=0., maxVal,tmpA, tmpB;
+ uint32_t blkCnt;
+ maxVal = F64_MIN;
#if defined(ARM_MATH_NEON)
- float64x2_t diffV , tmpAV , tmpBV , maxValV ;
- maxValV = vdupq_n_f64(maxVal);
- blkCnt = blockSize >> 1U ;
- while(blkCnt > 0U)
- {
- tmpAV = vld1q_f64(pA);
- tmpBV = vld1q_f64(pB);
- diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV)));
- maxValV = vmaxq_f64(maxValV, diffV);
- pA+=2;
- pB+=2;
- blkCnt--;
- }
- maxVal =vgetq_lane_f64(maxValV, 0);
- if(maxVal < vgetq_lane_f64(maxValV, 1))
- {
- maxVal = vgetq_lane_f64(maxValV, 1);
- }
- blkCnt = blockSize & 1;
-
-
+ float64x2_t diffV , tmpAV , tmpBV , maxValV ;
+ maxValV = vdupq_n_f64(maxVal);
+ blkCnt = blockSize >> 1U ;
+ while(blkCnt > 0U)
+ {
+ tmpAV = vld1q_f64(pA);
+ tmpBV = vld1q_f64(pB);
+ diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV)));
+ maxValV = vmaxq_f64(maxValV, diffV);
+ pA+=2;
+ pB+=2;
+ blkCnt--;
+ }
+ maxVal =vgetq_lane_f64(maxValV, 0);
+ if(maxVal < vgetq_lane_f64(maxValV, 1))
+ {
+ maxVal = vgetq_lane_f64(maxValV, 1);
+ }
+ blkCnt = blockSize & 1;
+
+
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
-
- while(blkCnt > 0)
- {
- tmpA = *pA++;
- tmpB = *pB++;
- diff = fabs(tmpA - tmpB);
- if (diff > maxVal)
- {
- maxVal = diff;
- }
- blkCnt --;
- }
-
- return(maxVal);
+
+ while(blkCnt > 0)
+ {
+ tmpA = *pA++;
+ tmpB = *pB++;
+ diff = fabs(tmpA - tmpB);
+ if (diff > maxVal)
+ {
+ maxVal = diff;
+ }
+ blkCnt --;
+ }
+
+ return(maxVal);
}
/**
diff --git a/Source/DistanceFunctions/arm_cityblock_distance_f64.c b/Source/DistanceFunctions/arm_cityblock_distance_f64.c
index 2f63ffae..9edb7041 100644
--- a/Source/DistanceFunctions/arm_cityblock_distance_f64.c
+++ b/Source/DistanceFunctions/arm_cityblock_distance_f64.c
@@ -47,41 +47,41 @@
*/
float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
- float64_t accum,tmpA, tmpB;
- uint32_t blkCnt;
- accum = 0.;
-
+ float64_t accum,tmpA, tmpB;
+ uint32_t blkCnt;
+ accum = 0.;
+
#if defined(ARM_MATH_NEON)
- float64x2_t tmpAV, tmpBV,accumV , subV;
- accumV = vdupq_n_f64(0.0f);
- blkCnt = blockSize >> 1U;
-
- while(blkCnt > 0U)
- {
- tmpAV = vld1q_f64(pA);
- tmpBV = vld1q_f64(pB);
- subV = vabdq_f64(tmpAV, tmpBV);
- accumV = vaddq_f64(accumV, subV);
- pA+=2;
- pB+=2;
- blkCnt--;
- }
- accum = vaddvq_f64(accumV);
- blkCnt = blockSize & 1 ;
-
+ float64x2_t tmpAV, tmpBV,accumV , subV;
+ accumV = vdupq_n_f64(0.0f);
+ blkCnt = blockSize >> 1U;
+
+ while(blkCnt > 0U)
+ {
+ tmpAV = vld1q_f64(pA);
+ tmpBV = vld1q_f64(pB);
+ subV = vabdq_f64(tmpAV, tmpBV);
+ accumV = vaddq_f64(accumV, subV);
+ pA+=2;
+ pB+=2;
+ blkCnt--;
+ }
+ accum = vaddvq_f64(accumV);
+ blkCnt = blockSize & 1 ;
+
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
- while(blkCnt > 0)
- {
- tmpA = *pA++;
- tmpB = *pB++;
- accum += fabs(tmpA - tmpB);
-
- blkCnt--;
- }
-
- return(accum);
+ while(blkCnt > 0)
+ {
+ tmpA = *pA++;
+ tmpB = *pB++;
+ accum += fabs(tmpA - tmpB);
+
+ blkCnt--;
+ }
+
+ return(accum);
}
/**
diff --git a/Source/DistanceFunctions/arm_euclidean_distance_f64.c b/Source/DistanceFunctions/arm_euclidean_distance_f64.c
index 794bf93d..1b00c9bf 100644
--- a/Source/DistanceFunctions/arm_euclidean_distance_f64.c
+++ b/Source/DistanceFunctions/arm_euclidean_distance_f64.c
@@ -49,35 +49,35 @@
*/
float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
- float64_t accum=0.,tmp;
- uint32_t blkCnt;
+ float64_t accum=0.,tmp;
+ uint32_t blkCnt;
#if defined(ARM_MATH_NEON)
- float64x2_t accumV,tmpV , pAV ,pBV;
- accumV = vdupq_n_f64(0.0f);
- blkCnt = blockSize >> 1U;
- while(blkCnt > 0U)
- {
- pAV = vld1q_f64(pA);
- pBV = vld1q_f64(pB);
- tmpV = vsubq_f64(pAV, pBV);
- accumV = vmlaq_f64(accumV, tmpV, tmpV);
- pA+=2;
- pB+=2;
- blkCnt--;
- }
- accum = vaddvq_f64(accumV);
- blkCnt = blockSize & 1;
+ float64x2_t accumV,tmpV , pAV ,pBV;
+ accumV = vdupq_n_f64(0.0f);
+ blkCnt = blockSize >> 1U;
+ while(blkCnt > 0U)
+ {
+ pAV = vld1q_f64(pA);
+ pBV = vld1q_f64(pB);
+ tmpV = vsubq_f64(pAV, pBV);
+ accumV = vmlaq_f64(accumV, tmpV, tmpV);
+ pA+=2;
+ pB+=2;
+ blkCnt--;
+ }
+ accum = vaddvq_f64(accumV);
+ blkCnt = blockSize & 1;
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
- while(blkCnt > 0)
- {
- tmp = *pA++ - *pB++;
- accum += SQ(tmp);
- blkCnt --;
- }
- tmp = sqrt(accum);
- return(tmp);
+ while(blkCnt > 0)
+ {
+ tmp = *pA++ - *pB++;
+ accum += SQ(tmp);
+ blkCnt --;
+ }
+ tmp = sqrt(accum);
+ return(tmp);
}
/**
diff --git a/Source/FastMathFunctions/arm_vexp_f64.c b/Source/FastMathFunctions/arm_vexp_f64.c
index e6c8f359..d275e1d9 100644
--- a/Source/FastMathFunctions/arm_vexp_f64.c
+++ b/Source/FastMathFunctions/arm_vexp_f64.c
@@ -33,44 +33,44 @@
#endif
void arm_vexp_f64(
- const float64_t * pSrc,
- float64_t * pDst,
- uint32_t blockSize)
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt;
+ uint32_t blkCnt;
#if defined(ARM_MATH_NEON)
-
-
- float64x2_t src;
- float64x2_t dst;
-
- blkCnt = blockSize >> 1U;
-
- while (blkCnt > 0U)
- {
- src = vld1q_f64(pSrc);
- dst = vexpq_f64(src);
- vst1q_f64(pDst, dst);
-
- pSrc += 2;
- pDst += 2;
-
- blkCnt--;
- }
-
- blkCnt = blockSize & 1;
+
+
+ float64x2_t src;
+ float64x2_t dst;
+
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ src = vld1q_f64(pSrc);
+ dst = vexpq_f64(src);
+ vst1q_f64(pDst, dst);
+
+ pSrc += 2;
+ pDst += 2;
+
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 1;
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
- while (blkCnt > 0U)
- {
- /* C = log(A) */
-
-
- /* Calculate log and store result in destination buffer. */
- *pDst++ = exp(*pSrc++);
-
- /* Decrement loop counter */
- blkCnt--;
- }
+ while (blkCnt > 0U)
+ {
+ /* C = log(A) */
+
+
+ /* Calculate log and store result in destination buffer. */
+ *pDst++ = exp(*pSrc++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
}
diff --git a/Source/FastMathFunctions/arm_vlog_f64.c b/Source/FastMathFunctions/arm_vlog_f64.c
index b9ab25ec..008c22dc 100644
--- a/Source/FastMathFunctions/arm_vlog_f64.c
+++ b/Source/FastMathFunctions/arm_vlog_f64.c
@@ -34,43 +34,43 @@
#endif
void arm_vlog_f64(
- const float64_t * pSrc,
- float64_t * pDst,
- uint32_t blockSize)
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
{
- uint32_t blkCnt;
+ uint32_t blkCnt;
#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
- float64x2_t src;
- float64x2_t dst;
-
- blkCnt = blockSize >> 1U;
-
- while (blkCnt > 0U)
- {
- src = vld1q_f64(pSrc);
- dst = vlogq_f64(src);
- vst1q_f64(pDst, dst);
-
- pSrc += 2;
- pDst += 2;
- /* Decrement loop counter */
- blkCnt--;
- }
-
- blkCnt = blockSize & 1;
+ float64x2_t src;
+ float64x2_t dst;
+
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ src = vld1q_f64(pSrc);
+ dst = vlogq_f64(src);
+ vst1q_f64(pDst, dst);
+
+ pSrc += 2;
+ pDst += 2;
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 1;
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
-
-
- while (blkCnt > 0U)
- {
- /* C = log(A) */
-
- /* Calculate log and store result in destination buffer. */
- *pDst++ = log(*pSrc++);
-
- /* Decrement loop counter */
- blkCnt--;
- }
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = log(A) */
+
+ /* Calculate log and store result in destination buffer. */
+ *pDst++ = log(*pSrc++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
}
diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
index e936c3fd..99fb2b41 100644
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
@@ -29,574 +29,574 @@
#include "dsp/filtering_functions.h"
/**
- @ingroup groupFilters
-*/
+ @ingroup groupFilters
+ */
/**
- @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
-
- This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
- The filters are implemented as a cascade of second order Biquad sections.
- These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
- Only floating-point data is supported.
-
- This function operate on blocks of input and output data and each call to the function
- processes blockSize samples through the filter.
- pSrc points to the array of input data and
- pDst points to the array of output data.
- Both arrays contain blockSize values.
-
- @par Algorithm
- Each Biquad stage implements a second order filter using the difference equation:
-
- y[n] = b0 * x[n] + d1 - d1 = b1 * x[n] + a1 * y[n] + d2 - d2 = b2 * x[n] + a2 * y[n] -- where d1 and d2 represent the two state values. - @par - A Biquad filter using a transposed Direct Form II structure is shown below. - \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" - Coefficients
b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
- Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
- Pay careful attention to the sign of the feedback coefficients.
- Some design tools flip the sign of the feedback coefficients:
- - y[n] = b0 * x[n] + d1; - d1 = b1 * x[n] - a1 * y[n] + d2; - d2 = b2 * x[n] - a2 * y[n]; -- In this case the feedback coefficients
a1 and a2 must be negated when used with the CMSIS DSP Library.
- @par
- Higher order filters are realized as a cascade of second order sections.
- numStages refers to the number of second order stages used.
- For example, an 8th order filter would be realized with numStages=4 second order stages.
- A 9th order filter would be realized with numStages=5 second order stages with the
- coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
- @par
- pState points to the state variable array.
- Each Biquad stage has 2 state variables d1 and d2.
- The state variables are arranged in the pState array as:
-
- {d11, d12, d21, d22, ...}
-
- where d1x refers to the state variables for the first Biquad and
- d2x refers to the state variables for the second Biquad.
- The state array has a total length of 2*numStages values.
- The state variables are updated after each block of data is processed; the coefficients are untouched.
- @par
- The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
- The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
- That is why the Direct Form I structure supports Q15 and Q31 data types.
- The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
- Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
- The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
-
- @par Instance Structure
- The coefficients and state variables for a filter are stored together in an instance data structure.
- A separate instance structure must be defined for each filter.
- Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
-
- @par Init Functions
- There is also an associated initialization function.
- The initialization function performs following operations:
- - Sets the values of the internal structure fields.
- - Zeros out the values in the state buffer.
- To do this manually without calling the init function, assign the follow subfields of the instance structure:
- numStages, pCoeffs, pState. Also set all of the values in pState to zero.
- @par
- Use of the initialization function is optional.
- However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
- To place an instance structure into a const data section, the instance structure must be manually initialized.
- Set the values in the state buffer to zeros before static initialization.
- For example, to statically initialize the instance structure use
-
- arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
- arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
-
- where numStages is the number of Biquad stages in the filter;
- pState is the address of the state buffer.
- pCoeffs is the address of the coefficient buffer;
-*/
+ @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
+
+ This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
+ The filters are implemented as a cascade of second order Biquad sections.
+ These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
+ Only floating-point data is supported.
+
+ This function operate on blocks of input and output data and each call to the function
+ processes blockSize samples through the filter.
+ pSrc points to the array of input data and
+ pDst points to the array of output data.
+ Both arrays contain blockSize values.
+
+ @par Algorithm
+ Each Biquad stage implements a second order filter using the difference equation:
+ + y[n] = b0 * x[n] + d1 + d1 = b1 * x[n] + a1 * y[n] + d2 + d2 = b2 * x[n] + a2 * y[n] ++ where d1 and d2 represent the two state values. + @par + A Biquad filter using a transposed Direct Form II structure is shown below. + \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad" + Coefficients
b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ Pay careful attention to the sign of the feedback coefficients.
+ Some design tools flip the sign of the feedback coefficients:
+ + y[n] = b0 * x[n] + d1; + d1 = b1 * x[n] - a1 * y[n] + d2; + d2 = b2 * x[n] - a2 * y[n]; ++ In this case the feedback coefficients
a1 and a2 must be negated when used with the CMSIS DSP Library.
+ @par
+ Higher order filters are realized as a cascade of second order sections.
+ numStages refers to the number of second order stages used.
+ For example, an 8th order filter would be realized with numStages=4 second order stages.
+ A 9th order filter would be realized with numStages=5 second order stages with the
+ coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
+ @par
+ pState points to the state variable array.
+ Each Biquad stage has 2 state variables d1 and d2.
+ The state variables are arranged in the pState array as:
+
+ {d11, d12, d21, d22, ...}
+
+ where d1x refers to the state variables for the first Biquad and
+ d2x refers to the state variables for the second Biquad.
+ The state array has a total length of 2*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+ @par
+ The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
+ The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
+ That is why the Direct Form I structure supports Q15 and Q31 data types.
+ The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
+ Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
+ The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+
+ @par Init Functions
+ There is also an associated initialization function.
+ The initialization function performs following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ For example, to statically initialize the instance structure use
+
+ arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};
+ arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};
+
+ where numStages is the number of Biquad stages in the filter;
+ pState is the address of the state buffer.
+ pCoeffs is the address of the coefficient buffer;
+ */
/**
- @addtogroup BiquadCascadeDF2T
- @{
+ @addtogroup BiquadCascadeDF2T
+ @{
*/
/**
- @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
- @param[in] S points to an instance of the filter data structure
- @param[in] pSrc points to the block of input data
- @param[out] pDst points to the block of output data
- @param[in] blockSize number of samples to process
- @return none
+ @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in] S points to an instance of the filter data structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
*/
#if defined(ARM_MATH_NEON)
void arm_biquad_cascade_df2T_f64(
- const arm_biquad_cascade_df2T_instance_f64 * S,
- const float64_t * pSrc,
- float64_t * pDst,
- uint32_t blockSize)
+ const arm_biquad_cascade_df2T_instance_f64 * S,
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
{
-
-
-
- const float64_t *pIn = pSrc; /* source pointer */
- float64_t Xn0, Xn1;
- float64_t acc0, acc1;
- float64_t *pOut = pDst; /* destination pointer */
- float64_t *pState = S->pState; /* State pointer */
- uint32_t sample, stage = S->numStages; /* loop counters */
- float64_t const *pCurCoeffs = /* coefficient pointer */
- (float64_t const *) S->pCoeffs;
- float64x2_t b0Coeffs, a0Coeffs; /* Coefficients vector */
- float64x2_t state; /* State vector*/
- float64x2_t zeroV = vdupq_n_f64(0);
-
- float64_t b0 ;
-
-
- do
- {
-
- /* Reading the coefficients */
- b0 = *pCurCoeffs++ ;
- b0Coeffs = vld1q_f64(pCurCoeffs);
- pCurCoeffs += 2 ;
- a0Coeffs = vld1q_f64(pCurCoeffs);
- pCurCoeffs +=2 ;
-
- state = vld1q_f64(pState);
-
- sample = blockSize >> 1U;
- while (sample > 0U)
- {
-
- /* y[n] = b0 * x[n] + d1 */
- /* d1 = b1 * x[n] + a1 * y[n] + d2 */
- /* d2 = b2 * x[n] + a2 * y[n] */
-
- Xn0 = *pIn++ ;
-
- /* Calculation of acc0*/
- acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
-
-
- /*
- * final initial
- * state b0Coeffs state a0Coeffs
- * | | | |
- * __ __ __ __
- * / \ / \ / \ / \
- * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
- * | d2 | | b2 | | 0 | | a2 |
- * \__/ \__/ \__/ \__/
- */
-
- /* state -> initial state (see above) */
-
- state = vextq_f64(state, zeroV, 1);
-
- /* Calculation of final state */
- state = vfmaq_n_f64(state, b0Coeffs, Xn0);
- state = vfmaq_n_f64(state, a0Coeffs, acc0);
-
- *pOut++ = acc0 ;
-
- /* y[n] = b0 * x[n] + d1 */
- /* d1 = b1 * x[n] + a1 * y[n] + d2 */
- /* d2 = b2 * x[n] + a2 * y[n] */
-
- Xn0 = *pIn++ ;
-
- /* Calculation of acc0*/
- acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
-
-
- /*
- * final initial
- * state b0Coeffs state a0Coeffs
- * | | | |
- * __ __ __ __
- * / \ / \ / \ / \
- * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
- * | d2 | | b2 | | 0 | | a2 |
- * \__/ \__/ \__/ \__/
- */
-
- /* state -> initial state (see above) */
-
- state = vextq_f64(state, zeroV, 1);
-
- /* Calculation of final state */
- state = vfmaq_n_f64(state, b0Coeffs, Xn0);
- state = vfmaq_n_f64(state, a0Coeffs, acc0);
-
- *pOut++ = acc0;
- sample--;
- }
- sample = blockSize & 1 ;
- while (sample > 0U)
- {
-
- /* y[n] = b0 * x[n] + d1 */
- /* d1 = b1 * x[n] + a1 * y[n] + d2 */
- /* d2 = b2 * x[n] + a2 * y[n] */
-
- Xn0 = *pIn++ ;
-
- /* Calculation of acc0*/
- acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
-
-
- /*
- * final initial
- * state b0Coeffs state a0Coeffs
- * | | | |
- * __ __ __ __
- * / \ / \ / \ / \
- * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
- * | d2 | | b2 | | 0 | | a2 |
- * \__/ \__/ \__/ \__/
- */
-
- /* state -> initial state (see above) */
-
- state = vextq_f64(state, zeroV, 1);
-
- /* Calculation of final state */
- state = vfmaq_n_f64(state, b0Coeffs, Xn0);
- state = vfmaq_n_f64(state, a0Coeffs, acc0);
-
- *pOut++ = acc0 ;
- sample--;
- }
- /* Store the updated state variables back into the state array */
- pState[0] = vgetq_lane_f64(state, 0);
- pState[1] = vgetq_lane_f64(state, 1);
-
- pState += 2U;
-
- /* The current stage output is given as the input to the next stage */
- pIn = pDst;
-
- /* Reset the output working pointer */
- pOut = pDst;
-
- stage--;
-
- } while (stage > 0U);
-
+
+
+
+ const float64_t *pIn = pSrc; /* source pointer */
+ float64_t Xn0, Xn1;
+ float64_t acc0, acc1;
+ float64_t *pOut = pDst; /* destination pointer */
+ float64_t *pState = S->pState; /* State pointer */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+ float64_t const *pCurCoeffs = /* coefficient pointer */
+ (float64_t const *) S->pCoeffs;
+ float64x2_t b0Coeffs, a0Coeffs; /* Coefficients vector */
+ float64x2_t state; /* State vector*/
+ float64x2_t zeroV = vdupq_n_f64(0);
+
+ float64_t b0 ;
+
+
+ do
+ {
+
+ /* Reading the coefficients */
+ b0 = *pCurCoeffs++ ;
+ b0Coeffs = vld1q_f64(pCurCoeffs);
+ pCurCoeffs += 2 ;
+ a0Coeffs = vld1q_f64(pCurCoeffs);
+ pCurCoeffs +=2 ;
+
+ state = vld1q_f64(pState);
+
+ sample = blockSize >> 1U;
+ while (sample > 0U)
+ {
+
+ /* y[n] = b0 * x[n] + d1 */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ /* d2 = b2 * x[n] + a2 * y[n] */
+
+ Xn0 = *pIn++ ;
+
+ /* Calculation of acc0*/
+ acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
+
+
+ /*
+ * final initial
+ * state b0Coeffs state a0Coeffs
+ * | | | |
+ * __ __ __ __
+ * / \ / \ / \ / \
+ * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
+ * | d2 | | b2 | | 0 | | a2 |
+ * \__/ \__/ \__/ \__/
+ */
+
+ /* state -> initial state (see above) */
+
+ state = vextq_f64(state, zeroV, 1);
+
+ /* Calculation of final state */
+ state = vfmaq_n_f64(state, b0Coeffs, Xn0);
+ state = vfmaq_n_f64(state, a0Coeffs, acc0);
+
+ *pOut++ = acc0 ;
+
+ /* y[n] = b0 * x[n] + d1 */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ /* d2 = b2 * x[n] + a2 * y[n] */
+
+ Xn0 = *pIn++ ;
+
+ /* Calculation of acc0*/
+ acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
+
+
+ /*
+ * final initial
+ * state b0Coeffs state a0Coeffs
+ * | | | |
+ * __ __ __ __
+ * / \ / \ / \ / \
+ * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
+ * | d2 | | b2 | | 0 | | a2 |
+ * \__/ \__/ \__/ \__/
+ */
+
+ /* state -> initial state (see above) */
+
+ state = vextq_f64(state, zeroV, 1);
+
+ /* Calculation of final state */
+ state = vfmaq_n_f64(state, b0Coeffs, Xn0);
+ state = vfmaq_n_f64(state, a0Coeffs, acc0);
+
+ *pOut++ = acc0;
+ sample--;
+ }
+ sample = blockSize & 1 ;
+ while (sample > 0U)
+ {
+
+ /* y[n] = b0 * x[n] + d1 */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ /* d2 = b2 * x[n] + a2 * y[n] */
+
+ Xn0 = *pIn++ ;
+
+ /* Calculation of acc0*/
+ acc0 = b0*Xn0+vgetq_lane_f64(state, 0);
+
+
+ /*
+ * final initial
+ * state b0Coeffs state a0Coeffs
+ * | | | |
+ * __ __ __ __
+ * / \ / \ / \ / \
+ * | d1 | = | b1 | * Xn0 + | d2 | + | a1 | x acc0
+ * | d2 | | b2 | | 0 | | a2 |
+ * \__/ \__/ \__/ \__/
+ */
+
+ /* state -> initial state (see above) */
+
+ state = vextq_f64(state, zeroV, 1);
+
+ /* Calculation of final state */
+ state = vfmaq_n_f64(state, b0Coeffs, Xn0);
+ state = vfmaq_n_f64(state, a0Coeffs, acc0);
+
+ *pOut++ = acc0 ;
+ sample--;
+ }
+ /* Store the updated state variables back into the state array */
+ pState[0] = vgetq_lane_f64(state, 0);
+ pState[1] = vgetq_lane_f64(state, 1);
+
+ pState += 2U;
+
+ /* The current stage output is given as the input to the next stage */
+ pIn = pDst;
+
+ /* Reset the output working pointer */
+ pOut = pDst;
+
+ stage--;
+
+ } while (stage > 0U);
+
}
#else
void arm_biquad_cascade_df2T_f64(
- const arm_biquad_cascade_df2T_instance_f64 * S,
- const float64_t * pSrc,
- float64_t * pDst,
- uint32_t blockSize)
+ const arm_biquad_cascade_df2T_instance_f64 * S,
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
{
-
- const float64_t *pIn = pSrc; /* Source pointer */
- float64_t *pOut = pDst; /* Destination pointer */
- float64_t *pState = S->pState; /* State pointer */
- const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float64_t acc1; /* Accumulator */
- float64_t b0, b1, b2, a1, a2; /* Filter coefficients */
- float64_t Xn1; /* Temporary input */
- float64_t d1, d2; /* State variables */
- uint32_t sample, stage = S->numStages; /* Loop counters */
-
-
- do
- {
- /* Reading the coefficients */
- b0 = pCoeffs[0];
- b1 = pCoeffs[1];
- b2 = pCoeffs[2];
- a1 = pCoeffs[3];
- a2 = pCoeffs[4];
-
- /* Reading the state values */
- d1 = pState[0];
- d2 = pState[1];
-
- pCoeffs += 5U;
-
+
+ const float64_t *pIn = pSrc; /* Source pointer */
+ float64_t *pOut = pDst; /* Destination pointer */
+ float64_t *pState = S->pState; /* State pointer */
+ const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float64_t acc1; /* Accumulator */
+ float64_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ float64_t Xn1; /* Temporary input */
+ float64_t d1, d2; /* State variables */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
+
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = pCoeffs[0];
+ b1 = pCoeffs[1];
+ b2 = pCoeffs[2];
+ a1 = pCoeffs[3];
+ a2 = pCoeffs[4];
+
+ /* Reading the state values */
+ d1 = pState[0];
+ d2 = pState[1];
+
+ pCoeffs += 5U;
+
#if defined (ARM_MATH_LOOPUNROLL)
-
- /* Loop unrolling: Compute 16 outputs at a time */
- sample = blockSize >> 4U;
-
- while (sample > 0U) {
-
- /* y[n] = b0 * x[n] + d1 */
- /* d1 = b1 * x[n] + a1 * y[n] + d2 */
- /* d2 = b2 * x[n] + a2 * y[n] */
-
-/* 1 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-
-/* 2 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 3 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 4 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 5 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 6 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 7 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 8 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 9 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 10 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 11 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 12 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 13 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 14 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 15 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
-/* 16 */
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
- /* decrement loop counter */
- sample--;
- }
-
- /* Loop unrolling: Compute remaining outputs */
- sample = blockSize & 0xFU;
-
+
+ /* Loop unrolling: Compute 16 outputs at a time */
+ sample = blockSize >> 4U;
+
+ while (sample > 0U) {
+
+ /* y[n] = b0 * x[n] + d1 */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ /* d2 = b2 * x[n] + a2 * y[n] */
+
+ /* 1 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+
+ /* 2 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 3 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 4 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 5 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 6 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 7 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 8 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 9 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 10 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 11 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 12 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 13 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 14 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 15 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* 16 */
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* decrement loop counter */
+ sample--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ sample = blockSize & 0xFU;
+
#else
-
- /* Initialize blkCnt with number of samples */
- sample = blockSize;
-
+
+ /* Initialize blkCnt with number of samples */
+ sample = blockSize;
+
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
- while (sample > 0U) {
- Xn1 = *pIn++;
-
- acc1 = b0 * Xn1 + d1;
-
- d1 = b1 * Xn1 + d2;
- d1 += a1 * acc1;
-
- d2 = b2 * Xn1;
- d2 += a2 * acc1;
-
- *pOut++ = acc1;
-
- /* decrement loop counter */
- sample--;
- }
-
- /* Store the updated state variables back into the state array */
- pState[0] = d1;
- pState[1] = d2;
-
- pState += 2U;
-
- /* The current stage output is given as the input to the next stage */
- pIn = pDst;
-
- /* Reset the output working pointer */
- pOut = pDst;
-
- /* decrement loop counter */
- stage--;
-
- } while (stage > 0U);
-
+
+ while (sample > 0U) {
+ Xn1 = *pIn++;
+
+ acc1 = b0 * Xn1 + d1;
+
+ d1 = b1 * Xn1 + d2;
+ d1 += a1 * acc1;
+
+ d2 = b2 * Xn1;
+ d2 += a2 * acc1;
+
+ *pOut++ = acc1;
+
+ /* decrement loop counter */
+ sample--;
+ }
+
+ /* Store the updated state variables back into the state array */
+ pState[0] = d1;
+ pState[1] = d2;
+
+ pState += 2U;
+
+ /* The current stage output is given as the input to the next stage */
+ pIn = pDst;
+
+ /* Reset the output working pointer */
+ pOut = pDst;
+
+ /* decrement loop counter */
+ stage--;
+
+ } while (stage > 0U);
+
}
#endif
/**
- @} end of BiquadCascadeDF2T group
+ @} end of BiquadCascadeDF2T group
*/
diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
index 5ab1c9a2..90e83df7 100644
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
@@ -63,22 +63,22 @@
*/
void arm_biquad_cascade_df2T_init_f64(
- arm_biquad_cascade_df2T_instance_f64 * S,
- uint8_t numStages,
- const float64_t * pCoeffs,
- float64_t * pState)
+ arm_biquad_cascade_df2T_instance_f64 * S,
+ uint8_t numStages,
+ const float64_t * pCoeffs,
+ float64_t * pState)
{
- /* Assign filter stages */
- S->numStages = numStages;
-
- /* Assign coefficient pointer */
- S->pCoeffs = pCoeffs;
-
- /* Clear state buffer and size is always 2 * numStages */
- memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t));
-
- /* Assign state pointer */
- S->pState = pState;
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 2 * numStages */
+ memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
}
/**
diff --git a/Source/FilteringFunctions/arm_correlate_f64.c b/Source/FilteringFunctions/arm_correlate_f64.c
index 8d52bcf6..1d9c4bd7 100644
--- a/Source/FilteringFunctions/arm_correlate_f64.c
+++ b/Source/FilteringFunctions/arm_correlate_f64.c
@@ -48,384 +48,384 @@
*/
void arm_correlate_f64(
- const float64_t * pSrcA,
- uint32_t srcALen,
- const float64_t * pSrcB,
- uint32_t srcBLen,
- float64_t * pDst)
+ const float64_t * pSrcA,
+ uint32_t srcALen,
+ const float64_t * pSrcB,
+ uint32_t srcBLen,
+ float64_t * pDst)
{
- const float64_t *pIn1; /* InputA pointer */
- const float64_t *pIn2; /* InputB pointer */
- float64_t *pOut = pDst; /* Output pointer */
- const float64_t *px; /* Intermediate inputA pointer */
- const float64_t *py; /* Intermediate inputB pointer */
- const float64_t *pSrc1;
- float64_t sum;
- uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
- uint32_t j, k, count, blkCnt; /* Loop counters */
- uint32_t outBlockSize; /* Loop counter */
- int32_t inc = 1; /* Destination address modifier */
+ const float64_t *pIn1; /* InputA pointer */
+ const float64_t *pIn2; /* InputB pointer */
+ float64_t *pOut = pDst; /* Output pointer */
+ const float64_t *px; /* Intermediate inputA pointer */
+ const float64_t *py; /* Intermediate inputB pointer */
+ const float64_t *pSrc1;
+ float64_t sum;
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Destination address modifier */
#if defined(ARM_MATH_NEON)
- float64x2_t sumV,pxV,pyV ;
+ float64x2_t sumV,pxV,pyV ;
#endif
-
- /* The algorithm implementation is based on the lengths of the inputs. */
- /* srcB is always made to slide across srcA. */
- /* So srcBLen is always considered as shorter or equal to srcALen */
- /* But CORR(x, y) is reverse of CORR(y, x) */
- /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
- /* and the destination pointer modifier, inc is set to -1 */
- /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
- /* But to improve the performance,
- * we assume zeroes in the output instead of zero padding either of the the inputs*/
- /* If srcALen > srcBLen,
- * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
- /* If srcALen < srcBLen,
- * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
- if (srcALen >= srcBLen)
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcA;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcB;
-
- /* Number of output samples is calculated */
- outBlockSize = (2U * srcALen) - 1U;
-
- /* When srcALen > srcBLen, zero padding has to be done to srcB
- * to make their lengths equal.
- * Instead, (outBlockSize - (srcALen + srcBLen - 1))
- * number of output samples are made zero */
- j = outBlockSize - (srcALen + (srcBLen - 1U));
-
- /* Updating the pointer position to non zero value */
- pOut += j;
- }
- else
- {
- /* Initialization of inputA pointer */
- pIn1 = pSrcB;
-
- /* Initialization of inputB pointer */
- pIn2 = pSrcA;
-
- /* srcBLen is always considered as shorter or equal to srcALen */
- j = srcBLen;
- srcBLen = srcALen;
- srcALen = j;
-
- /* CORR(x, y) = Reverse order(CORR(y, x)) */
- /* Hence set the destination pointer to point to the last output sample */
- pOut = pDst + ((srcALen + srcBLen) - 2U);
-
- /* Destination address modifier is set to -1 */
- inc = -1;
- }
-
- /* The function is internally
- * divided into three stages according to the number of multiplications that has to be
- * taken place between inputA samples and inputB samples. In the first stage of the
- * algorithm, the multiplications increase by one for every iteration.
- * In the second stage of the algorithm, srcBLen number of multiplications are done.
- * In the third stage of the algorithm, the multiplications decrease by one
- * for every iteration. */
-
- /* The algorithm is implemented in three stages.
- The loop counters of each stage is initiated here. */
- blockSize1 = srcBLen - 1U;
- blockSize2 = srcALen - (srcBLen - 1U);
- blockSize3 = blockSize1;
-
- /* --------------------------
- * Initializations of stage1
- * -------------------------*/
-
- /* sum = x[0] * y[srcBlen - 1]
- * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
- * ....
- * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
- */
-
- /* In this stage the MAC operations are increased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
- count = 1U;
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- pSrc1 = pIn2 + (srcBLen - 1U);
- py = pSrc1;
-
- /* ------------------------
- * Stage1 process
- * ----------------------*/
-
- /* The first stage starts here */
- while (blockSize1 > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0.;
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we assume zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding has to be done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.;
#if defined(ARM_MATH_NEON)
- sumV = vdupq_n_f64(0.0f);
- k = count >> 1U ;
-
- while(k > 0U)
- {
- pxV = vld1q_f64(px);
- pyV = vld1q_f64(py);
- sumV = vmlaq_f64(sumV, pxV, pyV);
- px+=2;
- py+=2;
- k--;
- }
- sum =vaddvq_f64(sumV);
- k = count & 1 ;
+ sumV = vdupq_n_f64(0.0f);
+ k = count >> 1U ;
+
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = count & 1 ;
#else
- /* Initialize k with number of samples */
- k = count;
+ /* Initialize k with number of samples */
+ k = count;
#endif
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- /* x[0] * y[srcBLen - 1] */
- sum += *px++ * *py++;
-
- /* Decrement loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
-
- *pOut = sum;
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- py = pSrc1 - count;
- px = pIn1;
-
- /* Increment MAC count */
- count++;
-
- /* Decrement loop counter */
- blockSize1--;
-
- }
-
- /* --------------------------
- * Initializations of stage2
- * ------------------------*/
-
- /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
- * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
- * ....
- * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
- */
-
- /* Working pointer of inputA */
- px = pIn1;
-
- /* Working pointer of inputB */
- py = pIn2;
-
- /* count is index by which the pointer pIn1 to be incremented */
- count = 0U;
-
- /* -------------------
- * Stage2 process
- * ------------------*/
-
- /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
- * So, to loop unroll over blockSize2,
- * srcBLen should be greater than or equal to 4 */
- if (srcBLen >= 4U)
- {
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize2;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0.;
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.;
#if defined(ARM_MATH_NEON)
- sumV = vdupq_n_f64(0.0f);
- k = srcBLen >> 1U ;
- while(k > 0U)
- {
- pxV = vld1q_f64(px);
- pyV = vld1q_f64(py);
- sumV = vmlaq_f64(sumV, pxV, pyV);
- px+=2;
- py+=2;
- k--;
- }
- sum =vaddvq_f64(sumV);
- k = srcBLen & 1 ;
-
+ sumV = vdupq_n_f64(0.0f);
+ k = srcBLen >> 1U ;
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = srcBLen & 1 ;
+
#else
-
- /* Initialize blkCnt with number of samples */
- k = srcBLen;
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
#endif
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- sum += *px++ * *py++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = sum;
-
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Increment the pointer pIn1 index, count by 1 */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pIn2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
- else
- {
- /* If the srcBLen is not a multiple of 4,
- * the blockSize2 loop cannot be unrolled by 4 */
- blkCnt = blockSize2;
-
- while (blkCnt > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0.;
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.;
#if defined(ARM_MATH_NEON)
- sumV = vdupq_n_f64(0.0f);
- k = srcBLen >> 1U ;
- while(k > 0U)
- {
- pxV = vld1q_f64(px);
- pyV = vld1q_f64(py);
- sumV = vmlaq_f64(sumV, pxV, pyV);
- px+=2;
- py+=2;
- k--;
- }
- sum =vaddvq_f64(sumV);
- k = srcBLen & 1 ;
-
+ sumV = vdupq_n_f64(0.0f);
+ k = srcBLen >> 1U ;
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = srcBLen & 1 ;
+
#else
-
- /* Loop over srcBLen */
- k = srcBLen;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
#endif
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- sum += *px++ * *py++;
-
- /* Decrement the loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = sum;
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Increment the pointer pIn1 index, count by 1 */
- count++;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = pIn1 + count;
- py = pIn2;
-
- /* Decrement the loop counter */
- blkCnt--;
- }
- }
-
-
- /* --------------------------
- * Initializations of stage3
- * -------------------------*/
-
- /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
- * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
- * ....
- * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
- * sum += x[srcALen-1] * y[0]
- */
-
- /* In this stage the MAC operations are decreased by 1 for every iteration.
- The count variable holds the number of MAC operations performed */
- count = srcBLen - 1U;
-
- /* Working pointer of inputA */
- pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
- px = pSrc1;
-
- /* Working pointer of inputB */
- py = pIn2;
-
- /* -------------------
- * Stage3 process
- * ------------------*/
-
- while (blockSize3 > 0U)
- {
- /* Accumulator is made zero for every iteration */
- sum = 0.;
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.;
#if defined(ARM_MATH_NEON)
- sumV = vdupq_n_f64(0.0f);
- k = count >> 1U ;
-
- while(k > 0U)
- {
- pxV = vld1q_f64(px);
- pyV = vld1q_f64(py);
- sumV = vmlaq_f64(sumV, pxV, pyV);
- px+=2;
- py+=2;
- k--;
- }
- sum =vaddvq_f64(sumV);
- k = count & 1 ;
+ sumV = vdupq_n_f64(0.0f);
+ k = count >> 1U ;
+
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = count & 1 ;
#else
-
- /* Initialize blkCnt with number of samples */
- k = count;
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
#endif
- while (k > 0U)
- {
- /* Perform the multiply-accumulate */
- sum += *px++ * *py++;
-
- /* Decrement loop counter */
- k--;
- }
-
- /* Store the result in the accumulator in the destination buffer. */
- *pOut = sum;
- /* Destination pointer is updated according to the address modifier, inc */
- pOut += inc;
-
- /* Update the inputA and inputB pointers for next MAC calculation */
- px = ++pSrc1;
- py = pIn2;
-
- /* Decrement MAC count */
- count--;
-
- /* Decrement the loop counter */
- blockSize3--;
- }
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
}
/**
diff --git a/Source/FilteringFunctions/arm_fir_f64.c b/Source/FilteringFunctions/arm_fir_f64.c
index 51fc8ece..9d598081 100644
--- a/Source/FilteringFunctions/arm_fir_f64.c
+++ b/Source/FilteringFunctions/arm_fir_f64.c
@@ -47,177 +47,177 @@
*/
#if defined(ARM_MATH_NEON)
void arm_fir_f64(
- const arm_fir_instance_f64 * S,
- const float64_t * pSrc,
- float64_t * pDst,
- uint32_t blockSize)
+ const arm_fir_instance_f64 * S,
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
{
- float64_t *pState = S->pState; /* State pointer */
- const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float64_t *pStateCurnt; /* Points to the current sample of the state */
- float64_t *px; /* Temporary pointer for state buffer */
- const float64_t *pb; /* Temporary pointer for coefficient buffer */
- float64x2_t pxV;
- float64x2_t pbV;
- float64x2_t acc0V;
- float64_t acc0; /* Accumulator */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
-
- /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Initialize blkCnt with number of taps */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0.;
- acc0V = vdupq_n_f64(0.0f);
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = pCoeffs;
-
- i = numTaps >> 1U;
-
- /* Perform the multiply-accumulates */
- while (i > 0U)
- {
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- pxV = vld1q_f64(px);
- pbV = vld1q_f64(pb);
- acc0V = vmlaq_f64(acc0V, pxV, pbV);
- px+=2;
- pb+=2;
-
- i--;
- }
-
- acc0 = vaddvq_f64(acc0V);
- i = numTaps%2 ;
- while(i >0U)
- {
- acc0+= *px++ * *pb++ ;
- i--;
- }
-
- /* Store result in destination buffer. */
-
- *pDst++ = acc0;
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1U;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Processing is complete.
- Now copy the last numTaps - 1 samples to the start of the state buffer.
- This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- /* Initialize tapCnt with number of taps */
- tapCnt = (numTaps - 1U);
-
- /* Copy remaining data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement loop counter */
- tapCnt--;
- }
-
+ float64_t *pState = S->pState; /* State pointer */
+ const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float64_t *pStateCurnt; /* Points to the current sample of the state */
+ float64_t *px; /* Temporary pointer for state buffer */
+ const float64_t *pb; /* Temporary pointer for coefficient buffer */
+ float64x2_t pxV;
+ float64x2_t pbV;
+ float64x2_t acc0V;
+ float64_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.;
+ acc0V = vdupq_n_f64(0.0f);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps >> 1U;
+
+ /* Perform the multiply-accumulates */
+ while (i > 0U)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ pxV = vld1q_f64(px);
+ pbV = vld1q_f64(pb);
+ acc0V = vmlaq_f64(acc0V, pxV, pbV);
+ px+=2;
+ pb+=2;
+
+ i--;
+ }
+
+ acc0 = vaddvq_f64(acc0V);
+ i = numTaps%2 ;
+ while(i >0U)
+ {
+ acc0+= *px++ * *pb++ ;
+ i--;
+ }
+
+ /* Store result in destination buffer. */
+
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
}
#else
void arm_fir_f64(
- const arm_fir_instance_f64 * S,
- const float64_t * pSrc,
- float64_t * pDst,
- uint32_t blockSize)
+ const arm_fir_instance_f64 * S,
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
{
- float64_t *pState = S->pState; /* State pointer */
- const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float64_t *pStateCurnt; /* Points to the current sample of the state */
- float64_t *px; /* Temporary pointer for state buffer */
- const float64_t *pb; /* Temporary pointer for coefficient buffer */
- float64_t acc0; /* Accumulator */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t i, tapCnt, blkCnt; /* Loop counters */
-
- /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
- /* pStateCurnt points to the location where the new input data should be written */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
-
- /* Initialize blkCnt with number of taps */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0.;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = pCoeffs;
-
- i = numTaps;
-
- /* Perform the multiply-accumulates */
- while (i > 0U)
- {
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc0 += *px++ * *pb++;
-
- i--;
- }
-
- /* Store result in destination buffer. */
- *pDst++ = acc0;
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1U;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Processing is complete.
- Now copy the last numTaps - 1 samples to the start of the state buffer.
- This prepares the state buffer for the next function call. */
-
- /* Points to the start of the state buffer */
- pStateCurnt = S->pState;
-
- /* Initialize tapCnt with number of taps */
- tapCnt = (numTaps - 1U);
-
- /* Copy remaining data */
- while (tapCnt > 0U)
- {
- *pStateCurnt++ = *pState++;
-
- /* Decrement loop counter */
- tapCnt--;
- }
-
+ float64_t *pState = S->pState; /* State pointer */
+ const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float64_t *pStateCurnt; /* Points to the current sample of the state */
+ float64_t *px; /* Temporary pointer for state buffer */
+ const float64_t *pb; /* Temporary pointer for coefficient buffer */
+ float64_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ while (i > 0U)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc0 += *px++ * *pb++;
+
+ i--;
+ }
+
+ /* Store result in destination buffer. */
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
}
#endif
diff --git a/Source/FilteringFunctions/arm_fir_init_f64.c b/Source/FilteringFunctions/arm_fir_init_f64.c
index 645ad9f6..27c56596 100644
--- a/Source/FilteringFunctions/arm_fir_init_f64.c
+++ b/Source/FilteringFunctions/arm_fir_init_f64.c
@@ -61,22 +61,22 @@
*/
void arm_fir_init_f64(
- arm_fir_instance_f64 * S,
- uint16_t numTaps,
- const float64_t * pCoeffs,
- float64_t * pState,
- uint32_t blockSize)
+ arm_fir_instance_f64 * S,
+ uint16_t numTaps,
+ const float64_t * pCoeffs,
+ float64_t * pState,
+ uint32_t blockSize)
{
- /* Assign filter taps */
- S->numTaps = numTaps;
-
- /* Assign coefficient pointer */
- S->pCoeffs = pCoeffs;
-
- /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
- memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float64_t));
- /* Assign state pointer */
- S->pState = pState;
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float64_t));
+ /* Assign state pointer */
+ S->pState = pState;
}
/**
diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
index b1605116..bfc4234f 100755
--- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@@ -55,223 +55,223 @@
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_cholesky_f64(
- const arm_matrix_instance_f64 * pSrc,
- arm_matrix_instance_f64 * pDst)
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
{
-
- arm_status status; /* status of matrix inverse */
-
-
+
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pSrc->numCols) ||
- (pDst->numRows != pDst->numCols) ||
- (pSrc->numRows != pDst->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- int i,j,k;
- int n = pSrc->numRows;
- float64_t invSqrtVj;
- float64_t *pA,*pG;
- int kCnt;
-
-
- float64x2_t acc, acc0, acc1, acc2, acc3;
- float64x2_t vecGi;
- float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
-
- float64x2_t tmp = vdupq_n_f64(0.0f);
-
- float64_t sum=0.0f;
- float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
-
-
- pA = pSrc->pData;
- pG = pDst->pData;
-
- for(i=0 ;i < n ; i++)
- {
- for(j=i ; j+3 < n ; j+=4)
- {
- pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
- pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
- pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
- pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
-
- acc0 = vdupq_n_f64(0.0f);
- acc1 = vdupq_n_f64(0.0f);
- acc2 = vdupq_n_f64(0.0f);
- acc3 = vdupq_n_f64(0.0f);
-
- kCnt = i >> 1U;
- k=0;
- while(kCnt > 0)
- {
-
- vecGi=vld1q_f64(&pG[i * n + k]);
-
- vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
- vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
- vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
- vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
-
- acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
- acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
- acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
- acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
-
- kCnt--;
- k+=2;
- }
-
-
- sum0 = vaddvq_f64(acc0);
- sum1 = vaddvq_f64(acc1);
- sum2 = vaddvq_f64(acc2);
- sum3 = vaddvq_f64(acc3);
-
-
- kCnt = i & 1;
- while(kCnt > 0)
- {
-
- sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
- sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
- sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
- sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
- kCnt--;
- k++;
- }
-
- pG[(j + 0) * n + i] -= sum0;
- pG[(j + 1) * n + i] -= sum1;
- pG[(j + 2) * n + i] -= sum2;
- pG[(j + 3) * n + i] -= sum3;
- }
-
- for(; j < n ; j++)
- {
- pG[j * n + i] = pA[j * n + i];
-
- acc = vdupq_n_f64(0.0f);
-
- kCnt = i >> 1U;
- k=0;
- while(kCnt > 0)
- {
-
- vecGi=vld1q_f64(&pG[i * n + k]);
- vecGj=vld1q_f64(&pG[j * n + k]);
-
- acc = vfmaq_f64(acc, vecGi, vecGj);
-
- kCnt--;
- k+=2;
- }
-
-
- sum = vaddvq_f64(acc);
-
- kCnt = i & 1;
- while(kCnt > 0)
- {
- sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
-
-
- kCnt--;
- k++;
- }
-
- pG[j * n + i] -= sum;
- }
-
- if (pG[i * n + i] <= 0.0f)
- {
- return(ARM_MATH_DECOMPOSITION_FAILURE);
- }
-
- invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
- SCALE_COL_F64(pDst,i,invSqrtVj,i);
- }
-
- status = ARM_MATH_SUCCESS;
-
- }
-
-
- /* Return to application */
- return (status);
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float64_t invSqrtVj;
+ float64_t *pA,*pG;
+ int kCnt;
+
+
+ float64x2_t acc, acc0, acc1, acc2, acc3;
+ float64x2_t vecGi;
+ float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+ float64x2_t tmp = vdupq_n_f64(0.0f);
+
+ float64_t sum=0.0f;
+ float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
+
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+ for(i=0 ;i < n ; i++)
+ {
+ for(j=i ; j+3 < n ; j+=4)
+ {
+ pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+ pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+ pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+ pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+ acc0 = vdupq_n_f64(0.0f);
+ acc1 = vdupq_n_f64(0.0f);
+ acc2 = vdupq_n_f64(0.0f);
+ acc3 = vdupq_n_f64(0.0f);
+
+ kCnt = i >> 1U;
+ k=0;
+ while(kCnt > 0)
+ {
+
+ vecGi=vld1q_f64(&pG[i * n + k]);
+
+ vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
+ vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
+ vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
+ vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
+
+ acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
+ acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
+ acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
+ acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
+
+ kCnt--;
+ k+=2;
+ }
+
+
+ sum0 = vaddvq_f64(acc0);
+ sum1 = vaddvq_f64(acc1);
+ sum2 = vaddvq_f64(acc2);
+ sum3 = vaddvq_f64(acc3);
+
+
+ kCnt = i & 1;
+ while(kCnt > 0)
+ {
+
+ sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
+ sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
+ sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
+ sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
+ kCnt--;
+ k++;
+ }
+
+ pG[(j + 0) * n + i] -= sum0;
+ pG[(j + 1) * n + i] -= sum1;
+ pG[(j + 2) * n + i] -= sum2;
+ pG[(j + 3) * n + i] -= sum3;
+ }
+
+ for(; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ acc = vdupq_n_f64(0.0f);
+
+ kCnt = i >> 1U;
+ k=0;
+ while(kCnt > 0)
+ {
+
+ vecGi=vld1q_f64(&pG[i * n + k]);
+ vecGj=vld1q_f64(&pG[j * n + k]);
+
+ acc = vfmaq_f64(acc, vecGi, vecGj);
+
+ kCnt--;
+ k+=2;
+ }
+
+
+ sum = vaddvq_f64(acc);
+
+ kCnt = i & 1;
+ while(kCnt > 0)
+ {
+ sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
+
+
+ kCnt--;
+ k++;
+ }
+
+ pG[j * n + i] -= sum;
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+ SCALE_COL_F64(pDst,i,invSqrtVj,i);
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
}
#else
arm_status arm_mat_cholesky_f64(
- const arm_matrix_instance_f64 * pSrc,
- arm_matrix_instance_f64 * pDst)
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
{
-
- arm_status status; /* status of matrix inverse */
-
-
+
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pSrc->numCols) ||
- (pDst->numRows != pDst->numCols) ||
- (pSrc->numRows != pDst->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- int i,j,k;
- int n = pSrc->numRows;
- float64_t invSqrtVj;
- float64_t *pA,*pG;
-
- pA = pSrc->pData;
- pG = pDst->pData;
-
-
- for(i=0 ; i < n ; i++)
- {
- for(j=i ; j < n ; j++)
- {
- pG[j * n + i] = pA[j * n + i];
-
- for(k=0; k < i ; k++)
- {
- pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
- }
- }
-
- if (pG[i * n + i] <= 0.0)
- {
- return(ARM_MATH_DECOMPOSITION_FAILURE);
- }
-
- invSqrtVj = 1.0/sqrt(pG[i * n + i]);
- SCALE_COL_F64(pDst,i,invSqrtVj,i);
-
- }
-
- status = ARM_MATH_SUCCESS;
-
- }
-
-
- /* Return to application */
- return (status);
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float64_t invSqrtVj;
+ float64_t *pA,*pG;
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+
+ for(i=0 ; i < n ; i++)
+ {
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ for(k=0; k < i ; k++)
+ {
+ pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+ }
+ }
+
+ if (pG[i * n + i] <= 0.0)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0/sqrt(pG[i * n + i]);
+ SCALE_COL_F64(pDst,i,invSqrtVj,i);
+
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
}
#endif
diff --git a/Source/MatrixFunctions/arm_mat_init_f64.c b/Source/MatrixFunctions/arm_mat_init_f64.c
index 5abe302b..e3bff4e9 100644
--- a/Source/MatrixFunctions/arm_mat_init_f64.c
+++ b/Source/MatrixFunctions/arm_mat_init_f64.c
@@ -56,19 +56,19 @@
*/
void arm_mat_init_f64(
- arm_matrix_instance_f64 * S,
- uint16_t nRows,
- uint16_t nColumns,
- float64_t * pData)
+ arm_matrix_instance_f64 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ float64_t * pData)
{
- /* Assign Number of Rows */
- S->numRows = nRows;
-
- /* Assign Number of Columns */
- S->numCols = nColumns;
-
- /* Assign Data pointer */
- S->pData = pData;
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
}
/**
diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
index 49d3ac28..e7be7cc1 100755
--- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@@ -48,179 +48,179 @@
*/
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
- arm_status arm_mat_solve_lower_triangular_f64(
- const arm_matrix_instance_f64 * lt,
- const arm_matrix_instance_f64 * a,
- arm_matrix_instance_f64 * dst)
- {
- arm_status status; /* status of matrix inverse */
-
-
+arm_status arm_mat_solve_lower_triangular_f64(
+ const arm_matrix_instance_f64 * lt,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+{
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((lt->numRows != lt->numCols) ||
- (lt->numRows != a->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* a1 b1 c1 x1 = a1
- b2 c2 x2 a2
- c3 x3 a3
-
- x3 = a3 / c3
- x2 = (a2 - c2 x3) / b2
-
- */
- int i,j,k,n,cols;
-
- n = dst->numRows;
- cols = dst->numCols;
-
- float64_t *pX = dst->pData;
- float64_t *pLT = lt->pData;
- float64_t *pA = a->pData;
-
- float64_t *lt_row;
- float64_t *a_col;
-
- float64_t invLT;
-
- float64x2_t vecA;
- float64x2_t vecX;
-
- for(i=0; i < n ; i++)
- {
-
- for(j=0; j+1 < cols; j += 2)
- {
- vecA = vld1q_f64(&pA[i * cols + j]);
-
- for(k=0; k < i; k++)
- {
- vecX = vld1q_f64(&pX[cols*k+j]);
- vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
- }
-
- if (pLT[n*i + i]==0.0f)
- {
- return(ARM_MATH_SINGULAR);
- }
-
- invLT = 1.0f / pLT[n*i + i];
- vecA = vmulq_f64(vecA,vdupq_n_f64(invLT));
- vst1q_f64(&pX[i*cols+j],vecA);
-
- }
-
- for(; j < cols; j ++)
- {
- a_col = &pA[j];
- lt_row = &pLT[n*i];
-
- float64_t tmp=a_col[i * cols];
-
- for(k=0; k < i; k++)
- {
- tmp -= lt_row[k] * pX[cols*k+j];
- }
-
- if (lt_row[i]==0.0f)
- {
- return(ARM_MATH_SINGULAR);
- }
- tmp = tmp / lt_row[i];
- pX[i*cols+j] = tmp;
- }
-
- }
- status = ARM_MATH_SUCCESS;
-
- }
-
- /* Return to application */
- return (status);
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n,cols;
+
+ n = dst->numRows;
+ cols = dst->numCols;
+
+ float64_t *pX = dst->pData;
+ float64_t *pLT = lt->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *lt_row;
+ float64_t *a_col;
+
+ float64_t invLT;
+
+ float64x2_t vecA;
+ float64x2_t vecX;
+
+ for(i=0; i < n ; i++)
+ {
+
+ for(j=0; j+1 < cols; j += 2)
+ {
+ vecA = vld1q_f64(&pA[i * cols + j]);
+
+ for(k=0; k < i; k++)
+ {
+ vecX = vld1q_f64(&pX[cols*k+j]);
+ vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
+ }
+
+ if (pLT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invLT = 1.0f / pLT[n*i + i];
+ vecA = vmulq_f64(vecA,vdupq_n_f64(invLT));
+ vst1q_f64(&pX[i*cols+j],vecA);
+
+ }
+
+ for(; j < cols; j ++)
+ {
+ a_col = &pA[j];
+ lt_row = &pLT[n*i];
+
+ float64_t tmp=a_col[i * cols];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[cols*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*cols+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
}
#else
- arm_status arm_mat_solve_lower_triangular_f64(
- const arm_matrix_instance_f64 * lt,
- const arm_matrix_instance_f64 * a,
- arm_matrix_instance_f64 * dst)
- {
- arm_status status; /* status of matrix inverse */
-
-
+arm_status arm_mat_solve_lower_triangular_f64(
+ const arm_matrix_instance_f64 * lt,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+{
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((lt->numRows != lt->numCols) ||
- (lt->numRows != a->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* a1 b1 c1 x1 = a1
- b2 c2 x2 a2
- c3 x3 a3
-
- x3 = a3 / c3
- x2 = (a2 - c2 x3) / b2
-
- */
- int i,j,k,n,cols;
-
- float64_t *pX = dst->pData;
- float64_t *pLT = lt->pData;
- float64_t *pA = a->pData;
-
- float64_t *lt_row;
- float64_t *a_col;
-
- n = dst->numRows;
- cols = dst->numCols;
-
- for(j=0; j < cols; j ++)
- {
- a_col = &pA[j];
-
- for(i=0; i < n ; i++)
- {
- float64_t tmp=a_col[i * cols];
-
- lt_row = &pLT[n*i];
-
- for(k=0; k < i; k++)
- {
- tmp -= lt_row[k] * pX[cols*k+j];
- }
-
- if (lt_row[i]==0.0)
- {
- return(ARM_MATH_SINGULAR);
- }
- tmp = tmp / lt_row[i];
- pX[i*cols+j] = tmp;
- }
-
- }
- status = ARM_MATH_SUCCESS;
-
- }
-
- /* Return to application */
- return (status);
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n,cols;
+
+ float64_t *pX = dst->pData;
+ float64_t *pLT = lt->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *lt_row;
+ float64_t *a_col;
+
+ n = dst->numRows;
+ cols = dst->numCols;
+
+ for(j=0; j < cols; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=0; i < n ; i++)
+ {
+ float64_t tmp=a_col[i * cols];
+
+ lt_row = &pLT[n*i];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[cols*k+j];
+ }
+
+ if (lt_row[i]==0.0)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*cols+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
}
#endif
/**
diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
index 68c10194..73291722 100755
--- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@@ -48,168 +48,168 @@
*/
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
- arm_status arm_mat_solve_upper_triangular_f64(
- const arm_matrix_instance_f64 * ut,
- const arm_matrix_instance_f64 * a,
- arm_matrix_instance_f64 * dst)
- {
-arm_status status; /* status of matrix inverse */
-
-
+arm_status arm_mat_solve_upper_triangular_f64(
+ const arm_matrix_instance_f64 * ut,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+{
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((ut->numRows != ut->numCols) ||
- (ut->numRows != a->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
-
- int i,j,k,n,cols;
-
- n = dst->numRows;
- cols = dst->numCols;
-
- float64_t *pX = dst->pData;
- float64_t *pUT = ut->pData;
- float64_t *pA = a->pData;
-
- float64_t *ut_row;
- float64_t *a_col;
-
- float64_t invUT;
-
- float64x2_t vecA;
- float64x2_t vecX;
-
- for(i=n-1; i >= 0 ; i--)
- {
- for(j=0; j+1 < cols; j +=2)
- {
- vecA = vld1q_f64(&pA[i * cols + j]);
-
- for(k=n-1; k > i; k--)
- {
- vecX = vld1q_f64(&pX[cols*k+j]);
- vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
- }
-
- if (pUT[n*i + i]==0.0f)
- {
- return(ARM_MATH_SINGULAR);
- }
-
- invUT = 1.0f / pUT[n*i + i];
- vecA = vmulq_f64(vecA,vdupq_n_f64(invUT));
-
-
- vst1q_f64(&pX[i*cols+j],vecA);
- }
-
- for(; j < cols; j ++)
- {
- a_col = &pA[j];
-
- ut_row = &pUT[n*i];
-
- float64_t tmp=a_col[i * cols];
-
- for(k=n-1; k > i; k--)
- {
- tmp -= ut_row[k] * pX[cols*k+j];
- }
-
- if (ut_row[i]==0.0f)
- {
- return(ARM_MATH_SINGULAR);
- }
- tmp = tmp / ut_row[i];
- pX[i*cols+j] = tmp;
- }
-
- }
- status = ARM_MATH_SUCCESS;
-
- }
-
-
- /* Return to application */
- return (status);
+
+ {
+
+ int i,j,k,n,cols;
+
+ n = dst->numRows;
+ cols = dst->numCols;
+
+ float64_t *pX = dst->pData;
+ float64_t *pUT = ut->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *ut_row;
+ float64_t *a_col;
+
+ float64_t invUT;
+
+ float64x2_t vecA;
+ float64x2_t vecX;
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ for(j=0; j+1 < cols; j +=2)
+ {
+ vecA = vld1q_f64(&pA[i * cols + j]);
+
+ for(k=n-1; k > i; k--)
+ {
+ vecX = vld1q_f64(&pX[cols*k+j]);
+ vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
+ }
+
+ if (pUT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invUT = 1.0f / pUT[n*i + i];
+ vecA = vmulq_f64(vecA,vdupq_n_f64(invUT));
+
+
+ vst1q_f64(&pX[i*cols+j],vecA);
+ }
+
+ for(; j < cols; j ++)
+ {
+ a_col = &pA[j];
+
+ ut_row = &pUT[n*i];
+
+ float64_t tmp=a_col[i * cols];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[cols*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*cols+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
}
#else
- arm_status arm_mat_solve_upper_triangular_f64(
- const arm_matrix_instance_f64 * ut,
- const arm_matrix_instance_f64 * a,
- arm_matrix_instance_f64 * dst)
- {
-arm_status status; /* status of matrix inverse */
-
-
+arm_status arm_mat_solve_upper_triangular_f64(
+ const arm_matrix_instance_f64 * ut,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+{
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((ut->numRows != ut->numCols) ||
- (ut->numRows != a->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
-
- int i,j,k,n,cols;
-
- float64_t *pX = dst->pData;
- float64_t *pUT = ut->pData;
- float64_t *pA = a->pData;
-
- float64_t *ut_row;
- float64_t *a_col;
-
- n = dst->numRows;
- cols = dst->numCols;
-
- for(j=0; j < cols; j ++)
- {
- a_col = &pA[j];
-
- for(i=n-1; i >= 0 ; i--)
- {
- float64_t tmp=a_col[i * cols];
-
- ut_row = &pUT[n*i];
-
- for(k=n-1; k > i; k--)
- {
- tmp -= ut_row[k] * pX[cols*k+j];
- }
-
- if (ut_row[i]==0.0)
- {
- return(ARM_MATH_SINGULAR);
- }
- tmp = tmp / ut_row[i];
- pX[i*cols+j] = tmp;
- }
-
- }
- status = ARM_MATH_SUCCESS;
-
- }
-
-
- /* Return to application */
- return (status);
+
+ {
+
+ int i,j,k,n,cols;
+
+ float64_t *pX = dst->pData;
+ float64_t *pUT = ut->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *ut_row;
+ float64_t *a_col;
+
+ n = dst->numRows;
+ cols = dst->numCols;
+
+ for(j=0; j < cols; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ float64_t tmp=a_col[i * cols];
+
+ ut_row = &pUT[n*i];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[cols*k+j];
+ }
+
+ if (ut_row[i]==0.0)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*cols+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
}
#endif
diff --git a/Source/MatrixFunctions/arm_mat_trans_f64.c b/Source/MatrixFunctions/arm_mat_trans_f64.c
index 08184731..03737e8a 100755
--- a/Source/MatrixFunctions/arm_mat_trans_f64.c
+++ b/Source/MatrixFunctions/arm_mat_trans_f64.c
@@ -57,216 +57,216 @@
#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f64(
- const arm_matrix_instance_f64 * pSrc,
- arm_matrix_instance_f64 * pDst)
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
{
- float64_t *pIn = pSrc->pData; /* input data matrix pointer */
- float64_t *pOut = pDst->pData; /* output data matrix pointer */
- float64_t *px; /* Temporary output data matrix pointer */
- uint16_t nRows = pSrc->numRows; /* number of rows */
- uint16_t nColumns = pSrc->numCols; /* number of columns */
-
- uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
- arm_status status; /* status of matrix transpose */
-
+ float64_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+ float64_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nColumns = pSrc->numCols; /* number of columns */
+
+ uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* Matrix transpose by exchanging the rows with columns */
- /* Row loop */
- rowCnt = row >> 1;
- while (rowCnt > 0U)
- {
- float64_t *row0,*row1;
- float64x2x4_t raV;
-
- blkCnt = nColumns >> 2;
-
- /* The pointer px is set to starting address of the column being processed */
- px = pOut + i;
-
- /* Compute 4 outputs at a time.
- ** a second loop below computes the remaining 1 to 3 samples. */
- while (blkCnt > 0U) /* Column loop */
- {
- row0 = pIn;
- row1 = pIn+nColumns;
- pIn+=4;
- raV = vld4q_lane_f64(row0, raV, 0);
- raV = vld4q_lane_f64(row1, raV, 1);
-
- vst1q_f64(px,raV.val[0]);
- px += nRows;
-
- vst1q_f64(px,raV.val[1]);
- px += nRows;
-
- vst1q_f64(px,raV.val[2]);
- px += nRows;
-
- vst1q_f64(px,raV.val[3]);
- px += nRows;
-
- /* Decrement the column loop counter */
- blkCnt--;
- }
-
- /* Perform matrix transpose for last 3 samples here. */
- blkCnt = nColumns % 0x4U;
-
- while (blkCnt > 0U)
- {
- /* Read and store the input element in the destination */
- *px++ = *pIn;
- *px++ = *(pIn + 1 * nColumns);
-
- px += (nRows - 2);
- pIn++;
-
- /* Decrement the column loop counter */
- blkCnt--;
- }
-
- i += 2;
- pIn += 1 * nColumns;
-
- /* Decrement the row loop counter */
- rowCnt--;
-
- } /* Row loop end */
-
- rowCnt = row & 1;
- while (rowCnt > 0U)
- {
- blkCnt = nColumns ;
- /* The pointer px is set to starting address of the column being processed */
- px = pOut + i;
-
- while (blkCnt > 0U)
- {
- /* Read and store the input element in the destination */
- *px = *pIn++;
-
- /* Update the pointer px to point to the next row of the transposed matrix */
- px += nRows;
-
- /* Decrement the column loop counter */
- blkCnt--;
- }
- i++;
- rowCnt -- ;
- }
-
- /* Set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
- }
-
- /* Return to application */
- return (status);
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* Row loop */
+ rowCnt = row >> 1;
+ while (rowCnt > 0U)
+ {
+ float64_t *row0,*row1;
+ float64x2x4_t raV;
+
+ blkCnt = nColumns >> 2;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U) /* Column loop */
+ {
+ row0 = pIn;
+ row1 = pIn+nColumns;
+ pIn+=4;
+ raV = vld4q_lane_f64(row0, raV, 0);
+ raV = vld4q_lane_f64(row1, raV, 1);
+
+ vst1q_f64(px,raV.val[0]);
+ px += nRows;
+
+ vst1q_f64(px,raV.val[1]);
+ px += nRows;
+
+ vst1q_f64(px,raV.val[2]);
+ px += nRows;
+
+ vst1q_f64(px,raV.val[3]);
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ /* Perform matrix transpose for last 3 samples here. */
+ blkCnt = nColumns % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Read and store the input element in the destination */
+ *px++ = *pIn;
+ *px++ = *(pIn + 1 * nColumns);
+
+ px += (nRows - 2);
+ pIn++;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ i += 2;
+ pIn += 1 * nColumns;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ } /* Row loop end */
+
+ rowCnt = row & 1;
+ while (rowCnt > 0U)
+ {
+ blkCnt = nColumns ;
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ while (blkCnt > 0U)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+ i++;
+ rowCnt -- ;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
}
#else
arm_status arm_mat_trans_f64(
- const arm_matrix_instance_f64 * pSrc,
- arm_matrix_instance_f64 * pDst)
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
{
- float64_t *pIn = pSrc->pData; /* input data matrix pointer */
- float64_t *pOut = pDst->pData; /* output data matrix pointer */
- float64_t *px; /* Temporary output data matrix pointer */
- uint16_t nRows = pSrc->numRows; /* number of rows */
- uint16_t nCols = pSrc->numCols; /* number of columns */
- uint64_t col, row = nRows, i = 0U; /* Loop counters */
- arm_status status; /* status of matrix transpose */
-
+ float64_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+ float64_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint64_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
#ifdef ARM_MATH_MATRIX_CHECK
-
- /* Check for matrix mismatch condition */
- if ((pSrc->numRows != pDst->numCols) ||
- (pSrc->numCols != pDst->numRows) )
- {
- /* Set status as ARM_MATH_SIZE_MISMATCH */
- status = ARM_MATH_SIZE_MISMATCH;
- }
- else
-
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
- {
- /* Matrix transpose by exchanging the rows with columns */
- /* row loop */
- do
- {
- /* Pointer px is set to starting address of column being processed */
- px = pOut + i;
-
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer px is set to starting address of column being processed */
+ px = pOut + i;
+
#if defined (ARM_MATH_LOOPUNROLL)
-
- /* Loop unrolling: Compute 4 outputs at a time */
- col = nCols >> 2U;
-
- while (col > 0U) /* column loop */
- {
- /* Read and store input element in destination */
- *px = *pIn++;
- /* Update pointer px to point to next row of transposed matrix */
- px += nRows;
-
- *px = *pIn++;
- px += nRows;
-
- *px = *pIn++;
- px += nRows;
-
- *px = *pIn++;
- px += nRows;
-
- /* Decrement column loop counter */
- col--;
- }
-
- /* Loop unrolling: Compute remaining outputs */
- col = nCols % 0x4U;
-
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
#else
-
- /* Initialize col with number of samples */
- col = nCols;
-
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
- while (col > 0U)
- {
- /* Read and store input element in destination */
- *px = *pIn++;
-
- /* Update pointer px to point to next row of transposed matrix */
- px += nRows;
-
- /* Decrement column loop counter */
- col--;
- }
-
- i++;
-
- /* Decrement row loop counter */
- row--;
-
- } while (row > 0U); /* row loop end */
-
- /* Set status as ARM_MATH_SUCCESS */
- status = ARM_MATH_SUCCESS;
- }
-
- /* Return to application */
- return (status);
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
}
#endif
/**
diff --git a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
index 38cf538f..2f17e023 100755
--- a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
@@ -49,9 +49,9 @@
#if defined(ARM_MATH_NEON)
void arm_absmax_no_idx_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
float64_t maxVal , in; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
@@ -112,9 +112,9 @@ void arm_absmax_no_idx_f64(
}
#else
void arm_absmax_no_idx_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
float64_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
diff --git a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
index ca521085..5e4abfcc 100755
--- a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
@@ -47,101 +47,101 @@
#if defined(ARM_MATH_NEON)
void arm_absmin_no_idx_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- float64_t minVal , in; /* Temporary variables to store the output value. */
- uint32_t blkCnt; /* Loop counter */
-
- float64x2_t minV;
- float64x2_t pSrcV ;
- pSrcV = vld1q_f64(pSrc);
- pSrc += 2 ;
- minV = vabsq_f64(pSrcV);
-
-
-
-
- /* Load first input value that act as reference value for comparision */
-
-
- /* Initialize blkCnt with number of samples */
- blkCnt = (blockSize - 2U) >> 1U;
-
- while (blkCnt > 0U)
- {
- /* Initialize maxVal to the next consecutive values one by one */
- pSrcV = vld1q_f64(pSrc);
- minV = vminq_f64(minV, vabsq_f64(pSrcV));
-
- pSrc += 2 ;
-
- /* Decrement loop counter */
- blkCnt--;
- }
- minVal =vgetq_lane_f64(minV, 0);
- if(minVal > vgetq_lane_f64(minV, 1))
- {
- minVal = vgetq_lane_f64(minV, 1);
- }
- blkCnt = (blockSize - 2U) & 1;
-
- while (blkCnt > 0U)
- {
- /* Initialize maxVal to the next consecutive values one by one */
- in = fabs(*pSrc++);
-
- /* compare for the maximum value */
- if (minVal > in)
- {
- /* Update the maximum value and it's index */
- minVal = in;
- }
-
- /* Decrement loop counter */
- blkCnt--;
- }
- *pResult = minVal;
-
-
- /* Store the maximum value and it's index into destination pointers */
-
+ float64_t minVal , in; /* Temporary variables to store the output value. */
+ uint32_t blkCnt; /* Loop counter */
+
+ float64x2_t minV;
+ float64x2_t pSrcV ;
+ pSrcV = vld1q_f64(pSrc);
+ pSrc += 2 ;
+ minV = vabsq_f64(pSrcV);
+
+
+
+
+ /* Load first input value that act as reference value for comparision */
+
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 2U) >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ pSrcV = vld1q_f64(pSrc);
+ minV = vminq_f64(minV, vabsq_f64(pSrcV));
+
+ pSrc += 2 ;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ minVal =vgetq_lane_f64(minV, 0);
+ if(minVal > vgetq_lane_f64(minV, 1))
+ {
+ minVal = vgetq_lane_f64(minV, 1);
+ }
+ blkCnt = (blockSize - 2U) & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ in = fabs(*pSrc++);
+
+ /* compare for the maximum value */
+ if (minVal > in)
+ {
+ /* Update the maximum value and it's index */
+ minVal = in;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ *pResult = minVal;
+
+
+ /* Store the maximum value and it's index into destination pointers */
+
}
#else
void arm_absmin_no_idx_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- float64_t minVal, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt; /* Loop counter */
-
-
- /* Load first input value that act as reference value for comparision */
- out = fabs(*pSrc++);
-
- /* Initialize blkCnt with number of samples */
- blkCnt = (blockSize - 1U);
-
- while (blkCnt > 0U)
- {
- /* Initialize minVal to the next consecutive values one by one */
- minVal = fabs(*pSrc++);
-
- /* compare for the minimum value */
- if (out > minVal)
- {
- /* Update the minimum value and it's index */
- out = minVal;
- }
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store the minimum value and it's index into destination pointers */
- *pResult = out;
+ float64_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt; /* Loop counter */
+
+
+ /* Load first input value that act as reference value for comparision */
+ out = fabs(*pSrc++);
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = fabs(*pSrc++);
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
}
#endif
/**
diff --git a/Source/StatisticsFunctions/arm_entropy_f64.c b/Source/StatisticsFunctions/arm_entropy_f64.c
index a8fc7ed0..a321d2af 100755
--- a/Source/StatisticsFunctions/arm_entropy_f64.c
+++ b/Source/StatisticsFunctions/arm_entropy_f64.c
@@ -49,43 +49,43 @@
float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
{
- const float64_t *pIn;
- uint32_t blkCnt;
- float64_t accum, p;
-
- pIn = pSrcA;
-
-
- accum = 0.0;
+ const float64_t *pIn;
+ uint32_t blkCnt;
+ float64_t accum, p;
+
+ pIn = pSrcA;
+
+
+ accum = 0.0;
#if defined(ARM_MATH_NEON)
- float64x2_t sumV ,pInV ;
- sumV = vdupq_n_f64(0.0f);
- blkCnt = blockSize >> 1U ;
-
- while(blkCnt > 0){
- pInV = vld1q_f64(pIn);
- sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) );
- pIn += 2 ;
- blkCnt--;
-
- }
- accum = vaddvq_f64(sumV);
- blkCnt = blockSize & 1 ;
+ float64x2_t sumV ,pInV ;
+ sumV = vdupq_n_f64(0.0f);
+ blkCnt = blockSize >> 1U ;
+
+ while(blkCnt > 0){
+ pInV = vld1q_f64(pIn);
+ sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) );
+ pIn += 2 ;
+ blkCnt--;
+
+ }
+ accum = vaddvq_f64(sumV);
+ blkCnt = blockSize & 1 ;
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
-
- while(blkCnt > 0)
- {
- p = *pIn++;
-
- accum += p * log(p);
-
- blkCnt--;
-
- }
-
- return(-accum);
+
+ while(blkCnt > 0)
+ {
+ p = *pIn++;
+
+ accum += p * log(p);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
}
/**
diff --git a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
index a52059ed..867ad0cc 100755
--- a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
+++ b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
@@ -50,53 +50,53 @@
float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,const float64_t * pSrcB,uint32_t blockSize)
{
- const float64_t *pInA, *pInB;
- uint32_t blkCnt;
- float64_t accum, pA,pB;
-
- float64x2_t accumV;
- float64x2_t tmpVA, tmpVB,tmpV;
-
- pInA = pSrcA;
- pInB = pSrcB;
-
- accum = 0.0f;
- accumV = vdupq_n_f64(0.0f);
-
- blkCnt = blockSize >> 1;
- while(blkCnt > 0)
- {
- tmpVA = vld1q_f64(pInA);
- pInA += 2;
-
- tmpVB = vld1q_f64(pInB);
- pInB += 2;
-
- tmpV = vinvq_f64(tmpVA);
- tmpVB = vmulq_f64(tmpVB, tmpV);
- tmpVB = vlogq_f64(tmpVB);
-
- accumV = vmlaq_f64(accumV, tmpVA, tmpVB);
-
- blkCnt--;
-
- }
-
-
- accum = vaddvq_f64(accumV);
-
- blkCnt = blockSize & 1;
- while(blkCnt > 0)
- {
- pA = *pInA++;
- pB = *pInB++;
- accum += pA * logf(pB/pA);
-
- blkCnt--;
-
- }
-
- return(-accum);
+ const float64_t *pInA, *pInB;
+ uint32_t blkCnt;
+ float64_t accum, pA,pB;
+
+ float64x2_t accumV;
+ float64x2_t tmpVA, tmpVB,tmpV;
+
+ pInA = pSrcA;
+ pInB = pSrcB;
+
+ accum = 0.0f;
+ accumV = vdupq_n_f64(0.0f);
+
+ blkCnt = blockSize >> 1;
+ while(blkCnt > 0)
+ {
+ tmpVA = vld1q_f64(pInA);
+ pInA += 2;
+
+ tmpVB = vld1q_f64(pInB);
+ pInB += 2;
+
+ tmpV = vinvq_f64(tmpVA);
+ tmpVB = vmulq_f64(tmpVB, tmpV);
+ tmpVB = vlogq_f64(tmpVB);
+
+ accumV = vmlaq_f64(accumV, tmpVA, tmpVB);
+
+ blkCnt--;
+
+ }
+
+
+ accum = vaddvq_f64(accumV);
+
+ blkCnt = blockSize & 1;
+ while(blkCnt > 0)
+ {
+ pA = *pInA++;
+ pB = *pInB++;
+ accum += pA * logf(pB/pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
}
#else
diff --git a/Source/StatisticsFunctions/arm_max_no_idx_f64.c b/Source/StatisticsFunctions/arm_max_no_idx_f64.c
index f6404c01..fcd05f94 100644
--- a/Source/StatisticsFunctions/arm_max_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_max_no_idx_f64.c
@@ -47,9 +47,9 @@
*/
#if defined(ARM_MATH_NEON)
void arm_max_no_idx_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
float64_t maxVal , in; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
@@ -110,9 +110,9 @@ void arm_max_no_idx_f64(
}
#else
void arm_max_no_idx_f64(
- const float64_t *pSrc,
- uint32_t blockSize,
- float64_t *pResult)
+ const float64_t *pSrc,
+ uint32_t blockSize,
+ float64_t *pResult)
{
float64_t maxValue = F64_MIN;
float64_t newVal;
diff --git a/Source/StatisticsFunctions/arm_mean_f64.c b/Source/StatisticsFunctions/arm_mean_f64.c
index 577ecf13..b4465bdc 100644
--- a/Source/StatisticsFunctions/arm_mean_f64.c
+++ b/Source/StatisticsFunctions/arm_mean_f64.c
@@ -49,68 +49,68 @@
#if defined(ARM_MATH_NEON)
void arm_mean_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64x2_t vSum = vdupq_n_f64(0.0f);
- float64_t sum = 0.; /* Temporary result storage */
- float64x2_t afterLoad ;
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize >> 1U;
-
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
-
- afterLoad = vld1q_f64(pSrc);
- vSum = vaddq_f64(vSum, afterLoad);
- pSrc += 2;
- /* Decrement loop counter */
- blkCnt--;
-
-
- }
- sum = vaddvq_f64(vSum);
-
- blkCnt = blockSize & 1;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- sum += *pSrc++;
-
- /* Decrement loop counter */
- blkCnt--;
- }
- *pResult = (sum/blockSize);
+ uint32_t blkCnt; /* Loop counter */
+ float64x2_t vSum = vdupq_n_f64(0.0f);
+ float64_t sum = 0.; /* Temporary result storage */
+ float64x2_t afterLoad ;
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize >> 1U;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ afterLoad = vld1q_f64(pSrc);
+ vSum = vaddq_f64(vSum, afterLoad);
+ pSrc += 2;
+ /* Decrement loop counter */
+ blkCnt--;
+
+
+ }
+ sum = vaddvq_f64(vSum);
+
+ blkCnt = blockSize & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ *pResult = (sum/blockSize);
}
#else
void arm_mean_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary result storage */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
- sum += *pSrc++;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- /* Store result to destination */
- *pResult = (sum / blockSize);
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (sum / blockSize);
}
#endif
/**
diff --git a/Source/StatisticsFunctions/arm_min_no_idx_f64.c b/Source/StatisticsFunctions/arm_min_no_idx_f64.c
index b9696ff8..903bd90a 100755
--- a/Source/StatisticsFunctions/arm_min_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_min_no_idx_f64.c
@@ -46,51 +46,51 @@
@return none
*/
void arm_min_no_idx_f64(
- const float64_t *pSrc,
- uint32_t blockSize,
- float64_t *pResult)
+ const float64_t *pSrc,
+ uint32_t blockSize,
+ float64_t *pResult)
{
- float64_t minValue = F64_MAX;
- float64_t newVal;
- uint32_t blkCnt ;
+ float64_t minValue = F64_MAX;
+ float64_t newVal;
+ uint32_t blkCnt ;
#if defined(ARM_MATH_NEON)
- float64x2_t minValueV , newValV ;
- minValueV = vdupq_n_f64(F64_MAX);
- blkCnt = blockSize >> 1U;
- while(blkCnt > 0)
- {
- newValV = vld1q_f64(pSrc);
- minValueV = vminq_f64(minValueV, newValV);
- pSrc += 2 ;
- blkCnt--;
-
- }
- minValue =vgetq_lane_f64(minValueV, 0);
- if(minValue > vgetq_lane_f64(minValueV, 1))
- {
- minValue = vgetq_lane_f64(minValueV, 1);
- }
-
- blkCnt = blockSize & 1 ;
+ float64x2_t minValueV , newValV ;
+ minValueV = vdupq_n_f64(F64_MAX);
+ blkCnt = blockSize >> 1U;
+ while(blkCnt > 0)
+ {
+ newValV = vld1q_f64(pSrc);
+ minValueV = vminq_f64(minValueV, newValV);
+ pSrc += 2 ;
+ blkCnt--;
+
+ }
+ minValue =vgetq_lane_f64(minValueV, 0);
+ if(minValue > vgetq_lane_f64(minValueV, 1))
+ {
+ minValue = vgetq_lane_f64(minValueV, 1);
+ }
+
+ blkCnt = blockSize & 1 ;
#else
- blkCnt = blockSize;
+ blkCnt = blockSize;
#endif
-
- while (blkCnt > 0U)
- {
- newVal = *pSrc++;
-
- /* compare for the minimum value */
- if (minValue > newVal)
- {
- /* Update the minimum value and it's index */
- minValue = newVal;
- }
-
- blkCnt --;
- }
-
- *pResult = minValue;
+
+ while (blkCnt > 0U)
+ {
+ newVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (minValue > newVal)
+ {
+ /* Update the minimum value and it's index */
+ minValue = newVal;
+ }
+
+ blkCnt --;
+ }
+
+ *pResult = minValue;
}
/**
diff --git a/Source/StatisticsFunctions/arm_mse_f64.c b/Source/StatisticsFunctions/arm_mse_f64.c
index 13daa2cb..b2a4ae4b 100755
--- a/Source/StatisticsFunctions/arm_mse_f64.c
+++ b/Source/StatisticsFunctions/arm_mse_f64.c
@@ -47,84 +47,84 @@
*/
void arm_mse_f64(
- const float64_t * pSrcA,
- const float64_t * pSrcB,
- uint32_t blockSize,
- float64_t * result)
+ const float64_t * pSrcA,
+ const float64_t * pSrcB,
+ uint32_t blockSize,
+ float64_t * result)
{
-
- uint32_t blkCnt; /* Loop counter */
- float64_t inA, inB;
- float64_t sum = 0.0;
+
+ uint32_t blkCnt; /* Loop counter */
+ float64_t inA, inB;
+ float64_t sum = 0.0;
#if defined (ARM_MATH_NEON)
-
- float64x2_t inAV , inBV , subV, sumV;
- sumV = vdupq_n_f64(0.0f);
-
- blkCnt = blockSize >> 1U ;
-
- while (blkCnt > 0U)
- {
- inAV = vld1q_f64(pSrcA);
- pSrcA+=2;
- inBV = vld1q_f64(pSrcB);
- pSrcB+=2;
- subV = vsubq_f64(inAV, inBV);
- sumV = vmlaq_f64(sumV, subV, subV);
-
- blkCnt--;
-
- }
- sum = vaddvq_f64(sumV);
- blkCnt = (blockSize) & 1;
-
+
+ float64x2_t inAV , inBV , subV, sumV;
+ sumV = vdupq_n_f64(0.0f);
+
+ blkCnt = blockSize >> 1U ;
+
+ while (blkCnt > 0U)
+ {
+ inAV = vld1q_f64(pSrcA);
+ pSrcA+=2;
+ inBV = vld1q_f64(pSrcB);
+ pSrcB+=2;
+ subV = vsubq_f64(inAV, inBV);
+ sumV = vmlaq_f64(sumV, subV, subV);
+
+ blkCnt--;
+
+ }
+ sum = vaddvq_f64(sumV);
+ blkCnt = (blockSize) & 1;
+
#else
- /* Temporary return variable */
+ /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
- blkCnt = (blockSize) >> 1;
-
+ blkCnt = (blockSize) >> 1;
+
#pragma clang loop vectorize(enable)
- while (blkCnt > 0U)
- {
-
-
- inA = *pSrcA++;
- inB = *pSrcB++;
- inA = inA - inB;
- sum += inA * inA;
-
- inA = *pSrcA++;
- inB = *pSrcB++;
- inA = inA - inB;
- sum += inA * inA;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
-
- /* Loop unrolling: Compute remaining outputs */
- blkCnt = (blockSize) & 1;
+ while (blkCnt > 0U)
+ {
+
+
+ inA = *pSrcA++;
+ inB = *pSrcB++;
+ inA = inA - inB;
+ sum += inA * inA;
+
+ inA = *pSrcA++;
+ inB = *pSrcB++;
+ inA = inA - inB;
+ sum += inA * inA;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize) & 1;
#else
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
#endif
#endif
-//#pragma clang loop vectorize(enable) unroll(disable)
- while (blkCnt > 0U)
- {
- inA = *pSrcA++;
- inB = *pSrcB++;
- inA = inA - inB;
- sum += inA * inA;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result in destination buffer */
- *result = sum / blockSize;
+ //#pragma clang loop vectorize(enable) unroll(disable)
+ while (blkCnt > 0U)
+ {
+ inA = *pSrcA++;
+ inB = *pSrcB++;
+ inA = inA - inB;
+ sum += inA * inA;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *result = sum / blockSize;
}
diff --git a/Source/StatisticsFunctions/arm_power_f64.c b/Source/StatisticsFunctions/arm_power_f64.c
index 710dc834..5760a7f8 100644
--- a/Source/StatisticsFunctions/arm_power_f64.c
+++ b/Source/StatisticsFunctions/arm_power_f64.c
@@ -46,81 +46,81 @@
*/
#if defined(ARM_MATH_NEON)
void arm_power_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary result storage */
- float64x2_t sumV ; /* Temporary variable to store input value */
- sumV = vdupq_n_f64(0.0f);
- float64x2_t pSrcV;
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize >> 1U;
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-
- /* Compute Power and store result in a temporary variable, sum. */
- pSrcV = vld1q_f64(pSrc);
- sumV = vmlaq_f64(sumV, pSrcV, pSrcV);
- pSrc+= 2 ;
-
-
-
- /* Decrement loop counter */
- blkCnt--;
- }
- sum = vaddvq_f64(sumV);
-
-
- float64_t in;
- blkCnt = blockSize & 1;
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-
- /* Compute Power and store result in a temporary variable, sum. */
- in = *pSrc++;
- sum += in * in;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result to destination */
- *pResult = sum;
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+ float64x2_t sumV ; /* Temporary variable to store input value */
+ sumV = vdupq_n_f64(0.0f);
+ float64x2_t pSrcV;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ pSrcV = vld1q_f64(pSrc);
+ sumV = vmlaq_f64(sumV, pSrcV, pSrcV);
+ pSrc+= 2 ;
+
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ sum = vaddvq_f64(sumV);
+
+
+ float64_t in;
+ blkCnt = blockSize & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
}
#else
void arm_power_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary result storage */
- float64_t in; /* Temporary variable to store input value */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-
- /* Compute Power and store result in a temporary variable, sum. */
- in = *pSrc++;
- sum += in * in;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result to destination */
- *pResult = sum;
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+ float64_t in; /* Temporary variable to store input value */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
}
#endif
/**
diff --git a/Source/StatisticsFunctions/arm_var_f64.c b/Source/StatisticsFunctions/arm_var_f64.c
index 6fd106aa..58a626b8 100644
--- a/Source/StatisticsFunctions/arm_var_f64.c
+++ b/Source/StatisticsFunctions/arm_var_f64.c
@@ -45,56 +45,58 @@
@return none
*/
void arm_var_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary result storage */
- float64_t fSum = 0.;
- float64_t fMean, fValue;
- const float64_t * pInput = pSrc;
-
- if (blockSize <= 1U)
- {
- *pResult = 0;
- return;
- }
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
-
- sum += *pInput++;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
- fMean = sum / (float64_t) blockSize;
-
- pInput = pSrc;
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- fValue = *pInput++ - fMean;
- fSum += fValue * fValue;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Variance */
- *pResult = fSum / (float64_t)(blockSize - 1.);
+
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+ float64_t fSum = 0.;
+ float64_t fMean, fValue;
+ const float64_t * pInput = pSrc;
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+ arm_mean_f64(pInput, blockSize, &fMean);
+#if defined(ARM_MATH_NEON)
+ float64x2_t fValueV ,fsumV , pInputV , fMeanV;
+ fsumV = vdupq_n_f64(0.0f);
+ fMeanV = vdupq_n_f64(fMean);
+ blkCnt = blockSize >> 1U;
+
+ while(blkCnt > 0U)
+ {
+ pInputV = vld1q_f64(pInput);
+ fValueV = vsubq_f64(pInputV, fMeanV);
+ fsumV = vmlaq_f64(fsumV, fValueV, fValueV);
+ pInput += 2 ;
+ blkCnt--;
+ }
+ fSum = vaddvq_f64(fsumV);
+
+ blkCnt = blockSize & 1 ;
+
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = fSum / (float64_t)(blockSize - 1.);
}
/**
- @} end of variance group
+ @} end of variance group
*/