Removed comments about loop unrolling in NEON codes to avoid confusion

pull/19/head
ClaudioMartino 7 years ago
parent d4de6207ff
commit 7c1fca8052

@ -71,14 +71,14 @@ void arm_abs_f32(
float32x4_t vec1; float32x4_t vec1;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = |A| */ /* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc); vec1 = vld1q_f32(pSrc);
res = vabsq_f32(vec1); res = vabsq_f32(vec1);
vst1q_f32(pDst, res); vst1q_f32(pDst, res);

@ -71,14 +71,14 @@ void arm_add_f32(
float32x4_t vec2; float32x4_t vec2;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = A + B */ /* C = A + B */
/* Add and then store the results in the destination buffer. */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA); vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB); vec2 = vld1q_f32(pSrcB);
res = vaddq_f32(vec1, vec2); res = vaddq_f32(vec1, vec2);

@ -74,11 +74,12 @@ void arm_dot_prod_f32(
float32x4_t res; float32x4_t res;
float32x2_t accum = vdup_n_f32(0); float32x2_t accum = vdup_n_f32(0);
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
vec1 = vld1q_f32(pSrcA); vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB); vec2 = vld1q_f32(pSrcB);
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */ /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */

@ -71,14 +71,14 @@ void arm_mult_f32(
float32x4_t vec2; float32x4_t vec2;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = A * B */ /* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
/* Multiply the inputs and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA); vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB); vec2 = vld1q_f32(pSrcB);
res = vmulq_f32(vec1, vec2); res = vmulq_f32(vec1, vec2);

@ -70,14 +70,14 @@ void arm_negate_f32(
float32x4_t vec1; float32x4_t vec1;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = -A */ /* C = -A */
/* Negate and then store the results in the destination buffer. */
/* Negate and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc); vec1 = vld1q_f32(pSrc);
res = vnegq_f32(vec1); res = vnegq_f32(vec1);
vst1q_f32(pDst, res); vst1q_f32(pDst, res);

@ -72,14 +72,14 @@ void arm_offset_f32(
float32x4_t vec1; float32x4_t vec1;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = A + offset */ /* C = A + offset */
/* Add offset and then store the results in the destination buffer. */ /* Add offset and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc); vec1 = vld1q_f32(pSrc);
res = vaddq_f32(vec1,vdupq_n_f32(offset)); res = vaddq_f32(vec1,vdupq_n_f32(offset));
vst1q_f32(pDst, res); vst1q_f32(pDst, res);

@ -84,13 +84,14 @@ void arm_scale_f32(
float32x4_t vec1; float32x4_t vec1;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = A * scale */ /* C = A * scale */
/* Scale the input and then store the results in the destination buffer. */
/* Scale the input and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc); vec1 = vld1q_f32(pSrc);
res = vmulq_f32(vec1, vdupq_n_f32(scale)); res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pDst, res); vst1q_f32(pDst, res);

@ -71,14 +71,14 @@ void arm_sub_f32(
float32x4_t vec2; float32x4_t vec2;
float32x4_t res; float32x4_t res;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = A - B */ /* C = A - B */
/* Subtract and then store the results in the destination buffer. */
/* Subtract and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA); vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB); vec2 = vld1q_f32(pSrcB);
res = vsubq_f32(vec1, vec2); res = vsubq_f32(vec1, vec2);

@ -82,7 +82,7 @@ void arm_cmplx_conj_f32(
zero = vdupq_n_f32(0.0); zero = vdupq_n_f32(0.0);
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U; blkCnt = numSamples >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -93,7 +93,7 @@ void arm_cmplx_dot_prod_f32(
accR = vdupq_n_f32(0.0); accR = vdupq_n_f32(0.0);
accI = vdupq_n_f32(0.0); accI = vdupq_n_f32(0.0);
/* Loop unrolling */ /* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3U; blkCnt = numSamples >> 3U;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -89,7 +89,7 @@ void arm_cmplx_mag_f32(
float32x4_t vImagB; float32x4_t vImagB;
float32x4_t vMagSqB; float32x4_t vMagSqB;
/* Loop unrolling */ /* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3; blkCnt = numSamples >> 3;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -88,7 +88,7 @@ void arm_cmplx_mag_squared_f32(
float32x4_t vImagB; float32x4_t vImagB;
float32x4_t vMagSqB; float32x4_t vMagSqB;
/* Loop unrolling */ /* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3; blkCnt = numSamples >> 3;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -82,7 +82,7 @@ void arm_cmplx_mult_cmplx_f32(
float32x4_t real, imag; float32x4_t real, imag;
float32x4x2_t outCplx; float32x4x2_t outCplx;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U; blkCnt = numSamples >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -82,7 +82,7 @@ void arm_cmplx_mult_real_f32(
float32x4_t r; float32x4_t r;
float32x4x2_t ab,outCplx; float32x4x2_t ab,outCplx;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U; blkCnt = numSamples >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -206,10 +206,10 @@ void arm_conv_f32(
res = vdupq_n_f32(0) ; res = vdupq_n_f32(0) ;
accum = vdup_n_f32(0); accum = vdup_n_f32(0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Compute 4 MACs simultaneously. */
k = count >> 2U; k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U) while (k > 0U)
@ -556,7 +556,7 @@ void arm_conv_f32(
float32x4_t y = vdupq_n_f32(0) ; float32x4_t y = vdupq_n_f32(0) ;
float32x2_t accum = vdup_n_f32(0) ; float32x2_t accum = vdup_n_f32(0) ;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U) while (k > 0U)
{ {

@ -359,7 +359,7 @@ void arm_correlate_f32(
acc3 = 0.0f; acc3 = 0.0f;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON)
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Compute 4 MACs simultaneously. */
k = srcBLen >> 2U; k = srcBLen >> 2U;
res = vdupq_n_f32(0) ; res = vdupq_n_f32(0) ;

@ -187,10 +187,10 @@ void arm_fir_decimate_f32(
/* Initialize coeff pointer */ /* Initialize coeff pointer */
pb = pCoeffs; pb = pCoeffs;
/* Loop unrolling. Process 4 taps at a time. */ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2; tapCnt = numTaps >> 2;
/* Loop over the number of taps. Unroll by a factor of 4. /* Loop over the number of taps.
** Repeat until we've computed numTaps-4 coefficients. */ ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U) while (tapCnt > 0U)
@ -287,10 +287,10 @@ void arm_fir_decimate_f32(
/* Initialize coeff pointer */ /* Initialize coeff pointer */
pb = pCoeffs; pb = pCoeffs;
/* Loop unrolling. Process 4 taps at a time. */ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2; tapCnt = numTaps >> 2;
/* Loop over the number of taps. Unroll by a factor of 4. /* Loop over the number of taps.
** Repeat until we've computed numTaps-4 coefficients. */ ** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U) while (tapCnt > 0U)
{ {

@ -163,7 +163,7 @@ void arm_fir_interpolate_f32(
blkCnt = blockSize >> 3; blkCnt = blockSize >> 3;
blkCntN4 = blockSize & 7; blkCntN4 = blockSize & 7;
/* Samples loop unrolled by 8 */ /* Loop unrolling */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* Copy new input samples into the state buffer */ /* Copy new input samples into the state buffer */

@ -208,7 +208,7 @@ void arm_lms_f32(
sum = 0.0f; sum = 0.0f;
sumV = vdupq_n_f32(0.0); sumV = vdupq_n_f32(0.0);
/* Loop unrolling. Process 4 taps at a time. */ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2; tapCnt = numTaps >> 2;
while (tapCnt > 0U) while (tapCnt > 0U)
@ -257,7 +257,7 @@ void arm_lms_f32(
/* Initialize coeff pointer */ /* Initialize coeff pointer */
pb = (pCoeffs); pb = (pCoeffs);
/* Loop unrolling. Process 4 taps at a time. */ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2; tapCnt = numTaps >> 2;
/* Update filter coefficients */ /* Update filter coefficients */
@ -305,7 +305,7 @@ void arm_lms_f32(
/* Points to the start of the pState buffer */ /* Points to the start of the pState buffer */
pStateCurnt = S->pState; pStateCurnt = S->pState;
/* Loop unrolling for (numTaps - 1U) samples copy */ /* Process 4 taps at a time for (numTaps - 1U) samples copy */
tapCnt = (numTaps - 1U) >> 2U; tapCnt = (numTaps - 1U) >> 2U;
/* copy data */ /* copy data */

@ -216,7 +216,7 @@ void arm_lms_norm_f32(
sum = 0.0f; sum = 0.0f;
sumV = vdupq_n_f32(0.0); sumV = vdupq_n_f32(0.0);
/* Loop unrolling. Process 4 taps at a time. */ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2; tapCnt = numTaps >> 2;
while (tapCnt > 0U) while (tapCnt > 0U)
@ -265,7 +265,7 @@ void arm_lms_norm_f32(
/* Initialize coeff pointer */ /* Initialize coeff pointer */
pb = (pCoeffs); pb = (pCoeffs);
/* Loop unrolling. Process 4 taps at a time. */ /* Process 4 taps at a time. */
tapCnt = numTaps >> 2; tapCnt = numTaps >> 2;
/* Update filter coefficients */ /* Update filter coefficients */
@ -317,7 +317,7 @@ void arm_lms_norm_f32(
/* Points to the start of the pState buffer */ /* Points to the start of the pState buffer */
pStateCurnt = S->pState; pStateCurnt = S->pState;
/* Loop unrolling for (numTaps - 1U)/4 samples copy */ /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
tapCnt = (numTaps - 1U) >> 2U; tapCnt = (numTaps - 1U) >> 2U;
/* copy data */ /* copy data */

@ -100,10 +100,9 @@ arm_status arm_mat_add_f32(
/* Total number of samples in the input matrix */ /* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols; numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
/* Loop unrolling */
blkCnt = numSamples >> 2U; blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -149,7 +149,7 @@ arm_status arm_mat_cmplx_mult_f32(
accR1 = vdupq_n_f32(0.0); accR1 = vdupq_n_f32(0.0);
accI1 = vdupq_n_f32(0.0); accI1 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2; colCnt = numColsA >> 2;
/* Matrix multiplication */ /* Matrix multiplication */
@ -302,7 +302,7 @@ arm_status arm_mat_cmplx_mult_f32(
accR0 = vdupq_n_f32(0.0); accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0); accI0 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2; colCnt = numColsA >> 2;
/* Matrix multiplication */ /* Matrix multiplication */

@ -109,7 +109,7 @@ arm_status arm_mat_inverse_f32(
* *
* 1. First combine the identity matrix and the input matrix separated by a bar to form an * 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows: * augmented matrix as follows:
* _ _ _ _ * _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 | * | a11 a12 | 1 0 | | X11 X12 |
* | | | = | | * | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _| * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
@ -299,7 +299,6 @@ arm_status arm_mat_inverse_f32(
* to the right of the pilot element */ * to the right of the pilot element */
j = (numCols - l) >> 2; j = (numCols - l) >> 2;
/* Loop unrolling */
while (j > 0U) while (j > 0U)
{ {
/* Divide each element of the row of the input matrix /* Divide each element of the row of the input matrix
@ -331,7 +330,6 @@ arm_status arm_mat_inverse_f32(
/* Loop over number of columns of the destination matrix */ /* Loop over number of columns of the destination matrix */
j = numCols >> 2; j = numCols >> 2;
/* Loop unrolling */
while (j > 0U) while (j > 0U)
{ {
/* Divide each element of the row of the destination matrix /* Divide each element of the row of the destination matrix
@ -399,7 +397,6 @@ arm_status arm_mat_inverse_f32(
to replace the elements in the input matrix */ to replace the elements in the input matrix */
j = (numCols - l) >> 2; j = (numCols - l) >> 2;
/* Loop unrolling */
while (j > 0U) while (j > 0U)
{ {
/* Replace the element by the sum of that row /* Replace the element by the sum of that row
@ -433,7 +430,6 @@ arm_status arm_mat_inverse_f32(
replace the elements in the destination matrix */ replace the elements in the destination matrix */
j = numCols >> 2; j = numCols >> 2;
/* Loop unrolling */
while (j > 0U) while (j > 0U)
{ {
/* Replace the element by the sum of that row /* Replace the element by the sum of that row

@ -169,7 +169,7 @@ arm_status arm_mat_mult_f32(
acc6 = vdupq_n_f32(0.0); acc6 = vdupq_n_f32(0.0);
acc7 = vdupq_n_f32(0.0); acc7 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U; colCnt = numColsA >> 2U;
/* Matrix multiplication */ /* Matrix multiplication */
@ -184,7 +184,8 @@ arm_status arm_mat_mult_f32(
a5V = vld1q_f32(pIn1F); a5V = vld1q_f32(pIn1F);
a6V = vld1q_f32(pIn1G); a6V = vld1q_f32(pIn1G);
a7V = vld1q_f32(pIn1H); a7V = vld1q_f32(pIn1H);
pIn1 += 4;
pIn1 += 4;
pIn1B += 4; pIn1B += 4;
pIn1C += 4; pIn1C += 4;
pIn1D += 4; pIn1D += 4;
@ -323,7 +324,7 @@ arm_status arm_mat_mult_f32(
acc0 = vdupq_n_f32(0.0); acc0 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U; colCnt = numColsA >> 2U;
/* Matrix multiplication */ /* Matrix multiplication */

@ -97,10 +97,9 @@ arm_status arm_mat_scale_f32(
/* Total number of samples in the input matrix */ /* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols; numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
/* Loop unrolling */
blkCnt = numSamples >> 2; blkCnt = numSamples >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -95,10 +95,9 @@ arm_status arm_mat_sub_f32(
/* Total number of samples in the input matrix */ /* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols; numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
/* Loop Unrolling */
blkCnt = numSamples >> 2U; blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
@ -110,7 +109,7 @@ arm_status arm_mat_sub_f32(
res = vsubq_f32(vec1, vec2); res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res); vst1q_f32(pOut, res);
/* update pointers to process next sampels */ /* Update pointers to process next samples */
pIn1 += 4U; pIn1 += 4U;
pIn2 += 4U; pIn2 += 4U;
pOut += 4U; pOut += 4U;

@ -90,13 +90,12 @@ arm_status arm_mat_trans_f32(
float32x4_t row0V,row1V,row2V,row3V; float32x4_t row0V,row1V,row2V,row3V;
float32x4x2_t ra0,ra1,rb0,rb1; float32x4x2_t ra0,ra1,rb0,rb1;
/* Loop Unrolling */
blkCnt = nColumns >> 2; blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */ /* The pointer px is set to starting address of the column being processed */
px = pOut + i; px = pOut + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */ while (blkCnt > 0U) /* Column loop */
{ {

@ -111,12 +111,11 @@ void arm_max_f32(
outV = vld1q_f32(pSrc); outV = vld1q_f32(pSrc);
pSrc += 4; pSrc += 4;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U; blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
srcV = vld1q_f32(pSrc); srcV = vld1q_f32(pSrc);
pSrc += 4; pSrc += 4;

@ -72,10 +72,9 @@ void arm_mean_f32(
float32_t in1, in2, in3, in4; float32_t in1, in2, in3, in4;
float32x4_t inV; float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -109,7 +109,7 @@ void arm_min_f32(
outV = vld1q_f32(pSrc); outV = vld1q_f32(pSrc);
pSrc += 4; pSrc += 4;
/* Loop unrolling */ /* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U; blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -71,10 +71,9 @@ void arm_power_f32(
float32x2_t sumV2; float32x2_t sumV2;
float32x4_t inV; float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -71,10 +71,9 @@ void arm_rms_f32(
float32x2_t sumV2; float32x2_t sumV2;
float32x4_t inV; float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -79,10 +79,9 @@ void arm_var_f32(
arm_mean_f32(pSrc,blockSize,&mean); arm_mean_f32(pSrc,blockSize,&mean);
avg = vdupq_n_f32(mean); avg = vdupq_n_f32(mean);
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -67,10 +67,9 @@ void arm_copy_f32(
float32x4_t inV; float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -68,11 +68,10 @@ void arm_fill_f32(
float32x4_t inV = vdupq_n_f32(value); float32x4_t inV = vdupq_n_f32(value);
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
* ** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = value */ /* C = value */
@ -85,7 +84,7 @@ void arm_fill_f32(
} }
/* If the blockSize is not a multiple of 4, compute any remaining output samples here. /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
* ** No loop unrolling is used. */ ** No loop unrolling is used. */
blkCnt = blockSize & 3; blkCnt = blockSize & 3;
while (blkCnt > 0U) while (blkCnt > 0U)

@ -80,10 +80,9 @@ void arm_float_to_q15(
int32x4_t cvt; int32x4_t cvt;
int16x4_t outV; int16x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -84,10 +84,9 @@ void arm_float_to_q31(
int32x4_t outV; int32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -82,10 +82,9 @@ void arm_float_to_q7(
int16x4_t cvt1,cvt2; int16x4_t cvt1,cvt2;
int8x8_t outV; int8x8_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 3U; blkCnt = blockSize >> 3U;
/* First part of the processing with loop unrolling. Compute 8 outputs at a time. /* Compute 8 outputs at a time.
** a second loop below computes the remaining 1 to 7 samples. */ ** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -68,10 +68,9 @@ void arm_q15_to_float(
int32x4_t inV0, inV1; int32x4_t inV0, inV1;
float32x4_t outV; float32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 3U; blkCnt = blockSize >> 3U;
/* First part of the processing with loop unrolling. Compute 8 outputs at a time. /* Compute 8 outputs at a time.
** a second loop below computes the remaining 1 to 7 samples. */ ** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -67,10 +67,9 @@ void arm_q31_to_float(
int32x4_t inV; int32x4_t inV;
float32x4_t outV; float32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 2U; blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time. /* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */ ** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -69,10 +69,9 @@ void arm_q7_to_float(
int32x4_t inVLL, inVLH, inVHL, inVHH; int32x4_t inVLL, inVLH, inVHL, inVHH;
float32x4_t outV; float32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 4U; blkCnt = blockSize >> 4U;
/* First part of the processing with loop unrolling. Compute 16 outputs at a time. /* Compute 16 outputs at a time.
** a second loop below computes the remaining 1 to 15 samples. */ ** a second loop below computes the remaining 1 to 15 samples. */
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

Loading…
Cancel
Save