diff --git a/Source/BasicMathFunctions/arm_abs_f32.c b/Source/BasicMathFunctions/arm_abs_f32.c index 5e94eddc..a7d2624e 100644 --- a/Source/BasicMathFunctions/arm_abs_f32.c +++ b/Source/BasicMathFunctions/arm_abs_f32.c @@ -71,14 +71,14 @@ void arm_abs_f32( float32x4_t vec1; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = |A| */ - /* Calculate absolute values and then store the results in the destination buffer. */ + /* Calculate absolute values and then store the results in the destination buffer. */ vec1 = vld1q_f32(pSrc); res = vabsq_f32(vec1); vst1q_f32(pDst, res); diff --git a/Source/BasicMathFunctions/arm_add_f32.c b/Source/BasicMathFunctions/arm_add_f32.c index 521c0021..1c66a24c 100644 --- a/Source/BasicMathFunctions/arm_add_f32.c +++ b/Source/BasicMathFunctions/arm_add_f32.c @@ -71,14 +71,14 @@ void arm_add_f32( float32x4_t vec2; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A + B */ - /* Add and then store the results in the destination buffer. */ + /* Add and then store the results in the destination buffer. */ vec1 = vld1q_f32(pSrcA); vec2 = vld1q_f32(pSrcB); res = vaddq_f32(vec1, vec2); diff --git a/Source/BasicMathFunctions/arm_dot_prod_f32.c b/Source/BasicMathFunctions/arm_dot_prod_f32.c index 97f99030..8510c022 100644 --- a/Source/BasicMathFunctions/arm_dot_prod_f32.c +++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c @@ -74,11 +74,12 @@ void arm_dot_prod_f32( float32x4_t res; float32x2_t accum = vdup_n_f32(0); - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; vec1 = vld1q_f32(pSrcA); vec2 = vld1q_f32(pSrcB); + while (blkCnt > 0U) { /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */ diff --git a/Source/BasicMathFunctions/arm_mult_f32.c b/Source/BasicMathFunctions/arm_mult_f32.c index b2abdb39..53ad73c8 100644 --- a/Source/BasicMathFunctions/arm_mult_f32.c +++ b/Source/BasicMathFunctions/arm_mult_f32.c @@ -71,14 +71,14 @@ void arm_mult_f32( float32x4_t vec2; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A * B */ - /* Multiply the inputs and then store the results in the destination buffer. */ + /* Multiply the inputs and then store the results in the destination buffer. */ vec1 = vld1q_f32(pSrcA); vec2 = vld1q_f32(pSrcB); res = vmulq_f32(vec1, vec2); diff --git a/Source/BasicMathFunctions/arm_negate_f32.c b/Source/BasicMathFunctions/arm_negate_f32.c index 01433653..f807112c 100644 --- a/Source/BasicMathFunctions/arm_negate_f32.c +++ b/Source/BasicMathFunctions/arm_negate_f32.c @@ -70,14 +70,14 @@ void arm_negate_f32( float32x4_t vec1; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = -A */ - /* Negate and then store the results in the destination buffer. */ + /* Negate and then store the results in the destination buffer. */ vec1 = vld1q_f32(pSrc); res = vnegq_f32(vec1); vst1q_f32(pDst, res); diff --git a/Source/BasicMathFunctions/arm_offset_f32.c b/Source/BasicMathFunctions/arm_offset_f32.c index 288e0f5f..b10e3f1d 100644 --- a/Source/BasicMathFunctions/arm_offset_f32.c +++ b/Source/BasicMathFunctions/arm_offset_f32.c @@ -72,14 +72,14 @@ void arm_offset_f32( float32x4_t vec1; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A + offset */ + /* Add offset and then store the results in the destination buffer. */ - vec1 = vld1q_f32(pSrc); res = vaddq_f32(vec1,vdupq_n_f32(offset)); vst1q_f32(pDst, res); diff --git a/Source/BasicMathFunctions/arm_scale_f32.c b/Source/BasicMathFunctions/arm_scale_f32.c index 4bd9e3e4..72ecbe5c 100644 --- a/Source/BasicMathFunctions/arm_scale_f32.c +++ b/Source/BasicMathFunctions/arm_scale_f32.c @@ -84,13 +84,14 @@ void arm_scale_f32( float32x4_t vec1; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A * scale */ - /* Scale the input and then store the results in the destination buffer. */ + + /* Scale the input and then store the results in the destination buffer. */ vec1 = vld1q_f32(pSrc); res = vmulq_f32(vec1, vdupq_n_f32(scale)); vst1q_f32(pDst, res); diff --git a/Source/BasicMathFunctions/arm_sub_f32.c b/Source/BasicMathFunctions/arm_sub_f32.c index 512a4b28..4c97af30 100644 --- a/Source/BasicMathFunctions/arm_sub_f32.c +++ b/Source/BasicMathFunctions/arm_sub_f32.c @@ -71,14 +71,14 @@ void arm_sub_f32( float32x4_t vec2; float32x4_t res; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; while (blkCnt > 0U) { /* C = A - B */ - /* Subtract and then store the results in the destination buffer. */ + /* Subtract and then store the results in the destination buffer. */ vec1 = vld1q_f32(pSrcA); vec2 = vld1q_f32(pSrcB); res = vsubq_f32(vec1, vec2); diff --git a/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c b/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c index 5259d6ad..df5db003 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c @@ -82,7 +82,7 @@ void arm_cmplx_conj_f32( zero = vdupq_n_f32(0.0); - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = numSamples >> 2U; while (blkCnt > 0U) diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c index 3fde9050..06f1bfa1 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c @@ -93,7 +93,7 @@ void arm_cmplx_dot_prod_f32( accR = vdupq_n_f32(0.0); accI = vdupq_n_f32(0.0); - /* Loop unrolling */ + /* Loop unrolling: Compute 8 outputs at a time */ blkCnt = numSamples >> 3U; while (blkCnt > 0U) diff --git a/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c index 8b8f1938..84812dcf 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c @@ -89,7 +89,7 @@ void arm_cmplx_mag_f32( float32x4_t vImagB; float32x4_t vMagSqB; - /* Loop unrolling */ + /* Loop unrolling: Compute 8 outputs at a time */ blkCnt = numSamples >> 3; while (blkCnt > 0U) diff --git a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c index 69128c91..99f051c3 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c @@ -88,7 +88,7 @@ void arm_cmplx_mag_squared_f32( float32x4_t vImagB; float32x4_t vMagSqB; - /* Loop unrolling */ + /* Loop unrolling: Compute 8 outputs at a time */ blkCnt = numSamples >> 3; while (blkCnt > 0U) diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c index 12b5ca7d..8d148216 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c @@ -82,7 +82,7 @@ void arm_cmplx_mult_cmplx_f32( float32x4_t real, imag; float32x4x2_t outCplx; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = numSamples >> 2U; while (blkCnt > 0U) diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c index ff9b112f..9651999e 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c @@ -82,7 +82,7 @@ void arm_cmplx_mult_real_f32( float32x4_t r; float32x4x2_t ab,outCplx; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = numSamples >> 2U; while (blkCnt > 0U) diff --git a/Source/FilteringFunctions/arm_conv_f32.c b/Source/FilteringFunctions/arm_conv_f32.c index cfada7d7..8fa13085 100644 --- a/Source/FilteringFunctions/arm_conv_f32.c +++ b/Source/FilteringFunctions/arm_conv_f32.c @@ -206,10 +206,10 @@ void arm_conv_f32( res = vdupq_n_f32(0) ; accum = vdup_n_f32(0); - /* Apply loop unrolling and compute 4 MACs simultaneously. */ + /* Compute 4 MACs simultaneously. */ k = count >> 2U; - /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + /* First part of the processing. Compute 4 MACs at a time. ** a second loop below computes MACs for the remaining 1 to 3 samples. */ while (k > 0U) @@ -556,7 +556,7 @@ void arm_conv_f32( float32x4_t y = vdupq_n_f32(0) ; float32x2_t accum = vdup_n_f32(0) ; - /* First part of the processing with loop unrolling. Compute 4 MACs at a time. + /* First part of the processing. Compute 4 MACs at a time. ** a second loop below computes MACs for the remaining 1 to 3 samples. */ while (k > 0U) { diff --git a/Source/FilteringFunctions/arm_correlate_f32.c b/Source/FilteringFunctions/arm_correlate_f32.c index 41b5b801..60a43f50 100644 --- a/Source/FilteringFunctions/arm_correlate_f32.c +++ b/Source/FilteringFunctions/arm_correlate_f32.c @@ -359,7 +359,7 @@ void arm_correlate_f32( acc3 = 0.0f; #if defined(ARM_MATH_NEON) - /* Apply loop unrolling and compute 4 MACs simultaneously. */ + /* Compute 4 MACs simultaneously. */ k = srcBLen >> 2U; res = vdupq_n_f32(0) ; diff --git a/Source/FilteringFunctions/arm_fir_decimate_f32.c b/Source/FilteringFunctions/arm_fir_decimate_f32.c index d829826a..218ca34f 100644 --- a/Source/FilteringFunctions/arm_fir_decimate_f32.c +++ b/Source/FilteringFunctions/arm_fir_decimate_f32.c @@ -187,10 +187,10 @@ void arm_fir_decimate_f32( /* Initialize coeff pointer */ pb = pCoeffs; - /* Loop unrolling. Process 4 taps at a time. */ + /* Process 4 taps at a time. */ tapCnt = numTaps >> 2; - /* Loop over the number of taps. Unroll by a factor of 4. + /* Loop over the number of taps. ** Repeat until we've computed numTaps-4 coefficients. */ while (tapCnt > 0U) @@ -287,10 +287,10 @@ void arm_fir_decimate_f32( /* Initialize coeff pointer */ pb = pCoeffs; - /* Loop unrolling. Process 4 taps at a time. */ + /* Process 4 taps at a time. */ tapCnt = numTaps >> 2; - /* Loop over the number of taps. Unroll by a factor of 4. + /* Loop over the number of taps. ** Repeat until we've computed numTaps-4 coefficients. */ while (tapCnt > 0U) { diff --git a/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/Source/FilteringFunctions/arm_fir_interpolate_f32.c index 44659356..ee0ed270 100644 --- a/Source/FilteringFunctions/arm_fir_interpolate_f32.c +++ b/Source/FilteringFunctions/arm_fir_interpolate_f32.c @@ -163,7 +163,7 @@ void arm_fir_interpolate_f32( blkCnt = blockSize >> 3; blkCntN4 = blockSize & 7; - /* Samples loop unrolled by 8 */ + /* Loop unrolling */ while (blkCnt > 0U) { /* Copy new input samples into the state buffer */ diff --git a/Source/FilteringFunctions/arm_lms_f32.c b/Source/FilteringFunctions/arm_lms_f32.c index c2594ac8..4fc6e7e2 100644 --- a/Source/FilteringFunctions/arm_lms_f32.c +++ b/Source/FilteringFunctions/arm_lms_f32.c @@ -208,7 +208,7 @@ void arm_lms_f32( sum = 0.0f; sumV = vdupq_n_f32(0.0); - /* Loop unrolling. Process 4 taps at a time. */ + /* Process 4 taps at a time. */ tapCnt = numTaps >> 2; while (tapCnt > 0U) @@ -257,7 +257,7 @@ void arm_lms_f32( /* Initialize coeff pointer */ pb = (pCoeffs); - /* Loop unrolling. Process 4 taps at a time. */ + /* Process 4 taps at a time. */ tapCnt = numTaps >> 2; /* Update filter coefficients */ @@ -305,7 +305,7 @@ void arm_lms_f32( /* Points to the start of the pState buffer */ pStateCurnt = S->pState; - /* Loop unrolling for (numTaps - 1U) samples copy */ + /* Process 4 taps at a time for (numTaps - 1U) samples copy */ tapCnt = (numTaps - 1U) >> 2U; /* copy data */ diff --git a/Source/FilteringFunctions/arm_lms_norm_f32.c b/Source/FilteringFunctions/arm_lms_norm_f32.c index bcdf92ec..28ab04a2 100644 --- a/Source/FilteringFunctions/arm_lms_norm_f32.c +++ b/Source/FilteringFunctions/arm_lms_norm_f32.c @@ -216,7 +216,7 @@ void arm_lms_norm_f32( sum = 0.0f; sumV = vdupq_n_f32(0.0); - /* Loop unrolling. Process 4 taps at a time. */ + /* Process 4 taps at a time. */ tapCnt = numTaps >> 2; while (tapCnt > 0U) @@ -265,7 +265,7 @@ void arm_lms_norm_f32( /* Initialize coeff pointer */ pb = (pCoeffs); - /* Loop unrolling. Process 4 taps at a time. */ + /* Process 4 taps at a time. */ tapCnt = numTaps >> 2; /* Update filter coefficients */ @@ -317,7 +317,7 @@ void arm_lms_norm_f32( /* Points to the start of the pState buffer */ pStateCurnt = S->pState; - /* Loop unrolling for (numTaps - 1U)/4 samples copy */ + /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */ tapCnt = (numTaps - 1U) >> 2U; /* copy data */ diff --git a/Source/MatrixFunctions/arm_mat_add_f32.c b/Source/MatrixFunctions/arm_mat_add_f32.c index 0aa8f395..8e1246c0 100644 --- a/Source/MatrixFunctions/arm_mat_add_f32.c +++ b/Source/MatrixFunctions/arm_mat_add_f32.c @@ -100,10 +100,9 @@ arm_status arm_mat_add_f32( /* Total number of samples in the input matrix */ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols; - /* Loop unrolling */ blkCnt = numSamples >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c index b31fbb14..8e2af317 100644 --- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c +++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c @@ -149,7 +149,7 @@ arm_status arm_mat_cmplx_mult_f32( accR1 = vdupq_n_f32(0.0); accI1 = vdupq_n_f32(0.0); - /* Apply loop unrolling and compute 4 MACs simultaneously. */ + /* Compute 4 MACs simultaneously. */ colCnt = numColsA >> 2; /* Matrix multiplication */ @@ -302,7 +302,7 @@ arm_status arm_mat_cmplx_mult_f32( accR0 = vdupq_n_f32(0.0); accI0 = vdupq_n_f32(0.0); - /* Apply loop unrolling and compute 4 MACs simultaneously. */ + /* Compute 4 MACs simultaneously. */ colCnt = numColsA >> 2; /* Matrix multiplication */ diff --git a/Source/MatrixFunctions/arm_mat_inverse_f32.c b/Source/MatrixFunctions/arm_mat_inverse_f32.c index 2a55964e..d602b98b 100644 --- a/Source/MatrixFunctions/arm_mat_inverse_f32.c +++ b/Source/MatrixFunctions/arm_mat_inverse_f32.c @@ -109,7 +109,7 @@ arm_status arm_mat_inverse_f32( * * 1. First combine the identity matrix and the input matrix separated by a bar to form an * augmented matrix as follows: - * _ _ _ _ + * _ _ _ _ * | a11 a12 | 1 0 | | X11 X12 | * | | | = | | * |_ a21 a22 | 0 1 _| |_ X21 X21 _| @@ -299,7 +299,6 @@ arm_status arm_mat_inverse_f32( * to the right of the pilot element */ j = (numCols - l) >> 2; - /* Loop unrolling */ while (j > 0U) { /* Divide each element of the row of the input matrix @@ -331,7 +330,6 @@ arm_status arm_mat_inverse_f32( /* Loop over number of columns of the destination matrix */ j = numCols >> 2; - /* Loop unrolling */ while (j > 0U) { /* Divide each element of the row of the destination matrix @@ -399,7 +397,6 @@ arm_status arm_mat_inverse_f32( to replace the elements in the input matrix */ j = (numCols - l) >> 2; - /* Loop unrolling */ while (j > 0U) { /* Replace the element by the sum of that row @@ -433,7 +430,6 @@ arm_status arm_mat_inverse_f32( replace the elements in the destination matrix */ j = numCols >> 2; - /* Loop unrolling */ while (j > 0U) { /* Replace the element by the sum of that row diff --git a/Source/MatrixFunctions/arm_mat_mult_f32.c b/Source/MatrixFunctions/arm_mat_mult_f32.c index 0e47035e..ffddf999 100644 --- a/Source/MatrixFunctions/arm_mat_mult_f32.c +++ b/Source/MatrixFunctions/arm_mat_mult_f32.c @@ -169,7 +169,7 @@ arm_status arm_mat_mult_f32( acc6 = vdupq_n_f32(0.0); acc7 = vdupq_n_f32(0.0); - /* Apply loop unrolling and compute 4 MACs simultaneously. */ + /* Compute 4 MACs simultaneously. */ colCnt = numColsA >> 2U; /* Matrix multiplication */ @@ -184,7 +184,8 @@ arm_status arm_mat_mult_f32( a5V = vld1q_f32(pIn1F); a6V = vld1q_f32(pIn1G); a7V = vld1q_f32(pIn1H); - pIn1 += 4; + + pIn1 += 4; pIn1B += 4; pIn1C += 4; pIn1D += 4; @@ -323,7 +324,7 @@ arm_status arm_mat_mult_f32( acc0 = vdupq_n_f32(0.0); - /* Apply loop unrolling and compute 4 MACs simultaneously. */ + /* Compute 4 MACs simultaneously. */ colCnt = numColsA >> 2U; /* Matrix multiplication */ diff --git a/Source/MatrixFunctions/arm_mat_scale_f32.c b/Source/MatrixFunctions/arm_mat_scale_f32.c index 4eeedad2..a0097b1a 100644 --- a/Source/MatrixFunctions/arm_mat_scale_f32.c +++ b/Source/MatrixFunctions/arm_mat_scale_f32.c @@ -97,10 +97,9 @@ arm_status arm_mat_scale_f32( /* Total number of samples in the input matrix */ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols; - /* Loop unrolling */ blkCnt = numSamples >> 2; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/MatrixFunctions/arm_mat_sub_f32.c b/Source/MatrixFunctions/arm_mat_sub_f32.c index 33ac2456..cb576477 100644 --- a/Source/MatrixFunctions/arm_mat_sub_f32.c +++ b/Source/MatrixFunctions/arm_mat_sub_f32.c @@ -95,10 +95,9 @@ arm_status arm_mat_sub_f32( /* Total number of samples in the input matrix */ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols; - /* Loop Unrolling */ blkCnt = numSamples >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { @@ -110,7 +109,7 @@ arm_status arm_mat_sub_f32( res = vsubq_f32(vec1, vec2); vst1q_f32(pOut, res); - /* update pointers to process next sampels */ + /* Update pointers to process next samples */ pIn1 += 4U; pIn2 += 4U; pOut += 4U; diff --git a/Source/MatrixFunctions/arm_mat_trans_f32.c b/Source/MatrixFunctions/arm_mat_trans_f32.c index 0be38069..71748bf2 100644 --- a/Source/MatrixFunctions/arm_mat_trans_f32.c +++ b/Source/MatrixFunctions/arm_mat_trans_f32.c @@ -90,13 +90,12 @@ arm_status arm_mat_trans_f32( float32x4_t row0V,row1V,row2V,row3V; float32x4x2_t ra0,ra1,rb0,rb1; - /* Loop Unrolling */ blkCnt = nColumns >> 2; /* The pointer px is set to starting address of the column being processed */ px = pOut + i; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) /* Column loop */ { diff --git a/Source/StatisticsFunctions/arm_max_f32.c b/Source/StatisticsFunctions/arm_max_f32.c index 97378ddd..cd54e2a1 100644 --- a/Source/StatisticsFunctions/arm_max_f32.c +++ b/Source/StatisticsFunctions/arm_max_f32.c @@ -111,12 +111,11 @@ void arm_max_f32( outV = vld1q_f32(pSrc); pSrc += 4; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = (blockSize - 4 ) >> 2U; while (blkCnt > 0U) { - srcV = vld1q_f32(pSrc); pSrc += 4; diff --git a/Source/StatisticsFunctions/arm_mean_f32.c b/Source/StatisticsFunctions/arm_mean_f32.c index 7589d993..63d96525 100644 --- a/Source/StatisticsFunctions/arm_mean_f32.c +++ b/Source/StatisticsFunctions/arm_mean_f32.c @@ -72,10 +72,9 @@ void arm_mean_f32( float32_t in1, in2, in3, in4; float32x4_t inV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/StatisticsFunctions/arm_min_f32.c b/Source/StatisticsFunctions/arm_min_f32.c index d37d0d8b..6e9ff4b5 100644 --- a/Source/StatisticsFunctions/arm_min_f32.c +++ b/Source/StatisticsFunctions/arm_min_f32.c @@ -109,7 +109,7 @@ void arm_min_f32( outV = vld1q_f32(pSrc); pSrc += 4; - /* Loop unrolling */ + /* Compute 4 outputs at a time */ blkCnt = (blockSize - 4 ) >> 2U; while (blkCnt > 0U) diff --git a/Source/StatisticsFunctions/arm_power_f32.c b/Source/StatisticsFunctions/arm_power_f32.c index a9f9df72..a4825a53 100644 --- a/Source/StatisticsFunctions/arm_power_f32.c +++ b/Source/StatisticsFunctions/arm_power_f32.c @@ -71,10 +71,9 @@ void arm_power_f32( float32x2_t sumV2; float32x4_t inV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/StatisticsFunctions/arm_rms_f32.c b/Source/StatisticsFunctions/arm_rms_f32.c index 045f8741..45465107 100644 --- a/Source/StatisticsFunctions/arm_rms_f32.c +++ b/Source/StatisticsFunctions/arm_rms_f32.c @@ -71,10 +71,9 @@ void arm_rms_f32( float32x2_t sumV2; float32x4_t inV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/StatisticsFunctions/arm_var_f32.c b/Source/StatisticsFunctions/arm_var_f32.c index 394df2aa..3c325b13 100644 --- a/Source/StatisticsFunctions/arm_var_f32.c +++ b/Source/StatisticsFunctions/arm_var_f32.c @@ -79,10 +79,9 @@ void arm_var_f32( arm_mean_f32(pSrc,blockSize,&mean); avg = vdupq_n_f32(mean); - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_copy_f32.c b/Source/SupportFunctions/arm_copy_f32.c index 2bcf9b74..707adc4c 100644 --- a/Source/SupportFunctions/arm_copy_f32.c +++ b/Source/SupportFunctions/arm_copy_f32.c @@ -67,10 +67,9 @@ void arm_copy_f32( float32x4_t inV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_fill_f32.c b/Source/SupportFunctions/arm_fill_f32.c index e0e65ff0..29f62862 100644 --- a/Source/SupportFunctions/arm_fill_f32.c +++ b/Source/SupportFunctions/arm_fill_f32.c @@ -68,11 +68,10 @@ void arm_fill_f32( float32x4_t inV = vdupq_n_f32(value); - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - * ** a second loop below computes the remaining 1 to 3 samples. */ + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { /* C = value */ @@ -85,7 +84,7 @@ void arm_fill_f32( } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. - * ** No loop unrolling is used. */ + ** No loop unrolling is used. */ blkCnt = blockSize & 3; while (blkCnt > 0U) diff --git a/Source/SupportFunctions/arm_float_to_q15.c b/Source/SupportFunctions/arm_float_to_q15.c index 548cb60c..68c1ad09 100644 --- a/Source/SupportFunctions/arm_float_to_q15.c +++ b/Source/SupportFunctions/arm_float_to_q15.c @@ -80,10 +80,9 @@ void arm_float_to_q15( int32x4_t cvt; int16x4_t outV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_float_to_q31.c b/Source/SupportFunctions/arm_float_to_q31.c index f64b8757..479f8c5b 100644 --- a/Source/SupportFunctions/arm_float_to_q31.c +++ b/Source/SupportFunctions/arm_float_to_q31.c @@ -84,10 +84,9 @@ void arm_float_to_q31( int32x4_t outV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_float_to_q7.c b/Source/SupportFunctions/arm_float_to_q7.c index ddc472ba..5f2a7eb0 100644 --- a/Source/SupportFunctions/arm_float_to_q7.c +++ b/Source/SupportFunctions/arm_float_to_q7.c @@ -82,10 +82,9 @@ void arm_float_to_q7( int16x4_t cvt1,cvt2; int8x8_t outV; - /* Loop unrolling */ blkCnt = blockSize >> 3U; - /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + /* Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_q15_to_float.c b/Source/SupportFunctions/arm_q15_to_float.c index 15790b3a..f49d9b77 100644 --- a/Source/SupportFunctions/arm_q15_to_float.c +++ b/Source/SupportFunctions/arm_q15_to_float.c @@ -68,10 +68,9 @@ void arm_q15_to_float( int32x4_t inV0, inV1; float32x4_t outV; - /* Loop unrolling */ blkCnt = blockSize >> 3U; - /* First part of the processing with loop unrolling. Compute 8 outputs at a time. + /* Compute 8 outputs at a time. ** a second loop below computes the remaining 1 to 7 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_q31_to_float.c b/Source/SupportFunctions/arm_q31_to_float.c index 932bfb2a..03e7ec6f 100644 --- a/Source/SupportFunctions/arm_q31_to_float.c +++ b/Source/SupportFunctions/arm_q31_to_float.c @@ -67,10 +67,9 @@ void arm_q31_to_float( int32x4_t inV; float32x4_t outV; - /* Loop unrolling */ blkCnt = blockSize >> 2U; - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. + /* Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while (blkCnt > 0U) { diff --git a/Source/SupportFunctions/arm_q7_to_float.c b/Source/SupportFunctions/arm_q7_to_float.c index 927b345e..6bd86bfe 100644 --- a/Source/SupportFunctions/arm_q7_to_float.c +++ b/Source/SupportFunctions/arm_q7_to_float.c @@ -69,10 +69,9 @@ void arm_q7_to_float( int32x4_t inVLL, inVLH, inVHL, inVHH; float32x4_t outV; - /* Loop unrolling */ blkCnt = blockSize >> 4U; - /* First part of the processing with loop unrolling. Compute 16 outputs at a time. + /* Compute 16 outputs at a time. ** a second loop below computes the remaining 1 to 15 samples. */ while (blkCnt > 0U) {