Removed comments about loop unrolling in NEON codes to avoid confusion

pull/19/head
ClaudioMartino 7 years ago
parent d4de6207ff
commit 7c1fca8052

@ -71,14 +71,14 @@ void arm_abs_f32(
float32x4_t vec1;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vabsq_f32(vec1);
vst1q_f32(pDst, res);

@ -71,14 +71,14 @@ void arm_add_f32(
float32x4_t vec2;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vaddq_f32(vec1, vec2);

@ -74,11 +74,12 @@ void arm_dot_prod_f32(
float32x4_t res;
float32x2_t accum = vdup_n_f32(0);
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
while (blkCnt > 0U)
{
/* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */

@ -71,14 +71,14 @@ void arm_mult_f32(
float32x4_t vec2;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
/* Multiply the inputs and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vmulq_f32(vec1, vec2);

@ -70,14 +70,14 @@ void arm_negate_f32(
float32x4_t vec1;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
/* Negate and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vnegq_f32(vec1);
vst1q_f32(pDst, res);

@ -72,14 +72,14 @@ void arm_offset_f32(
float32x4_t vec1;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vaddq_f32(vec1,vdupq_n_f32(offset));
vst1q_f32(pDst, res);

@ -84,12 +84,13 @@ void arm_scale_f32(
float32x4_t vec1;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vmulq_f32(vec1, vdupq_n_f32(scale));

@ -71,14 +71,14 @@ void arm_sub_f32(
float32x4_t vec2;
float32x4_t res;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
/* Subtract and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vsubq_f32(vec1, vec2);

@ -82,7 +82,7 @@ void arm_cmplx_conj_f32(
zero = vdupq_n_f32(0.0);
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)

@ -93,7 +93,7 @@ void arm_cmplx_dot_prod_f32(
accR = vdupq_n_f32(0.0);
accI = vdupq_n_f32(0.0);
/* Loop unrolling */
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3U;
while (blkCnt > 0U)

@ -89,7 +89,7 @@ void arm_cmplx_mag_f32(
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling */
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
while (blkCnt > 0U)

@ -88,7 +88,7 @@ void arm_cmplx_mag_squared_f32(
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling */
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
while (blkCnt > 0U)

@ -82,7 +82,7 @@ void arm_cmplx_mult_cmplx_f32(
float32x4_t real, imag;
float32x4x2_t outCplx;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)

@ -82,7 +82,7 @@ void arm_cmplx_mult_real_f32(
float32x4_t r;
float32x4x2_t ab,outCplx;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)

@ -206,10 +206,10 @@ void arm_conv_f32(
res = vdupq_n_f32(0) ;
accum = vdup_n_f32(0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */
/* Compute 4 MACs simultaneously. */
k = count >> 2U;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
/* First part of the processing. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
@ -556,7 +556,7 @@ void arm_conv_f32(
float32x4_t y = vdupq_n_f32(0) ;
float32x2_t accum = vdup_n_f32(0) ;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
/* First part of the processing. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */
while (k > 0U)
{

@ -359,7 +359,7 @@ void arm_correlate_f32(
acc3 = 0.0f;
#if defined(ARM_MATH_NEON)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
/* Compute 4 MACs simultaneously. */
k = srcBLen >> 2U;
res = vdupq_n_f32(0) ;

@ -187,10 +187,10 @@ void arm_fir_decimate_f32(
/* Initialize coeff pointer */
pb = pCoeffs;
/* Loop unrolling. Process 4 taps at a time. */
/* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
/* Loop over the number of taps. Unroll by a factor of 4.
/* Loop over the number of taps.
** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
@ -287,10 +287,10 @@ void arm_fir_decimate_f32(
/* Initialize coeff pointer */
pb = pCoeffs;
/* Loop unrolling. Process 4 taps at a time. */
/* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
/* Loop over the number of taps. Unroll by a factor of 4.
/* Loop over the number of taps.
** Repeat until we've computed numTaps-4 coefficients. */
while (tapCnt > 0U)
{

@ -163,7 +163,7 @@ void arm_fir_interpolate_f32(
blkCnt = blockSize >> 3;
blkCntN4 = blockSize & 7;
/* Samples loop unrolled by 8 */
/* Loop unrolling */
while (blkCnt > 0U)
{
/* Copy new input samples into the state buffer */

@ -208,7 +208,7 @@ void arm_lms_f32(
sum = 0.0f;
sumV = vdupq_n_f32(0.0);
/* Loop unrolling. Process 4 taps at a time. */
/* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
while (tapCnt > 0U)
@ -257,7 +257,7 @@ void arm_lms_f32(
/* Initialize coeff pointer */
pb = (pCoeffs);
/* Loop unrolling. Process 4 taps at a time. */
/* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
/* Update filter coefficients */
@ -305,7 +305,7 @@ void arm_lms_f32(
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
/* Loop unrolling for (numTaps - 1U) samples copy */
/* Process 4 taps at a time for (numTaps - 1U) samples copy */
tapCnt = (numTaps - 1U) >> 2U;
/* copy data */

@ -216,7 +216,7 @@ void arm_lms_norm_f32(
sum = 0.0f;
sumV = vdupq_n_f32(0.0);
/* Loop unrolling. Process 4 taps at a time. */
/* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
while (tapCnt > 0U)
@ -265,7 +265,7 @@ void arm_lms_norm_f32(
/* Initialize coeff pointer */
pb = (pCoeffs);
/* Loop unrolling. Process 4 taps at a time. */
/* Process 4 taps at a time. */
tapCnt = numTaps >> 2;
/* Update filter coefficients */
@ -317,7 +317,7 @@ void arm_lms_norm_f32(
/* Points to the start of the pState buffer */
pStateCurnt = S->pState;
/* Loop unrolling for (numTaps - 1U)/4 samples copy */
/* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
tapCnt = (numTaps - 1U) >> 2U;
/* copy data */

@ -100,10 +100,9 @@ arm_status arm_mat_add_f32(
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
/* Loop unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -149,7 +149,7 @@ arm_status arm_mat_cmplx_mult_f32(
accR1 = vdupq_n_f32(0.0);
accI1 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Matrix multiplication */
@ -302,7 +302,7 @@ arm_status arm_mat_cmplx_mult_f32(
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Matrix multiplication */

@ -299,7 +299,6 @@ arm_status arm_mat_inverse_f32(
* to the right of the pilot element */
j = (numCols - l) >> 2;
/* Loop unrolling */
while (j > 0U)
{
/* Divide each element of the row of the input matrix
@ -331,7 +330,6 @@ arm_status arm_mat_inverse_f32(
/* Loop over number of columns of the destination matrix */
j = numCols >> 2;
/* Loop unrolling */
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
@ -399,7 +397,6 @@ arm_status arm_mat_inverse_f32(
to replace the elements in the input matrix */
j = (numCols - l) >> 2;
/* Loop unrolling */
while (j > 0U)
{
/* Replace the element by the sum of that row
@ -433,7 +430,6 @@ arm_status arm_mat_inverse_f32(
replace the elements in the destination matrix */
j = numCols >> 2;
/* Loop unrolling */
while (j > 0U)
{
/* Replace the element by the sum of that row

@ -169,7 +169,7 @@ arm_status arm_mat_mult_f32(
acc6 = vdupq_n_f32(0.0);
acc7 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
@ -184,6 +184,7 @@ arm_status arm_mat_mult_f32(
a5V = vld1q_f32(pIn1F);
a6V = vld1q_f32(pIn1G);
a7V = vld1q_f32(pIn1H);
pIn1 += 4;
pIn1B += 4;
pIn1C += 4;
@ -323,7 +324,7 @@ arm_status arm_mat_mult_f32(
acc0 = vdupq_n_f32(0.0);
/* Apply loop unrolling and compute 4 MACs simultaneously. */
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */

@ -97,10 +97,9 @@ arm_status arm_mat_scale_f32(
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
/* Loop unrolling */
blkCnt = numSamples >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -95,10 +95,9 @@ arm_status arm_mat_sub_f32(
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
/* Loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
@ -110,7 +109,7 @@ arm_status arm_mat_sub_f32(
res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* update pointers to process next sampels */
/* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;

@ -90,13 +90,12 @@ arm_status arm_mat_trans_f32(
float32x4_t row0V,row1V,row2V,row3V;
float32x4x2_t ra0,ra1,rb0,rb1;
/* Loop Unrolling */
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */
{

@ -111,12 +111,11 @@ void arm_max_f32(
outV = vld1q_f32(pSrc);
pSrc += 4;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U)
{
srcV = vld1q_f32(pSrc);
pSrc += 4;

@ -72,10 +72,9 @@ void arm_mean_f32(
float32_t in1, in2, in3, in4;
float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -109,7 +109,7 @@ void arm_min_f32(
outV = vld1q_f32(pSrc);
pSrc += 4;
/* Loop unrolling */
/* Compute 4 outputs at a time */
blkCnt = (blockSize - 4 ) >> 2U;
while (blkCnt > 0U)

@ -71,10 +71,9 @@ void arm_power_f32(
float32x2_t sumV2;
float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -71,10 +71,9 @@ void arm_rms_f32(
float32x2_t sumV2;
float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -79,10 +79,9 @@ void arm_var_f32(
arm_mean_f32(pSrc,blockSize,&mean);
avg = vdupq_n_f32(mean);
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -67,10 +67,9 @@ void arm_copy_f32(
float32x4_t inV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -68,11 +68,10 @@ void arm_fill_f32(
float32x4_t inV = vdupq_n_f32(value);
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
* ** a second loop below computes the remaining 1 to 3 samples. */
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = value */
@ -85,7 +84,7 @@ void arm_fill_f32(
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
* ** No loop unrolling is used. */
** No loop unrolling is used. */
blkCnt = blockSize & 3;
while (blkCnt > 0U)

@ -80,10 +80,9 @@ void arm_float_to_q15(
int32x4_t cvt;
int16x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -84,10 +84,9 @@ void arm_float_to_q31(
int32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -82,10 +82,9 @@ void arm_float_to_q7(
int16x4_t cvt1,cvt2;
int8x8_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 3U;
/* First part of the processing with loop unrolling. Compute 8 outputs at a time.
/* Compute 8 outputs at a time.
** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U)
{

@ -68,10 +68,9 @@ void arm_q15_to_float(
int32x4_t inV0, inV1;
float32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 3U;
/* First part of the processing with loop unrolling. Compute 8 outputs at a time.
/* Compute 8 outputs at a time.
** a second loop below computes the remaining 1 to 7 samples. */
while (blkCnt > 0U)
{

@ -67,10 +67,9 @@ void arm_q31_to_float(
int32x4_t inV;
float32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{

@ -69,10 +69,9 @@ void arm_q7_to_float(
int32x4_t inVLL, inVLH, inVHL, inVHH;
float32x4_t outV;
/* Loop unrolling */
blkCnt = blockSize >> 4U;
/* First part of the processing with loop unrolling. Compute 16 outputs at a time.
/* Compute 16 outputs at a time.
** a second loop below computes the remaining 1 to 15 samples. */
while (blkCnt > 0U)
{

Loading…
Cancel
Save