Removed comments about loop unrolling in NEON codes to avoid confusion

7 years ago · 7c1fca8052
parent d4de6207ff
commit 7c1fca8052
41 changed files with 66 additions and 84 deletions
--- a/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/Source/BasicMathFunctions/arm_abs_f32.c
@ -71,14 +71,14 @@ void arm_abs_f32(
    float32x4_t vec1;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = |A| */
-        /* Calculate absolute values and then store the results in the destination buffer. */

+    	/* Calculate absolute values and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vabsq_f32(vec1);
        vst1q_f32(pDst, res);
--- a/Source/BasicMathFunctions/arm_add_f32.c
+++ b/Source/BasicMathFunctions/arm_add_f32.c
@ -71,14 +71,14 @@ void arm_add_f32(
    float32x4_t vec2;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = A + B */
-        /* Add and then store the results in the destination buffer. */

+    	/* Add and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vaddq_f32(vec1, vec2);
--- a/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c
@ -74,11 +74,12 @@ void arm_dot_prod_f32(
    float32x4_t res;
    float32x2_t accum = vdup_n_f32(0);

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    vec1 = vld1q_f32(pSrcA);
    vec2 = vld1q_f32(pSrcB);
+
    while (blkCnt > 0U)
    {
        /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
--- a/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/Source/BasicMathFunctions/arm_mult_f32.c
@ -71,14 +71,14 @@ void arm_mult_f32(
    float32x4_t vec2;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = A * B */
-        /* Multiply the inputs and then store the results in the destination buffer. */

+    	/* Multiply the inputs and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vmulq_f32(vec1, vec2);
--- a/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/Source/BasicMathFunctions/arm_negate_f32.c
@ -70,14 +70,14 @@ void arm_negate_f32(
    float32x4_t vec1;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = -A */
-        /* Negate and then store the results in the destination buffer. */

+    	/* Negate and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vnegq_f32(vec1);
        vst1q_f32(pDst, res);
--- a/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/Source/BasicMathFunctions/arm_offset_f32.c
@ -72,14 +72,14 @@ void arm_offset_f32(
    float32x4_t vec1;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = A + offset */
-        /* Add offset and then store the results in the destination buffer. */
 
+        /* Add offset and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vaddq_f32(vec1,vdupq_n_f32(offset));
        vst1q_f32(pDst, res);
--- a/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/Source/BasicMathFunctions/arm_scale_f32.c
@ -84,12 +84,13 @@ void arm_scale_f32(
    float32x4_t vec1;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = A * scale */
+
    	/* Scale the input and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrc);
        res = vmulq_f32(vec1, vdupq_n_f32(scale));
--- a/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/Source/BasicMathFunctions/arm_sub_f32.c
@ -71,14 +71,14 @@ void arm_sub_f32(
    float32x4_t vec2;
    float32x4_t res;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = blockSize >> 2U;

    while (blkCnt > 0U)
    {
        /* C = A - B */
-        /* Subtract and then store the results in the destination buffer. */

+        /* Subtract and then store the results in the destination buffer. */
        vec1 = vld1q_f32(pSrcA);
        vec2 = vld1q_f32(pSrcB);
        res = vsubq_f32(vec1, vec2);
--- a/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@ -82,7 +82,7 @@ void arm_cmplx_conj_f32(

   zero = vdupq_n_f32(0.0);

-   /* Loop unrolling */
+   /* Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;

   while (blkCnt > 0U)
--- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@ -93,7 +93,7 @@ void arm_cmplx_dot_prod_f32(
    accR = vdupq_n_f32(0.0);
    accI = vdupq_n_f32(0.0);

-    /* Loop unrolling */
+    /* Loop unrolling: Compute 8 outputs at a time */
    blkCnt = numSamples >> 3U;

    while (blkCnt > 0U)
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@ -89,7 +89,7 @@ void arm_cmplx_mag_f32(
  float32x4_t vImagB;
  float32x4_t vMagSqB;

-  /* Loop unrolling */
+  /* Loop unrolling: Compute 8 outputs at a time */
  blkCnt = numSamples >> 3;

  while (blkCnt > 0U)
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@ -88,7 +88,7 @@ void arm_cmplx_mag_squared_f32(
  float32x4_t vImagB;
  float32x4_t vMagSqB;

-  /* Loop unrolling */
+  /* Loop unrolling: Compute 8 outputs at a time */
  blkCnt = numSamples >> 3;

  while (blkCnt > 0U)
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@ -82,7 +82,7 @@ void arm_cmplx_mult_cmplx_f32(
    float32x4_t real, imag;
    float32x4x2_t outCplx;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = numSamples >> 2U;

    while (blkCnt > 0U)
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@ -82,7 +82,7 @@ void arm_cmplx_mult_real_f32(
    float32x4_t r;
    float32x4x2_t ab,outCplx;

-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
    blkCnt = numSamples >> 2U;

    while (blkCnt > 0U)
--- a/Source/FilteringFunctions/arm_conv_f32.c
+++ b/Source/FilteringFunctions/arm_conv_f32.c
@ -206,10 +206,10 @@ void arm_conv_f32(
    res = vdupq_n_f32(0) ;
    accum = vdup_n_f32(0);

-    /* Apply loop unrolling and compute 4 MACs simultaneously. */
+    /* Compute 4 MACs simultaneously. */
    k = count >> 2U;

-    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+    /* First part of the processing.  Compute 4 MACs at a time.
     ** a second loop below computes MACs for the remaining 1 to 3 samples. */

    while (k > 0U)
@ -556,7 +556,7 @@ void arm_conv_f32(
      float32x4_t y = vdupq_n_f32(0) ;
      float32x2_t accum = vdup_n_f32(0) ;

-      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+      /* First part of the processing.  Compute 4 MACs at a time.
       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
      while (k > 0U)
      {
--- a/Source/FilteringFunctions/arm_correlate_f32.c
+++ b/Source/FilteringFunctions/arm_correlate_f32.c
@ -359,7 +359,7 @@ void arm_correlate_f32(
      acc3 = 0.0f;

 #if defined(ARM_MATH_NEON)
-      /* Apply loop unrolling and compute 4 MACs simultaneously. */
+      /* Compute 4 MACs simultaneously. */
      k = srcBLen >> 2U;

      res = vdupq_n_f32(0) ;
--- a/Source/FilteringFunctions/arm_fir_decimate_f32.c
+++ b/Source/FilteringFunctions/arm_fir_decimate_f32.c
@ -187,10 +187,10 @@ void arm_fir_decimate_f32(
    /* Initialize coeff pointer */
    pb = pCoeffs;

-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
    tapCnt = numTaps >> 2;

-    /* Loop over the number of taps.  Unroll by a factor of 4.
+    /* Loop over the number of taps. 
     ** Repeat until we've computed numTaps-4 coefficients. */

    while (tapCnt > 0U)
@ -287,10 +287,10 @@ void arm_fir_decimate_f32(
    /* Initialize coeff pointer */
    pb = pCoeffs;

-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
    tapCnt = numTaps >> 2;

-    /* Loop over the number of taps.  Unroll by a factor of 4.
+    /* Loop over the number of taps.
     ** Repeat until we've computed numTaps-4 coefficients. */
    while (tapCnt > 0U)
    {
--- a/Source/FilteringFunctions/arm_fir_interpolate_f32.c
+++ b/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@ -163,7 +163,7 @@ void arm_fir_interpolate_f32(
  blkCnt = blockSize >> 3;
  blkCntN4 = blockSize & 7;

-  /* Samples loop unrolled by 8 */
+  /* Loop unrolling */
  while (blkCnt > 0U)
  {
    /* Copy new input samples into the state buffer */
--- a/Source/FilteringFunctions/arm_lms_f32.c
+++ b/Source/FilteringFunctions/arm_lms_f32.c
@ -208,7 +208,7 @@ void arm_lms_f32(
    sum = 0.0f;
    sumV = vdupq_n_f32(0.0);

-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
    tapCnt = numTaps >> 2;

    while (tapCnt > 0U)
@ -257,7 +257,7 @@ void arm_lms_f32(
    /* Initialize coeff pointer */
    pb = (pCoeffs);

-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
    tapCnt = numTaps >> 2;

    /* Update filter coefficients */
@ -305,7 +305,7 @@ void arm_lms_f32(
  /* Points to the start of the pState buffer */
  pStateCurnt = S->pState;

-  /* Loop unrolling for (numTaps - 1U) samples copy */
+  /* Process 4 taps at a time for (numTaps - 1U) samples copy */
  tapCnt = (numTaps - 1U) >> 2U;

  /* copy data */
--- a/Source/FilteringFunctions/arm_lms_norm_f32.c
+++ b/Source/FilteringFunctions/arm_lms_norm_f32.c
@ -216,7 +216,7 @@ void arm_lms_norm_f32(
    sum = 0.0f;
    sumV = vdupq_n_f32(0.0);

-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
    tapCnt = numTaps >> 2;

    while (tapCnt > 0U)
@ -265,7 +265,7 @@ void arm_lms_norm_f32(
    /* Initialize coeff pointer */
    pb = (pCoeffs);

-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
    tapCnt = numTaps >> 2;

    /* Update filter coefficients */
@ -317,7 +317,7 @@ void arm_lms_norm_f32(
  /* Points to the start of the pState buffer */
  pStateCurnt = S->pState;

-  /* Loop unrolling for (numTaps - 1U)/4 samples copy */
+  /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
  tapCnt = (numTaps - 1U) >> 2U;

  /* copy data */
--- a/Source/MatrixFunctions/arm_mat_add_f32.c
+++ b/Source/MatrixFunctions/arm_mat_add_f32.c
@ -100,10 +100,9 @@ arm_status arm_mat_add_f32(
    /* Total number of samples in the input matrix */
    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;

-    /* Loop unrolling */
    blkCnt = numSamples >> 2U;

-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+    /* Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
@ -149,7 +149,7 @@ arm_status arm_mat_cmplx_mult_f32(
        accR1 = vdupq_n_f32(0.0);
        accI1 = vdupq_n_f32(0.0);

-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
        colCnt = numColsA >> 2;

        /* Matrix multiplication */
@ -302,7 +302,7 @@ arm_status arm_mat_cmplx_mult_f32(
        accR0 = vdupq_n_f32(0.0);
        accI0 = vdupq_n_f32(0.0);

-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
        colCnt = numColsA >> 2;

        /* Matrix multiplication */
--- a/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f32.c
@ -299,7 +299,6 @@ arm_status arm_mat_inverse_f32(
       * to the right of the pilot element */
      j = (numCols - l) >> 2;

-      /* Loop unrolling */
      while (j > 0U)
      {
        /* Divide each element of the row of the input matrix
@ -331,7 +330,6 @@ arm_status arm_mat_inverse_f32(
      /* Loop over number of columns of the destination matrix */
      j = numCols >> 2;

-      /* Loop unrolling */
      while (j > 0U)
      {
        /* Divide each element of the row of the destination matrix
@ -399,7 +397,6 @@ arm_status arm_mat_inverse_f32(
             to replace the elements in the input matrix */
          j = (numCols - l) >> 2;
 	  
-	  /* Loop unrolling */
          while (j > 0U)
          {
            /* Replace the element by the sum of that row
@ -433,7 +430,6 @@ arm_status arm_mat_inverse_f32(
             replace the elements in the destination matrix */
          j = numCols >> 2;

-	  /* Loop unrolling */
          while (j > 0U)
          {
            /* Replace the element by the sum of that row
--- a/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f32.c
@ -169,7 +169,7 @@ arm_status arm_mat_mult_f32(
        acc6 = vdupq_n_f32(0.0);
        acc7 = vdupq_n_f32(0.0);

-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
        colCnt = numColsA >> 2U;

        /* Matrix multiplication */
@ -184,6 +184,7 @@ arm_status arm_mat_mult_f32(
          a5V = vld1q_f32(pIn1F); 
          a6V = vld1q_f32(pIn1G); 
          a7V = vld1q_f32(pIn1H); 
+
 	  pIn1 += 4;
          pIn1B += 4;
          pIn1C += 4;
@ -323,7 +324,7 @@ arm_status arm_mat_mult_f32(

        acc0 = vdupq_n_f32(0.0);

-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
        colCnt = numColsA >> 2U;

        /* Matrix multiplication   */
--- a/Source/MatrixFunctions/arm_mat_scale_f32.c
+++ b/Source/MatrixFunctions/arm_mat_scale_f32.c
@ -97,10 +97,9 @@ arm_status arm_mat_scale_f32(
    /* Total number of samples in the input matrix */
    numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;

-    /* Loop unrolling */
    blkCnt = numSamples >> 2;

-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+    /* Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
--- a/Source/MatrixFunctions/arm_mat_sub_f32.c
+++ b/Source/MatrixFunctions/arm_mat_sub_f32.c
@ -95,10 +95,9 @@ arm_status arm_mat_sub_f32(
    /* Total number of samples in the input matrix */
    numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;

-    /* Loop Unrolling */
    blkCnt = numSamples >> 2U;

-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+    /* Compute 4 outputs at a time.
     ** a second loop below computes the remaining 1 to 3 samples. */
    while (blkCnt > 0U)
    {
@ -110,7 +109,7 @@ arm_status arm_mat_sub_f32(
      res = vsubq_f32(vec1, vec2);
      vst1q_f32(pOut, res);

-      /* update pointers to process next sampels */
+      /* Update pointers to process next samples */
      pIn1 += 4U;
      pIn2 += 4U;
      pOut += 4U;
--- a/Source/MatrixFunctions/arm_mat_trans_f32.c
+++ b/Source/MatrixFunctions/arm_mat_trans_f32.c
@ -90,13 +90,12 @@ arm_status arm_mat_trans_f32(
      float32x4_t row0V,row1V,row2V,row3V;
      float32x4x2_t ra0,ra1,rb0,rb1;

-      /* Loop Unrolling */
      blkCnt = nColumns >> 2;

      /* The pointer px is set to starting address of the column being processed */
      px = pOut + i;

-      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+      /* Compute 4 outputs at a time.
       ** a second loop below computes the remaining 1 to 3 samples. */
      while (blkCnt > 0U)        /* Column loop */
      {
--- a/Source/StatisticsFunctions/arm_max_f32.c
+++ b/Source/StatisticsFunctions/arm_max_f32.c
@ -111,12 +111,11 @@ void arm_max_f32(
      outV = vld1q_f32(pSrc);
      pSrc += 4;
 
-      /* Loop unrolling */
+      /* Compute 4 outputs at a time */
      blkCnt = (blockSize - 4 ) >> 2U;
    
      while (blkCnt > 0U)
      {
-    
        srcV = vld1q_f32(pSrc);
        pSrc += 4;
    
--- a/Source/StatisticsFunctions/arm_mean_f32.c
+++ b/Source/StatisticsFunctions/arm_mean_f32.c
@ -72,10 +72,9 @@ void arm_mean_f32(
  float32_t in1, in2, in3, in4;
  float32x4_t inV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/StatisticsFunctions/arm_min_f32.c
+++ b/Source/StatisticsFunctions/arm_min_f32.c
@ -109,7 +109,7 @@ void arm_min_f32(
      outV = vld1q_f32(pSrc);
      pSrc += 4;
    
-      /* Loop unrolling */
+      /* Compute 4 outputs at a time */
      blkCnt = (blockSize - 4 ) >> 2U;
    
      while (blkCnt > 0U)
--- a/Source/StatisticsFunctions/arm_power_f32.c
+++ b/Source/StatisticsFunctions/arm_power_f32.c
@ -71,10 +71,9 @@ void arm_power_f32(
  float32x2_t sumV2;
  float32x4_t inV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/StatisticsFunctions/arm_rms_f32.c
+++ b/Source/StatisticsFunctions/arm_rms_f32.c
@ -71,10 +71,9 @@ void arm_rms_f32(
  float32x2_t sumV2;
  float32x4_t inV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/StatisticsFunctions/arm_var_f32.c
+++ b/Source/StatisticsFunctions/arm_var_f32.c
@ -79,10 +79,9 @@ void arm_var_f32(
  arm_mean_f32(pSrc,blockSize,&mean);
  avg = vdupq_n_f32(mean);

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_copy_f32.c
+++ b/Source/SupportFunctions/arm_copy_f32.c
@ -67,10 +67,9 @@ void arm_copy_f32(

  float32x4_t inV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_fill_f32.c
+++ b/Source/SupportFunctions/arm_fill_f32.c
@ -68,11 +68,10 @@ void arm_fill_f32(

  float32x4_t inV = vdupq_n_f32(value);

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
- *    ** a second loop below computes the remaining 1 to 3 samples. */
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
    /* C = value */
@ -85,7 +84,7 @@ void arm_fill_f32(
  }

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- *    ** No loop unrolling is used. */
+   ** No loop unrolling is used. */
  blkCnt = blockSize & 3;

  while (blkCnt > 0U)
--- a/Source/SupportFunctions/arm_float_to_q15.c
+++ b/Source/SupportFunctions/arm_float_to_q15.c
@ -80,10 +80,9 @@ void arm_float_to_q15(
  int32x4_t cvt;
  int16x4_t outV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_float_to_q31.c
+++ b/Source/SupportFunctions/arm_float_to_q31.c
@ -84,10 +84,9 @@ void arm_float_to_q31(

  int32x4_t outV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_float_to_q7.c
+++ b/Source/SupportFunctions/arm_float_to_q7.c
@ -82,10 +82,9 @@ void arm_float_to_q7(
  int16x4_t cvt1,cvt2;
  int8x8_t outV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 3U;

-  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
+  /* Compute 8 outputs at a time.
   ** a second loop below computes the remaining 1 to 7 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_q15_to_float.c
+++ b/Source/SupportFunctions/arm_q15_to_float.c
@ -68,10 +68,9 @@ void arm_q15_to_float(
  int32x4_t inV0, inV1;
  float32x4_t outV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 3U;

-  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
+  /* Compute 8 outputs at a time.
   ** a second loop below computes the remaining 1 to 7 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_q31_to_float.c
+++ b/Source/SupportFunctions/arm_q31_to_float.c
@ -67,10 +67,9 @@ void arm_q31_to_float(
  int32x4_t inV;
  float32x4_t outV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 2U;

-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
   ** a second loop below computes the remaining 1 to 3 samples. */
  while (blkCnt > 0U)
  {
--- a/Source/SupportFunctions/arm_q7_to_float.c
+++ b/Source/SupportFunctions/arm_q7_to_float.c
@ -69,10 +69,9 @@ void arm_q7_to_float(
  int32x4_t inVLL, inVLH, inVHL, inVHH;
  float32x4_t outV;

-  /* Loop unrolling */
  blkCnt = blockSize >> 4U;

-  /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.
+  /* Compute 16 outputs at a time.
   ** a second loop below computes the remaining 1 to 15 samples. */
  while (blkCnt > 0U)
  {