diff --git a/Source/BasicMathFunctions/arm_abs_f32.c b/Source/BasicMathFunctions/arm_abs_f32.c
index 5e94eddc..a7d2624e 100644
--- a/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/Source/BasicMathFunctions/arm_abs_f32.c
@@ -71,14 +71,14 @@ void arm_abs_f32(
     float32x4_t vec1;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = |A| */
-        /* Calculate absolute values and then store the results in the destination buffer. */
 
+    	/* Calculate absolute values and then store the results in the destination buffer. */
         vec1 = vld1q_f32(pSrc);
         res = vabsq_f32(vec1);
         vst1q_f32(pDst, res);
diff --git a/Source/BasicMathFunctions/arm_add_f32.c b/Source/BasicMathFunctions/arm_add_f32.c
index 521c0021..1c66a24c 100644
--- a/Source/BasicMathFunctions/arm_add_f32.c
+++ b/Source/BasicMathFunctions/arm_add_f32.c
@@ -71,14 +71,14 @@ void arm_add_f32(
     float32x4_t vec2;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = A + B */
-        /* Add and then store the results in the destination buffer. */
 
+    	/* Add and then store the results in the destination buffer. */
         vec1 = vld1q_f32(pSrcA);
         vec2 = vld1q_f32(pSrcB);
         res = vaddq_f32(vec1, vec2);
diff --git a/Source/BasicMathFunctions/arm_dot_prod_f32.c b/Source/BasicMathFunctions/arm_dot_prod_f32.c
index 97f99030..8510c022 100644
--- a/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -74,11 +74,12 @@ void arm_dot_prod_f32(
     float32x4_t res;
     float32x2_t accum = vdup_n_f32(0);
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     vec1 = vld1q_f32(pSrcA);
     vec2 = vld1q_f32(pSrcB);
+
     while (blkCnt > 0U)
     {
         /* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
diff --git a/Source/BasicMathFunctions/arm_mult_f32.c b/Source/BasicMathFunctions/arm_mult_f32.c
index b2abdb39..53ad73c8 100644
--- a/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/Source/BasicMathFunctions/arm_mult_f32.c
@@ -71,14 +71,14 @@ void arm_mult_f32(
     float32x4_t vec2;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = A * B */
-        /* Multiply the inputs and then store the results in the destination buffer. */
 
+    	/* Multiply the inputs and then store the results in the destination buffer. */
         vec1 = vld1q_f32(pSrcA);
         vec2 = vld1q_f32(pSrcB);
         res = vmulq_f32(vec1, vec2);
diff --git a/Source/BasicMathFunctions/arm_negate_f32.c b/Source/BasicMathFunctions/arm_negate_f32.c
index 01433653..f807112c 100644
--- a/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/Source/BasicMathFunctions/arm_negate_f32.c
@@ -70,14 +70,14 @@ void arm_negate_f32(
     float32x4_t vec1;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = -A */
-        /* Negate and then store the results in the destination buffer. */
 
+    	/* Negate and then store the results in the destination buffer. */
         vec1 = vld1q_f32(pSrc);
         res = vnegq_f32(vec1);
         vst1q_f32(pDst, res);
diff --git a/Source/BasicMathFunctions/arm_offset_f32.c b/Source/BasicMathFunctions/arm_offset_f32.c
index 288e0f5f..b10e3f1d 100644
--- a/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/Source/BasicMathFunctions/arm_offset_f32.c
@@ -72,14 +72,14 @@ void arm_offset_f32(
     float32x4_t vec1;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = A + offset */
+ 
         /* Add offset and then store the results in the destination buffer. */
-
         vec1 = vld1q_f32(pSrc);
         res = vaddq_f32(vec1,vdupq_n_f32(offset));
         vst1q_f32(pDst, res);
diff --git a/Source/BasicMathFunctions/arm_scale_f32.c b/Source/BasicMathFunctions/arm_scale_f32.c
index 4bd9e3e4..72ecbe5c 100644
--- a/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/Source/BasicMathFunctions/arm_scale_f32.c
@@ -84,13 +84,14 @@ void arm_scale_f32(
     float32x4_t vec1;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = A * scale */
-        /* Scale the input and then store the results in the destination buffer. */
+
+    	/* Scale the input and then store the results in the destination buffer. */
         vec1 = vld1q_f32(pSrc);
         res = vmulq_f32(vec1, vdupq_n_f32(scale));
         vst1q_f32(pDst, res);
diff --git a/Source/BasicMathFunctions/arm_sub_f32.c b/Source/BasicMathFunctions/arm_sub_f32.c
index 512a4b28..4c97af30 100644
--- a/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/Source/BasicMathFunctions/arm_sub_f32.c
@@ -71,14 +71,14 @@ void arm_sub_f32(
     float32x4_t vec2;
     float32x4_t res;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = blockSize >> 2U;
 
     while (blkCnt > 0U)
     {
         /* C = A - B */
-        /* Subtract and then store the results in the destination buffer. */
 
+        /* Subtract and then store the results in the destination buffer. */
         vec1 = vld1q_f32(pSrcA);
         vec2 = vld1q_f32(pSrcB);
         res = vsubq_f32(vec1, vec2);
diff --git a/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c b/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
index 5259d6ad..df5db003 100644
--- a/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -82,7 +82,7 @@ void arm_cmplx_conj_f32(
 
    zero = vdupq_n_f32(0.0);
 
-   /* Loop unrolling */
+   /* Compute 4 outputs at a time */
    blkCnt = numSamples >> 2U;
 
    while (blkCnt > 0U)
diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
index 3fde9050..06f1bfa1 100644
--- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -93,7 +93,7 @@ void arm_cmplx_dot_prod_f32(
     accR = vdupq_n_f32(0.0);
     accI = vdupq_n_f32(0.0);
 
-    /* Loop unrolling */
+    /* Loop unrolling: Compute 8 outputs at a time */
     blkCnt = numSamples >> 3U;
 
     while (blkCnt > 0U)
diff --git a/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
index 8b8f1938..84812dcf 100644
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -89,7 +89,7 @@ void arm_cmplx_mag_f32(
   float32x4_t vImagB;
   float32x4_t vMagSqB;
 
-  /* Loop unrolling */
+  /* Loop unrolling: Compute 8 outputs at a time */
   blkCnt = numSamples >> 3;
 
   while (blkCnt > 0U)
diff --git a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
index 69128c91..99f051c3 100644
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -88,7 +88,7 @@ void arm_cmplx_mag_squared_f32(
   float32x4_t vImagB;
   float32x4_t vMagSqB;
 
-  /* Loop unrolling */
+  /* Loop unrolling: Compute 8 outputs at a time */
   blkCnt = numSamples >> 3;
 
   while (blkCnt > 0U)
diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
index 12b5ca7d..8d148216 100644
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -82,7 +82,7 @@ void arm_cmplx_mult_cmplx_f32(
     float32x4_t real, imag;
     float32x4x2_t outCplx;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = numSamples >> 2U;
 
     while (blkCnt > 0U)
diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
index ff9b112f..9651999e 100644
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -82,7 +82,7 @@ void arm_cmplx_mult_real_f32(
     float32x4_t r;
     float32x4x2_t ab,outCplx;
 
-    /* Loop unrolling */
+    /* Compute 4 outputs at a time */
     blkCnt = numSamples >> 2U;
 
     while (blkCnt > 0U)
diff --git a/Source/FilteringFunctions/arm_conv_f32.c b/Source/FilteringFunctions/arm_conv_f32.c
index cfada7d7..8fa13085 100644
--- a/Source/FilteringFunctions/arm_conv_f32.c
+++ b/Source/FilteringFunctions/arm_conv_f32.c
@@ -206,10 +206,10 @@ void arm_conv_f32(
     res = vdupq_n_f32(0) ;
     accum = vdup_n_f32(0);
 
-    /* Apply loop unrolling and compute 4 MACs simultaneously. */
+    /* Compute 4 MACs simultaneously. */
     k = count >> 2U;
 
-    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+    /* First part of the processing.  Compute 4 MACs at a time.
      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 
     while (k > 0U)
@@ -556,7 +556,7 @@ void arm_conv_f32(
       float32x4_t y = vdupq_n_f32(0) ;
       float32x2_t accum = vdup_n_f32(0) ;
 
-      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
+      /* First part of the processing.  Compute 4 MACs at a time.
        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
       while (k > 0U)
       {
diff --git a/Source/FilteringFunctions/arm_correlate_f32.c b/Source/FilteringFunctions/arm_correlate_f32.c
index 41b5b801..60a43f50 100644
--- a/Source/FilteringFunctions/arm_correlate_f32.c
+++ b/Source/FilteringFunctions/arm_correlate_f32.c
@@ -359,7 +359,7 @@ void arm_correlate_f32(
       acc3 = 0.0f;
 
 #if defined(ARM_MATH_NEON)
-      /* Apply loop unrolling and compute 4 MACs simultaneously. */
+      /* Compute 4 MACs simultaneously. */
       k = srcBLen >> 2U;
 
       res = vdupq_n_f32(0) ;
diff --git a/Source/FilteringFunctions/arm_fir_decimate_f32.c b/Source/FilteringFunctions/arm_fir_decimate_f32.c
index d829826a..218ca34f 100644
--- a/Source/FilteringFunctions/arm_fir_decimate_f32.c
+++ b/Source/FilteringFunctions/arm_fir_decimate_f32.c
@@ -187,10 +187,10 @@ void arm_fir_decimate_f32(
     /* Initialize coeff pointer */
     pb = pCoeffs;
 
-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
     tapCnt = numTaps >> 2;
 
-    /* Loop over the number of taps.  Unroll by a factor of 4.
+    /* Loop over the number of taps. 
      ** Repeat until we've computed numTaps-4 coefficients. */
 
     while (tapCnt > 0U)
@@ -287,10 +287,10 @@ void arm_fir_decimate_f32(
     /* Initialize coeff pointer */
     pb = pCoeffs;
 
-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
     tapCnt = numTaps >> 2;
 
-    /* Loop over the number of taps.  Unroll by a factor of 4.
+    /* Loop over the number of taps.
      ** Repeat until we've computed numTaps-4 coefficients. */
     while (tapCnt > 0U)
     {
diff --git a/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/Source/FilteringFunctions/arm_fir_interpolate_f32.c
index 44659356..ee0ed270 100644
--- a/Source/FilteringFunctions/arm_fir_interpolate_f32.c
+++ b/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@@ -163,7 +163,7 @@ void arm_fir_interpolate_f32(
   blkCnt = blockSize >> 3;
   blkCntN4 = blockSize & 7;
 
-  /* Samples loop unrolled by 8 */
+  /* Loop unrolling */
   while (blkCnt > 0U)
   {
     /* Copy new input samples into the state buffer */
diff --git a/Source/FilteringFunctions/arm_lms_f32.c b/Source/FilteringFunctions/arm_lms_f32.c
index c2594ac8..4fc6e7e2 100644
--- a/Source/FilteringFunctions/arm_lms_f32.c
+++ b/Source/FilteringFunctions/arm_lms_f32.c
@@ -208,7 +208,7 @@ void arm_lms_f32(
     sum = 0.0f;
     sumV = vdupq_n_f32(0.0);
 
-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
     tapCnt = numTaps >> 2;
 
     while (tapCnt > 0U)
@@ -257,7 +257,7 @@ void arm_lms_f32(
     /* Initialize coeff pointer */
     pb = (pCoeffs);
 
-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
     tapCnt = numTaps >> 2;
 
     /* Update filter coefficients */
@@ -305,7 +305,7 @@ void arm_lms_f32(
   /* Points to the start of the pState buffer */
   pStateCurnt = S->pState;
 
-  /* Loop unrolling for (numTaps - 1U) samples copy */
+  /* Process 4 taps at a time for (numTaps - 1U) samples copy */
   tapCnt = (numTaps - 1U) >> 2U;
 
   /* copy data */
diff --git a/Source/FilteringFunctions/arm_lms_norm_f32.c b/Source/FilteringFunctions/arm_lms_norm_f32.c
index bcdf92ec..28ab04a2 100644
--- a/Source/FilteringFunctions/arm_lms_norm_f32.c
+++ b/Source/FilteringFunctions/arm_lms_norm_f32.c
@@ -216,7 +216,7 @@ void arm_lms_norm_f32(
     sum = 0.0f;
     sumV = vdupq_n_f32(0.0);
 
-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
     tapCnt = numTaps >> 2;
 
     while (tapCnt > 0U)
@@ -265,7 +265,7 @@ void arm_lms_norm_f32(
     /* Initialize coeff pointer */
     pb = (pCoeffs);
 
-    /* Loop unrolling.  Process 4 taps at a time. */
+    /* Process 4 taps at a time. */
     tapCnt = numTaps >> 2;
 
     /* Update filter coefficients */
@@ -317,7 +317,7 @@ void arm_lms_norm_f32(
   /* Points to the start of the pState buffer */
   pStateCurnt = S->pState;
 
-  /* Loop unrolling for (numTaps - 1U)/4 samples copy */
+  /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
   tapCnt = (numTaps - 1U) >> 2U;
 
   /* copy data */
diff --git a/Source/MatrixFunctions/arm_mat_add_f32.c b/Source/MatrixFunctions/arm_mat_add_f32.c
index 0aa8f395..8e1246c0 100644
--- a/Source/MatrixFunctions/arm_mat_add_f32.c
+++ b/Source/MatrixFunctions/arm_mat_add_f32.c
@@ -100,10 +100,9 @@ arm_status arm_mat_add_f32(
     /* Total number of samples in the input matrix */
     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
 
-    /* Loop unrolling */
     blkCnt = numSamples >> 2U;
 
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+    /* Compute 4 outputs at a time.
      ** a second loop below computes the remaining 1 to 3 samples. */
     while (blkCnt > 0U)
     {
diff --git a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
index b31fbb14..8e2af317 100644
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
@@ -149,7 +149,7 @@ arm_status arm_mat_cmplx_mult_f32(
         accR1 = vdupq_n_f32(0.0);
         accI1 = vdupq_n_f32(0.0);
 
-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
         colCnt = numColsA >> 2;
 
         /* Matrix multiplication */
@@ -302,7 +302,7 @@ arm_status arm_mat_cmplx_mult_f32(
         accR0 = vdupq_n_f32(0.0);
         accI0 = vdupq_n_f32(0.0);
 
-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
         colCnt = numColsA >> 2;
 
         /* Matrix multiplication */
diff --git a/Source/MatrixFunctions/arm_mat_inverse_f32.c b/Source/MatrixFunctions/arm_mat_inverse_f32.c
index 2a55964e..d602b98b 100644
--- a/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -109,7 +109,7 @@ arm_status arm_mat_inverse_f32(
    *
    *     1. First combine the identity matrix and the input matrix separated by a bar to form an
    *        augmented matrix as follows:
-   *                _                  _         _         _
+   *              _                  _         _         _
    *             |  a11  a12 | 1   0  |       |  X11 X12  |
    *             |           |        |   =   |           |
    *             |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
@@ -299,7 +299,6 @@ arm_status arm_mat_inverse_f32(
        * to the right of the pilot element */
       j = (numCols - l) >> 2;
 
-      /* Loop unrolling */
       while (j > 0U)
       {
         /* Divide each element of the row of the input matrix
@@ -331,7 +330,6 @@ arm_status arm_mat_inverse_f32(
       /* Loop over number of columns of the destination matrix */
       j = numCols >> 2;
 
-      /* Loop unrolling */
       while (j > 0U)
       {
         /* Divide each element of the row of the destination matrix
@@ -399,7 +397,6 @@ arm_status arm_mat_inverse_f32(
              to replace the elements in the input matrix */
           j = (numCols - l) >> 2;
 	  
-	  /* Loop unrolling */
           while (j > 0U)
           {
             /* Replace the element by the sum of that row
@@ -433,7 +430,6 @@ arm_status arm_mat_inverse_f32(
              replace the elements in the destination matrix */
           j = numCols >> 2;
 
-	  /* Loop unrolling */
           while (j > 0U)
           {
             /* Replace the element by the sum of that row
diff --git a/Source/MatrixFunctions/arm_mat_mult_f32.c b/Source/MatrixFunctions/arm_mat_mult_f32.c
index 0e47035e..ffddf999 100644
--- a/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -169,7 +169,7 @@ arm_status arm_mat_mult_f32(
         acc6 = vdupq_n_f32(0.0);
         acc7 = vdupq_n_f32(0.0);
 
-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
         colCnt = numColsA >> 2U;
 
         /* Matrix multiplication */
@@ -184,7 +184,8 @@ arm_status arm_mat_mult_f32(
           a5V = vld1q_f32(pIn1F); 
           a6V = vld1q_f32(pIn1G); 
           a7V = vld1q_f32(pIn1H); 
-          pIn1 += 4;
+
+	  pIn1 += 4;
           pIn1B += 4;
           pIn1C += 4;
           pIn1D += 4;
@@ -323,7 +324,7 @@ arm_status arm_mat_mult_f32(
 
         acc0 = vdupq_n_f32(0.0);
 
-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Compute 4 MACs simultaneously. */
         colCnt = numColsA >> 2U;
 
         /* Matrix multiplication   */
diff --git a/Source/MatrixFunctions/arm_mat_scale_f32.c b/Source/MatrixFunctions/arm_mat_scale_f32.c
index 4eeedad2..a0097b1a 100644
--- a/Source/MatrixFunctions/arm_mat_scale_f32.c
+++ b/Source/MatrixFunctions/arm_mat_scale_f32.c
@@ -97,10 +97,9 @@ arm_status arm_mat_scale_f32(
     /* Total number of samples in the input matrix */
     numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
 
-    /* Loop unrolling */
     blkCnt = numSamples >> 2;
 
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+    /* Compute 4 outputs at a time.
      ** a second loop below computes the remaining 1 to 3 samples. */
     while (blkCnt > 0U)
     {
diff --git a/Source/MatrixFunctions/arm_mat_sub_f32.c b/Source/MatrixFunctions/arm_mat_sub_f32.c
index 33ac2456..cb576477 100644
--- a/Source/MatrixFunctions/arm_mat_sub_f32.c
+++ b/Source/MatrixFunctions/arm_mat_sub_f32.c
@@ -95,10 +95,9 @@ arm_status arm_mat_sub_f32(
     /* Total number of samples in the input matrix */
     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
 
-    /* Loop Unrolling */
     blkCnt = numSamples >> 2U;
 
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+    /* Compute 4 outputs at a time.
      ** a second loop below computes the remaining 1 to 3 samples. */
     while (blkCnt > 0U)
     {
@@ -110,7 +109,7 @@ arm_status arm_mat_sub_f32(
       res = vsubq_f32(vec1, vec2);
       vst1q_f32(pOut, res);
 
-      /* update pointers to process next sampels */
+      /* Update pointers to process next samples */
       pIn1 += 4U;
       pIn2 += 4U;
       pOut += 4U;
diff --git a/Source/MatrixFunctions/arm_mat_trans_f32.c b/Source/MatrixFunctions/arm_mat_trans_f32.c
index 0be38069..71748bf2 100644
--- a/Source/MatrixFunctions/arm_mat_trans_f32.c
+++ b/Source/MatrixFunctions/arm_mat_trans_f32.c
@@ -90,13 +90,12 @@ arm_status arm_mat_trans_f32(
       float32x4_t row0V,row1V,row2V,row3V;
       float32x4x2_t ra0,ra1,rb0,rb1;
 
-      /* Loop Unrolling */
       blkCnt = nColumns >> 2;
 
       /* The pointer px is set to starting address of the column being processed */
       px = pOut + i;
 
-      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+      /* Compute 4 outputs at a time.
        ** a second loop below computes the remaining 1 to 3 samples. */
       while (blkCnt > 0U)        /* Column loop */
       {
diff --git a/Source/StatisticsFunctions/arm_max_f32.c b/Source/StatisticsFunctions/arm_max_f32.c
index 97378ddd..cd54e2a1 100644
--- a/Source/StatisticsFunctions/arm_max_f32.c
+++ b/Source/StatisticsFunctions/arm_max_f32.c
@@ -111,12 +111,11 @@ void arm_max_f32(
       outV = vld1q_f32(pSrc);
       pSrc += 4;
  
-      /* Loop unrolling */
+      /* Compute 4 outputs at a time */
       blkCnt = (blockSize - 4 ) >> 2U;
     
       while (blkCnt > 0U)
       {
-    
         srcV = vld1q_f32(pSrc);
         pSrc += 4;
     
diff --git a/Source/StatisticsFunctions/arm_mean_f32.c b/Source/StatisticsFunctions/arm_mean_f32.c
index 7589d993..63d96525 100644
--- a/Source/StatisticsFunctions/arm_mean_f32.c
+++ b/Source/StatisticsFunctions/arm_mean_f32.c
@@ -72,10 +72,9 @@ void arm_mean_f32(
   float32_t in1, in2, in3, in4;
   float32x4_t inV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/StatisticsFunctions/arm_min_f32.c b/Source/StatisticsFunctions/arm_min_f32.c
index d37d0d8b..6e9ff4b5 100644
--- a/Source/StatisticsFunctions/arm_min_f32.c
+++ b/Source/StatisticsFunctions/arm_min_f32.c
@@ -109,7 +109,7 @@ void arm_min_f32(
       outV = vld1q_f32(pSrc);
       pSrc += 4;
     
-      /* Loop unrolling */
+      /* Compute 4 outputs at a time */
       blkCnt = (blockSize - 4 ) >> 2U;
     
       while (blkCnt > 0U)
diff --git a/Source/StatisticsFunctions/arm_power_f32.c b/Source/StatisticsFunctions/arm_power_f32.c
index a9f9df72..a4825a53 100644
--- a/Source/StatisticsFunctions/arm_power_f32.c
+++ b/Source/StatisticsFunctions/arm_power_f32.c
@@ -71,10 +71,9 @@ void arm_power_f32(
   float32x2_t sumV2;
   float32x4_t inV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/StatisticsFunctions/arm_rms_f32.c b/Source/StatisticsFunctions/arm_rms_f32.c
index 045f8741..45465107 100644
--- a/Source/StatisticsFunctions/arm_rms_f32.c
+++ b/Source/StatisticsFunctions/arm_rms_f32.c
@@ -71,10 +71,9 @@ void arm_rms_f32(
   float32x2_t sumV2;
   float32x4_t inV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/StatisticsFunctions/arm_var_f32.c b/Source/StatisticsFunctions/arm_var_f32.c
index 394df2aa..3c325b13 100644
--- a/Source/StatisticsFunctions/arm_var_f32.c
+++ b/Source/StatisticsFunctions/arm_var_f32.c
@@ -79,10 +79,9 @@ void arm_var_f32(
   arm_mean_f32(pSrc,blockSize,&mean);
   avg = vdupq_n_f32(mean);
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_copy_f32.c b/Source/SupportFunctions/arm_copy_f32.c
index 2bcf9b74..707adc4c 100644
--- a/Source/SupportFunctions/arm_copy_f32.c
+++ b/Source/SupportFunctions/arm_copy_f32.c
@@ -67,10 +67,9 @@ void arm_copy_f32(
 
   float32x4_t inV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_fill_f32.c b/Source/SupportFunctions/arm_fill_f32.c
index e0e65ff0..29f62862 100644
--- a/Source/SupportFunctions/arm_fill_f32.c
+++ b/Source/SupportFunctions/arm_fill_f32.c
@@ -68,11 +68,10 @@ void arm_fill_f32(
 
   float32x4_t inV = vdupq_n_f32(value);
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
- *    ** a second loop below computes the remaining 1 to 3 samples. */
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
     /* C = value */
@@ -85,7 +84,7 @@ void arm_fill_f32(
   }
 
   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
- *    ** No loop unrolling is used. */
+   ** No loop unrolling is used. */
   blkCnt = blockSize & 3;
 
   while (blkCnt > 0U)
diff --git a/Source/SupportFunctions/arm_float_to_q15.c b/Source/SupportFunctions/arm_float_to_q15.c
index 548cb60c..68c1ad09 100644
--- a/Source/SupportFunctions/arm_float_to_q15.c
+++ b/Source/SupportFunctions/arm_float_to_q15.c
@@ -80,10 +80,9 @@ void arm_float_to_q15(
   int32x4_t cvt;
   int16x4_t outV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_float_to_q31.c b/Source/SupportFunctions/arm_float_to_q31.c
index f64b8757..479f8c5b 100644
--- a/Source/SupportFunctions/arm_float_to_q31.c
+++ b/Source/SupportFunctions/arm_float_to_q31.c
@@ -84,10 +84,9 @@ void arm_float_to_q31(
 
   int32x4_t outV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_float_to_q7.c b/Source/SupportFunctions/arm_float_to_q7.c
index ddc472ba..5f2a7eb0 100644
--- a/Source/SupportFunctions/arm_float_to_q7.c
+++ b/Source/SupportFunctions/arm_float_to_q7.c
@@ -82,10 +82,9 @@ void arm_float_to_q7(
   int16x4_t cvt1,cvt2;
   int8x8_t outV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 3U;
 
-  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
+  /* Compute 8 outputs at a time.
    ** a second loop below computes the remaining 1 to 7 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_q15_to_float.c b/Source/SupportFunctions/arm_q15_to_float.c
index 15790b3a..f49d9b77 100644
--- a/Source/SupportFunctions/arm_q15_to_float.c
+++ b/Source/SupportFunctions/arm_q15_to_float.c
@@ -68,10 +68,9 @@ void arm_q15_to_float(
   int32x4_t inV0, inV1;
   float32x4_t outV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 3U;
 
-  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
+  /* Compute 8 outputs at a time.
    ** a second loop below computes the remaining 1 to 7 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_q31_to_float.c b/Source/SupportFunctions/arm_q31_to_float.c
index 932bfb2a..03e7ec6f 100644
--- a/Source/SupportFunctions/arm_q31_to_float.c
+++ b/Source/SupportFunctions/arm_q31_to_float.c
@@ -67,10 +67,9 @@ void arm_q31_to_float(
   int32x4_t inV;
   float32x4_t outV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 2U;
 
-  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+  /* Compute 4 outputs at a time.
    ** a second loop below computes the remaining 1 to 3 samples. */
   while (blkCnt > 0U)
   {
diff --git a/Source/SupportFunctions/arm_q7_to_float.c b/Source/SupportFunctions/arm_q7_to_float.c
index 927b345e..6bd86bfe 100644
--- a/Source/SupportFunctions/arm_q7_to_float.c
+++ b/Source/SupportFunctions/arm_q7_to_float.c
@@ -69,10 +69,9 @@ void arm_q7_to_float(
   int32x4_t inVLL, inVLH, inVHL, inVHH;
   float32x4_t outV;
 
-  /* Loop unrolling */
   blkCnt = blockSize >> 4U;
 
-  /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.
+  /* Compute 16 outputs at a time.
    ** a second loop below computes the remaining 1 to 15 samples. */
   while (blkCnt > 0U)
   {