diff --git a/Include/dsp/matrix_utils.h b/Include/dsp/matrix_utils.h
index 92ed2793..4e1defa8 100755
--- a/Include/dsp/matrix_utils.h
+++ b/Include/dsp/matrix_utils.h
@@ -42,29 +42,29 @@ extern "C"
 
 #define SCALE_COL_T(T,CAST,A,ROW,v,i)        \
 {                                       \
-  int32_t w;                            \
+  int32_t _w;                            \
   T *data = (A)->pData;                 \
-  const int32_t numCols = (A)->numCols; \
+  const int32_t _numCols = (A)->numCols; \
   const int32_t nb = (A)->numRows - ROW;\
                                         \
-  data += i + numCols * (ROW);          \
+  data += i + _numCols * (ROW);          \
                                         \
-  for(w=0;w < nb; w++)                  \
+  for(_w=0;_w < nb; _w++)                  \
   {                                     \
      *data *= CAST v;                   \
-     data += numCols;                   \
+     data += _numCols;                   \
   }                                     \
 }
 
 #define COPY_COL_T(T,A,ROW,COL,DST)               \
 {                                                 \
-    uint32_t row;                                 \
-    T *pb=DST;                                    \
-    T *pa = (A)->pData + ROW * (A)->numCols + COL;\
-    for(row = ROW; row < (A)->numRows; row ++)    \
+    uint32_t _row;                                \
+    T *_pb=DST;                                    \
+    T *_pa = (A)->pData + ROW * (A)->numCols + COL;\
+    for(_row = ROW; _row < (A)->numRows; _row ++) \
     {                                             \
-         *pb++ = *pa;                             \
-         pa += (A)->numCols;                      \
+         *_pb++ = *_pa;                             \
+         _pa += (A)->numCols;                      \
     }                                             \
 }
 
@@ -74,20 +74,20 @@ extern "C"
 #define SWAP_ROWS_F16(A,COL,i,j)                  \
   {                                               \
     int cnt = ((A)->numCols)-(COL);               \
-    int32_t w;                                   \
+    int32_t _w;                                    \
     float16_t *data = (A)->pData;                 \
-    const int32_t numCols = (A)->numCols;        \
+    const int32_t _numCols = (A)->numCols;        \
                                                   \
-    for(w=(COL);w < numCols; w+=8)                \
+    for(_w=(COL);_w < _numCols; _w+=8)               \
     {                                             \
        f16x8_t tmpa,tmpb;                         \
        mve_pred16_t p0 = vctp16q(cnt);            \
                                                   \
-       tmpa=vldrhq_z_f16(&data[i*numCols + w],p0);\
-       tmpb=vldrhq_z_f16(&data[j*numCols + w],p0);\
+       tmpa=vldrhq_z_f16(&data[i*_numCols + _w],p0);\
+       tmpb=vldrhq_z_f16(&data[j*_numCols + _w],p0);\
                                                   \
-       vstrhq_p(&data[i*numCols + w], tmpb, p0);  \
-       vstrhq_p(&data[j*numCols + w], tmpa, p0);  \
+       vstrhq_p(&data[i*_numCols + _w], tmpb, p0);  \
+       vstrhq_p(&data[j*_numCols + _w], tmpa, p0);  \
                                                   \
        cnt -= 8;                                  \
     }                                             \
@@ -96,17 +96,17 @@ extern "C"
 #define SCALE_ROW_F16(A,COL,v,i)                   \
 {                                                   \
   int cnt = ((A)->numCols)-(COL);                   \
-  int32_t w;                                       \
+  int32_t _w;                                       \
   float16_t *data = (A)->pData;                     \
-  const int32_t numCols = (A)->numCols;            \
+  const int32_t _numCols = (A)->numCols;            \
                                                     \
-  for(w=(COL);w < numCols; w+=8)                    \
+  for(_w=(COL);_w < _numCols; _w+=8)                    \
   {                                                 \
        f16x8_t tmpa;                                \
        mve_pred16_t p0 = vctp16q(cnt);              \
-       tmpa = vldrhq_z_f16(&data[i*numCols + w],p0);\
+       tmpa = vldrhq_z_f16(&data[i*_numCols + _w],p0);\
        tmpa = vmulq_n_f16(tmpa,(_Float16)v);                  \
-       vstrhq_p(&data[i*numCols + w], tmpa, p0);    \
+       vstrhq_p(&data[i*_numCols + _w], tmpa, p0);    \
        cnt -= 8;                                    \
   }                                                 \
                                                     \
@@ -115,19 +115,19 @@ extern "C"
 #define MAC_ROW_F16(COL,A,i,v,B,j)                   \
 {                                                    \
   int cnt = ((A)->numCols)-(COL);                    \
-  int32_t w;                                        \
+  int32_t _w;                                        \
   float16_t *dataA = (A)->pData;                     \
   float16_t *dataB = (B)->pData;                     \
-  const int32_t numCols = (A)->numCols;             \
+  const int32_t _numCols = (A)->numCols;             \
                                                      \
-  for(w=(COL);w < numCols; w+=8)                     \
+  for(_w=(COL);_w < _numCols; _w+=8)                     \
   {                                                  \
        f16x8_t tmpa,tmpb;                            \
        mve_pred16_t p0 = vctp16q(cnt);               \
-       tmpa = vldrhq_z_f16(&dataA[i*numCols + w],p0);\
-       tmpb = vldrhq_z_f16(&dataB[j*numCols + w],p0);\
+       tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
        tmpa = vfmaq_n_f16(tmpa,tmpb,v);              \
-       vstrhq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
        cnt -= 8;                                     \
   }                                                  \
                                                      \
@@ -136,20 +136,20 @@ extern "C"
 #define MAS_ROW_F16(COL,A,i,v,B,j)                   \
 {                                                    \
   int cnt = ((A)->numCols)-(COL);                    \
-  int32_t w;                                        \
+  int32_t _w;                                        \
   float16_t *dataA = (A)->pData;                     \
   float16_t *dataB = (B)->pData;                     \
-  const int32_t numCols = (A)->numCols;             \
+  const int32_t _numCols = (A)->numCols;             \
   f16x8_t vec=vdupq_n_f16(v);                        \
                                                      \
-  for(w=(COL);w < numCols; w+=8)                     \
+  for(_w=(COL);_w < _numCols; _w+=8)                     \
   {                                                  \
        f16x8_t tmpa,tmpb;                            \
        mve_pred16_t p0 = vctp16q(cnt);               \
-       tmpa = vldrhq_z_f16(&dataA[i*numCols + w],p0);\
-       tmpb = vldrhq_z_f16(&dataB[j*numCols + w],p0);\
+       tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
        tmpa = vfmsq_f16(tmpa,tmpb,vec);              \
-       vstrhq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
        cnt -= 8;                                     \
   }                                                  \
                                                      \
@@ -160,16 +160,16 @@ extern "C"
 
 #define SWAP_ROWS_F16(A,COL,i,j)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float16_t *dataI = (A)->pData;       \
   float16_t *dataJ = (A)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
                                        \
-  dataI += i*numCols + (COL);          \
-  dataJ += j*numCols + (COL);          \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      float16_t tmp;                    \
      tmp = *dataI;                     \
@@ -180,14 +180,14 @@ extern "C"
 
 #define SCALE_ROW_F16(A,COL,v,i)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float16_t *data = (A)->pData;        \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
                                        \
-  data += i*numCols + (COL);           \
+  data += i*_numCols + (COL);           \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      *data++ *= (_Float16)v;           \
   }                                    \
@@ -196,16 +196,16 @@ extern "C"
 
 #define MAC_ROW_F16(COL,A,i,v,B,j)                \
 {                                                 \
-  int32_t w;                                      \
+  int32_t _w;                                      \
   float16_t *dataA = (A)->pData;                  \
   float16_t *dataB = (B)->pData;                  \
-  const int32_t numCols = (A)->numCols;           \
-  const int32_t nb = numCols-(COL);               \
+  const int32_t _numCols = (A)->numCols;           \
+  const int32_t nb = _numCols-(COL);               \
                                                   \
-  dataA += i*numCols + (COL);                     \
-  dataB += j*numCols + (COL);                     \
+  dataA += i*_numCols + (COL);                     \
+  dataB += j*_numCols + (COL);                     \
                                                   \
-  for(w=0;w < nb; w++)                            \
+  for(_w=0;_w < nb; _w++)                            \
   {                                               \
      *dataA++ += (_Float16)v * (_Float16)*dataB++;\
   }                                               \
@@ -213,16 +213,16 @@ extern "C"
 
 #define MAS_ROW_F16(COL,A,i,v,B,j)                \
 {                                                 \
-  int32_t w;                                      \
+  int32_t _w;                                      \
   float16_t *dataA = (A)->pData;                  \
   float16_t *dataB = (B)->pData;                  \
-  const int32_t numCols = (A)->numCols;           \
-  const int32_t nb = numCols-(COL);               \
+  const int32_t _numCols = (A)->numCols;           \
+  const int32_t nb = _numCols-(COL);               \
                                                   \
-  dataA += i*numCols + (COL);                     \
-  dataB += j*numCols + (COL);                     \
+  dataA += i*_numCols + (COL);                     \
+  dataB += j*_numCols + (COL);                     \
                                                   \
-  for(w=0;w < nb; w++)                            \
+  for(_w=0;_w < nb; _w++)                            \
   {                                               \
      *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
   }                                               \
@@ -245,19 +245,19 @@ extern "C"
   {                                               \
     int cnt = ((A)->numCols)-(COL);               \
     float32_t *data = (A)->pData;                 \
-    const int32_t numCols = (A)->numCols;        \
-    int32_t w;                                   \
+    const int32_t _numCols = (A)->numCols;        \
+    int32_t _w;                                   \
                                                   \
-    for(w=(COL);w < numCols; w+=4)                \
+    for(_w=(COL);_w < _numCols; _w+=4)                \
     {                                             \
        f32x4_t tmpa,tmpb;                         \
        mve_pred16_t p0 = vctp32q(cnt);            \
                                                   \
-       tmpa=vldrwq_z_f32(&data[i*numCols + w],p0);\
-       tmpb=vldrwq_z_f32(&data[j*numCols + w],p0);\
+       tmpa=vldrwq_z_f32(&data[i*_numCols + _w],p0);\
+       tmpb=vldrwq_z_f32(&data[j*_numCols + _w],p0);\
                                                   \
-       vstrwq_p(&data[i*numCols + w], tmpb, p0);  \
-       vstrwq_p(&data[j*numCols + w], tmpa, p0);  \
+       vstrwq_p(&data[i*_numCols + _w], tmpb, p0);  \
+       vstrwq_p(&data[j*_numCols + _w], tmpa, p0);  \
                                                   \
        cnt -= 4;                                  \
     }                                             \
@@ -268,17 +268,17 @@ extern "C"
   int cnt = ((A)->numCols)-(COL);                    \
   float32_t *dataA = (A)->pData;                     \
   float32_t *dataB = (B)->pData;                     \
-  const int32_t numCols = (A)->numCols;             \
-  int32_t w;                                        \
+  const int32_t _numCols = (A)->numCols;             \
+  int32_t _w;                                        \
                                                      \
-  for(w=(COL);w < numCols; w+=4)                     \
+  for(_w=(COL);_w < _numCols; _w+=4)                     \
   {                                                  \
        f32x4_t tmpa,tmpb;                            \
        mve_pred16_t p0 = vctp32q(cnt);               \
-       tmpa = vldrwq_z_f32(&dataA[i*numCols + w],p0);\
-       tmpb = vldrwq_z_f32(&dataB[j*numCols + w],p0);\
+       tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
        tmpa = vfmaq_n_f32(tmpa,tmpb,v);              \
-       vstrwq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
        cnt -= 4;                                     \
   }                                                  \
                                                      \
@@ -289,18 +289,18 @@ extern "C"
   int cnt = ((A)->numCols)-(COL);                    \
   float32_t *dataA = (A)->pData;                     \
   float32_t *dataB = (B)->pData;                     \
-  const int32_t numCols = (A)->numCols;             \
-  int32_t w;                                        \
+  const int32_t _numCols = (A)->numCols;             \
+  int32_t _w;                                        \
   f32x4_t vec=vdupq_n_f32(v);                        \
                                                      \
-  for(w=(COL);w < numCols; w+=4)                     \
+  for(_w=(COL);_w < _numCols; _w+=4)                     \
   {                                                  \
        f32x4_t tmpa,tmpb;                            \
        mve_pred16_t p0 = vctp32q(cnt);               \
-       tmpa = vldrwq_z_f32(&dataA[i*numCols + w],p0);\
-       tmpb = vldrwq_z_f32(&dataB[j*numCols + w],p0);\
+       tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
        tmpa = vfmsq_f32(tmpa,tmpb,vec);              \
-       vstrwq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0);    \
        cnt -= 4;                                     \
   }                                                  \
                                                      \
@@ -310,16 +310,16 @@ extern "C"
 {                                                   \
   int cnt = ((A)->numCols)-(COL);                   \
   float32_t *data = (A)->pData;                     \
-  const int32_t numCols = (A)->numCols;            \
-  int32_t w;                                       \
+  const int32_t _numCols = (A)->numCols;            \
+  int32_t _w;                                       \
                                                     \
-  for(w=(COL);w < numCols; w+=4)                    \
+  for(_w=(COL);_w < _numCols; _w+=4)                    \
   {                                                 \
        f32x4_t tmpa;                                \
        mve_pred16_t p0 = vctp32q(cnt);              \
-       tmpa = vldrwq_z_f32(&data[i*numCols + w],p0);\
+       tmpa = vldrwq_z_f32(&data[i*_numCols + _w],p0);\
        tmpa = vmulq_n_f32(tmpa,v);                  \
-       vstrwq_p(&data[i*numCols + w], tmpa, p0);    \
+       vstrwq_p(&data[i*_numCols + _w], tmpa, p0);    \
        cnt -= 4;                                    \
   }                                                 \
                                                     \
@@ -329,18 +329,18 @@ extern "C"
 
 #define SWAP_ROWS_F32(A,COL,i,j)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float32_t *dataI = (A)->pData;       \
   float32_t *dataJ = (A)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols - COL;    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - COL;    \
                                        \
-  dataI += i*numCols + (COL);          \
-  dataJ += j*numCols + (COL);          \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
                                        \
   float32_t tmp;                       \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      tmp = *dataI;                     \
      *dataI++ = *dataJ;                \
@@ -352,15 +352,15 @@ extern "C"
 {                                      \
   float32_t *dataA = (A)->pData;       \
   float32_t *dataB = (B)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols - (COL);  \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - (COL);  \
   int32_t nbElems;                     \
   f32x4_t vec = vdupq_n_f32(v);        \
                                        \
   nbElems = nb >> 2;                   \
                                        \
-  dataA += i*numCols + (COL);          \
-  dataB += j*numCols + (COL);          \
+  dataA += i*_numCols + (COL);          \
+  dataB += j*_numCols + (COL);          \
                                        \
   while(nbElems>0)                     \
   {                                    \
@@ -386,15 +386,15 @@ extern "C"
 {                                      \
   float32_t *dataA = (A)->pData;       \
   float32_t *dataB = (B)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols - (COL);  \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - (COL);  \
   int32_t nbElems;                     \
   f32x4_t vec = vdupq_n_f32(v);        \
                                        \
   nbElems = nb >> 2;                   \
                                        \
-  dataA += i*numCols + (COL);          \
-  dataB += j*numCols + (COL);          \
+  dataA += i*_numCols + (COL);          \
+  dataB += j*_numCols + (COL);          \
                                        \
   while(nbElems>0)                     \
   {                                    \
@@ -419,14 +419,14 @@ extern "C"
 #define SCALE_ROW_F32(A,COL,v,i)        \
 {                                       \
   float32_t *data = (A)->pData;         \
-  const int32_t numCols = (A)->numCols; \
-  const int32_t nb = numCols - (COL);   \
+  const int32_t _numCols = (A)->numCols; \
+  const int32_t nb = _numCols - (COL);   \
   int32_t nbElems;                      \
   f32x4_t vec = vdupq_n_f32(v);         \
                                         \
   nbElems = nb >> 2;                    \
                                         \
-  data += i*numCols + (COL);            \
+  data += i*_numCols + (COL);            \
   while(nbElems>0)                      \
   {                                     \
        f32x4_t tmpa;                    \
@@ -450,18 +450,18 @@ extern "C"
 
 #define SWAP_ROWS_F32(A,COL,i,j)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float32_t tmp;                       \
   float32_t *dataI = (A)->pData;       \
   float32_t *dataJ = (A)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols - COL;    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - COL;    \
                                        \
-  dataI += i*numCols + (COL);          \
-  dataJ += j*numCols + (COL);          \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
                                        \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      tmp = *dataI;                     \
      *dataI++ = *dataJ;                \
@@ -471,14 +471,14 @@ extern "C"
 
 #define SCALE_ROW_F32(A,COL,v,i)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float32_t *data = (A)->pData;        \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols - COL;    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols - COL;    \
                                        \
-  data += i*numCols + (COL);           \
+  data += i*_numCols + (COL);           \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      *data++ *= v;                     \
   }                                    \
@@ -487,16 +487,16 @@ extern "C"
 
 #define MAC_ROW_F32(COL,A,i,v,B,j)     \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float32_t *dataA = (A)->pData;       \
   float32_t *dataB = (B)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
                                        \
-  dataA = dataA + i*numCols + (COL);   \
-  dataB = dataB + j*numCols + (COL);   \
+  dataA = dataA + i*_numCols + (COL);   \
+  dataB = dataB + j*_numCols + (COL);   \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      *dataA++ += v* *dataB++;          \
   }                                    \
@@ -504,16 +504,16 @@ extern "C"
 
 #define MAS_ROW_F32(COL,A,i,v,B,j)     \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float32_t *dataA = (A)->pData;       \
   float32_t *dataB = (B)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
                                        \
-  dataA = dataA + i*numCols + (COL);   \
-  dataB = dataB + j*numCols + (COL);   \
+  dataA = dataA + i*_numCols + (COL);   \
+  dataB = dataB + j*_numCols + (COL);   \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      *dataA++ -= v* *dataB++;          \
   }                                    \
@@ -522,7 +522,7 @@ extern "C"
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 
-/* Functions with only a scalar version */
+/* Functions _with only a scalar version */
 
 #define COPY_COL_F32(A,ROW,COL,DST) \
   COPY_COL_T(float32_t,A,ROW,COL,DST)
@@ -532,15 +532,15 @@ extern "C"
 
 #define SWAP_COLS_F32(A,COL,i,j)               \
 {                                              \
-  int32_t w;                                  \
+  int32_t _w;                                  \
   float32_t *data = (A)->pData;                \
-  const int32_t numCols = (A)->numCols;       \
-  for(w=(COL);w < numCols; w++)                \
+  const int32_t _numCols = (A)->numCols;       \
+  for(_w=(COL);_w < _numCols; _w++)                \
   {                                            \
      float32_t tmp;                            \
-     tmp = data[w*numCols + i];                \
-     data[w*numCols + i] = data[w*numCols + j];\
-     data[w*numCols + j] = tmp;                \
+     tmp = data[_w*_numCols + i];                \
+     data[_w*_numCols + i] = data[_w*_numCols + j];\
+     data[_w*_numCols + j] = tmp;                \
   }                                            \
 }
 
@@ -549,16 +549,16 @@ extern "C"
 
 #define SWAP_ROWS_F64(A,COL,i,j)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float64_t *dataI = (A)->pData;       \
   float64_t *dataJ = (A)->pData;       \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
                                        \
-  dataI += i*numCols + (COL);          \
-  dataJ += j*numCols + (COL);          \
+  dataI += i*_numCols + (COL);          \
+  dataJ += j*_numCols + (COL);          \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      float64_t tmp;                    \
      tmp = *dataI;                     \
@@ -569,28 +569,28 @@ extern "C"
 
 #define SWAP_COLS_F64(A,COL,i,j)               \
 {                                              \
-  int32_t w;                                  \
+  int32_t _w;                                  \
   float64_t *data = (A)->pData;                \
-  const int32_t numCols = (A)->numCols;       \
-  for(w=(COL);w < numCols; w++)                \
+  const int32_t _numCols = (A)->numCols;       \
+  for(_w=(COL);_w < _numCols; _w++)                \
   {                                            \
      float64_t tmp;                            \
-     tmp = data[w*numCols + i];                \
-     data[w*numCols + i] = data[w*numCols + j];\
-     data[w*numCols + j] = tmp;                \
+     tmp = data[_w*_numCols + i];                \
+     data[_w*_numCols + i] = data[_w*_numCols + j];\
+     data[_w*_numCols + j] = tmp;                \
   }                                            \
 }
 
 #define SCALE_ROW_F64(A,COL,v,i)       \
 {                                      \
-  int32_t w;                           \
+  int32_t _w;                           \
   float64_t *data = (A)->pData;        \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);    \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);    \
                                        \
-  data += i*numCols + (COL);           \
+  data += i*_numCols + (COL);           \
                                        \
-  for(w=0;w < nb; w++)                 \
+  for(_w=0;_w < nb; _w++)                 \
   {                                    \
      *data++ *= v;                     \
   }                                    \
@@ -601,16 +601,16 @@ extern "C"
 
 #define MAC_ROW_F64(COL,A,i,v,B,j)      \
 {                                       \
-  int32_t w;                           \
+  int32_t _w;                           \
   float64_t *dataA = (A)->pData;        \
   float64_t *dataB = (B)->pData;        \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);     \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);     \
                                         \
-  dataA += i*numCols + (COL);           \
-  dataB += j*numCols + (COL);           \
+  dataA += i*_numCols + (COL);           \
+  dataB += j*_numCols + (COL);           \
                                         \
-  for(w=0;w < nb; w++)                  \
+  for(_w=0;_w < nb; _w++)                  \
   {                                     \
      *dataA++ += v* *dataB++;           \
   }                                     \
@@ -618,16 +618,16 @@ extern "C"
 
 #define MAS_ROW_F64(COL,A,i,v,B,j)      \
 {                                       \
-  int32_t w;                           \
+  int32_t _w;                           \
   float64_t *dataA = (A)->pData;        \
   float64_t *dataB = (B)->pData;        \
-  const int32_t numCols = (A)->numCols;\
-  const int32_t nb = numCols-(COL);     \
+  const int32_t _numCols = (A)->numCols;\
+  const int32_t nb = _numCols-(COL);     \
                                         \
-  dataA += i*numCols + (COL);           \
-  dataB += j*numCols + (COL);           \
+  dataA += i*_numCols + (COL);           \
+  dataB += j*_numCols + (COL);           \
                                         \
-  for(w=0;w < nb; w++)                  \
+  for(_w=0;_w < nb; _w++)                  \
   {                                     \
      *dataA++ -= v* *dataB++;           \
   }                                     \
diff --git a/Source/DistanceFunctions/arm_cosine_distance_f64.c b/Source/DistanceFunctions/arm_cosine_distance_f64.c
index c7443a6b..6ceffab9 100644
--- a/Source/DistanceFunctions/arm_cosine_distance_f64.c
+++ b/Source/DistanceFunctions/arm_cosine_distance_f64.c
@@ -59,7 +59,7 @@ float64_t arm_cosine_distance_f64(const float64_t *pA,const float64_t *pB, uint3
     arm_dot_prod_f64(pA,pB,blockSize,&dot);
 
     tmp = sqrt(pwra * pwrb);
-    return(1. - dot / tmp);
+    return(1.0L - dot / tmp);
 
 }
 
diff --git a/Source/DistanceFunctions/arm_dtw_distance_f32.c b/Source/DistanceFunctions/arm_dtw_distance_f32.c
index 53e45a09..68ed1106 100644
--- a/Source/DistanceFunctions/arm_dtw_distance_f32.c
+++ b/Source/DistanceFunctions/arm_dtw_distance_f32.c
@@ -28,6 +28,7 @@
  */
 
 #include "dsp/distance_functions.h"
+#include "dsp/matrix_utils.h"
 #include <limits.h>
 #include <math.h>
 
diff --git a/Source/FilteringFunctions/arm_conv_partial_q15.c b/Source/FilteringFunctions/arm_conv_partial_q15.c
index cfab5168..5f587fac 100644
--- a/Source/FilteringFunctions/arm_conv_partial_q15.c
+++ b/Source/FilteringFunctions/arm_conv_partial_q15.c
@@ -453,13 +453,19 @@ arm_status arm_conv_partial_q15(
         }
 
         /* Store the results in the accumulators in the destination buffer. */
-#ifndef ARM_MATH_BIG_ENDIAN
-        write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
-        write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+        {
+          int32_t sat0 = __SSAT((acc0 >> 15), 16);
+          int32_t sat1 = __SSAT((acc1 >> 15), 16);
+          int32_t sat2 = __SSAT((acc2 >> 15), 16);
+          int32_t sat3 = __SSAT((acc3 >> 15), 16);
+#ifndef  ARM_MATH_BIG_ENDIAN
+          write_q15x2_ia (&pOut, __PKHBT(sat0, sat1, 16));
+          write_q15x2_ia (&pOut, __PKHBT(sat2, sat3, 16));
 #else
-        write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
-        write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
-#endif /* #ifndef  ARM_MATH_BIG_ENDIAN */
+          write_q15x2_ia (&pOut, __PKHBT(sat1, sat0, 16));
+          write_q15x2_ia (&pOut, __PKHBT(sat3, sat2, 16));
+#endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
+        }
 
         /* Increment the pointer pIn1 index, count by 4 */
         count += 4U;
diff --git a/Source/FilteringFunctions/arm_conv_q15.c b/Source/FilteringFunctions/arm_conv_q15.c
index 0a28b959..ede1ce34 100644
--- a/Source/FilteringFunctions/arm_conv_q15.c
+++ b/Source/FilteringFunctions/arm_conv_q15.c
@@ -586,14 +586,19 @@ void arm_conv_q15(
       }
 
       /* Store the result in the accumulator in the destination buffer. */
+      {
+        int32_t sat0 = __SSAT((acc0 >> 15), 16);
+        int32_t sat1 = __SSAT((acc1 >> 15), 16);
+        int32_t sat2 = __SSAT((acc2 >> 15), 16);
+        int32_t sat3 = __SSAT((acc3 >> 15), 16);
 #ifndef  ARM_MATH_BIG_ENDIAN
-      write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
-      write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+        write_q15x2_ia (&pOut, __PKHBT(sat0, sat1, 16));
+        write_q15x2_ia (&pOut, __PKHBT(sat2, sat3, 16));
 #else
-      write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
-      write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+        write_q15x2_ia (&pOut, __PKHBT(sat1, sat0, 16));
+        write_q15x2_ia (&pOut, __PKHBT(sat3, sat2, 16));
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
-
+      }
       /* Increment the pointer pIn1 index, count by 4 */
       count += 4U;
 
diff --git a/Source/MatrixFunctions/arm_householder_f64.c b/Source/MatrixFunctions/arm_householder_f64.c
index be4f3e4c..8c04a908 100644
--- a/Source/MatrixFunctions/arm_householder_f64.c
+++ b/Source/MatrixFunctions/arm_householder_f64.c
@@ -93,12 +93,12 @@ float64_t arm_householder_f64(
     beta =  alpha * alpha + x1norm2;
     beta=sqrt(beta);
 
-    if (alpha > 0.0)
+    if (alpha > 0.0L)
     {
       beta = -beta;
     }
 
-    r = 1.0 / (alpha -beta);
+    r = 1.0L / (alpha -beta);
     arm_scale_f64(pOut,r,pOut,blockSize);
     pOut[0] = 1.0;
 
diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
index 0e284c21..4404dc42 100755
--- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@@ -192,12 +192,12 @@ arm_status arm_mat_cholesky_f64(
                 pG[j * n + i] -= sum;
             }
             
-            if (pG[i * n + i] <= 0.0)
+            if (pG[i * n + i] <= 0.0L)
             {
                 return(ARM_MATH_DECOMPOSITION_FAILURE);
             }
             
-            invSqrtVj = 1.0/sqrt(pG[i * n + i]);
+            invSqrtVj = 1.0L/sqrt(pG[i * n + i]);
             SCALE_COL_F64(pDst,i,invSqrtVj,i);
         }
         
@@ -254,12 +254,12 @@ arm_status arm_mat_cholesky_f64(
                 }
             }
             
-            if (pG[i * n + i] <= 0.0)
+            if (pG[i * n + i] <= 0.0L)
             {
                 return(ARM_MATH_DECOMPOSITION_FAILURE);
             }
             
-            invSqrtVj = 1.0/sqrt(pG[i * n + i]);
+            invSqrtVj = 1.0L/sqrt(pG[i * n + i]);
             SCALE_COL_F64(pDst,i,invSqrtVj,i);
             
         }
diff --git a/Source/MatrixFunctions/arm_mat_inverse_f64.c b/Source/MatrixFunctions/arm_mat_inverse_f64.c
index f41356f2..ff073ed1 100644
--- a/Source/MatrixFunctions/arm_mat_inverse_f64.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f64.c
@@ -61,7 +61,7 @@ arm_status arm_mat_inverse_f64(
   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
 
 
-  float64_t pivot = 0.0, newPivot=0.0;                /* Temporary input values  */
+  float64_t pivot = 0.0L, newPivot=0.0L;                /* Temporary input values  */
   uint32_t selectedRow,pivotRow,i, rowNb, rowCnt, flag = 0U, j,column;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
@@ -182,7 +182,7 @@ arm_status arm_mat_inverse_f64(
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
-      if ((pivot != 0.0) && (selectedRow != column))
+      if ((pivot != 0.0L) && (selectedRow != column))
       {
             /* Loop over number of columns
              * to the right of the pilot element */
@@ -198,14 +198,14 @@ arm_status arm_mat_inverse_f64(
 
 
       /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (pivot == 0.0))
+      if ((flag != 1U) && (pivot == 0.0L))
       {
         return ARM_MATH_SINGULAR;
       }
 
      
       /* Pivot element of the row */
-      pivot = 1.0 / pivot;
+      pivot = 1.0L / pivot;
 
       SCALE_ROW_F64(pSrc,column,pivot,pivotRow);
       SCALE_ROW_F64(pDst,0,pivot,pivotRow);
@@ -241,12 +241,12 @@ arm_status arm_mat_inverse_f64(
     /* Set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
 
-    if ((flag != 1U) && (pivot == 0.0))
+    if ((flag != 1U) && (pivot == 0.0L))
     {
       pIn = pSrc->pData;
       for (i = 0; i < numRows * numCols; i++)
       {
-        if (pIn[i] != 0.0)
+        if (pIn[i] != 0.0L)
             break;
       }
 
diff --git a/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
index dae2ee0b..435b27f7 100755
--- a/Source/MatrixFunctions/arm_mat_ldlt_f32.c
+++ b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -365,7 +365,6 @@ arm_status arm_mat_ldlt_f32(
 
 
         int r;
-        int w;
 
         for(r=k;r<n;r++)
         {
@@ -394,7 +393,7 @@ arm_status arm_mat_ldlt_f32(
             break;
         }
 
-        for(w=k+1;w<n;w++)
+        for(int w=k+1;w<n;w++)
         {
           int x;
           for(x=k+1;x<n;x++)
@@ -403,7 +402,7 @@ arm_status arm_mat_ldlt_f32(
           }
         }
 
-        for(w=k+1;w<n;w++)
+        for(int w=k+1;w<n;w++)
         {
                pA[w*n+k] = pA[w*n+k] / a;
         }
diff --git a/Source/MatrixFunctions/arm_mat_ldlt_f64.c b/Source/MatrixFunctions/arm_mat_ldlt_f64.c
index 46432718..d2ac547e 100755
--- a/Source/MatrixFunctions/arm_mat_ldlt_f64.c
+++ b/Source/MatrixFunctions/arm_mat_ldlt_f64.c
@@ -102,7 +102,7 @@ arm_status arm_mat_ldlt_f64(
     {
         /* Find pivot */
         float64_t m=F64_MIN,a;
-        int w,r,j=k; 
+        int r,j=k;
 
 
         for(r=k;r<n;r++)
@@ -125,14 +125,14 @@ arm_status arm_mat_ldlt_f64(
 
         a = pA[k*n+k];
 
-        if (fabs(a) < 1.0e-18)
+        if (fabs(a) < 1.0e-18L)
         {
 
             fullRank = 0;
             break;
         }
 
-        for(w=k+1;w<n;w++)
+        for(int w=k+1;w<n;w++)
         {
           int x;
           for(x=k+1;x<n;x++)
@@ -141,7 +141,7 @@ arm_status arm_mat_ldlt_f64(
           }
         }
 
-        for(w=k+1;w<n;w++)
+        for(int w=k+1;w<n;w++)
         {
                pA[w*n+k] = pA[w*n+k] / a;
         }
diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
index cd71243b..fbd8f5ae 100755
--- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@@ -108,7 +108,7 @@ arm_status arm_mat_solve_lower_triangular_f64(
                     vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
                 }
                 
-                if (pLT[n*i + i]==0.0)
+                if (pLT[n*i + i]==0.0L)
                 {
                     return(ARM_MATH_SINGULAR);
                 }
@@ -131,7 +131,7 @@ arm_status arm_mat_solve_lower_triangular_f64(
                     tmp -= lt_row[k] * pX[cols*k+j];
                 }
                 
-                if (lt_row[i]==0.0)
+                if (lt_row[i]==0.0L)
                 {
                     return(ARM_MATH_SINGULAR);
                 }
@@ -206,7 +206,7 @@ arm_status arm_mat_solve_lower_triangular_f64(
                     tmp -= lt_row[k] * pX[cols*k+j];
                 }
                 
-                if (lt_row[i]==0.0)
+                if (lt_row[i]==0.0L)
                 {
                     return(ARM_MATH_SINGULAR);
                 }
diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
index 3a227ed9..c3ae2b9e 100755
--- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@@ -100,7 +100,7 @@ arm_status arm_mat_solve_upper_triangular_f64(
                     vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
                 }
                 
-                if (pUT[n*i + i]==0.0)
+                if (pUT[n*i + i]==0.0L)
                 {
                     return(ARM_MATH_SINGULAR);
                 }
@@ -125,7 +125,7 @@ arm_status arm_mat_solve_upper_triangular_f64(
                     tmp -= ut_row[k] * pX[cols*k+j];
                 }
                 
-                if (ut_row[i]==0.0)
+                if (ut_row[i]==0.0L)
                 {
                     return(ARM_MATH_SINGULAR);
                 }
@@ -194,7 +194,7 @@ arm_status arm_mat_solve_upper_triangular_f64(
                     tmp -= ut_row[k] * pX[cols*k+j];
                 }
                 
-                if (ut_row[i]==0.0)
+                if (ut_row[i]==0.0L)
                 {
                     return(ARM_MATH_SINGULAR);
                 }
diff --git a/Source/TransformFunctions/arm_cfft_f64.c b/Source/TransformFunctions/arm_cfft_f64.c
index e8c0a6b8..6694db1d 100644
--- a/Source/TransformFunctions/arm_cfft_f64.c
+++ b/Source/TransformFunctions/arm_cfft_f64.c
@@ -297,7 +297,7 @@ void arm_cfft_f64(
 
     if (ifftFlag == 1U)
     {
-        invL = 1.0 / (float64_t)L;
+        invL = 1.0L / (float64_t)L;
         /*  Conjugate and scale output data */
         pSrc = p1;
         for(l=0; l<L; l++)
diff --git a/Source/TransformFunctions/arm_rfft_fast_f64.c b/Source/TransformFunctions/arm_rfft_fast_f64.c
index a1e4ed01..c4cfc177 100755
--- a/Source/TransformFunctions/arm_rfft_fast_f64.c
+++ b/Source/TransformFunctions/arm_rfft_fast_f64.c
@@ -63,8 +63,8 @@ void stage_rfft_f64(
 
    // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
    // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
-   *pOut++ = 0.5 * ( t1a + t1b );
-   *pOut++ = 0.5 * ( t1a - t1b );
+   *pOut++ = 0.5L * ( t1a + t1b );
+   *pOut++ = 0.5L * ( t1a - t1b );
 
    // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
    pB  = p + 2*k;
@@ -105,8 +105,8 @@ void stage_rfft_f64(
       p2 = twR * t1b;
       p3 = twI * t1b;
 
-      *pOut++ = 0.5 * (xAR + xBR + p0 + p3 ); //xAR
-      *pOut++ = 0.5 * (xAI - xBI + p1 - p2 ); //xAI
+      *pOut++ = 0.5L * (xAR + xBR + p0 + p3 ); //xAR
+      *pOut++ = 0.5L * (xAI - xBI + p1 - p2 ); //xAI
 
       pA += 2;
       pB -= 2;
@@ -135,8 +135,8 @@ void merge_rfft_f64(
 
    pCoeff += 2 ;
 
-   *pOut++ = 0.5 * ( xAR + xAI );
-   *pOut++ = 0.5 * ( xAR - xAI );
+   *pOut++ = 0.5L * ( xAR + xAI );
+   *pOut++ = 0.5L * ( xAR - xAI );
 
    pB  =  p + 2*k ;
    pA +=  2	   ;
@@ -164,8 +164,8 @@ void merge_rfft_f64(
 
       // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
       // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
-      *pOut++ = 0.5 * (xAR + xBR - r - s ); //xAR
-      *pOut++ = 0.5 * (xAI - xBI + t - u ); //xAI
+      *pOut++ = 0.5L * (xAR + xBR - r - s ); //xAR
+      *pOut++ = 0.5L * (xAI - xBI + t - u ); //xAI
 
       pA += 2;
       pB -= 2;