diff --git a/Include/dsp/matrix_utils.h b/Include/dsp/matrix_utils.h
new file mode 100755
index 00000000..a02cb6d1
--- /dev/null
+++ b/Include/dsp/matrix_utils.h
@@ -0,0 +1,615 @@
+/******************************************************************************
+ * @file     matrix_utils.h
+ * @brief    Public header file for CMSIS DSP Library
+ * @version  V1.11.0
+ * @date     30 May 2022
+ * Target Processor: Cortex-M and Cortex-A cores
+ ******************************************************************************/
+/*
+ * Copyright (c) 2010-2022 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ 
+#ifndef _MATRIX_UTILS_H_
+#define _MATRIX_UTILS_H_
+
+#include "arm_math_types.h"
+#include "arm_math_memory.h"
+
+#include "dsp/none.h"
+#include "dsp/utils.h"
+
+#ifdef   __cplusplus
+extern "C"
+{
+#endif
+
+#define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)])
+
+#define SCALE_COL_T(T,CAST,A,ROW,v,i)        \
+{                                       \
+  int32_t w;                            \
+  T *data = (A)->pData;                 \
+  const int32_t numCols = (A)->numCols; \
+  const int32_t nb = (A)->numRows - ROW;\
+                                        \
+  data += i + numCols * (ROW);          \
+                                        \
+  for(w=0;w < nb; w++)                  \
+  {                                     \
+     *data *= CAST v;                   \
+     data += numCols;                   \
+  }                                     \
+}
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F16(A,COL,i,j)                  \
+  {                                               \
+    int cnt = ((A)->numCols)-(COL);               \
+    int32_t w;                                   \
+    float16_t *data = (A)->pData;                 \
+    const int32_t numCols = (A)->numCols;        \
+                                                  \
+    for(w=(COL);w < numCols; w+=8)                \
+    {                                             \
+       f16x8_t tmpa,tmpb;                         \
+       mve_pred16_t p0 = vctp16q(cnt);            \
+                                                  \
+       tmpa=vldrhq_z_f16(&data[i*numCols + w],p0);\
+       tmpb=vldrhq_z_f16(&data[j*numCols + w],p0);\
+                                                  \
+       vstrhq_p(&data[i*numCols + w], tmpb, p0);  \
+       vstrhq_p(&data[j*numCols + w], tmpa, p0);  \
+                                                  \
+       cnt -= 8;                                  \
+    }                                             \
+  }
+
+#define SCALE_ROW_F16(A,COL,v,i)                   \
+{                                                   \
+  int cnt = ((A)->numCols)-(COL);                   \
+  int32_t w;                                       \
+  float16_t *data = (A)->pData;                     \
+  const int32_t numCols = (A)->numCols;            \
+                                                    \
+  for(w=(COL);w < numCols; w+=8)                    \
+  {                                                 \
+       f16x8_t tmpa;                                \
+       mve_pred16_t p0 = vctp16q(cnt);              \
+       tmpa = vldrhq_z_f16(&data[i*numCols + w],p0);\
+       tmpa = vmulq_n_f16(tmpa,(_Float16)v);                  \
+       vstrhq_p(&data[i*numCols + w], tmpa, p0);    \
+       cnt -= 8;                                    \
+  }                                                 \
+                                                    \
+}
+
+#define MAC_ROW_F16(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  int32_t w;                                        \
+  float16_t *dataA = (A)->pData;                     \
+  float16_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+                                                     \
+  for(w=(COL);w < numCols; w+=8)                     \
+  {                                                  \
+       f16x8_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp16q(cnt);               \
+       tmpa = vldrhq_z_f16(&dataA[i*numCols + w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*numCols + w],p0);\
+       tmpa = vfmaq_n_f16(tmpa,tmpb,v);              \
+       vstrhq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 8;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define MAS_ROW_F16(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  int32_t w;                                        \
+  float16_t *dataA = (A)->pData;                     \
+  float16_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+  f16x8_t vec=vdupq_n_f16(v);                        \
+                                                     \
+  for(w=(COL);w < numCols; w+=8)                     \
+  {                                                  \
+       f16x8_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp16q(cnt);               \
+       tmpa = vldrhq_z_f16(&dataA[i*numCols + w],p0);\
+       tmpb = vldrhq_z_f16(&dataB[j*numCols + w],p0);\
+       tmpa = vfmsq_f16(tmpa,tmpb,vec);              \
+       vstrhq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 8;                                     \
+  }                                                  \
+                                                     \
+}
+
+#else
+
+#define SWAP_ROWS_F16(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float16_t *dataI = (A)->pData;       \
+  float16_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     float16_t tmp;                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SCALE_ROW_F16(A,COL,v,i)       \
+{                                      \
+  int32_t w;                           \
+  float16_t *data = (A)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  data += i*numCols + (COL);           \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *data++ *= (_Float16)v;           \
+  }                                    \
+}
+
+
+#define MAC_ROW_F16(COL,A,i,v,B,j)                \
+{                                                 \
+  int32_t w;                                      \
+  float16_t *dataA = (A)->pData;                  \
+  float16_t *dataB = (B)->pData;                  \
+  const int32_t numCols = (A)->numCols;           \
+  const int32_t nb = numCols-(COL);               \
+                                                  \
+  dataA += i*numCols + (COL);                     \
+  dataB += j*numCols + (COL);                     \
+                                                  \
+  for(w=0;w < nb; w++)                            \
+  {                                               \
+     *dataA++ += (_Float16)v * (_Float16)*dataB++;\
+  }                                               \
+}
+
+#define MAS_ROW_F16(COL,A,i,v,B,j)                \
+{                                                 \
+  int32_t w;                                      \
+  float16_t *dataA = (A)->pData;                  \
+  float16_t *dataB = (B)->pData;                  \
+  const int32_t numCols = (A)->numCols;           \
+  const int32_t nb = numCols-(COL);               \
+                                                  \
+  dataA += i*numCols + (COL);                     \
+  dataB += j*numCols + (COL);                     \
+                                                  \
+  for(w=0;w < nb; w++)                            \
+  {                                               \
+     *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
+  }                                               \
+}
+
+#endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
+
+
+#define SCALE_COL_F16(A,ROW,v,i)        \
+  SCALE_COL_T(float16_t,(_Float16),A,ROW,v,i)
+  
+#endif /* defined(ARM_FLOAT16_SUPPORTED)*/
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F32(A,COL,i,j)                  \
+  {                                               \
+    int cnt = ((A)->numCols)-(COL);               \
+    float32_t *data = (A)->pData;                 \
+    const int32_t numCols = (A)->numCols;        \
+    int32_t w;                                   \
+                                                  \
+    for(w=(COL);w < numCols; w+=4)                \
+    {                                             \
+       f32x4_t tmpa,tmpb;                         \
+       mve_pred16_t p0 = vctp32q(cnt);            \
+                                                  \
+       tmpa=vldrwq_z_f32(&data[i*numCols + w],p0);\
+       tmpb=vldrwq_z_f32(&data[j*numCols + w],p0);\
+                                                  \
+       vstrwq_p(&data[i*numCols + w], tmpb, p0);  \
+       vstrwq_p(&data[j*numCols + w], tmpa, p0);  \
+                                                  \
+       cnt -= 4;                                  \
+    }                                             \
+  }
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  float32_t *dataA = (A)->pData;                     \
+  float32_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+  int32_t w;                                        \
+                                                     \
+  for(w=(COL);w < numCols; w+=4)                     \
+  {                                                  \
+       f32x4_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp32q(cnt);               \
+       tmpa = vldrwq_z_f32(&dataA[i*numCols + w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*numCols + w],p0);\
+       tmpa = vfmaq_n_f32(tmpa,tmpb,v);              \
+       vstrwq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 4;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)                   \
+{                                                    \
+  int cnt = ((A)->numCols)-(COL);                    \
+  float32_t *dataA = (A)->pData;                     \
+  float32_t *dataB = (B)->pData;                     \
+  const int32_t numCols = (A)->numCols;             \
+  int32_t w;                                        \
+  f32x4_t vec=vdupq_n_f32(v);                        \
+                                                     \
+  for(w=(COL);w < numCols; w+=4)                     \
+  {                                                  \
+       f32x4_t tmpa,tmpb;                            \
+       mve_pred16_t p0 = vctp32q(cnt);               \
+       tmpa = vldrwq_z_f32(&dataA[i*numCols + w],p0);\
+       tmpb = vldrwq_z_f32(&dataB[j*numCols + w],p0);\
+       tmpa = vfmsq_f32(tmpa,tmpb,vec);              \
+       vstrwq_p(&dataA[i*numCols + w], tmpa, p0);    \
+       cnt -= 4;                                     \
+  }                                                  \
+                                                     \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)                    \
+{                                                   \
+  int cnt = ((A)->numCols)-(COL);                   \
+  float32_t *data = (A)->pData;                     \
+  const int32_t numCols = (A)->numCols;            \
+  int32_t w;                                       \
+                                                    \
+  for(w=(COL);w < numCols; w+=4)                    \
+  {                                                 \
+       f32x4_t tmpa;                                \
+       mve_pred16_t p0 = vctp32q(cnt);              \
+       tmpa = vldrwq_z_f32(&data[i*numCols + w],p0);\
+       tmpa = vmulq_n_f32(tmpa,v);                  \
+       vstrwq_p(&data[i*numCols + w], tmpa, p0);    \
+       cnt -= 4;                                    \
+  }                                                 \
+                                                    \
+}
+
+#elif defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define SWAP_ROWS_F32(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float32_t *dataI = (A)->pData;       \
+  float32_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - COL;    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+  float32_t tmp;                       \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - (COL);  \
+  int32_t nbElems;                     \
+  f32x4_t vec = vdupq_n_f32(v);        \
+                                       \
+  nbElems = nb >> 2;                   \
+                                       \
+  dataA += i*numCols + (COL);          \
+  dataB += j*numCols + (COL);          \
+                                       \
+  while(nbElems>0)                     \
+  {                                    \
+       f32x4_t tmpa,tmpb;              \
+       tmpa = vld1q_f32(dataA,p0);     \
+       tmpb = vld1q_f32(dataB,p0);     \
+       tmpa = vmlaq_f32(tmpa,tmpb,vec);\
+       vst1q_f32(dataA, tmpa, p0);     \
+       nbElems--;                      \
+       dataA += 4;                     \
+       dataB += 4;                     \
+  }                                    \
+                                       \
+  nbElems = nb & 3;                    \
+  while(nbElems > 0)                   \
+  {                                    \
+     *dataA++ += v* *dataB++;          \
+     nbElems--;                        \
+  }                                    \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - (COL);  \
+  int32_t nbElems;                     \
+  f32x4_t vec = vdupq_n_f32(v);        \
+                                       \
+  nbElems = nb >> 2;                   \
+                                       \
+  dataA += i*numCols + (COL);          \
+  dataB += j*numCols + (COL);          \
+                                       \
+  while(nbElems>0)                     \
+  {                                    \
+       f32x4_t tmpa,tmpb;              \
+       tmpa = vld1q_f32(dataA);        \
+       tmpb = vld1q_f32(dataB);        \
+       tmpa = vmlsq_f32(tmpa,tmpb,vec);\
+       vst1q_f32(dataA, tmpa);         \
+       nbElems--;                      \
+       dataA += 4;                     \
+       dataB += 4;                     \
+  }                                    \
+                                       \
+  nbElems = nb & 3;                    \
+  while(nbElems > 0)                   \
+  {                                    \
+     *dataA++ -= v* *dataB++;          \
+     nbElems--;                        \
+  }                                    \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)        \
+{                                       \
+  float32_t *data = (A)->pData;         \
+  const int32_t numCols = (A)->numCols; \
+  const int32_t nb = numCols - (COL);   \
+  int32_t nbElems;                      \
+  f32x4_t vec = vdupq_n_f32(v);         \
+                                        \
+  nbElems = nb >> 2;                    \
+                                        \
+  data += i*numCols + (COL);            \
+  while(nbElems>0)                      \
+  {                                     \
+       f32x4_t tmpa;                    \
+       tmpa = vld1q_f32(data);          \
+       tmpa = vmulq_f32(tmpa,vec);      \
+       vst1q_f32(data, tmpa);           \
+       data += 4;                       \
+       nbElems --;                      \
+  }                                     \
+                                        \
+  nbElems = nb & 3;                     \
+  while(nbElems > 0)                    \
+  {                                     \
+     *data++ *= v;                      \
+     nbElems--;                         \
+  }                                     \
+                                        \
+}
+
+#else
+
+#define SWAP_ROWS_F32(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float32_t tmp;                       \
+  float32_t *dataI = (A)->pData;       \
+  float32_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - COL;    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SCALE_ROW_F32(A,COL,v,i)       \
+{                                      \
+  int32_t w;                           \
+  float32_t *data = (A)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols - COL;    \
+                                       \
+  data += i*numCols + (COL);           \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *data++ *= v;                     \
+  }                                    \
+}
+
+
+#define MAC_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  int32_t w;                           \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataA = dataA + i*numCols + (COL);   \
+  dataB = dataB + j*numCols + (COL);   \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *dataA++ += v* *dataB++;          \
+  }                                    \
+}
+
+#define MAS_ROW_F32(COL,A,i,v,B,j)     \
+{                                      \
+  int32_t w;                           \
+  float32_t *dataA = (A)->pData;       \
+  float32_t *dataB = (B)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataA = dataA + i*numCols + (COL);   \
+  dataB = dataB + j*numCols + (COL);   \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *dataA++ -= v* *dataB++;          \
+  }                                    \
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+#define SWAP_COLS_F32(A,COL,i,j)               \
+{                                              \
+  int32_t w;                                  \
+  float32_t *data = (A)->pData;                \
+  const int32_t numCols = (A)->numCols;       \
+  for(w=(COL);w < numCols; w++)                \
+  {                                            \
+     float32_t tmp;                            \
+     tmp = data[w*numCols + i];                \
+     data[w*numCols + i] = data[w*numCols + j];\
+     data[w*numCols + j] = tmp;                \
+  }                                            \
+}
+
+#define SCALE_COL_F32(A,ROW,v,i)        \
+  SCALE_COL_T(float32_t,,A,ROW,v,i)
+
+#define SWAP_ROWS_F64(A,COL,i,j)       \
+{                                      \
+  int32_t w;                           \
+  float64_t *dataI = (A)->pData;       \
+  float64_t *dataJ = (A)->pData;       \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  dataI += i*numCols + (COL);          \
+  dataJ += j*numCols + (COL);          \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     float64_t tmp;                    \
+     tmp = *dataI;                     \
+     *dataI++ = *dataJ;                \
+     *dataJ++ = tmp;                   \
+  }                                    \
+}
+
+#define SWAP_COLS_F64(A,COL,i,j)               \
+{                                              \
+  int32_t w;                                  \
+  float64_t *data = (A)->pData;                \
+  const int32_t numCols = (A)->numCols;       \
+  for(w=(COL);w < numCols; w++)                \
+  {                                            \
+     float64_t tmp;                            \
+     tmp = data[w*numCols + i];                \
+     data[w*numCols + i] = data[w*numCols + j];\
+     data[w*numCols + j] = tmp;                \
+  }                                            \
+}
+
+#define SCALE_ROW_F64(A,COL,v,i)       \
+{                                      \
+  int32_t w;                           \
+  float64_t *data = (A)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);    \
+                                       \
+  data += i*numCols + (COL);           \
+                                       \
+  for(w=0;w < nb; w++)                 \
+  {                                    \
+     *data++ *= v;                     \
+  }                                    \
+}
+
+#define SCALE_COL_F64(A,ROW,v,i)        \
+  SCALE_COL_T(float64_t,,A,ROW,v,i)
+
+#define MAC_ROW_F64(COL,A,i,v,B,j)      \
+{                                       \
+  int32_t w;                           \
+  float64_t *dataA = (A)->pData;        \
+  float64_t *dataB = (B)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);     \
+                                        \
+  dataA += i*numCols + (COL);           \
+  dataB += j*numCols + (COL);           \
+                                        \
+  for(w=0;w < nb; w++)                  \
+  {                                     \
+     *dataA++ += v* *dataB++;           \
+  }                                     \
+}
+
+#define MAS_ROW_F64(COL,A,i,v,B,j)      \
+{                                       \
+  int32_t w;                           \
+  float64_t *dataA = (A)->pData;        \
+  float64_t *dataB = (B)->pData;        \
+  const int32_t numCols = (A)->numCols;\
+  const int32_t nb = numCols-(COL);     \
+                                        \
+  dataA += i*numCols + (COL);           \
+  dataB += j*numCols + (COL);           \
+                                        \
+  for(w=0;w < nb; w++)                  \
+  {                                     \
+     *dataA++ -= v* *dataB++;           \
+  }                                     \
+}
+
+#ifdef   __cplusplus
+}
+#endif
+
+#endif /* ifndef _MATRIX_UTILS_H_ */
diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f16.c b/Source/MatrixFunctions/arm_mat_cholesky_f16.c
index 9c8acef6..939eae75 100755
--- a/Source/MatrixFunctions/arm_mat_cholesky_f16.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f16.c
@@ -27,6 +27,7 @@
  */
 
 #include "dsp/matrix_functions_f16.h"
+#include "dsp/matrix_utils.h"
 
 #if defined(ARM_FLOAT16_SUPPORTED)
 
@@ -50,7 +51,7 @@
                    - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
    * @par
    * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
-   * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+   * The decomposition of A is returning a lower triangular matrix U such that A = L L^t
    */
 
 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
@@ -164,10 +165,7 @@ arm_status arm_mat_cholesky_f16(
        }
 
        invSqrtVj = 1.0f16/(_Float16)sqrtf((float32_t)pG[i * n + i]);
-       for(j=i; j < n ; j++)
-       {
-         pG[j * n + i] = (_Float16)pG[j * n + i] * (_Float16)invSqrtVj ;
-       }
+       SCALE_COL_F16(pDst,i,invSqrtVj,i);
     }
 
     status = ARM_MATH_SUCCESS;
@@ -233,10 +231,8 @@ arm_status arm_mat_cholesky_f16(
        because doing it in f16 would not have any impact on the performances.
        */
        invSqrtVj = 1.0f/sqrtf((float32_t)pG[i * n + i]);
-       for(j=i ; j < n ; j++)
-       {
-         pG[j * n + i] = (_Float16)pG[j * n + i] * (_Float16)invSqrtVj ;
-       }
+       SCALE_COL_F16(pDst,i,invSqrtVj,i);
+
     }
 
     status = ARM_MATH_SUCCESS;
diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f32.c b/Source/MatrixFunctions/arm_mat_cholesky_f32.c
index 94b96895..213666e2 100755
--- a/Source/MatrixFunctions/arm_mat_cholesky_f32.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f32.c
@@ -27,6 +27,7 @@
  */
 
 #include "dsp/matrix_functions.h"
+#include "dsp/matrix_utils.h"
 
 /**
   @ingroup groupMatrix
@@ -35,7 +36,7 @@
 /**
   @defgroup MatrixChol Cholesky and LDLT decompositions
 
-  Computes the Cholesky or LDL^t decomposition of a matrix.
+  Computes the Cholesky or LL^t decomposition of a matrix.
 
 
   If the input matrix does not have a decomposition, then the 
@@ -58,7 +59,7 @@
                    - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
    * @par
    * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
-   * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+   * The decomposition of A is returning a lower triangular matrix L such that A = L L^t
    */
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
@@ -170,10 +171,7 @@ arm_status arm_mat_cholesky_f32(
        }
 
        invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
-       for(j=i; j < n ; j++)
-       {
-         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
-       }
+       SCALE_COL_F32(pDst,i,invSqrtVj,i);
     }
 
     status = ARM_MATH_SUCCESS;
@@ -350,10 +348,7 @@ arm_status arm_mat_cholesky_f32(
        }
 
        invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
-       for(j=i; j < n ; j++)
-       {
-         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
-       }
+       SCALE_COL_F32(pDst,i,invSqrtVj,i);
     }
 
     status = ARM_MATH_SUCCESS;
@@ -416,10 +411,8 @@ arm_status arm_mat_cholesky_f32(
        }
 
        invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
-       for(j=i ; j < n ; j++)
-       {
-         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
-       }
+       SCALE_COL_F32(pDst,i,invSqrtVj,i);
+      
     }
 
     status = ARM_MATH_SUCCESS;
diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
index 4e095d06..7669582d 100755
--- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@@ -27,6 +27,7 @@
  */
 
 #include "dsp/matrix_functions.h"
+#include "dsp/matrix_utils.h"
 
 /**
   @ingroup groupMatrix
@@ -48,7 +49,7 @@
                    - \ref ARM_MATH_DECOMPOSITION_FAILURE      : Input matrix cannot be decomposed
    * @par
    * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
-   * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+   * The decomposition of A is returning a lower triangular matrix L such that A = L L^t
    */
 
 
@@ -102,10 +103,8 @@ arm_status arm_mat_cholesky_f64(
        }
 
        invSqrtVj = 1.0/sqrt(pG[i * n + i]);
-       for(j=i ; j < n ; j++)
-       {
-         pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
-       }
+       SCALE_COL_F64(pDst,i,invSqrtVj,i);
+
     }
 
     status = ARM_MATH_SUCCESS;
diff --git a/Source/MatrixFunctions/arm_mat_inverse_f16.c b/Source/MatrixFunctions/arm_mat_inverse_f16.c
index 90bc06d0..67aeb227 100755
--- a/Source/MatrixFunctions/arm_mat_inverse_f16.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f16.c
@@ -27,6 +27,7 @@
  */
 
 #include "dsp/matrix_functions_f16.h"
+#include "dsp/matrix_utils.h"
 
 #if defined(ARM_FLOAT16_SUPPORTED)
 
@@ -50,520 +51,20 @@
                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
                    - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
  */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-arm_status arm_mat_inverse_f16(
-  const arm_matrix_instance_f16 * pSrc,
-  arm_matrix_instance_f16 * pDst)
-{
-    float16_t *pIn = pSrc->pData;   /* input data matrix pointer */
-    float16_t *pOut = pDst->pData;  /* output data matrix pointer */
-    float16_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
-    float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
-    float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */
-
-    uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
-    uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
-    float16_t *pTmpA, *pTmpB;
-
-    _Float16 in = 0.0f16;        /* Temporary input values  */
-    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
-    arm_status status;          /* status of matrix inverse */
-    uint32_t  blkCnt;
-
-#ifdef ARM_MATH_MATRIX_CHECK
-   /* Check for matrix mismatch condition */
-  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
-     || (pSrc->numRows != pDst->numRows))
-  {
-    /* Set status as ARM_MATH_SIZE_MISMATCH */
-    status = ARM_MATH_SIZE_MISMATCH;
-  }
-  else
-#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
-  {
-
-    /*--------------------------------------------------------------------------------------------------------------
-     * Matrix Inverse can be solved using elementary row operations.
-     *
-     *  Gauss-Jordan Method:
-     *
-     *     1. First combine the identity matrix and the input matrix separated by a bar to form an
-     *        augmented matrix as follows:
-     *                      _  _          _     _      _   _         _         _
-     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
-     *                     |  |            | | |        |   |   =   |           |
-     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
-     *
-     *      2. In our implementation, pDst Matrix is used as identity matrix.
-     *
-     *      3. Begin with the first row. Let i = 1.
-     *
-     *      4. Check to see if the pivot for row i is zero.
-     *         The pivot is the element of the main diagonal that is on the current row.
-     *         For instance, if working with row i, then the pivot element is aii.
-     *         If the pivot is zero, exchange that row with a row below it that does not
-     *         contain a zero in column i. If this is not possible, then an inverse
-     *         to that matrix does not exist.
-     *
-     *      5. Divide every element of row i by the pivot.
-     *
-     *      6. For every row below and  row i, replace that row with the sum of that row and
-     *         a multiple of row i so that each new element in column i below row i is zero.
-     *
-     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
-     *         for every element below and above the main diagonal.
-     *
-     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
-     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
-     *----------------------------------------------------------------------------------------------------------------*/
-
-        /*
-         * Working pointer for destination matrix
-         */
-        pOutT1 = pOut;
-        /*
-         * Loop over the number of rows
-         */
-        rowCnt = numRows;
-        /*
-         * Making the destination matrix as identity matrix
-         */
-        while (rowCnt > 0U)
-        {
-            /*
-             * Writing all zeroes in lower triangle of the destination matrix
-             */
-            j = numRows - rowCnt;
-            while (j > 0U)
-            {
-                *pOutT1++ = 0.0f16;
-                j--;
-            }
-            /*
-             * Writing all ones in the diagonal of the destination matrix
-             */
-            *pOutT1++ = 1.0f16;
-            /*
-             * Writing all zeroes in upper triangle of the destination matrix
-             */
-            j = rowCnt - 1U;
-            while (j > 0U)
-            {
-                *pOutT1++ = 0.0f16;
-                j--;
-            }
-            /*
-             * Decrement the loop counter
-             */
-            rowCnt--;
-        }
-
-        /*
-         * Loop over the number of columns of the input matrix.
-         * All the elements in each column are processed by the row operations
-         */
-        loopCnt = numCols;
-        /*
-         * Index modifier to navigate through the columns
-         */
-        l = 0U;
-        while (loopCnt > 0U)
-        {
-            /*
-             * Check if the pivot element is zero..
-             * If it is zero then interchange the row with non zero row below.
-             * If there is no non zero element to replace in the rows below,
-             * then the matrix is Singular.
-             */
-
-            /*
-             * Working pointer for the input matrix that points
-             * * to the pivot element of the particular row
-             */
-            pInT1 = pIn + (l * numCols);
-            /*
-             * Working pointer for the destination matrix that points
-             * * to the pivot element of the particular row
-             */
-            pOutT1 = pOut + (l * numCols);
-            /*
-             * Temporary variable to hold the pivot value
-             */
-            in = *pInT1;
-            
-
-            /*
-             * Check if the pivot element is zero
-             */
-            if ((_Float16)*pInT1 == 0.0f16)
-            {
-                /*
-                 * Loop over the number rows present below
-                 */
-                for (i = 1U; i < numRows-l; i++)
-                {
-                    /*
-                     * Update the input and destination pointers
-                     */
-                    pInT2 = pInT1 + (numCols * i);
-                    pOutT2 = pOutT1 + (numCols * i);
-                    /*
-                     * Check if there is a non zero pivot element to
-                     * * replace in the rows below
-                     */
-                    if ((_Float16)*pInT2 != 0.0f16)
-                    {
-                        f16x8_t vecA, vecB;
-                        /*
-                         * Loop over number of columns
-                         * * to the right of the pilot element
-                         */
-                        pTmpA = pInT1;
-                        pTmpB = pInT2;
-                        blkCnt = (numCols - l) >> 3;
-                        while (blkCnt > 0U)
-                        {
-                            
-                            vecA = vldrhq_f16(pTmpA);
-                            vecB = vldrhq_f16(pTmpB);
-                            vstrhq_f16(pTmpB, vecA);
-                            vstrhq_f16(pTmpA, vecB);
-
-                            pTmpA += 8;
-                            pTmpB += 8;
-                            /*
-                             * Decrement the blockSize loop counter
-                             */
-                            blkCnt--;
-                        }
-                        /*
-                         * tail
-                         * (will be merged thru tail predication)
-                         */
-                        blkCnt = (numCols - l) & 7;
-                        if (blkCnt > 0U)
-                        {
-                            mve_pred16_t p0 = vctp16q(blkCnt);
-
-                            vecA = vldrhq_f16(pTmpA);
-                            vecB = vldrhq_f16(pTmpB);
-                            vstrhq_p_f16(pTmpB, vecA, p0);
-                            vstrhq_p_f16(pTmpA, vecB, p0);
-                        }
-
-                        pInT1 += numCols - l;
-                        pInT2 += numCols - l;
-                        pTmpA = pOutT1;
-                        pTmpB = pOutT2;
-                        blkCnt = numCols >> 3;
-                        while (blkCnt > 0U)
-                        {
-
-                            vecA = vldrhq_f16(pTmpA);
-                            vecB = vldrhq_f16(pTmpB);
-                            vstrhq_f16(pTmpB, vecA);
-                            vstrhq_f16(pTmpA, vecB);
-                            pTmpA += 8;
-                            pTmpB += 8;
-                            /*
-                             * Decrement the blockSize loop counter
-                             */
-                            blkCnt--;
-                        }
-                        /*
-                         * tail
-                         */
-                        blkCnt = numCols & 7;
-                        if (blkCnt > 0U)
-                        {
-                            mve_pred16_t p0 = vctp16q(blkCnt);
-
-                            vecA = vldrhq_f16(pTmpA);
-                            vecB = vldrhq_f16(pTmpB);
-                            vstrhq_p_f16(pTmpB, vecA, p0);
-                            vstrhq_p_f16(pTmpA, vecB, p0);
-                        }
-
-                        pOutT1 += numCols;
-                        pOutT2 += numCols;
-                        /*
-                         * Flag to indicate whether exchange is done or not
-                         */
-                        flag = 1U;
-
-                        /*
-                         * Break after exchange is done
-                         */
-                        break;
-                    }
-              
-                }
-            }
-
-            /*
-             * Update the status if the matrix is singular
-             */
-            if ((flag != 1U) && (in == 0.0f16))
-            {
-                return ARM_MATH_SINGULAR;
-            }
-
-            /*
-             * Points to the pivot row of input and destination matrices
-             */
-            pPivotRowIn = pIn + (l * numCols);
-            pPivotRowDst = pOut + (l * numCols);
-
-            /*
-             * Temporary pointers to the pivot row pointers
-             */
-            pInT1 = pPivotRowIn;
-            pOutT1 = pPivotRowDst;
-
-            /*
-             * Pivot element of the row
-             */
-            in = *(pIn + (l * numCols));
-
-            pTmpA = pInT1;
-
-            f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
-
-            blkCnt = (numCols - l) >> 3;
-            f16x8_t vecA;
-            while (blkCnt > 0U)
-            {
-                *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
-                pTmpA += 8;
-                /*
-                 * Decrement the blockSize loop counter
-                 */
-                blkCnt--;
-            }
-            /*
-             * tail
-             */
-            blkCnt = (numCols - l) & 7;
-            if (blkCnt > 0U)
-            {
-                mve_pred16_t p0 = vctp16q(blkCnt);
-                
-
-                vecA = vldrhq_f16(pTmpA);
-                vecA = vecA * invIn;
-                vstrhq_p_f16(pTmpA, vecA, p0);
-            }
-
-            pInT1 += numCols - l;
-            /*
-             * Loop over number of columns
-             * * to the right of the pilot element
-             */
-
-            pTmpA = pOutT1;
-            blkCnt = numCols >> 3;
-            while (blkCnt > 0U)
-            {
-                *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
-                pTmpA += 8;
-                /*
-                 * Decrement the blockSize loop counter
-                 */
-                blkCnt--;
-            }
-            /*
-             * tail
-             * (will be merged thru tail predication)
-             */
-            blkCnt = numCols & 7;
-            if (blkCnt > 0U)
-            {
-                mve_pred16_t p0 = vctp16q(blkCnt);
-
-                vecA = vldrhq_f16(pTmpA);
-                vecA = vecA * invIn;
-                vstrhq_p_f16(pTmpA, vecA, p0);
-            }
-
-            pOutT1 += numCols;
-
-            /*
-             * Replace the rows with the sum of that row and a multiple of row i
-             * * so that each new element in column i above row i is zero.
-             */
-
-            /*
-             * Temporary pointers for input and destination matrices
-             */
-            pInT1 = pIn;
-            pOutT1 = pOut;
-
-            for (i = 0U; i < numRows; i++)
-            {
-                /*
-                 * Check for the pivot element
-                 */
-                if (i == l)
-                {
-                    /*
-                     * If the processing element is the pivot element,
-                     * only the columns to the right are to be processed
-                     */
-                    pInT1 += numCols - l;
-                    pOutT1 += numCols;
-                }
-                else
-                {
-                    /*
-                     * Element of the reference row
-                     */
-
-                    /*
-                     * Working pointers for input and destination pivot rows
-                     */
-                    pPRT_in = pPivotRowIn;
-                    pPRT_pDst = pPivotRowDst;
-                    /*
-                     * Loop over the number of columns to the right of the pivot element,
-                     * to replace the elements in the input matrix
-                     */
-
-                    in = *pInT1;
-                    f16x8_t tmpV = vdupq_n_f16(in);
-
-                    blkCnt = (numCols - l) >> 3;
-                    while (blkCnt > 0U)
-                    {
-                        f16x8_t vec1, vec2;
-                        /*
-                         * Replace the element by the sum of that row
-                         * and a multiple of the reference row
-                         */
-                        vec1 = vldrhq_f16(pInT1);
-                        vec2 = vldrhq_f16(pPRT_in);
-                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
-                        vstrhq_f16(pInT1, vec1);
-                        pPRT_in += 8;
-                        pInT1 += 8;
-                        /*
-                         * Decrement the blockSize loop counter
-                         */
-                        blkCnt--;
-                    }
-                    /*
-                     * tail
-                     * (will be merged thru tail predication)
-                     */
-                    blkCnt = (numCols - l) & 7;
-                    if (blkCnt > 0U)
-                    {
-                        f16x8_t vec1, vec2;
-                        mve_pred16_t p0 = vctp16q(blkCnt);
-
-                        vec1 = vldrhq_f16(pInT1);
-                        vec2 = vldrhq_f16(pPRT_in);
-                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
-                        vstrhq_p_f16(pInT1, vec1, p0);
-                        pInT1 += blkCnt;
-                    }
-
-                    blkCnt = numCols >> 3;
-                    while (blkCnt > 0U)
-                    {
-                        f16x8_t vec1, vec2;
-
-                        /*
-                         * Replace the element by the sum of that row
-                         * and a multiple of the reference row
-                         */
-                        vec1 = vldrhq_f16(pOutT1);
-                        vec2 = vldrhq_f16(pPRT_pDst);
-                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
-                        vstrhq_f16(pOutT1, vec1);
-                        pPRT_pDst += 8;
-                        pOutT1 += 8;
-                        /*
-                         * Decrement the blockSize loop counter
-                         */
-                        blkCnt--;
-                    }
-                    /*
-                     * tail
-                     * (will be merged thru tail predication)
-                     */
-                    blkCnt = numCols & 7;
-                    if (blkCnt > 0U)
-                    {
-                        f16x8_t vec1, vec2;
-                        mve_pred16_t p0 = vctp16q(blkCnt);
-
-                        vec1 = vldrhq_f16(pOutT1);
-                        vec2 = vldrhq_f16(pPRT_pDst);
-                        vec1 = vfmsq_f16(vec1, tmpV, vec2);
-                        vstrhq_p_f16(pOutT1, vec1, p0);
-
-                        pInT2 += blkCnt;
-                        pOutT1 += blkCnt;
-                    }
-                }
-                /*
-                 * Increment the temporary input pointer
-                 */
-                pInT1 = pInT1 + l;
-            }
-            /*
-             * Increment the input pointer
-             */
-            pIn++;
-            /*
-             * Decrement the loop counter
-             */
-            loopCnt--;
-            /*
-             * Increment the index modifier
-             */
-            l++;
-        }
-
-        /*
-         * Set status as ARM_MATH_SUCCESS
-         */
-        status = ARM_MATH_SUCCESS;
-
-        if ((flag != 1U) && (in == 0.0f16))
-        {
-            pIn = pSrc->pData;
-            for (i = 0; i < numRows * numCols; i++)
-            {
-                if ((_Float16)pIn[i] != 0.0f16)
-                    break;
-            }
-
-            if (i == numRows * numCols)
-                status = ARM_MATH_SINGULAR;
-        }
-  }
-  /* Return to application */
-  return (status);
-}
-
-#else
-
 arm_status arm_mat_inverse_f16(
   const arm_matrix_instance_f16 * pSrc,
         arm_matrix_instance_f16 * pDst)
 {
   float16_t *pIn = pSrc->pData;                  /* input data matrix pointer */
   float16_t *pOut = pDst->pData;                 /* output data matrix pointer */
-  float16_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
-  float16_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
-  float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
+  
+  float16_t *pTmp;
   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
 
-  _Float16 Xchg, in = 0.0f16, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
+
+  float16_t pivot = 0.0f16, newPivot=0.0f16;                /* Temporary input values  */
+  uint32_t selectedRow,pivotRow,i, rowNb, rowCnt, flag = 0U, j,column;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -581,7 +82,6 @@ arm_status arm_mat_inverse_f16(
 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
 
   {
-
     /*--------------------------------------------------------------------------------------------------------------
      * Matrix Inverse can be solved using elementary row operations.
      *
@@ -618,7 +118,7 @@ arm_status arm_mat_inverse_f16(
      *----------------------------------------------------------------------------------------------------------------*/
 
     /* Working pointer for destination matrix */
-    pOutT1 = pOut;
+    pTmp = pOut;
 
     /* Loop over the number of rows */
     rowCnt = numRows;
@@ -630,18 +130,18 @@ arm_status arm_mat_inverse_f16(
       j = numRows - rowCnt;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0f16;
+        *pTmp++ = 0.0f16;
         j--;
       }
 
       /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0f16;
+      *pTmp++ = 1.0f16;
 
       /* Writing all zeroes in upper triangle of the destination matrix */
       j = rowCnt - 1U;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0f16;
+        *pTmp++ = 0.0f16;
         j--;
       }
 
@@ -651,220 +151,100 @@ arm_status arm_mat_inverse_f16(
 
     /* Loop over the number of columns of the input matrix.
        All the elements in each column are processed by the row operations */
-    loopCnt = numCols;
 
     /* Index modifier to navigate through the columns */
-    l = 0U;
-
-    while (loopCnt > 0U)
+    for(column = 0U; column < numCols; column++)
     {
       /* Check if the pivot element is zero..
        * If it is zero then interchange the row with non zero row below.
        * If there is no non zero element to replace in the rows below,
        * then the matrix is Singular. */
 
-      /* Working pointer for the input matrix that points
-       * to the pivot element of the particular row  */
-      pInT1 = pIn + (l * numCols);
-
-      /* Working pointer for the destination matrix that points
-       * to the pivot element of the particular row  */
-      pOutT1 = pOut + (l * numCols);
+      pivotRow = column;
 
       /* Temporary variable to hold the pivot value */
-      in = *pInT1;
-
+      pTmp = ELEM(pSrc,column,column) ;
+      pivot = *pTmp;
+      selectedRow = column;
 
-      /* Check if the pivot element is zero */
-      if ((_Float16)*pInT1 == 0.0f16)
-      {
+     
         /* Loop over the number rows present below */
 
-        for (i = 1U; i < numRows-l; i++)
-        {
+      for (rowNb = column+1; rowNb < numRows; rowNb++)
+      {
           /* Update the input and destination pointers */
-          pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * i);
+          pTmp = ELEM(pSrc,rowNb,column);
+          newPivot = *pTmp;
+          if (fabsf((float32_t)newPivot) > fabsf((float32_t)pivot))
+          {
+            selectedRow = rowNb; 
+            pivot = newPivot;
+          }
+
+      }
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
-          if ((_Float16)*pInT2 != 0.0f16)
-          {
+      if (((_Float16)pivot != 0.0f16) && (selectedRow != column))
+      {
             /* Loop over number of columns
              * to the right of the pilot element */
-            j = numCols - l;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the input matrix */
-              Xchg = *pInT2;
-              *pInT2++ = *pInT1;
-              *pInT1++ = Xchg;
 
-              /* Decrement the loop counter */
-              j--;
-            }
-
-            /* Loop over number of columns of the destination matrix */
-            j = numCols;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the destination matrix */
-              Xchg = *pOutT2;
-              *pOutT2++ = *pOutT1;
-              *pOutT1++ = Xchg;
-
-              /* Decrement loop counter */
-              j--;
-            }
+            SWAP_ROWS_F16(pSrc,column, pivotRow,selectedRow);
+            SWAP_ROWS_F16(pDst,0, pivotRow,selectedRow);
 
+    
             /* Flag to indicate whether exchange is done or not */
             flag = 1U;
 
-            /* Break after exchange is done */
-            break;
-          }
-
-        }
       }
 
+
       /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0f16))
+      if ((flag != 1U) && ((_Float16)pivot == 0.0f16))
       {
         return ARM_MATH_SINGULAR;
       }
 
-      /* Points to the pivot row of input and destination matrices */
-      pPivotRowIn = pIn + (l * numCols);
-      pPivotRowDst = pOut + (l * numCols);
-
-      /* Temporary pointers to the pivot row pointers */
-      pInT1 = pPivotRowIn;
-      pInT2 = pPivotRowDst;
-
+     
       /* Pivot element of the row */
-      in = *pPivotRowIn;
+      pivot = 1.0f16 / (_Float16)pivot;
 
-      /* Loop over number of columns
-       * to the right of the pilot element */
-      j = (numCols - l);
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        in1 = *pInT1;
-        *pInT1++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Loop over number of columns of the destination matrix */
-      j = numCols;
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        in1 = *pInT2;
-        *pInT2++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
+      SCALE_ROW_F16(pSrc,column,pivot,pivotRow);
+      SCALE_ROW_F16(pDst,0,pivot,pivotRow);
 
+      
       /* Replace the rows with the sum of that row and a multiple of row i
        * so that each new element in column i above row i is zero.*/
 
-      /* Temporary pointers for input and destination matrices */
-      pInT1 = pIn;
-      pInT2 = pOut;
-
-      /* index used to check for pivot element */
-      i = 0U;
-
-      /* Loop over number of rows */
-      /*  to be replaced by the sum of that row and a multiple of row i */
-      k = numRows;
-
-      while (k > 0U)
+      rowNb = 0;
+      for (;rowNb < pivotRow; rowNb++)
       {
-        /* Check for the pivot element */
-        if (i == l)
-        {
-          /* If the processing element is the pivot element,
-             only the columns to the right are to be processed */
-          pInT1 += numCols - l;
-
-          pInT2 += numCols;
-        }
-        else
-        {
-          /* Element of the reference row */
-          in = *pInT1;
-
-          /* Working pointers for input and destination pivot rows */
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-
-          /* Loop over the number of columns to the right of the pivot element,
-             to replace the elements in the input matrix */
-          j = (numCols - l);
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT1;
-            *pInT1++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_in++);
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-          /* Loop over the number of columns to
-             replace the elements in the destination matrix */
-          j = numCols;
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
 
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT2;
-            *pInT2++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_pDst++);
+           MAS_ROW_F16(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F16(0     ,pDst,rowNb,pivot,pDst,pivotRow);
 
-            /* Decrement loop counter */
-            j--;
-          }
 
-        }
+      }
 
-        /* Increment temporary input pointer */
-        pInT1 = pInT1 + l;
+      for (rowNb = pivotRow + 1; rowNb < numRows; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
 
-        /* Decrement loop counter */
-        k--;
+           MAS_ROW_F16(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F16(0     ,pDst,rowNb,pivot,pDst,pivotRow);
 
-        /* Increment pivot index */
-        i++;
       }
 
-      /* Increment the input pointer */
-      pIn++;
-
-      /* Decrement the loop counter */
-      loopCnt--;
-
-      /* Increment the index modifier */
-      l++;
     }
 
     /* Set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
 
-    if ((flag != 1U) && ((_Float16)in == 0.0f16))
+    if ((flag != 1U) && ((_Float16)pivot == 0.0f16))
     {
       pIn = pSrc->pData;
       for (i = 0; i < numRows * numCols; i++)
@@ -881,8 +261,6 @@ arm_status arm_mat_inverse_f16(
   /* Return to application */
   return (status);
 }
-#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
-
 /**
   @} end of MatrixInv group
  */
diff --git a/Source/MatrixFunctions/arm_mat_inverse_f32.c b/Source/MatrixFunctions/arm_mat_inverse_f32.c
index b0ef7608..e901f165 100644
--- a/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -27,6 +27,7 @@
  */
 
 #include "dsp/matrix_functions.h"
+#include "dsp/matrix_utils.h"
 
 
 /**
@@ -68,926 +69,20 @@
                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
                    - \ref ARM_MATH_SINGULAR      : Input matrix is found to be singular (non-invertible)
  */
-#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-arm_status arm_mat_inverse_f32(
-  const arm_matrix_instance_f32 * pSrc,
-  arm_matrix_instance_f32 * pDst)
-{
-    float32_t *pIn = pSrc->pData;   /* input data matrix pointer */
-    float32_t *pOut = pDst->pData;  /* output data matrix pointer */
-    float32_t *pInT1, *pInT2;   /* Temporary input data matrix pointer */
-    float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
-    float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;    /* Temporary input and output data matrix pointer */
-
-    uint32_t  numRows = pSrc->numRows;  /* Number of rows in the matrix  */
-    uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
-    float32_t *pTmpA, *pTmpB;
-
-    float32_t in = 0.0f;        /* Temporary input values  */
-    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, l;   /* loop counters */
-    arm_status status;          /* status of matrix inverse */
-    uint32_t  blkCnt;
-
-#ifdef ARM_MATH_MATRIX_CHECK
-   /* Check for matrix mismatch condition */
-  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
-     || (pSrc->numRows != pDst->numRows))
-  {
-    /* Set status as ARM_MATH_SIZE_MISMATCH */
-    status = ARM_MATH_SIZE_MISMATCH;
-  }
-  else
-#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
-  {
-
-    /*--------------------------------------------------------------------------------------------------------------
-     * Matrix Inverse can be solved using elementary row operations.
-     *
-     *  Gauss-Jordan Method:
-     *
-     *     1. First combine the identity matrix and the input matrix separated by a bar to form an
-     *        augmented matrix as follows:
-     *                      _  _          _     _      _   _         _         _
-     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
-     *                     |  |            | | |        |   |   =   |           |
-     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
-     *
-     *      2. In our implementation, pDst Matrix is used as identity matrix.
-     *
-     *      3. Begin with the first row. Let i = 1.
-     *
-     *      4. Check to see if the pivot for row i is zero.
-     *         The pivot is the element of the main diagonal that is on the current row.
-     *         For instance, if working with row i, then the pivot element is aii.
-     *         If the pivot is zero, exchange that row with a row below it that does not
-     *         contain a zero in column i. If this is not possible, then an inverse
-     *         to that matrix does not exist.
-     *
-     *      5. Divide every element of row i by the pivot.
-     *
-     *      6. For every row below and  row i, replace that row with the sum of that row and
-     *         a multiple of row i so that each new element in column i below row i is zero.
-     *
-     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
-     *         for every element below and above the main diagonal.
-     *
-     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
-     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
-     *----------------------------------------------------------------------------------------------------------------*/
-
-        /*
-         * Working pointer for destination matrix
-         */
-        pOutT1 = pOut;
-        /*
-         * Loop over the number of rows
-         */
-        rowCnt = numRows;
-        /*
-         * Making the destination matrix as identity matrix
-         */
-        while (rowCnt > 0U)
-        {
-            /*
-             * Writing all zeroes in lower triangle of the destination matrix
-             */
-            j = numRows - rowCnt;
-            while (j > 0U)
-            {
-                *pOutT1++ = 0.0f;
-                j--;
-            }
-            /*
-             * Writing all ones in the diagonal of the destination matrix
-             */
-            *pOutT1++ = 1.0f;
-            /*
-             * Writing all zeroes in upper triangle of the destination matrix
-             */
-            j = rowCnt - 1U;
-            while (j > 0U)
-            {
-                *pOutT1++ = 0.0f;
-                j--;
-            }
-            /*
-             * Decrement the loop counter
-             */
-            rowCnt--;
-        }
-
-        /*
-         * Loop over the number of columns of the input matrix.
-         * All the elements in each column are processed by the row operations
-         */
-        loopCnt = numCols;
-        /*
-         * Index modifier to navigate through the columns
-         */
-        l = 0U;
-        while (loopCnt > 0U)
-        {
-            /*
-             * Check if the pivot element is zero..
-             * If it is zero then interchange the row with non zero row below.
-             * If there is no non zero element to replace in the rows below,
-             * then the matrix is Singular.
-             */
-
-            /*
-             * Working pointer for the input matrix that points
-             * * to the pivot element of the particular row
-             */
-            pInT1 = pIn + (l * numCols);
-            /*
-             * Working pointer for the destination matrix that points
-             * * to the pivot element of the particular row
-             */
-            pOutT1 = pOut + (l * numCols);
-            /*
-             * Temporary variable to hold the pivot value
-             */
-            in = *pInT1;
-           
-
-            /*
-             * Check if the pivot element is zero
-             */
-            if (*pInT1 == 0.0f)
-            {
-                /*
-                 * Loop over the number rows present below
-                 */
-                for (i = 1U; i < numRows-l; i++)
-                {
-                    /*
-                     * Update the input and destination pointers
-                     */
-                    pInT2 = pInT1 + (numCols * i);
-                    pOutT2 = pOutT1 + (numCols * i);
-                    /*
-                     * Check if there is a non zero pivot element to
-                     * * replace in the rows below
-                     */
-                    if (*pInT2 != 0.0f)
-                    {
-                        f32x4_t vecA, vecB;
-                        /*
-                         * Loop over number of columns
-                         * * to the right of the pilot element
-                         */
-                        pTmpA = pInT1;
-                        pTmpB = pInT2;
-                        blkCnt = (numCols - l) >> 2;
-                        while (blkCnt > 0U)
-                        {
-                            
-                            vecA = vldrwq_f32(pTmpA);
-                            vecB = vldrwq_f32(pTmpB);
-                            vstrwq_f32(pTmpB, vecA);
-                            vstrwq_f32(pTmpA, vecB);
-
-                            pTmpA += 4;
-                            pTmpB += 4;
-                            /*
-                             * Decrement the blockSize loop counter
-                             */
-                            blkCnt--;
-                        }
-                        /*
-                         * tail
-                         * (will be merged thru tail predication)
-                         */
-                        blkCnt = (numCols - l) & 3;
-                        if (blkCnt > 0U)
-                        {
-                            mve_pred16_t p0 = vctp32q(blkCnt);
-
-                            vecA = vldrwq_f32(pTmpA);
-                            vecB = vldrwq_f32(pTmpB);
-                            vstrwq_p_f32(pTmpB, vecA, p0);
-                            vstrwq_p_f32(pTmpA, vecB, p0);
-                        }
-
-                        pInT1 += numCols - l;
-                        pInT2 += numCols - l;
-                        pTmpA = pOutT1;
-                        pTmpB = pOutT2;
-                        blkCnt = numCols >> 2;
-                        while (blkCnt > 0U)
-                        {
-
-                            vecA = vldrwq_f32(pTmpA);
-                            vecB = vldrwq_f32(pTmpB);
-                            vstrwq_f32(pTmpB, vecA);
-                            vstrwq_f32(pTmpA, vecB);
-                            pTmpA += 4;
-                            pTmpB += 4;
-                            /*
-                             * Decrement the blockSize loop counter
-                             */
-                            blkCnt--;
-                        }
-                        /*
-                         * tail
-                         */
-                        blkCnt = numCols & 3;
-                        if (blkCnt > 0U)
-                        {
-                            mve_pred16_t p0 = vctp32q(blkCnt);
-
-                            vecA = vldrwq_f32(pTmpA);
-                            vecB = vldrwq_f32(pTmpB);
-                            vstrwq_p_f32(pTmpB, vecA, p0);
-                            vstrwq_p_f32(pTmpA, vecB, p0);
-                        }
-
-                        pOutT1 += numCols;
-                        pOutT2 += numCols;
-                        /*
-                         * Flag to indicate whether exchange is done or not
-                         */
-                        flag = 1U;
-
-                        /*
-                         * Break after exchange is done
-                         */
-                        break;
-                    }
-                    
-                }
-            }
-
-            /*
-             * Update the status if the matrix is singular
-             */
-            if ((flag != 1U) && (in == 0.0f))
-            {
-                return ARM_MATH_SINGULAR;
-            }
-
-            /*
-             * Points to the pivot row of input and destination matrices
-             */
-            pPivotRowIn = pIn + (l * numCols);
-            pPivotRowDst = pOut + (l * numCols);
-
-            /*
-             * Temporary pointers to the pivot row pointers
-             */
-            pInT1 = pPivotRowIn;
-            pOutT1 = pPivotRowDst;
-
-            /*
-             * Pivot element of the row
-             */
-            in = *(pIn + (l * numCols));
-
-            pTmpA = pInT1;
-
-            f32x4_t invIn = vdupq_n_f32(1.0f / in);
-
-            blkCnt = (numCols - l) >> 2;
-            f32x4_t vecA;
-            while (blkCnt > 0U)
-            {
-                *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA * invIn;
-                pTmpA += 4;
-                /*
-                 * Decrement the blockSize loop counter
-                 */
-                blkCnt--;
-            }
-            /*
-             * tail
-             */
-            blkCnt = (numCols - l) & 3;
-            if (blkCnt > 0U)
-            {
-                mve_pred16_t p0 = vctp32q(blkCnt);
-                
-
-                vecA = vldrwq_f32(pTmpA);
-                vecA = vecA * invIn;
-                vstrwq_p_f32(pTmpA, vecA, p0);
-            }
-
-            pInT1 += numCols - l;
-            /*
-             * Loop over number of columns
-             * * to the right of the pilot element
-             */
-
-            pTmpA = pOutT1;
-            blkCnt = numCols >> 2;
-            while (blkCnt > 0U)
-            {
-                *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA *invIn;
-                pTmpA += 4;
-                /*
-                 * Decrement the blockSize loop counter
-                 */
-                blkCnt--;
-            }
-            /*
-             * tail
-             * (will be merged thru tail predication)
-             */
-            blkCnt = numCols & 3;
-            if (blkCnt > 0U)
-            {
-                mve_pred16_t p0 = vctp32q(blkCnt);
-
-                vecA = vldrwq_f32(pTmpA);
-                vecA = vecA * invIn;
-                vstrwq_p_f32(pTmpA, vecA, p0);
-            }
-
-            pOutT1 += numCols;
-
-            /*
-             * Replace the rows with the sum of that row and a multiple of row i
-             * * so that each new element in column i above row i is zero.
-             */
-
-            /*
-             * Temporary pointers for input and destination matrices
-             */
-            pInT1 = pIn;
-            pOutT1 = pOut;
-
-            for (i = 0U; i < numRows; i++)
-            {
-                /*
-                 * Check for the pivot element
-                 */
-                if (i == l)
-                {
-                    /*
-                     * If the processing element is the pivot element,
-                     * only the columns to the right are to be processed
-                     */
-                    pInT1 += numCols - l;
-                    pOutT1 += numCols;
-                }
-                else
-                {
-                    /*
-                     * Element of the reference row
-                     */
-
-                    /*
-                     * Working pointers for input and destination pivot rows
-                     */
-                    pPRT_in = pPivotRowIn;
-                    pPRT_pDst = pPivotRowDst;
-                    /*
-                     * Loop over the number of columns to the right of the pivot element,
-                     * to replace the elements in the input matrix
-                     */
-
-                    in = *pInT1;
-                    f32x4_t tmpV = vdupq_n_f32(in);
-
-                    blkCnt = (numCols - l) >> 2;
-                    while (blkCnt > 0U)
-                    {
-                        f32x4_t vec1, vec2;
-                        /*
-                         * Replace the element by the sum of that row
-                         * and a multiple of the reference row
-                         */
-                        vec1 = vldrwq_f32(pInT1);
-                        vec2 = vldrwq_f32(pPRT_in);
-                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
-                        vstrwq_f32(pInT1, vec1);
-                        pPRT_in += 4;
-                        pInT1 += 4;
-                        /*
-                         * Decrement the blockSize loop counter
-                         */
-                        blkCnt--;
-                    }
-                    /*
-                     * tail
-                     * (will be merged thru tail predication)
-                     */
-                    blkCnt = (numCols - l) & 3;
-                    if (blkCnt > 0U)
-                    {
-                        f32x4_t vec1, vec2;
-                        mve_pred16_t p0 = vctp32q(blkCnt);
-
-                        vec1 = vldrwq_f32(pInT1);
-                        vec2 = vldrwq_f32(pPRT_in);
-                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
-                        vstrwq_p_f32(pInT1, vec1, p0);
-                        pInT1 += blkCnt;
-                    }
-
-                    blkCnt = numCols >> 2;
-                    while (blkCnt > 0U)
-                    {
-                        f32x4_t vec1, vec2;
-
-                        /*
-                         * Replace the element by the sum of that row
-                         * and a multiple of the reference row
-                         */
-                        vec1 = vldrwq_f32(pOutT1);
-                        vec2 = vldrwq_f32(pPRT_pDst);
-                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
-                        vstrwq_f32(pOutT1, vec1);
-                        pPRT_pDst += 4;
-                        pOutT1 += 4;
-                        /*
-                         * Decrement the blockSize loop counter
-                         */
-                        blkCnt--;
-                    }
-                    /*
-                     * tail
-                     * (will be merged thru tail predication)
-                     */
-                    blkCnt = numCols & 3;
-                    if (blkCnt > 0U)
-                    {
-                        f32x4_t vec1, vec2;
-                        mve_pred16_t p0 = vctp32q(blkCnt);
-
-                        vec1 = vldrwq_f32(pOutT1);
-                        vec2 = vldrwq_f32(pPRT_pDst);
-                        vec1 = vfmsq_f32(vec1, tmpV, vec2);
-                        vstrwq_p_f32(pOutT1, vec1, p0);
-
-                        pInT2 += blkCnt;
-                        pOutT1 += blkCnt;
-                    }
-                }
-                /*
-                 * Increment the temporary input pointer
-                 */
-                pInT1 = pInT1 + l;
-            }
-            /*
-             * Increment the input pointer
-             */
-            pIn++;
-            /*
-             * Decrement the loop counter
-             */
-            loopCnt--;
-            /*
-             * Increment the index modifier
-             */
-            l++;
-        }
-
-        /*
-         * Set status as ARM_MATH_SUCCESS
-         */
-        status = ARM_MATH_SUCCESS;
-
-        if ((flag != 1U) && (in == 0.0f))
-        {
-            pIn = pSrc->pData;
-            for (i = 0; i < numRows * numCols; i++)
-            {
-                if (pIn[i] != 0.0f)
-                    break;
-            }
-
-            if (i == numRows * numCols)
-                status = ARM_MATH_SINGULAR;
-        }
-  }
-  /* Return to application */
-  return (status);
-}
-
-#else
-#if defined(ARM_MATH_NEON)
-arm_status arm_mat_inverse_f32(
-  const arm_matrix_instance_f32 * pSrc,
-  arm_matrix_instance_f32 * pDst)
-{
-  float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
-  float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
-  float32_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
-  float32_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
-  float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
-  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
-  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
-
-
-  float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
-  arm_status status;                             /* status of matrix inverse */
-  float32x4_t vec1;
-  float32x4_t vec2;
-  float32x4_t tmpV;
-
-#ifdef ARM_MATH_MATRIX_CHECK
-
-  /* Check for matrix mismatch condition */
-  if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
-     || (pSrc->numRows != pDst->numRows))
-  {
-    /* Set status as ARM_MATH_SIZE_MISMATCH */
-    status = ARM_MATH_SIZE_MISMATCH;
-  }
-  else
-#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
-
-  {
-   /*--------------------------------------------------------------------------------------------------------------
-   * Matrix Inverse can be solved using elementary row operations.
-   *
-   *  Gauss-Jordan Method:
-   *
-   *     1. First combine the identity matrix and the input matrix separated by a bar to form an
-   *        augmented matrix as follows:
-   *              _                  _         _         _
-   *             |  a11  a12 | 1   0  |       |  X11 X12  |
-   *             |           |        |   =   |           |
-   *             |_ a21  a22 | 0   1 _|       |_ X21 X21 _|
-   *
-   *    2. In our implementation, pDst Matrix is used as identity matrix.
-   *
-   *    3. Begin with the first row. Let i = 1.
-   *
-   *    4. Check to see if the pivot for row i is zero.
-   *       The pivot is the element of the main diagonal that is on the current row.
-   *       For instance, if working with row i, then the pivot element is aii.
-   *       If the pivot is zero, exchange that row with a row below it that does not
-   *       contain a zero in column i. If this is not possible, then an inverse
-   *       to that matrix does not exist.
-   *
-   *      5. Divide every element of row i by the pivot.
-   *
-   *      6. For every row below and  row i, replace that row with the sum of that row and
-   *       a multiple of row i so that each new element in column i below row i is zero.
-   *
-   *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
-   *       for every element below and above the main diagonal.
-   *
-   *    8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
-   *       Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
-   *----------------------------------------------------------------------------------------------------------------*/
-
-    /* Working pointer for destination matrix */
-    pOutT1 = pOut;
-
-    /* Loop over the number of rows */
-    rowCnt = numRows;
-
-    /* Making the destination matrix as identity matrix */
-    while (rowCnt > 0U)
-    {
-      /* Writing all zeroes in lower triangle of the destination matrix */
-      j = numRows - rowCnt;
-      while (j > 0U)
-      {
-        *pOutT1++ = 0.0f;
-        j--;
-      }
-
-      /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0f;
-
-      /* Writing all zeroes in upper triangle of the destination matrix */
-      j = rowCnt - 1U;
-
-      while (j > 0U)
-      {
-        *pOutT1++ = 0.0f;
-        j--;
-      }
-
-      /* Decrement the loop counter */
-      rowCnt--;
-    }
-
-    /* Loop over the number of columns of the input matrix.
-       All the elements in each column are processed by the row operations */
-    loopCnt = numCols;
-
-    /* Index modifier to navigate through the columns */
-    l = 0U;
-
-    while (loopCnt > 0U)
-    {
-      /* Check if the pivot element is zero..
-       * If it is zero then interchange the row with non zero row below.
-       * If there is no non zero element to replace in the rows below,
-       * then the matrix is Singular. */
-
-      /* Working pointer for the input matrix that points
-       * to the pivot element of the particular row  */
-      pInT1 = pIn + (l * numCols);
-
-      /* Working pointer for the destination matrix that points
-       * to the pivot element of the particular row  */
-      pOutT1 = pOut + (l * numCols);
-
-      /* Temporary variable to hold the pivot value */
-      in = *pInT1;
-
-      /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0f)
-      {
-        /* Loop over the number rows present below */
-        for (i = 1U; i < numRows - l; i++)
-        {
-          /* Update the input and destination pointers */
-          pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * i);
-
-          /* Check if there is a non zero pivot element to
-           * replace in the rows below */
-          if (*pInT2 != 0.0f)
-          {
-            /* Loop over number of columns
-             * to the right of the pilot element */
-            j = numCols - l;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the input matrix */
-              Xchg = *pInT2;
-              *pInT2++ = *pInT1;
-              *pInT1++ = Xchg;
-
-              /* Decrement the loop counter */
-              j--;
-            }
-
-            /* Loop over number of columns of the destination matrix */
-            j = numCols;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the destination matrix */
-              Xchg = *pOutT2;
-              *pOutT2++ = *pOutT1;
-              *pOutT1++ = Xchg;
-
-              /* Decrement the loop counter */
-              j--;
-            }
-
-            /* Flag to indicate whether exchange is done or not */
-            flag = 1U;
-
-            /* Break after exchange is done */
-            break;
-          }
-
-         
-        }
-      }
-
-      /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0f))
-      {
-        return ARM_MATH_SINGULAR;
-      }
-
-      /* Points to the pivot row of input and destination matrices */
-      pPivotRowIn = pIn + (l * numCols);
-      pPivotRowDst = pOut + (l * numCols);
-
-      /* Temporary pointers to the pivot row pointers */
-      pInT1 = pPivotRowIn;
-      pInT2 = pPivotRowDst;
-
-      /* Pivot element of the row */
-      in = *pPivotRowIn;
-      tmpV = vdupq_n_f32(1.0f/in);
-
-      /* Loop over number of columns
-       * to the right of the pilot element */
-      j = (numCols - l) >> 2;
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        vec1 = vld1q_f32(pInT1);
-
-        vec1 = vmulq_f32(vec1, tmpV);
-        vst1q_f32(pInT1, vec1);
-        pInT1 += 4;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Tail */
-      j = (numCols - l) & 3;
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        in1 = *pInT1;
-        *pInT1++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Loop over number of columns of the destination matrix */
-      j = numCols >> 2;
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        vec1 = vld1q_f32(pInT2);
-
-        vec1 = vmulq_f32(vec1, tmpV);
-        vst1q_f32(pInT2, vec1);
-        pInT2 += 4;
-      
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Tail */
-      j = numCols & 3;
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        in1 = *pInT2;
-        *pInT2++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Replace the rows with the sum of that row and a multiple of row i
-       * so that each new element in column i above row i is zero.*/
-
-      /* Temporary pointers for input and destination matrices */
-      pInT1 = pIn;
-      pInT2 = pOut;
-
-      /* index used to check for pivot element */
-      i = 0U;
-
-      /* Loop over number of rows */
-      /*  to be replaced by the sum of that row and a multiple of row i */
-      k = numRows;
-
-      while (k > 0U)
-      {
-        /* Check for the pivot element */
-        if (i == l)
-        {
-          /* If the processing element is the pivot element,
-             only the columns to the right are to be processed */
-          pInT1 += numCols - l;
-
-          pInT2 += numCols;
-        }
-        else
-        {
-          /* Element of the reference row */
-          in = *pInT1;
-          tmpV = vdupq_n_f32(in);
-
-          /* Working pointers for input and destination pivot rows */
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-
-          /* Loop over the number of columns to the right of the pivot element,
-             to replace the elements in the input matrix */
-          j = (numCols - l) >> 2;
-	  
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            vec1 = vld1q_f32(pInT1);
-            vec2 = vld1q_f32(pPRT_in);
-            vec1 = vmlsq_f32(vec1, tmpV, vec2);
-            vst1q_f32(pInT1, vec1);
-            pPRT_in += 4;
-            pInT1 += 4;
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-	  /* Tail */
-          j = (numCols - l) & 3;
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT1;
-            *pInT1++ = in1 - (in * *pPRT_in++);
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-          /* Loop over the number of columns to
-             replace the elements in the destination matrix */
-          j = numCols >> 2;
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            vec1 = vld1q_f32(pInT2);
-            vec2 = vld1q_f32(pPRT_pDst);
-            vec1 = vmlsq_f32(vec1, tmpV, vec2);
-            vst1q_f32(pInT2, vec1);
-            pPRT_pDst += 4;
-            pInT2 += 4;
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-	  /* Tail */
-          j = numCols & 3;
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT2;
-            *pInT2++ = in1 - (in * *pPRT_pDst++);
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-        }
-
-        /* Increment the temporary input pointer */
-        pInT1 = pInT1 + l;
-
-        /* Decrement the loop counter */
-        k--;
-
-        /* Increment the pivot index */
-        i++;
-      }
-
-      /* Increment the input pointer */
-      pIn++;
-
-      /* Decrement the loop counter */
-      loopCnt--;
-
-      /* Increment the index modifier */
-      l++;
-    }
-
-    /* Set status as ARM_MATH_SUCCESS */
-    status = ARM_MATH_SUCCESS;
-
-    if ((flag != 1U) && (in == 0.0f))
-    {
-      pIn = pSrc->pData;
-      for (i = 0; i < numRows * numCols; i++)
-      {
-        if (pIn[i] != 0.0f)
-            break;
-      }
-
-      if (i == numRows * numCols)
-        status = ARM_MATH_SINGULAR;
-    }
-  }
-  /* Return to application */
-  return (status);
-}
-#else
 arm_status arm_mat_inverse_f32(
   const arm_matrix_instance_f32 * pSrc,
         arm_matrix_instance_f32 * pDst)
 {
   float32_t *pIn = pSrc->pData;                  /* input data matrix pointer */
   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
-  float32_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
-  float32_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
-  float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
+  
+  float32_t *pTmp;
   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
 
-#if defined (ARM_MATH_DSP)
 
-  float32_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
+  float32_t pivot = 0.0f, newPivot=0.0f;                /* Temporary input values  */
+  uint32_t selectedRow,pivotRow,i, rowNb, rowCnt, flag = 0U, j,column;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -1005,7 +100,6 @@ arm_status arm_mat_inverse_f32(
 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
 
   {
-
     /*--------------------------------------------------------------------------------------------------------------
      * Matrix Inverse can be solved using elementary row operations.
      *
@@ -1042,7 +136,7 @@ arm_status arm_mat_inverse_f32(
      *----------------------------------------------------------------------------------------------------------------*/
 
     /* Working pointer for destination matrix */
-    pOutT1 = pOut;
+    pTmp = pOut;
 
     /* Loop over the number of rows */
     rowCnt = numRows;
@@ -1054,18 +148,18 @@ arm_status arm_mat_inverse_f32(
       j = numRows - rowCnt;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0f;
+        *pTmp++ = 0.0f;
         j--;
       }
 
       /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0f;
+      *pTmp++ = 1.0f;
 
       /* Writing all zeroes in upper triangle of the destination matrix */
       j = rowCnt - 1U;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0f;
+        *pTmp++ = 0.0f;
         j--;
       }
 
@@ -1075,477 +169,100 @@ arm_status arm_mat_inverse_f32(
 
     /* Loop over the number of columns of the input matrix.
        All the elements in each column are processed by the row operations */
-    loopCnt = numCols;
 
     /* Index modifier to navigate through the columns */
-    l = 0U;
-
-    while (loopCnt > 0U)
+    for(column = 0U; column < numCols; column++)
     {
       /* Check if the pivot element is zero..
        * If it is zero then interchange the row with non zero row below.
        * If there is no non zero element to replace in the rows below,
        * then the matrix is Singular. */
 
-      /* Working pointer for the input matrix that points
-       * to the pivot element of the particular row  */
-      pInT1 = pIn + (l * numCols);
-
-      /* Working pointer for the destination matrix that points
-       * to the pivot element of the particular row  */
-      pOutT1 = pOut + (l * numCols);
+      pivotRow = column;
 
       /* Temporary variable to hold the pivot value */
-      in = *pInT1;
+      pTmp = ELEM(pSrc,column,column) ;
+      pivot = *pTmp;
+      selectedRow = column;
 
-    
-
-      /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0f)
-      {
+      /* Find maximum pivot in column */
+      
         /* Loop over the number rows present below */
 
-        for (i = 1U; i < numRows - l; i++)
-        {
+      for (rowNb = column+1; rowNb < numRows; rowNb++)
+      {
           /* Update the input and destination pointers */
-          pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * i);
-
-          /* Check if there is a non zero pivot element to
-           * replace in the rows below */
-          if (*pInT2 != 0.0f)
+          pTmp = ELEM(pSrc,rowNb,column);
+          newPivot = *pTmp;
+          if (fabsf(newPivot) > fabsf(pivot))
           {
-            /* Loop over number of columns
-             * to the right of the pilot element */
-            j = numCols - l;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the input matrix */
-              Xchg = *pInT2;
-              *pInT2++ = *pInT1;
-              *pInT1++ = Xchg;
-
-              /* Decrement the loop counter */
-              j--;
-            }
-
-            /* Loop over number of columns of the destination matrix */
-            j = numCols;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the destination matrix */
-              Xchg = *pOutT2;
-              *pOutT2++ = *pOutT1;
-              *pOutT1++ = Xchg;
-
-              /* Decrement loop counter */
-              j--;
-            }
-
-            /* Flag to indicate whether exchange is done or not */
-            flag = 1U;
-
-            /* Break after exchange is done */
-            break;
+            selectedRow = rowNb; 
+            pivot = newPivot;
           }
-
-
-          /* Decrement loop counter */
-        }
       }
-
-      /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0f))
+        
+      /* Check if there is a non zero pivot element to
+       * replace in the rows below */
+      if ((pivot != 0.0f) && (selectedRow != column))
       {
-        return ARM_MATH_SINGULAR;
-      }
-
-      /* Points to the pivot row of input and destination matrices */
-      pPivotRowIn = pIn + (l * numCols);
-      pPivotRowDst = pOut + (l * numCols);
-
-      /* Temporary pointers to the pivot row pointers */
-      pInT1 = pPivotRowIn;
-      pInT2 = pPivotRowDst;
-
-      /* Pivot element of the row */
-      in = *pPivotRowIn;
-
-      /* Loop over number of columns
-       * to the right of the pilot element */
-      j = (numCols - l);
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        in1 = *pInT1;
-        *pInT1++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Loop over number of columns of the destination matrix */
-      j = numCols;
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        in1 = *pInT2;
-        *pInT2++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Replace the rows with the sum of that row and a multiple of row i
-       * so that each new element in column i above row i is zero.*/
-
-      /* Temporary pointers for input and destination matrices */
-      pInT1 = pIn;
-      pInT2 = pOut;
-
-      /* index used to check for pivot element */
-      i = 0U;
-
-      /* Loop over number of rows */
-      /*  to be replaced by the sum of that row and a multiple of row i */
-      k = numRows;
-
-      while (k > 0U)
-      {
-        /* Check for the pivot element */
-        if (i == l)
-        {
-          /* If the processing element is the pivot element,
-             only the columns to the right are to be processed */
-          pInT1 += numCols - l;
-
-          pInT2 += numCols;
-        }
-        else
-        {
-          /* Element of the reference row */
-          in = *pInT1;
-
-          /* Working pointers for input and destination pivot rows */
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-
-          /* Loop over the number of columns to the right of the pivot element,
-             to replace the elements in the input matrix */
-          j = (numCols - l);
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT1;
-            *pInT1++ = in1 - (in * *pPRT_in++);
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-          /* Loop over the number of columns to
-             replace the elements in the destination matrix */
-          j = numCols;
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT2;
-            *pInT2++ = in1 - (in * *pPRT_pDst++);
-
-            /* Decrement loop counter */
-            j--;
-          }
-
-        }
-
-        /* Increment temporary input pointer */
-        pInT1 = pInT1 + l;
-
-        /* Decrement loop counter */
-        k--;
-
-        /* Increment pivot index */
-        i++;
-      }
-
-      /* Increment the input pointer */
-      pIn++;
-
-      /* Decrement the loop counter */
-      loopCnt--;
-
-      /* Increment the index modifier */
-      l++;
-    }
-
-
-#else
-
-  float32_t Xchg, in = 0.0f;                     /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, l;      /* loop counters */
-  arm_status status;                             /* status of matrix inverse */
-
-#ifdef ARM_MATH_MATRIX_CHECK
-
-  /* Check for matrix mismatch condition */
-  if ((pSrc->numRows != pSrc->numCols) ||
-      (pDst->numRows != pDst->numCols) ||
-      (pSrc->numRows != pDst->numRows)   )
-  {
-    /* Set status as ARM_MATH_SIZE_MISMATCH */
-    status = ARM_MATH_SIZE_MISMATCH;
-  }
-  else
-
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
-  {
-
-    /*--------------------------------------------------------------------------------------------------------------
-     * Matrix Inverse can be solved using elementary row operations.
-     *
-     *  Gauss-Jordan Method:
-     *
-     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
-     *        augmented matrix as follows:
-     *                      _  _          _     _      _   _         _         _
-     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
-     *                     |  |            | | |        |   |   =   |           |
-     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
-     *
-     *      2. In our implementation, pDst Matrix is used as identity matrix.
-     *
-     *      3. Begin with the first row. Let i = 1.
-     *
-     *      4. Check to see if the pivot for row i is zero.
-     *         The pivot is the element of the main diagonal that is on the current row.
-     *         For instance, if working with row i, then the pivot element is aii.
-     *         If the pivot is zero, exchange that row with a row below it that does not
-     *         contain a zero in column i. If this is not possible, then an inverse
-     *         to that matrix does not exist.
-     *
-     *      5. Divide every element of row i by the pivot.
-     *
-     *      6. For every row below and  row i, replace that row with the sum of that row and
-     *         a multiple of row i so that each new element in column i below row i is zero.
-     *
-     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
-     *         for every element below and above the main diagonal.
-     *
-     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
-     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
-     *----------------------------------------------------------------------------------------------------------------*/
-
-    /* Working pointer for destination matrix */
-    pOutT1 = pOut;
-
-    /* Loop over the number of rows */
-    rowCnt = numRows;
-
-    /* Making the destination matrix as identity matrix */
-    while (rowCnt > 0U)
-    {
-      /* Writing all zeroes in lower triangle of the destination matrix */
-      j = numRows - rowCnt;
-      while (j > 0U)
-      {
-        *pOutT1++ = 0.0f;
-        j--;
-      }
-
-      /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0f;
-
-      /* Writing all zeroes in upper triangle of the destination matrix */
-      j = rowCnt - 1U;
-      while (j > 0U)
-      {
-        *pOutT1++ = 0.0f;
-        j--;
-      }
-
-      /* Decrement loop counter */
-      rowCnt--;
-    }
-
-    /* Loop over the number of columns of the input matrix.
-       All the elements in each column are processed by the row operations */
-    loopCnt = numCols;
-
-    /* Index modifier to navigate through the columns */
-    l = 0U;
-
-    while (loopCnt > 0U)
-    {
-      /* Check if the pivot element is zero..
-       * If it is zero then interchange the row with non zero row below.
-       * If there is no non zero element to replace in the rows below,
-       * then the matrix is Singular. */
-
-      /* Working pointer for the input matrix that points
-       * to the pivot element of the particular row  */
-      pInT1 = pIn + (l * numCols);
-
-      /* Working pointer for the destination matrix that points
-       * to the pivot element of the particular row  */
-      pOutT1 = pOut + (l * numCols);
-
-      /* Temporary variable to hold the pivot value */
-      in = *pInT1;
-
-      /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0f)
-      {
-        /* Loop over the number rows present below */
-        for (i = 1U; i < numRows-l; i++)
-        {
-          /* Update the input and destination pointers */
-          pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * i);
-
-          /* Check if there is a non zero pivot element to
-           * replace in the rows below */
-          if (*pInT2 != 0.0f)
-          {
-            /* Loop over number of columns
-             * to the right of the pilot element */
-            for (j = 0U; j < (numCols - l); j++)
-            {
-              /* Exchange the row elements of the input matrix */
-              Xchg = *pInT2;
-              *pInT2++ = *pInT1;
-              *pInT1++ = Xchg;
-            }
-
-            for (j = 0U; j < numCols; j++)
-            {
-              Xchg = *pOutT2;
-              *pOutT2++ = *pOutT1;
-              *pOutT1++ = Xchg;
-            }
+            
+            SWAP_ROWS_F32(pSrc,column, pivotRow,selectedRow);
+            SWAP_ROWS_F32(pDst,0, pivotRow,selectedRow);
 
+    
             /* Flag to indicate whether exchange is done or not */
             flag = 1U;
+       }
 
-            /* Break after exchange is done */
-            break;
-          }
-        }
-      }
 
+      
+      
 
       /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0f))
+      if ((flag != 1U) && (pivot == 0.0f))
       {
         return ARM_MATH_SINGULAR;
       }
 
-      /* Points to the pivot row of input and destination matrices */
-      pPivotRowIn = pIn + (l * numCols);
-      pPivotRowDst = pOut + (l * numCols);
-
-      /* Temporary pointers to the pivot row pointers */
-      pInT1 = pPivotRowIn;
-      pOutT1 = pPivotRowDst;
-
+     
       /* Pivot element of the row */
-      in = *(pIn + (l * numCols));
+      pivot = 1.0f / pivot;
 
-      /* Loop over number of columns
-       * to the right of the pilot element */
-      for (j = 0U; j < (numCols - l); j++)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        *pInT1 = *pInT1 / in;
-        pInT1++;
-      }
-      for (j = 0U; j < numCols; j++)
-      {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        *pOutT1 = *pOutT1 / in;
-        pOutT1++;
-      }
+      SCALE_ROW_F32(pSrc,column,pivot,pivotRow);
+      SCALE_ROW_F32(pDst,0,pivot,pivotRow);
 
+      
       /* Replace the rows with the sum of that row and a multiple of row i
        * so that each new element in column i above row i is zero.*/
 
-      /* Temporary pointers for input and destination matrices */
-      pInT1 = pIn;
-      pOutT1 = pOut;
-
-      for (i = 0U; i < numRows; i++)
+      rowNb = 0;
+      for (;rowNb < pivotRow; rowNb++)
       {
-        /* Check for the pivot element */
-        if (i == l)
-        {
-          /* If the processing element is the pivot element,
-             only the columns to the right are to be processed */
-          pInT1 += numCols - l;
-          pOutT1 += numCols;
-        }
-        else
-        {
-          /* Element of the reference row */
-          in = *pInT1;
-
-          /* Working pointers for input and destination pivot rows */
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-
-          /* Loop over the number of columns to the right of the pivot element,
-             to replace the elements in the input matrix */
-          for (j = 0U; j < (numCols - l); j++)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            *pInT1 = *pInT1 - (in * *pPRT_in++);
-            pInT1++;
-          }
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
 
-          /* Loop over the number of columns to
-             replace the elements in the destination matrix */
-          for (j = 0U; j < numCols; j++)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            *pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
-            pOutT1++;
-          }
+           MAS_ROW_F32(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F32(0     ,pDst,rowNb,pivot,pDst,pivotRow);
 
-        }
 
-        /* Increment temporary input pointer */
-        pInT1 = pInT1 + l;
       }
 
-      /* Increment the input pointer */
-      pIn++;
+      for (rowNb = pivotRow + 1; rowNb < numRows; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
 
-      /* Decrement the loop counter */
-      loopCnt--;
+           MAS_ROW_F32(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F32(0     ,pDst,rowNb,pivot,pDst,pivotRow);
 
-      /* Increment the index modifier */
-      l++;
-    }
+      }
 
-#endif /* #if defined (ARM_MATH_DSP) */
+    }
 
     /* Set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
 
-    if ((flag != 1U) && (in == 0.0f))
+    if ((flag != 1U) && (pivot == 0.0f))
     {
       pIn = pSrc->pData;
       for (i = 0; i < numRows * numCols; i++)
@@ -1562,9 +279,6 @@ arm_status arm_mat_inverse_f32(
   /* Return to application */
   return (status);
 }
-#endif /* #if defined(ARM_MATH_NEON) */
-#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
-
 /**
   @} end of MatrixInv group
  */
diff --git a/Source/MatrixFunctions/arm_mat_inverse_f64.c b/Source/MatrixFunctions/arm_mat_inverse_f64.c
index bf99a792..f41356f2 100644
--- a/Source/MatrixFunctions/arm_mat_inverse_f64.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f64.c
@@ -27,6 +27,7 @@
  */
 
 #include "dsp/matrix_functions.h"
+#include "dsp/matrix_utils.h"
 
 /**
   @ingroup groupMatrix
@@ -54,16 +55,14 @@ arm_status arm_mat_inverse_f64(
 {
   float64_t *pIn = pSrc->pData;                  /* input data matrix pointer */
   float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
-  float64_t *pInT1, *pInT2;                      /* Temporary input data matrix pointer */
-  float64_t *pOutT1, *pOutT2;                    /* Temporary output data matrix pointer */
-  float64_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst;  /* Temporary input and output data matrix pointer */
+  
+  float64_t *pTmp;
   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
 
-#if defined (ARM_MATH_DSP)
 
-  float64_t Xchg, in = 0.0, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
+  float64_t pivot = 0.0, newPivot=0.0;                /* Temporary input values  */
+  uint32_t selectedRow,pivotRow,i, rowNb, rowCnt, flag = 0U, j,column;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -81,7 +80,6 @@ arm_status arm_mat_inverse_f64(
 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
 
   {
-
     /*--------------------------------------------------------------------------------------------------------------
      * Matrix Inverse can be solved using elementary row operations.
      *
@@ -118,7 +116,7 @@ arm_status arm_mat_inverse_f64(
      *----------------------------------------------------------------------------------------------------------------*/
 
     /* Working pointer for destination matrix */
-    pOutT1 = pOut;
+    pTmp = pOut;
 
     /* Loop over the number of rows */
     rowCnt = numRows;
@@ -130,18 +128,18 @@ arm_status arm_mat_inverse_f64(
       j = numRows - rowCnt;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0;
+        *pTmp++ = 0.0;
         j--;
       }
 
       /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0;
+      *pTmp++ = 1.0;
 
       /* Writing all zeroes in upper triangle of the destination matrix */
       j = rowCnt - 1U;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0;
+        *pTmp++ = 0.0;
         j--;
       }
 
@@ -151,477 +149,99 @@ arm_status arm_mat_inverse_f64(
 
     /* Loop over the number of columns of the input matrix.
        All the elements in each column are processed by the row operations */
-    loopCnt = numCols;
 
     /* Index modifier to navigate through the columns */
-    l = 0U;
-
-    while (loopCnt > 0U)
+    for(column = 0U; column < numCols; column++)
     {
       /* Check if the pivot element is zero..
        * If it is zero then interchange the row with non zero row below.
        * If there is no non zero element to replace in the rows below,
        * then the matrix is Singular. */
 
-      /* Working pointer for the input matrix that points
-       * to the pivot element of the particular row  */
-      pInT1 = pIn + (l * numCols);
-
-      /* Working pointer for the destination matrix that points
-       * to the pivot element of the particular row  */
-      pOutT1 = pOut + (l * numCols);
+      pivotRow = column;
 
       /* Temporary variable to hold the pivot value */
-      in = *pInT1;
+      pTmp = ELEM(pSrc,column,column) ;
+      pivot = *pTmp;
+      selectedRow = column;
 
-    
-
-      /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0)
-      {
+      
         /* Loop over the number rows present below */
 
-        for (i = 1U; i < numRows - l; i++)
-        {
-          /* Update the input and destination pointers */
-          pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * i);
-
-          /* Check if there is a non zero pivot element to
-           * replace in the rows below */
-          if (*pInT2 != 0.0)
-          {
-            /* Loop over number of columns
-             * to the right of the pilot element */
-            j = numCols - l;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the input matrix */
-              Xchg = *pInT2;
-              *pInT2++ = *pInT1;
-              *pInT1++ = Xchg;
-
-              /* Decrement the loop counter */
-              j--;
-            }
-
-            /* Loop over number of columns of the destination matrix */
-            j = numCols;
-
-            while (j > 0U)
-            {
-              /* Exchange the row elements of the destination matrix */
-              Xchg = *pOutT2;
-              *pOutT2++ = *pOutT1;
-              *pOutT1++ = Xchg;
-
-              /* Decrement loop counter */
-              j--;
-            }
-
-            /* Flag to indicate whether exchange is done or not */
-            flag = 1U;
-
-            /* Break after exchange is done */
-            break;
-          }
-
-
-          /* Decrement loop counter */
-        }
-      }
-
-      /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0))
-      {
-        return ARM_MATH_SINGULAR;
-      }
-
-      /* Points to the pivot row of input and destination matrices */
-      pPivotRowIn = pIn + (l * numCols);
-      pPivotRowDst = pOut + (l * numCols);
-
-      /* Temporary pointers to the pivot row pointers */
-      pInT1 = pPivotRowIn;
-      pInT2 = pPivotRowDst;
-
-      /* Pivot element of the row */
-      in = *pPivotRowIn;
-
-      /* Loop over number of columns
-       * to the right of the pilot element */
-      j = (numCols - l);
-
-      while (j > 0U)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        in1 = *pInT1;
-        *pInT1++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Loop over number of columns of the destination matrix */
-      j = numCols;
-
-      while (j > 0U)
+      for (rowNb = column+1; rowNb < numRows; rowNb++)
       {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        in1 = *pInT2;
-        *pInT2++ = in1 / in;
-
-        /* Decrement the loop counter */
-        j--;
-      }
-
-      /* Replace the rows with the sum of that row and a multiple of row i
-       * so that each new element in column i above row i is zero.*/
-
-      /* Temporary pointers for input and destination matrices */
-      pInT1 = pIn;
-      pInT2 = pOut;
-
-      /* index used to check for pivot element */
-      i = 0U;
-
-      /* Loop over number of rows */
-      /*  to be replaced by the sum of that row and a multiple of row i */
-      k = numRows;
-
-      while (k > 0U)
-      {
-        /* Check for the pivot element */
-        if (i == l)
-        {
-          /* If the processing element is the pivot element,
-             only the columns to the right are to be processed */
-          pInT1 += numCols - l;
-
-          pInT2 += numCols;
-        }
-        else
-        {
-          /* Element of the reference row */
-          in = *pInT1;
-
-          /* Working pointers for input and destination pivot rows */
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-
-          /* Loop over the number of columns to the right of the pivot element,
-             to replace the elements in the input matrix */
-          j = (numCols - l);
-
-          while (j > 0U)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT1;
-            *pInT1++ = in1 - (in * *pPRT_in++);
-
-            /* Decrement the loop counter */
-            j--;
-          }
-
-          /* Loop over the number of columns to
-             replace the elements in the destination matrix */
-          j = numCols;
-
-          while (j > 0U)
+          /* Update the input and destination pointers */
+          pTmp = ELEM(pSrc,rowNb,column);
+          newPivot = *pTmp;
+          if (fabs(newPivot) > fabs(pivot))
           {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            in1 = *pInT2;
-            *pInT2++ = in1 - (in * *pPRT_pDst++);
-
-            /* Decrement loop counter */
-            j--;
+            selectedRow = rowNb; 
+            pivot = newPivot;
           }
-
-        }
-
-        /* Increment temporary input pointer */
-        pInT1 = pInT1 + l;
-
-        /* Decrement loop counter */
-        k--;
-
-        /* Increment pivot index */
-        i++;
-      }
-
-      /* Increment the input pointer */
-      pIn++;
-
-      /* Decrement the loop counter */
-      loopCnt--;
-
-      /* Increment the index modifier */
-      l++;
-    }
-
-
-#else
-
-  float64_t Xchg, in = 0.0;                     /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt, l;      /* loop counters */
-  arm_status status;                             /* status of matrix inverse */
-
-#ifdef ARM_MATH_MATRIX_CHECK
-
-  /* Check for matrix mismatch condition */
-  if ((pSrc->numRows != pSrc->numCols) ||
-      (pDst->numRows != pDst->numCols) ||
-      (pSrc->numRows != pDst->numRows)   )
-  {
-    /* Set status as ARM_MATH_SIZE_MISMATCH */
-    status = ARM_MATH_SIZE_MISMATCH;
-  }
-  else
-
-#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
-
-  {
-
-    /*--------------------------------------------------------------------------------------------------------------
-     * Matrix Inverse can be solved using elementary row operations.
-     *
-     *  Gauss-Jordan Method:
-     *
-     *      1. First combine the identity matrix and the input matrix separated by a bar to form an
-     *        augmented matrix as follows:
-     *                      _  _          _     _      _   _         _         _
-     *                     |  |  a11  a12  | | | 1   0  |   |       |  X11 X12  |
-     *                     |  |            | | |        |   |   =   |           |
-     *                     |_ |_ a21  a22 _| | |_0   1 _|  _|       |_ X21 X21 _|
-     *
-     *      2. In our implementation, pDst Matrix is used as identity matrix.
-     *
-     *      3. Begin with the first row. Let i = 1.
-     *
-     *      4. Check to see if the pivot for row i is zero.
-     *         The pivot is the element of the main diagonal that is on the current row.
-     *         For instance, if working with row i, then the pivot element is aii.
-     *         If the pivot is zero, exchange that row with a row below it that does not
-     *         contain a zero in column i. If this is not possible, then an inverse
-     *         to that matrix does not exist.
-     *
-     *      5. Divide every element of row i by the pivot.
-     *
-     *      6. For every row below and  row i, replace that row with the sum of that row and
-     *         a multiple of row i so that each new element in column i below row i is zero.
-     *
-     *      7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
-     *         for every element below and above the main diagonal.
-     *
-     *      8. Now an identical matrix is formed to the left of the bar(input matrix, src).
-     *         Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
-     *----------------------------------------------------------------------------------------------------------------*/
-
-    /* Working pointer for destination matrix */
-    pOutT1 = pOut;
-
-    /* Loop over the number of rows */
-    rowCnt = numRows;
-
-    /* Making the destination matrix as identity matrix */
-    while (rowCnt > 0U)
-    {
-      /* Writing all zeroes in lower triangle of the destination matrix */
-      j = numRows - rowCnt;
-      while (j > 0U)
-      {
-        *pOutT1++ = 0.0;
-        j--;
       }
 
-      /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0;
-
-      /* Writing all zeroes in upper triangle of the destination matrix */
-      j = rowCnt - 1U;
-      while (j > 0U)
-      {
-        *pOutT1++ = 0.0;
-        j--;
-      }
-
-      /* Decrement loop counter */
-      rowCnt--;
-    }
-
-    /* Loop over the number of columns of the input matrix.
-       All the elements in each column are processed by the row operations */
-    loopCnt = numCols;
-
-    /* Index modifier to navigate through the columns */
-    l = 0U;
-
-    while (loopCnt > 0U)
-    {
-      /* Check if the pivot element is zero..
-       * If it is zero then interchange the row with non zero row below.
-       * If there is no non zero element to replace in the rows below,
-       * then the matrix is Singular. */
-
-      /* Working pointer for the input matrix that points
-       * to the pivot element of the particular row  */
-      pInT1 = pIn + (l * numCols);
-
-      /* Working pointer for the destination matrix that points
-       * to the pivot element of the particular row  */
-      pOutT1 = pOut + (l * numCols);
-
-      /* Temporary variable to hold the pivot value */
-      in = *pInT1;
-
-      /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0)
-      {
-        /* Loop over the number rows present below */
-        for (i = 1U; i < numRows-l; i++)
-        {
-          /* Update the input and destination pointers */
-          pInT2 = pInT1 + (numCols * i);
-          pOutT2 = pOutT1 + (numCols * i);
-
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
-          if (*pInT2 != 0.0)
-          {
+      if ((pivot != 0.0) && (selectedRow != column))
+      {
             /* Loop over number of columns
              * to the right of the pilot element */
-            for (j = 0U; j < (numCols - l); j++)
-            {
-              /* Exchange the row elements of the input matrix */
-              Xchg = *pInT2;
-              *pInT2++ = *pInT1;
-              *pInT1++ = Xchg;
-            }
-
-            for (j = 0U; j < numCols; j++)
-            {
-              Xchg = *pOutT2;
-              *pOutT2++ = *pOutT1;
-              *pOutT1++ = Xchg;
-            }
 
+            SWAP_ROWS_F64(pSrc,column, pivotRow,selectedRow);
+            SWAP_ROWS_F64(pDst,0, pivotRow,selectedRow);
+
+    
             /* Flag to indicate whether exchange is done or not */
             flag = 1U;
 
-            /* Break after exchange is done */
-            break;
-          }
-        }
       }
 
 
       /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0))
+      if ((flag != 1U) && (pivot == 0.0))
       {
         return ARM_MATH_SINGULAR;
       }
 
-      /* Points to the pivot row of input and destination matrices */
-      pPivotRowIn = pIn + (l * numCols);
-      pPivotRowDst = pOut + (l * numCols);
-
-      /* Temporary pointers to the pivot row pointers */
-      pInT1 = pPivotRowIn;
-      pOutT1 = pPivotRowDst;
-
+     
       /* Pivot element of the row */
-      in = *(pIn + (l * numCols));
+      pivot = 1.0 / pivot;
 
-      /* Loop over number of columns
-       * to the right of the pilot element */
-      for (j = 0U; j < (numCols - l); j++)
-      {
-        /* Divide each element of the row of the input matrix
-         * by the pivot element */
-        *pInT1 = *pInT1 / in;
-        pInT1++;
-      }
-      for (j = 0U; j < numCols; j++)
-      {
-        /* Divide each element of the row of the destination matrix
-         * by the pivot element */
-        *pOutT1 = *pOutT1 / in;
-        pOutT1++;
-      }
+      SCALE_ROW_F64(pSrc,column,pivot,pivotRow);
+      SCALE_ROW_F64(pDst,0,pivot,pivotRow);
 
+      
       /* Replace the rows with the sum of that row and a multiple of row i
        * so that each new element in column i above row i is zero.*/
 
-      /* Temporary pointers for input and destination matrices */
-      pInT1 = pIn;
-      pOutT1 = pOut;
-
-      for (i = 0U; i < numRows; i++)
+      rowNb = 0;
+      for (;rowNb < pivotRow; rowNb++)
       {
-        /* Check for the pivot element */
-        if (i == l)
-        {
-          /* If the processing element is the pivot element,
-             only the columns to the right are to be processed */
-          pInT1 += numCols - l;
-          pOutT1 += numCols;
-        }
-        else
-        {
-          /* Element of the reference row */
-          in = *pInT1;
-
-          /* Working pointers for input and destination pivot rows */
-          pPRT_in = pPivotRowIn;
-          pPRT_pDst = pPivotRowDst;
-
-          /* Loop over the number of columns to the right of the pivot element,
-             to replace the elements in the input matrix */
-          for (j = 0U; j < (numCols - l); j++)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            *pInT1 = *pInT1 - (in * *pPRT_in++);
-            pInT1++;
-          }
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
 
-          /* Loop over the number of columns to
-             replace the elements in the destination matrix */
-          for (j = 0U; j < numCols; j++)
-          {
-            /* Replace the element by the sum of that row
-               and a multiple of the reference row  */
-            *pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
-            pOutT1++;
-          }
+           MAS_ROW_F64(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F64(0     ,pDst,rowNb,pivot,pDst,pivotRow);
 
-        }
 
-        /* Increment temporary input pointer */
-        pInT1 = pInT1 + l;
       }
 
-      /* Increment the input pointer */
-      pIn++;
+      for (rowNb = pivotRow + 1; rowNb < numRows; rowNb++)
+      {
+           pTmp = ELEM(pSrc,rowNb,column) ;
+           pivot = *pTmp;
 
-      /* Decrement the loop counter */
-      loopCnt--;
+           MAS_ROW_F64(column,pSrc,rowNb,pivot,pSrc,pivotRow);
+           MAS_ROW_F64(0     ,pDst,rowNb,pivot,pDst,pivotRow);
 
-      /* Increment the index modifier */
-      l++;
-    }
+      }
 
-#endif /* #if defined (ARM_MATH_DSP) */
+    }
 
     /* Set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
 
-    if ((flag != 1U) && (in == 0.0))
+    if ((flag != 1U) && (pivot == 0.0))
     {
       pIn = pSrc->pData;
       for (i = 0; i < numRows * numCols; i++)
@@ -638,7 +258,6 @@ arm_status arm_mat_inverse_f64(
   /* Return to application */
   return (status);
 }
-
 /**
   @} end of MatrixInv group
  */
diff --git a/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
index 1d3586c6..dae2ee0b 100755
--- a/Source/MatrixFunctions/arm_mat_ldlt_f32.c
+++ b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -27,44 +27,12 @@
  */
 
 #include "dsp/matrix_functions.h"
-
-
+#include "dsp/matrix_utils.h"
 
 
 
 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
-
-/// @private
-#define SWAP_ROWS_F32(A,i,j)                 \
-  {                                      \
-    int cnt = n;                         \
-                                         \
-    for(int w=0;w < n; w+=4)             \
-    {                                    \
-       f32x4_t tmpa,tmpb;                \
-       mve_pred16_t p0 = vctp32q(cnt);   \
-                                         \
-       tmpa=vldrwq_z_f32(&A[i*n + w],p0);\
-       tmpb=vldrwq_z_f32(&A[j*n + w],p0);\
-                                         \
-       vstrwq_p(&A[i*n + w], tmpb, p0);  \
-       vstrwq_p(&A[j*n + w], tmpa, p0);  \
-                                         \
-       cnt -= 4;                         \
-    }                                    \
-  }
-
-/// @private
-#define SWAP_COLS_F32(A,i,j)     \
-  for(int w=0;w < n; w++)    \
-  {                          \
-     float32_t tmp;          \
-     tmp = A[w*n + i];       \
-     A[w*n + i] = A[w*n + j];\
-     A[w*n + j] = tmp;       \
-  }
-
 /**
   @ingroup groupMatrix
  */
@@ -157,8 +125,8 @@ arm_status arm_mat_ldlt_f32(
 
         if(j != k)
         {
-          SWAP_ROWS_F32(pA,k,j);
-          SWAP_COLS_F32(pA,k,j);
+          SWAP_ROWS_F32(pl,0,k,j);
+          SWAP_COLS_F32(pl,0,k,j);
         }
 
 
@@ -323,25 +291,6 @@ arm_status arm_mat_ldlt_f32(
 }
 #else
 
-/// @private
-#define SWAP_ROWS_F32(A,i,j)     \
-  for(w=0;w < n; w++)    \
-  {                          \
-     float32_t tmp;          \
-     tmp = A[i*n + w];       \
-     A[i*n + w] = A[j*n + w];\
-     A[j*n + w] = tmp;       \
-  }
-
-/// @private
-#define SWAP_COLS_F32(A,i,j)     \
-  for(w=0;w < n; w++)    \
-  {                          \
-     float32_t tmp;          \
-     tmp = A[w*n + i];       \
-     A[w*n + i] = A[w*n + j];\
-     A[w*n + j] = tmp;       \
-  }
 
 /**
   @ingroup groupMatrix
@@ -429,8 +378,8 @@ arm_status arm_mat_ldlt_f32(
 
         if(j != k)
         {
-          SWAP_ROWS_F32(pA,k,j);
-          SWAP_COLS_F32(pA,k,j);
+          SWAP_ROWS_F32(pl,0,k,j);
+          SWAP_COLS_F32(pl,0,k,j);
         }
 
 
diff --git a/Source/MatrixFunctions/arm_mat_ldlt_f64.c b/Source/MatrixFunctions/arm_mat_ldlt_f64.c
index 8f7331dc..46432718 100755
--- a/Source/MatrixFunctions/arm_mat_ldlt_f64.c
+++ b/Source/MatrixFunctions/arm_mat_ldlt_f64.c
@@ -27,35 +27,10 @@
  */
 
 #include "dsp/matrix_functions.h"
-#include <math.h>
-
-
+#include "dsp/matrix_utils.h"
 
-/// @private
-#define SWAP_ROWS_F64(A,i,j) \
-{                            \
-  int w;                     \
-  for(w=0;w < n; w++)        \
-  {                          \
-     float64_t tmp;          \
-     tmp = A[i*n + w];       \
-     A[i*n + w] = A[j*n + w];\
-     A[j*n + w] = tmp;       \
-  }                          \
-}
+#include <math.h>
 
-/// @private
-#define SWAP_COLS_F64(A,i,j) \
-{                            \
-  int w;                     \
-  for(w=0;w < n; w++)        \
-  {                          \
-     float64_t tmp;          \
-     tmp = A[w*n + i];       \
-     A[w*n + i] = A[w*n + j];\
-     A[w*n + j] = tmp;       \
-  }                          \
-}
 
 /**
   @ingroup groupMatrix
@@ -141,8 +116,8 @@ arm_status arm_mat_ldlt_f64(
 
         if(j != k)
         {
-          SWAP_ROWS_F64(pA,k,j);
-          SWAP_COLS_F64(pA,k,j);
+          SWAP_ROWS_F64(pl,0,k,j);
+          SWAP_COLS_F64(pl,0,k,j);
         }
 
 
diff --git a/Source/MatrixFunctions/arm_mat_vec_mult_f32.c b/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
index b112d349..a7c7deff 100755
--- a/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
@@ -165,7 +165,7 @@ void arm_mat_vec_mult_f32(
     }
 
     /*
-     * compute 2 rows in parrallel
+     * compute 2 rows in parallel
      */
     if (row >= 2)
     {
diff --git a/Testing/Source/Tests/UnaryTestsF16.cpp b/Testing/Source/Tests/UnaryTestsF16.cpp
index 34b36730..406b67d0 100755
--- a/Testing/Source/Tests/UnaryTestsF16.cpp
+++ b/Testing/Source/Tests/UnaryTestsF16.cpp
@@ -22,9 +22,9 @@ Comparisons for inverse
 /* Not very accurate for big matrix.
 But big matrix needed for checking the vectorized code */
 
-#define SNR_THRESHOLD_INV 45
-#define REL_ERROR_INV (3.0e-2)
-#define ABS_ERROR_INV (3.0e-2)
+#define SNR_THRESHOLD_INV 52
+#define REL_ERROR_INV (3.0e-3)
+#define ABS_ERROR_INV (2.0e-2)
 
 #define REL_ERROR_SOLVE (6.0e-2)
 #define ABS_ERROR_SOLVE (2.0e-2)
diff --git a/Testing/Source/Tests/UnaryTestsF32.cpp b/Testing/Source/Tests/UnaryTestsF32.cpp
index 8aee65c1..f66bba28 100755
--- a/Testing/Source/Tests/UnaryTestsF32.cpp
+++ b/Testing/Source/Tests/UnaryTestsF32.cpp
@@ -22,9 +22,9 @@ Comparisons for inverse
 /* Not very accurate for big matrix.
 But big matrix needed for checking the vectorized code */
 
-#define SNR_THRESHOLD_INV 67
-#define REL_ERROR_INV (1.0e-3)
-#define ABS_ERROR_INV (1.0e-3)
+#define SNR_THRESHOLD_INV 100
+#define REL_ERROR_INV (3.0e-5)
+#define ABS_ERROR_INV (1.0e-5)
 
 /*