Added functions optimised for 64 bit float

4 years ago · e5753198af
parent a973e9ed37
commit e5753198af
22 changed files with 1822 additions and 268 deletions
--- a/Source/BasicMathFunctions/arm_dot_prod_f64.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_dot_prod_f64.c
 * Description:  Floating-point dot product
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        03 June 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -45,7 +45,62 @@
  @param[out]    result     output result returned here.
  @return        none
 */
+#if defined(ARM_MATH_NEON)
+void arm_dot_prod_f64(
+    const float64_t * pSrcA,
+    const float64_t * pSrcB,
+        uint32_t blockSize,
+        float64_t * result)
+    {
+    uint32_t blkCnt;                               /* Loop counter */
+    float64_t sum = 0.;                            /* Temporary return variable */
+    float64x2_t sumV ;                              /* Neon buffer for sum variable */
+    
+    /* Neon Buffer Initialisation */
+    sumV = vsetq_lane_f64(0.0f, sumV, 0);
+    sumV = vsetq_lane_f64(0.0f, sumV, 1);
+    
+    /* Neon Buffer for sources */
+    float64x2_t pSrcAV;
+    float64x2_t pSrcBV;
+
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize >> 1U;
+
+    while (blkCnt > 0U)
+    {
+    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+        
+    /* Load source value in Neon Buffer */
+      pSrcAV = vld1q_f64(pSrcA);
+      pSrcBV = vld1q_f64(pSrcB);
+    /* Calculate dot product and store result in a temporary buffer. */
+      sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV);
+      
+      pSrcA+=2;
+      pSrcB+=2;
+    /* Decrement loop counter */
+      blkCnt--;
+    }
+    /* Sum both 64 bits part in the float64x2 */
+    sum = vaddvq_f64(sumV);
+
+    
+    /* Tail */
+    blkCnt = blockSize & 1 ;
+
+    while(blkCnt > 0U)
+    {
+        sum += (*pSrcA++) * (*pSrcB++);
+
+        /* Decrement loop counter */
+        blkCnt--;
+    }

+    /* Store result in destination buffer */
+    *result = sum;
+    }
+#else
 void arm_dot_prod_f64(
  const float64_t * pSrcA,
  const float64_t * pSrcB,
@ -72,6 +127,7 @@ void arm_dot_prod_f64(
  /* Store result in destination buffer */
  *result = sum;
 }
+#endif

 /**
  @} end of BasicDotProd group
--- a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
+++ b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
@ -4,8 +4,8 @@
 * Title:        arm_chebyshev_distance_f64.c
 * Description:  Chebyshev distance between two vectors
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -48,15 +48,37 @@
 */
 float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
 {
-   float64_t diff=0.,  maxVal,tmpA, tmpB;

-   tmpA = *pA++;
-   tmpB = *pB++;
-   diff = fabs(tmpA - tmpB);
-   maxVal = diff;
-   blockSize--;
+   float64_t diff=0.,  maxVal,tmpA, tmpB;
+  uint32_t blkCnt;
+  maxVal = F64_MIN;
+#if defined(ARM_MATH_NEON)
+  float64x2_t diffV , tmpAV , tmpBV , maxValV ;
+  maxValV = vdupq_n_f64(maxVal);
+  blkCnt = blockSize >> 1U ;
+  while(blkCnt > 0U)
+  {
+    tmpAV = vld1q_f64(pA);
+    tmpBV = vld1q_f64(pB);
+    diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV)));
+    maxValV = vmaxq_f64(maxValV, diffV);
+    pA+=2;
+    pB+=2;
+    blkCnt--;
+  }
+  maxVal =vgetq_lane_f64(maxValV, 0);
+  if(maxVal < vgetq_lane_f64(maxValV, 1))
+  {
+      maxVal = vgetq_lane_f64(maxValV, 1);
+  }
+  blkCnt = blockSize & 1;
+  
+  
+#else
+  blkCnt = blockSize;
+#endif

-   while(blockSize > 0)
+   while(blkCnt > 0)
   {
      tmpA = *pA++;
      tmpB = *pB++;
@ -65,7 +87,7 @@ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, ui
      {
        maxVal = diff;
      }
-      blockSize --;
+      blkCnt --;
   }
  
   return(maxVal);
--- a/Source/DistanceFunctions/arm_cityblock_distance_f64.c
+++ b/Source/DistanceFunctions/arm_cityblock_distance_f64.c
@ -4,8 +4,8 @@
 * Title:        arm_cityblock_distance_f64.c
 * Description:  Cityblock (Manhattan) distance between two vectors
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -48,15 +48,37 @@
 float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
 {
   float64_t accum,tmpA, tmpB;
-
+  uint32_t blkCnt;
   accum = 0.;
-   while(blockSize > 0)
+  
+#if defined(ARM_MATH_NEON)
+  float64x2_t tmpAV, tmpBV,accumV , subV;
+  accumV = vdupq_n_f64(0.0f);
+  blkCnt = blockSize >> 1U;
+  
+  while(blkCnt > 0U)
+  {
+    tmpAV = vld1q_f64(pA);
+    tmpBV = vld1q_f64(pB);
+    subV = vabdq_f64(tmpAV, tmpBV);
+    accumV = vaddq_f64(accumV, subV);
+    pA+=2;
+    pB+=2;
+    blkCnt--;
+  }
+  accum = vaddvq_f64(accumV);
+  blkCnt = blockSize & 1 ; 
+  
+#else
+  blkCnt = blockSize;
+#endif
+   while(blkCnt > 0)
   {
      tmpA = *pA++;
      tmpB = *pB++;
      accum  += fabs(tmpA - tmpB);
      
-      blockSize --;
+      blkCnt--;
   }
  
   return(accum);
--- a/Source/DistanceFunctions/arm_euclidean_distance_f64.c
+++ b/Source/DistanceFunctions/arm_euclidean_distance_f64.c
@ -4,8 +4,8 @@
 * Title:        arm_euclidean_distance_f64.c
 * Description:  Euclidean distance between two vectors
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -50,12 +50,31 @@
 float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
 {
   float64_t accum=0.,tmp;
-
-   while(blockSize > 0)
+  uint32_t blkCnt;
+#if defined(ARM_MATH_NEON)
+  float64x2_t accumV,tmpV , pAV ,pBV;
+  accumV = vdupq_n_f64(0.0f);
+  blkCnt = blockSize >> 1U;
+  while(blkCnt > 0U)
+  {
+    pAV = vld1q_f64(pA);
+    pBV = vld1q_f64(pB);
+    tmpV = vsubq_f64(pAV, pBV);
+    accumV = vmlaq_f64(accumV, tmpV, tmpV);
+    pA+=2;
+    pB+=2;
+    blkCnt--;
+  }
+  accum = vaddvq_f64(accumV);
+  blkCnt = blockSize & 1;
+#else
+  blkCnt = blockSize;
+#endif
+   while(blkCnt > 0)
   {
      tmp = *pA++ - *pB++;
      accum += SQ(tmp);
-      blockSize --;
+      blkCnt --;
   }
   tmp = sqrt(accum);
   return(tmp);
--- a/Source/FastMathFunctions/arm_vexp_f64.c
+++ b/Source/FastMathFunctions/arm_vexp_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_vlog_f64.c
 * Description:  Fast vectorized log
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -28,31 +28,44 @@

 #include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"
+#if defined(ARM_MATH_NEON)
+#include "arm_vec_math.h"
+#endif

-/**
-  @addtogroup vexp
-  @{
- */
-
-/**
-  @brief         Floating-point vector of exp values.
-  @param[in]     pSrc       points to the input vector
-  @param[out]    pDst       points to the output vector
-  @param[in]     blockSize  number of samples in each vector
-  @return        none
- */
 void arm_vexp_f64(
  const float64_t * pSrc,
        float64_t * pDst,
        uint32_t blockSize)
 {
-   uint32_t blkCnt; 
+   uint32_t blkCnt;
+#if defined(ARM_MATH_NEON)
+ 
+  
+  float64x2_t src;
+  float64x2_t dst;

-   blkCnt = blockSize;
+  blkCnt = blockSize >> 1U;

+  while (blkCnt > 0U)
+  {
+     src = vld1q_f64(pSrc);
+     dst = vexpq_f64(src);
+     vst1q_f64(pDst, dst);
+
+     pSrc += 2;
+     pDst += 2;
+
+     blkCnt--;
+  }
+  
+  blkCnt = blockSize & 1;
+#else
+   blkCnt = blockSize;
+#endif
   while (blkCnt > 0U)
   {
      /* C = log(A) */
+     
  
      /* Calculate log and store result in destination buffer. */
      *pDst++ = exp(*pSrc++);
@ -61,7 +74,3 @@ void arm_vexp_f64(
      blkCnt--;
   }
 }
-
-/**
-  @} end of vexp group
- */
--- a/Source/FastMathFunctions/arm_vlog_f64.c
+++ b/Source/FastMathFunctions/arm_vlog_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_vlog_f64.c
 * Description:  Fast vectorized log
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -29,14 +29,39 @@
 #include "dsp/fast_math_functions.h"
 #include "arm_common_tables.h"

+#if defined(ARM_MATH_NEON)
+#include "arm_vec_math.h"
+#endif
+
 void arm_vlog_f64(
  const float64_t * pSrc,
        float64_t * pDst,
        uint32_t blockSize)
 {
   uint32_t blkCnt; 
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+   float64x2_t src;
+   float64x2_t dst;
+
+   blkCnt = blockSize >> 1U;
+
+   while (blkCnt > 0U)
+   {
+      src = vld1q_f64(pSrc);
+      dst = vlogq_f64(src);
+      vst1q_f64(pDst, dst);

+      pSrc += 2;
+      pDst += 2;
+      /* Decrement loop counter */
+      blkCnt--;
+   }
+
+   blkCnt = blockSize & 1;
+#else
   blkCnt = blockSize;
+#endif
+

   while (blkCnt > 0U)
   {
--- a/Source/FilteringFunctions/arm_correlate_f64.c
+++ b/Source/FilteringFunctions/arm_correlate_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_correlate_f64.c
 * Description:  Correlation of floating-point sequences
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -65,6 +65,9 @@ void arm_correlate_f64(
        uint32_t j, k, count, blkCnt;                  /* Loop counters */
        uint32_t outBlockSize;                         /* Loop counter */
        int32_t inc = 1;                               /* Destination address modifier */
+#if defined(ARM_MATH_NEON)
+  float64x2_t sumV,pxV,pyV ;
+#endif

  /* The algorithm implementation is based on the lengths of the inputs. */
  /* srcB is always made to slide across srcA. */
@ -164,10 +167,25 @@ void arm_correlate_f64(
  {
    /* Accumulator is made zero for every iteration */
    sum = 0.;
-
+#if defined(ARM_MATH_NEON)
+    sumV = vdupq_n_f64(0.0f);
+    k = count >> 1U ;
+    
+    while(k > 0U)
+    {
+      pxV = vld1q_f64(px);
+      pyV = vld1q_f64(py);
+      sumV = vmlaq_f64(sumV, pxV, pyV);
+      px+=2;
+      py+=2;
+      k--;
+    }
+    sum =vaddvq_f64(sumV);
+    k = count & 1 ;
+#else
    /* Initialize k with number of samples */
    k = count;
-
+#endif
    while (k > 0U)
    {
      /* Perform the multiply-accumulate */
@ -179,6 +197,7 @@ void arm_correlate_f64(
    }

    /* Store the result in the accumulator in the destination buffer. */
+
    *pOut = sum;
    /* Destination pointer is updated according to the address modifier, inc */
    pOut += inc;
@ -192,6 +211,7 @@ void arm_correlate_f64(

    /* Decrement loop counter */
    blockSize1--;
+
  }

  /* --------------------------
@ -229,10 +249,26 @@ void arm_correlate_f64(
    {
      /* Accumulator is made zero for every iteration */
      sum = 0.;
+#if defined(ARM_MATH_NEON)
+      sumV = vdupq_n_f64(0.0f);
+      k = srcBLen >> 1U ;
+      while(k > 0U)
+      {
+        pxV = vld1q_f64(px);
+        pyV = vld1q_f64(py);
+        sumV = vmlaq_f64(sumV, pxV, pyV);
+        px+=2;
+        py+=2;
+        k--;
+      }
+      sum =vaddvq_f64(sumV);
+      k = srcBLen & 1 ;
+      
+#else

      /* Initialize blkCnt with number of samples */
      k = srcBLen;
-
+#endif
      while (k > 0U)
      {
        /* Perform the multiply-accumulate */
@ -269,10 +305,26 @@ void arm_correlate_f64(
    {
      /* Accumulator is made zero for every iteration */
      sum = 0.;
+#if defined(ARM_MATH_NEON)
+      sumV = vdupq_n_f64(0.0f);
+      k = srcBLen >> 1U ;
+      while(k > 0U)
+      {
+        pxV = vld1q_f64(px);
+        pyV = vld1q_f64(py);
+        sumV = vmlaq_f64(sumV, pxV, pyV);
+        px+=2;
+        py+=2;
+        k--;
+      }
+      sum =vaddvq_f64(sumV);
+      k = srcBLen & 1 ;
+      
+#else

      /* Loop over srcBLen */
      k = srcBLen;
-
+#endif
      while (k > 0U)
      {
        /* Perform the multiply-accumulate */
@ -330,10 +382,26 @@ void arm_correlate_f64(
  {
    /* Accumulator is made zero for every iteration */
    sum = 0.;
+#if defined(ARM_MATH_NEON)
+    sumV = vdupq_n_f64(0.0f);
+    k = count >> 1U ;
+    
+    while(k > 0U)
+    {
+      pxV = vld1q_f64(px);
+      pyV = vld1q_f64(py);
+      sumV = vmlaq_f64(sumV, pxV, pyV);
+      px+=2;
+      py+=2;
+      k--;
+    }
+    sum =vaddvq_f64(sumV);
+    k = count & 1 ;
+#else

    /* Initialize blkCnt with number of samples */
    k = count;
-
+#endif
    while (k > 0U)
    {
      /* Perform the multiply-accumulate */
--- a/Source/FilteringFunctions/arm_fir_f64.c
+++ b/Source/FilteringFunctions/arm_fir_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_fir_f64.c
 * Description:  Floating-point FIR filter processing function
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        03 June 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -45,7 +45,103 @@
  @param[in]     blockSize  number of samples to process
  @return        none
 */
+#if defined(ARM_MATH_NEON)
+void arm_fir_f64(
+  const arm_fir_instance_f64 * S,
+  const float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize)
+{
+        float64_t *pState = S->pState;                 /* State pointer */
+  const float64_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
+        float64_t *pStateCurnt;                        /* Points to the current sample of the state */
+        float64_t *px;                                 /* Temporary pointer for state buffer */
+  const float64_t *pb;                                 /* Temporary pointer for coefficient buffer */
+    float64x2_t pxV;
+    float64x2_t pbV;
+    float64x2_t acc0V;
+        float64_t acc0;                                /* Accumulator */
+        uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
+        uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
+
+  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+  /* pStateCurnt points to the location where the new input data should be written */
+  pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+  /* Initialize blkCnt with number of taps */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* Copy one sample at a time into state buffer */
+    *pStateCurnt++ = *pSrc++;
+
+    /* Set the accumulator to zero */
+    acc0 = 0.;
+    acc0V = vdupq_n_f64(0.0f);
+
+    /* Initialize state pointer */
+    px = pState;
+
+    /* Initialize Coefficient pointer */
+    pb = pCoeffs;
+
+    i = numTaps >> 1U;
+
+    /* Perform the multiply-accumulates */
+    while (i > 0U)
+    {
+      /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+        pxV = vld1q_f64(px);
+        pbV = vld1q_f64(pb);
+        acc0V = vmlaq_f64(acc0V, pxV, pbV);
+        px+=2;
+        pb+=2;
+
+      i--;
+    }
+      
+      acc0 = vaddvq_f64(acc0V);
+      i = numTaps%2 ;
+    while(i >0U)
+    {
+        acc0+= *px++ * *pb++ ;
+        i--;
+    }
+
+    /* Store result in destination buffer. */
+
+    *pDst++ = acc0;
+
+    /* Advance state pointer by 1 for the next sample */
+    pState = pState + 1U;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Processing is complete.
+     Now copy the last numTaps - 1 samples to the start of the state buffer.
+     This prepares the state buffer for the next function call. */
+
+  /* Points to the start of the state buffer */
+  pStateCurnt = S->pState;
+
+  /* Initialize tapCnt with number of taps */
+  tapCnt = (numTaps - 1U);
+
+  /* Copy remaining data */
+  while (tapCnt > 0U)
+  {
+    *pStateCurnt++ = *pState++;
+
+    /* Decrement loop counter */
+    tapCnt--;
+  }
+
+}

+#else
 void arm_fir_f64(
  const arm_fir_instance_f64 * S,
  const float64_t * pSrc,
@ -123,6 +219,7 @@ void arm_fir_f64(
  }

 }
+#endif

 /**
 * @} end of FIR group
--- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mat_cholesky_f64.c
 * Description:  Floating-point Cholesky decomposition
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -52,6 +52,7 @@
   * The decomposition of A is returning a lower triangular matrix L such that A = L L^t
   */

+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)

 arm_status arm_mat_cholesky_f64(
  const arm_matrix_instance_f64 * pSrc,
@ -61,6 +62,163 @@ arm_status arm_mat_cholesky_f64(
  arm_status status;                             /* status of matrix inverse */


+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pSrc->numCols) ||
+      (pDst->numRows != pDst->numCols) ||
+      (pSrc->numRows != pDst->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    int i,j,k;
+    int n = pSrc->numRows;
+    float64_t invSqrtVj;
+    float64_t *pA,*pG;
+    int kCnt;
+
+
+    float64x2_t acc, acc0, acc1, acc2, acc3;
+    float64x2_t vecGi;
+    float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+    float64x2_t tmp = vdupq_n_f64(0.0f);
+
+    float64_t sum=0.0f;
+    float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
+
+
+    pA = pSrc->pData;
+    pG = pDst->pData;
+    
+    for(i=0 ;i < n ; i++)
+    {
+       for(j=i ; j+3 < n ; j+=4)
+       {
+          pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+          pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+          pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+          pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+          acc0 = vdupq_n_f64(0.0f);
+          acc1 = vdupq_n_f64(0.0f);
+          acc2 = vdupq_n_f64(0.0f);
+          acc3 = vdupq_n_f64(0.0f);
+
+          kCnt = i >> 1U;
+          k=0;
+          while(kCnt > 0)
+          {
+
+             vecGi=vld1q_f64(&pG[i * n + k]);
+             
+             vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
+             vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
+             vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
+             vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
+
+             acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
+             acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
+             acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
+             acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
+
+             kCnt--;
+             k+=2;
+          }
+
+
+         sum0 = vaddvq_f64(acc0);
+         sum1 = vaddvq_f64(acc1);
+         sum2 = vaddvq_f64(acc2);
+         sum3 = vaddvq_f64(acc3);
+
+
+          kCnt = i & 1;
+          while(kCnt > 0)
+          {
+
+             sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
+             sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
+             sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
+             sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
+             kCnt--;
+             k++;
+          }
+
+          pG[(j + 0) * n + i] -= sum0;
+          pG[(j + 1) * n + i] -= sum1;
+          pG[(j + 2) * n + i] -= sum2;
+          pG[(j + 3) * n + i] -= sum3;
+       }
+
+       for(; j < n ; j++)
+       {
+          pG[j * n + i] = pA[j * n + i];
+
+          acc = vdupq_n_f64(0.0f);
+
+          kCnt = i >> 1U;
+          k=0;
+          while(kCnt > 0)
+          {
+
+             vecGi=vld1q_f64(&pG[i * n + k]);
+             vecGj=vld1q_f64(&pG[j * n + k]);
+
+             acc = vfmaq_f64(acc, vecGi, vecGj);
+
+             kCnt--;
+             k+=2;
+          }
+
+
+         sum = vaddvq_f64(acc);
+
+          kCnt = i & 1;
+          while(kCnt > 0)
+          {
+             sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
+
+            
+             kCnt--;
+             k++;
+          }
+
+          pG[j * n + i] -= sum;
+       }
+
+       if (pG[i * n + i] <= 0.0f)
+       {
+         return(ARM_MATH_DECOMPOSITION_FAILURE);
+       }
+
+       invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+       SCALE_COL_F64(pDst,i,invSqrtVj,i);
+    }
+
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+#else
+arm_status arm_mat_cholesky_f64(
+  const arm_matrix_instance_f64 * pSrc,
+        arm_matrix_instance_f64 * pDst)
+{
+
+  arm_status status;                             /* status of matrix inverse */
+
+
 #ifdef ARM_MATH_MATRIX_CHECK

  /* Check for matrix mismatch condition */
@ -115,6 +273,7 @@ arm_status arm_mat_cholesky_f64(
  /* Return to application */
  return (status);
 }
+#endif

 /**
  @} end of MatrixChol group
--- a/Source/MatrixFunctions/arm_mat_mult_f64.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mat_mult_f64.c
 * Description:  Floating-point matrix multiplication
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -27,11 +27,31 @@
 */

 #include "dsp/matrix_functions.h"
+#if defined(ARM_MATH_NEON)
+#define GROUPOFROWS 8
+#endif

 /**
 * @ingroup groupMatrix
 */

+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
+ * in an <code>M x P</code> matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
+ */
+
+
 /**
 * @addtogroup MatrixMult
 * @{
@ -46,7 +66,322 @@
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 */

+#if defined(ARM_MATH_NEON)
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in]       *pSrcA points to the first input matrix structure
+ * @param[in]       *pSrcB points to the second input matrix structure
+ * @param[out]      *pDst points to output matrix structure
+ * @return          The function returns either
+ * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
+ */
+arm_status arm_mat_mult_f64(
+  const arm_matrix_instance_f64 * pSrcA,
+  const arm_matrix_instance_f64 * pSrcB,
+  arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A */
+  float64_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B */
+  float64_t *pInA = pSrcA->pData;                /* input data matrix pointer A  */
+  float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float64_t *px;                                 /* Temporary output data matrix pointer */
+  float64_t sum;                                 /* Accumulator */
+  uint32_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A */
+  uint32_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
+  uint32_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
+
+
+  uint32_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt;      /* loop counters */
+  arm_status status;                             /* status of matrix multiplication */
+
+  float64x2_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
+  float64x2_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
+  float64x2_t accum = vdupq_n_f64(0);
+  float64_t *pIn1B = pSrcA->pData;
+  float64_t *pIn1C = pSrcA->pData;
+  float64_t *pIn1D = pSrcA->pData;
+  float64_t *pIn1E = pSrcA->pData;
+  float64_t *pIn1F = pSrcA->pData;
+  float64_t *pIn1G = pSrcA->pData;
+  float64_t *pIn1H = pSrcA->pData;
+
+  float64_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH;                                 /* Temporary output data matrix pointer */
+  float64_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrcA->numCols != pSrcB->numRows) ||
+     (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
+  {
+    /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+    /* Row loop */
+    rowCnt = row >> 3;
+
+    while(rowCnt > 0)
+    {
+      /* Output pointer is set to starting address of the row being processed */
+      px = pOut + GROUPOFROWS*i;
+      pxB = px + numColsB;
+      pxC = px + 2*numColsB;
+      pxD = px + 3*numColsB;
+      pxE = px + 4*numColsB;
+      pxF = px + 5*numColsB;
+      pxG = px + 6*numColsB;
+      pxH = px + 7*numColsB;
+
+      /* For every row wise process, the column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, the pIn2 pointer is set
+       ** to the starting address of the pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      j = 0U;
+
+      /* Column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum0 = 0.0f;
+        sum1 = 0.0f;
+        sum2 = 0.0f;
+        sum3 = 0.0f;
+        sum4 = 0.0f;
+        sum5 = 0.0f;
+        sum6 = 0.0f;
+        sum7 = 0.0f;
+
+        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+        pIn1 = pInA;
+        pIn1B = pIn1 + numColsA;
+        pIn1C = pIn1 + 2*numColsA;
+        pIn1D = pIn1 + 3*numColsA;
+        pIn1E = pIn1 + 4*numColsA;
+        pIn1F = pIn1 + 5*numColsA;
+        pIn1G = pIn1 + 6*numColsA;
+        pIn1H = pIn1 + 7*numColsA;
+
+        acc0 = vdupq_n_f64(0.0);
+        acc1 = vdupq_n_f64(0.0);
+        acc2 = vdupq_n_f64(0.0);
+        acc3 = vdupq_n_f64(0.0);
+        acc4 = vdupq_n_f64(0.0);
+        acc5 = vdupq_n_f64(0.0);
+        acc6 = vdupq_n_f64(0.0);
+        acc7 = vdupq_n_f64(0.0);
+
+        /* Compute 2 MACs simultaneously. */
+        colCnt = numColsA >> 1U;
+
+        /* Matrix multiplication */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          a0V = vld1q_f64(pIn1);
+          a1V = vld1q_f64(pIn1B);
+          a2V = vld1q_f64(pIn1C);
+          a3V = vld1q_f64(pIn1D);
+          a4V = vld1q_f64(pIn1E);
+          a5V = vld1q_f64(pIn1F);
+          a6V = vld1q_f64(pIn1G);
+          a7V = vld1q_f64(pIn1H);
+
+          pIn1 += 2;
+          pIn1B += 2;
+          pIn1C += 2;
+          pIn1D += 2;
+          pIn1E += 2;
+          pIn1F += 2;
+          pIn1G += 2;
+          pIn1H += 2;
+          
+          temp = vsetq_lane_f64(*pIn2,temp,0);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f64(*pIn2,temp,1);
+          pIn2 += numColsB;
+ 
+
+          acc0 = vmlaq_f64(acc0,a0V,temp);
+          acc1 = vmlaq_f64(acc1,a1V,temp);
+          acc2 = vmlaq_f64(acc2,a2V,temp);
+          acc3 = vmlaq_f64(acc3,a3V,temp);
+          acc4 = vmlaq_f64(acc4,a4V,temp);
+          acc5 = vmlaq_f64(acc5,a5V,temp);
+          acc6 = vmlaq_f64(acc6,a6V,temp);
+          acc7 = vmlaq_f64(acc7,a7V,temp);
+
+          /* Decrement the loop count */
+          colCnt--;
+        }
+
+        sum0 += vaddvq_f64(acc0);
+        sum1 += vaddvq_f64(acc1);
+        sum2 += vaddvq_f64(acc2);
+        sum3 += vaddvq_f64(acc3);
+        sum4 += vaddvq_f64(acc4);
+        sum5 += vaddvq_f64(acc5);
+        sum6 += vaddvq_f64(acc6);
+        sum7 += vaddvq_f64(acc7);
+
+        /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+         ** No loop unrolling is used. */
+        colCnt = numColsA & 1;
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          sum0 += *pIn1++ * (*pIn2);
+          sum1 += *pIn1B++ * (*pIn2);
+          sum2 += *pIn1C++ * (*pIn2);
+          sum3 += *pIn1D++ * (*pIn2);
+          sum4 += *pIn1E++ * (*pIn2);
+          sum5 += *pIn1F++ * (*pIn2);
+          sum6 += *pIn1G++ * (*pIn2);
+          sum7 += *pIn1H++ * (*pIn2);
+          pIn2 += numColsB;
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        /* Store the result in the destination buffer */
+        *px++ = sum0;
+        *pxB++ = sum1;
+        *pxC++ = sum2;
+        *pxD++ = sum3;
+        *pxE++ = sum4;
+        *pxF++ = sum5;
+        *pxG++ = sum6;
+        *pxH++ = sum7;
+
+        /* Update the pointer pIn2 to point to the  starting address of the next column */
+        j++;
+        pIn2 = pSrcB->pData + j;
+
+        /* Decrement the column loop counter */
+        col--;
+
+      } while (col > 0U);
+
+      /* Update the pointer pInA to point to the  starting address of the next row */
+      i = i + numColsB;
+      pInA = pInA + GROUPOFROWS*numColsA;
+
+      /* Decrement the row loop counter */
+      rowCnt--;
+    }
+
+    /*
+
+    i was the index of a group of rows computed by previous loop.
+    Now i is the index of a row since below code is computing row per row
+    and no more group of row per group of rows.
+
+    */
+
+    i = GROUPOFROWS*i;
+    rowCnt = row & 7;
+
+    while(rowCnt > 0)
+    {
+      /* Output pointer is set to starting address of the row being processed */
+      px = pOut + i;
+
+      /* For every row wise process, the column loop counter is to be initiated */
+      col = numColsB;
+
+      /* For every row wise process, the pIn2 pointer is set
+       ** to the starting address of the pSrcB data */
+      pIn2 = pSrcB->pData;
+
+      j = 0U;
+
+      /* Column loop */
+      do
+      {
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum = 0.0f;
+
+        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+        pIn1 = pInA;
+
+        acc0 = vdupq_n_f64(0.0);
+
+        /* Compute 4 MACs simultaneously. */
+        colCnt = numColsA >> 1U;
+
+        /* Matrix multiplication   */
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          a0V = vld1q_f64(pIn1);  // load & separate real/imag pSrcA (de-interleave 2)
+          pIn1 += 2;
+          
+          temp = vsetq_lane_f64(*pIn2,temp,0);
+          pIn2 += numColsB;
+          temp = vsetq_lane_f64(*pIn2,temp,1);
+          pIn2 += numColsB;
+
+
+          acc0 = vmlaq_f64(acc0,a0V,temp);
+
+          /* Decrement the loop count */
+          colCnt--;
+        }
+
+        //accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+        sum += vaddvq_f64(acc0);
+
+        /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+         ** No loop unrolling is used. */
+        colCnt = numColsA % 0x2U;
+
+        while (colCnt > 0U)
+        {
+          /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+          sum += *pIn1++ * (*pIn2);
+          pIn2 += numColsB;
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        /* Store the result in the destination buffer */
+        *px++ = sum;
+
+        /* Update the pointer pIn2 to point to the  starting address of the next column */
+        j++;
+        pIn2 = pSrcB->pData + j;
+
+        /* Decrement the column loop counter */
+        col--;

+      } while (col > 0U);
+
+
+      /* Update the pointer pInA to point to the  starting address of the next row */
+      i = i + numColsB;
+      pInA = pInA + numColsA;
+
+      /* Decrement the row loop counter */
+      rowCnt--;
+
+    }
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
 arm_status arm_mat_mult_f64(
  const arm_matrix_instance_f64 * pSrcA,
  const arm_matrix_instance_f64 * pSrcB,
@ -178,7 +513,7 @@ arm_status arm_mat_mult_f64(
  /* Return to application */
  return (status);
 }
-
+#endif

 /**
 * @} end of MatrixMult group
--- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mat_solve_lower_triangular_f64.c
 * Description:  Solve linear system LT X = A with LT lower triangular matrix
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -46,6 +46,109 @@
   * @param[out] dst The solution X of LT . X = A
   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
   */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+  arm_status arm_mat_solve_lower_triangular_f64(
+  const arm_matrix_instance_f64 * lt,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst)
+  {
+  arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((lt->numRows != lt->numCols) ||
+      (lt->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+    /* a1 b1 c1   x1 = a1
+          b2 c2   x2   a2
+             c3   x3   a3
+
+    x3 = a3 / c3
+    x2 = (a2 - c2 x3) / b2
+
+    */
+    int i,j,k,n,cols;
+
+    n = dst->numRows;
+    cols = dst->numCols;
+
+    float64_t *pX = dst->pData;
+    float64_t *pLT = lt->pData;
+    float64_t *pA = a->pData;
+
+    float64_t *lt_row;
+    float64_t *a_col;
+
+    float64_t invLT;
+
+    float64x2_t vecA;
+    float64x2_t vecX;
+
+    for(i=0; i < n ; i++)
+    {
+
+      for(j=0; j+1 < cols; j += 2)
+      {
+            vecA = vld1q_f64(&pA[i * cols + j]);
+
+            for(k=0; k < i; k++)
+            {
+                vecX = vld1q_f64(&pX[cols*k+j]);
+                vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
+            }
+
+            if (pLT[n*i + i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invLT = 1.0f / pLT[n*i + i];
+            vecA = vmulq_f64(vecA,vdupq_n_f64(invLT));
+            vst1q_f64(&pX[i*cols+j],vecA);
+
+       }
+
+       for(; j < cols; j ++)
+       {
+            a_col = &pA[j];
+            lt_row = &pLT[n*i];
+
+            float64_t tmp=a_col[i * cols];
+            
+            for(k=0; k < i; k++)
+            {
+                tmp -= lt_row[k] * pX[cols*k+j];
+            }
+
+            if (lt_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / lt_row[i];
+            pX[i*cols+j] = tmp;
+        }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  /* Return to application */
+  return (status);
+}
+
+#else
  arm_status arm_mat_solve_lower_triangular_f64(
  const arm_matrix_instance_f64 * lt,
  const arm_matrix_instance_f64 * a,
@ -119,6 +222,7 @@
  /* Return to application */
  return (status);
 }
+#endif
 /**
  @} end of MatrixInv group
 */
--- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mat_solve_upper_triangular_f64.c
 * Description:  Solve linear system UT X = A with UT upper triangular matrix
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -46,6 +46,104 @@
   * @param[out] dst The solution X of UT . X = A
   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
  */
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+  arm_status arm_mat_solve_upper_triangular_f64(
+  const arm_matrix_instance_f64 * ut,
+  const arm_matrix_instance_f64 * a,
+  arm_matrix_instance_f64 * dst)
+  {
+arm_status status;                             /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((ut->numRows != ut->numCols) ||
+      (ut->numRows != a->numRows)   )
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+  {
+
+    int i,j,k,n,cols;
+
+    n = dst->numRows;
+    cols = dst->numCols;
+
+    float64_t *pX = dst->pData;
+    float64_t *pUT = ut->pData;
+    float64_t *pA = a->pData;
+
+    float64_t *ut_row;
+    float64_t *a_col;
+
+    float64_t invUT;
+
+    float64x2_t vecA;
+    float64x2_t vecX;
+    
+    for(i=n-1; i >= 0 ; i--)
+    {
+      for(j=0; j+1 < cols; j +=2)
+      {
+            vecA = vld1q_f64(&pA[i * cols + j]);
+            
+            for(k=n-1; k > i; k--)
+            {
+                vecX = vld1q_f64(&pX[cols*k+j]);
+                vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
+            }
+
+            if (pUT[n*i + i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+
+            invUT = 1.0f / pUT[n*i + i];
+            vecA = vmulq_f64(vecA,vdupq_n_f64(invUT));
+           
+
+            vst1q_f64(&pX[i*cols+j],vecA);
+      }
+
+      for(; j < cols; j ++)
+      {
+            a_col = &pA[j];
+
+            ut_row = &pUT[n*i];
+
+            float64_t tmp=a_col[i * cols];
+            
+            for(k=n-1; k > i; k--)
+            {
+                tmp -= ut_row[k] * pX[cols*k+j];
+            }
+
+            if (ut_row[i]==0.0f)
+            {
+              return(ARM_MATH_SINGULAR);
+            }
+            tmp = tmp / ut_row[i];
+            pX[i*cols+j] = tmp;
+       }
+
+    }
+    status = ARM_MATH_SUCCESS;
+
+  }
+
+  
+  /* Return to application */
+  return (status);
+}
+
+#else
  arm_status arm_mat_solve_upper_triangular_f64(
  const arm_matrix_instance_f64 * ut,
  const arm_matrix_instance_f64 * a,
@ -113,7 +211,7 @@ arm_status status;                             /* status of matrix inverse */
  /* Return to application */
  return (status);
 }
-
+#endif

 /**
  @} end of MatrixInv group
--- a/Source/MatrixFunctions/arm_mat_trans_f64.c
+++ b/Source/MatrixFunctions/arm_mat_trans_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mat_trans_f64.c
 * Description:  Floating-point matrix transpose
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -32,7 +32,14 @@
  @ingroup groupMatrix
 */

+/**
+  @defgroup MatrixTrans Matrix Transpose
+
+  Tranposes a matrix.

+  Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
+  \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */

 /**
  @addtogroup MatrixTrans
@ -47,7 +54,126 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
+#if defined(ARM_MATH_NEON)

+arm_status arm_mat_trans_f64(
+  const arm_matrix_instance_f64 * pSrc,
+  arm_matrix_instance_f64 * pDst)
+{
+  float64_t *pIn = pSrc->pData;                  /* input data matrix pointer */
+  float64_t *pOut = pDst->pData;                 /* output data matrix pointer */
+  float64_t *px;                                 /* Temporary output data matrix pointer */
+  uint16_t nRows = pSrc->numRows;                /* number of rows */
+  uint16_t nColumns = pSrc->numCols;             /* number of columns */
+
+  uint16_t blkCnt, rowCnt, i = 0U, row = nRows;          /* loop counters */
+  arm_status status;                             /* status of matrix transpose  */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+  /* Check for matrix mismatch condition */
+  if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+  {
+    /* Set status as ARM_MATH_SIZE_MISMATCH */
+    status = ARM_MATH_SIZE_MISMATCH;
+  }
+  else
+#endif /*    #ifdef ARM_MATH_MATRIX_CHECK    */
+
+  {
+    /* Matrix transpose by exchanging the rows with columns */
+    /* Row loop */
+    rowCnt = row >> 1;
+    while (rowCnt > 0U)
+    {
+      float64_t *row0,*row1;
+      float64x2x4_t raV;
+
+      blkCnt = nColumns >> 2;
+
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + i;
+
+      /* Compute 4 outputs at a time.
+       ** a second loop below computes the remaining 1 to 3 samples. */
+      while (blkCnt > 0U)        /* Column loop */
+      {
+        row0 = pIn;
+        row1 = pIn+nColumns;
+        pIn+=4;
+        raV = vld4q_lane_f64(row0, raV, 0);
+        raV = vld4q_lane_f64(row1, raV, 1);
+        
+        vst1q_f64(px,raV.val[0]);
+        px += nRows;
+
+        vst1q_f64(px,raV.val[1]);
+        px += nRows;
+
+        vst1q_f64(px,raV.val[2]);
+        px += nRows;
+
+        vst1q_f64(px,raV.val[3]);
+        px += nRows;
+
+        /* Decrement the column loop counter */
+        blkCnt--;
+      }
+
+      /* Perform matrix transpose for last 3 samples here. */
+      blkCnt = nColumns % 0x4U;
+
+      while (blkCnt > 0U)
+      {
+        /* Read and store the input element in the destination */
+        *px++ = *pIn;
+        *px++ = *(pIn + 1 * nColumns);
+        
+        px += (nRows - 2);
+        pIn++;
+
+        /* Decrement the column loop counter */
+        blkCnt--;
+      }
+
+      i += 2;
+      pIn += 1 * nColumns;
+
+      /* Decrement the row loop counter */
+      rowCnt--;
+
+    }         /* Row loop end  */
+
+    rowCnt = row & 1;
+    while (rowCnt > 0U)
+    {
+      blkCnt = nColumns ;
+      /* The pointer px is set to starting address of the column being processed */
+      px = pOut + i;
+
+      while (blkCnt > 0U)
+      {
+        /* Read and store the input element in the destination */
+        *px = *pIn++;
+
+        /* Update the pointer px to point to the next row of the transposed matrix */
+        px += nRows;
+
+        /* Decrement the column loop counter */
+        blkCnt--;
+      }
+      i++;
+      rowCnt -- ;
+    }
+
+    /* Set status as ARM_MATH_SUCCESS */
+    status = ARM_MATH_SUCCESS;
+  }
+
+  /* Return to application */
+  return (status);
+}
+#else
 arm_status arm_mat_trans_f64(
  const arm_matrix_instance_f64 * pSrc,
        arm_matrix_instance_f64 * pDst)
@ -142,7 +268,7 @@ arm_status arm_mat_trans_f64(
  /* Return to application */
  return (status);
 }
-
+#endif
 /**
 * @} end of MatrixTrans group
 */
--- a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_absmax_no_idx_f64.c
 * Description:  Maximum value of absolute values of a floating-point vector
 *
- * $Date:        16 November 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -28,60 +28,128 @@

 #include "dsp/statistics_functions.h"

+
+
 /**
-  @ingroup groupStats
+ @ingroup groupStats
 */

 /**
-  @addtogroup AbsMax
-  @{
+ @addtogroup AbsMax
+ @{
 */

 /**
-  @brief         Maximum value of absolute values of a floating-point vector.
-  @param[in]     pSrc       points to the input vector
-  @param[in]     blockSize  number of samples in input vector
-  @param[out]    pResult    maximum value returned here
-  @return        none
+ @brief         Maximum value of absolute values of a floating-point vector.
+ @param[in]     pSrc       points to the input vector
+ @param[in]     blockSize  number of samples in input vector
+ @param[out]    pResult    maximum value returned here
+ @return        none
 */
+
+#if defined(ARM_MATH_NEON)
 void arm_absmax_no_idx_f64(
-  const float64_t * pSrc,
-        uint32_t blockSize,
-        float64_t * pResult)
+                           const float64_t * pSrc,
+                           uint32_t blockSize,
+                           float64_t * pResult)
 {
-        float64_t maxVal, out;                         /* Temporary variables to store the output value. */
-        uint32_t blkCnt;                     /* Loop counter */
-
-
+    float64_t maxVal , in;                         /* Temporary variables to store the output value. */
+    uint32_t blkCnt;                     /* Loop counter */
+    
+    float64x2_t maxV;
+    float64x2_t pSrcV ;
+    pSrcV = vld1q_f64(pSrc);
+    pSrc += 2 ;
+    maxV = vabsq_f64(pSrcV);

+    
+    
+    
+    /* Load first input value that act as reference value for comparision */

-
-  /* Load first input value that act as reference value for comparision */
-  out = fabs(*pSrc++);
-
-  /* Initialize blkCnt with number of samples */
-  blkCnt = (blockSize - 1U);
-
-  while (blkCnt > 0U)
-  {
-    /* Initialize maxVal to the next consecutive values one by one */
-    maxVal = fabs(*pSrc++);
-
-    /* compare for the maximum value */
-    if (out < maxVal)
+    
+    /* Initialize blkCnt with number of samples */
+    blkCnt = (blockSize - 2U) >> 1U;
+    
+    while (blkCnt > 0U)
    {
-      /* Update the maximum value and it's index */
-      out = maxVal;
+        /* Initialize maxVal to the next consecutive values one by one */
+        pSrcV = vld1q_f64(pSrc);
+        maxV = vmaxq_f64(maxV, vabsq_f64(pSrcV));
+        
+        pSrc += 2 ;
+        
+        /* Decrement loop counter */
+        blkCnt--;
    }
+    maxVal =vgetq_lane_f64(maxV, 0);
+    if(maxVal < vgetq_lane_f64(maxV, 1))
+    {
+        maxVal = vgetq_lane_f64(maxV, 1);
+    }
+    blkCnt = (blockSize - 2U) & 1;
+    
+    while (blkCnt > 0U)
+    {
+        /* Initialize maxVal to the next consecutive values one by one */
+        in = fabs(*pSrc++);
+        
+        /* compare for the maximum value */
+        if (maxVal < in)
+        {
+            /* Update the maximum value and it's index */
+            maxVal = in;
+        }
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    *pResult = maxVal;
+    
+    
+    /* Store the maximum value and it's index into destination pointers */

-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-  /* Store the maximum value and it's index into destination pointers */
-  *pResult = out;
 }
+#else
+void arm_absmax_no_idx_f64(
+                           const float64_t * pSrc,
+                           uint32_t blockSize,
+                           float64_t * pResult)
+{
+    float64_t maxVal, out;                         /* Temporary variables to store the output value. */
+    uint32_t blkCnt;                     /* Loop counter */
+    
+    
+    
+    
+    
+    /* Load first input value that act as reference value for comparision */
+    out = fabs(*pSrc++);
+    
+    /* Initialize blkCnt with number of samples */
+    blkCnt = (blockSize - 1U);
+    
+    while (blkCnt > 0U)
+    {
+        /* Initialize maxVal to the next consecutive values one by one */
+        maxVal = fabs(*pSrc++);
+        
+        /* compare for the maximum value */
+        if (out < maxVal)
+        {
+            /* Update the maximum value and it's index */
+            out = maxVal;
+        }
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    
+    /* Store the maximum value and it's index into destination pointers */
+    *pResult = out;
+}
+#endif

 /**
-  @} end of AbsMax group
+ @} end of AbsMax group
 */
--- a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_absmin_no_idx_f64.c
 * Description:  Minimum value of absolute values of a floating-point vector
 *
- * $Date:        16 November 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -44,6 +44,71 @@
  @param[out]    pResult    minimum value returned here
  @return        none
 */
+
+#if defined(ARM_MATH_NEON)
+void arm_absmin_no_idx_f64(
+                           const float64_t * pSrc,
+                           uint32_t blockSize,
+                           float64_t * pResult)
+{
+    float64_t minVal , in;                         /* Temporary variables to store the output value. */
+    uint32_t blkCnt;                     /* Loop counter */
+    
+    float64x2_t minV;
+    float64x2_t pSrcV ;
+    pSrcV = vld1q_f64(pSrc);
+    pSrc += 2 ;
+    minV = vabsq_f64(pSrcV);
+
+    
+    
+    
+    /* Load first input value that act as reference value for comparision */
+
+    
+    /* Initialize blkCnt with number of samples */
+    blkCnt = (blockSize - 2U) >> 1U;
+    
+    while (blkCnt > 0U)
+    {
+        /* Initialize maxVal to the next consecutive values one by one */
+        pSrcV = vld1q_f64(pSrc);
+        minV = vminq_f64(minV, vabsq_f64(pSrcV));
+        
+        pSrc += 2 ;
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    minVal =vgetq_lane_f64(minV, 0);
+    if(minVal > vgetq_lane_f64(minV, 1))
+    {
+        minVal = vgetq_lane_f64(minV, 1);
+    }
+    blkCnt = (blockSize - 2U) & 1;
+    
+    while (blkCnt > 0U)
+    {
+        /* Initialize maxVal to the next consecutive values one by one */
+        in = fabs(*pSrc++);
+        
+        /* compare for the maximum value */
+        if (minVal > in)
+        {
+            /* Update the maximum value and it's index */
+            minVal = in;
+        }
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    *pResult = minVal;
+    
+    
+    /* Store the maximum value and it's index into destination pointers */
+
+}
+#else
 void arm_absmin_no_idx_f64(
  const float64_t * pSrc,
        uint32_t blockSize,
@ -78,7 +143,7 @@ void arm_absmin_no_idx_f64(
  /* Store the minimum value and it's index into destination pointers */
  *pResult = out;
 }
-
+#endif
 /**
  @} end of AbsMin group
 */
--- a/Source/StatisticsFunctions/arm_entropy_f64.c
+++ b/Source/StatisticsFunctions/arm_entropy_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_logsumexp_f64.c
 * Description:  LogSumExp
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -29,6 +29,9 @@
 #include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
+#if defined(ARM_MATH_NEON)
+#include "arm_vec_math.h"
+#endif

 /**
 * @addtogroup Entropy
@ -51,9 +54,26 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
    float64_t accum, p;
 
    pIn = pSrcA;
-    blkCnt = blockSize;
+

    accum = 0.0;
+#if defined(ARM_MATH_NEON)
+  float64x2_t sumV ,pInV ;
+  sumV = vdupq_n_f64(0.0f);
+  blkCnt = blockSize >> 1U ;
+  
+  while(blkCnt > 0){
+    pInV = vld1q_f64(pIn);
+    sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) );
+    pIn += 2 ;
+    blkCnt--;
+    
+  }
+  accum = vaddvq_f64(sumV);
+  blkCnt = blockSize & 1 ;
+#else
+  blkCnt = blockSize;
+#endif

    while(blkCnt > 0)
    {
--- a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
+++ b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_logsumexp_f64.c
 * Description:  LogSumExp
 *
- * $Date:        23 April 2021
- * $Revision:    V1.9.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.9.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -44,6 +44,61 @@
 * @return Kullback-Leibler divergence D(A || B)
 *
 */
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+
+float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,const float64_t * pSrcB,uint32_t blockSize)
+{
+    const float64_t *pInA, *pInB;
+    uint32_t blkCnt;
+    float64_t accum, pA,pB;
+
+    float64x2_t accumV;
+    float64x2_t tmpVA, tmpVB,tmpV;
+ 
+    pInA = pSrcA;
+    pInB = pSrcB;
+
+    accum = 0.0f;
+    accumV = vdupq_n_f64(0.0f);
+
+    blkCnt = blockSize >> 1;
+    while(blkCnt > 0)
+    {
+      tmpVA = vld1q_f64(pInA);
+      pInA += 2;
+
+      tmpVB = vld1q_f64(pInB);
+      pInB += 2;
+
+      tmpV = vinvq_f64(tmpVA);
+      tmpVB = vmulq_f64(tmpVB, tmpV);
+      tmpVB = vlogq_f64(tmpVB);
+
+      accumV = vmlaq_f64(accumV, tmpVA, tmpVB);
+       
+      blkCnt--;
+    
+    }
+
+
+  accum = vaddvq_f64(accumV);
+
+    blkCnt = blockSize & 1;
+    while(blkCnt > 0)
+    {
+       pA = *pInA++;
+       pB = *pInB++;
+       accum += pA * logf(pB/pA);
+       
+       blkCnt--;
+    
+    }
+
+    return(-accum);
+}
+#else

 float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pSrcB, uint32_t blockSize)
 {
@ -69,7 +124,7 @@ float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pS

    return(-accum);
 }
-
+#endif
 /**
 * @} end of Kullback-Leibler group
 */
--- a/Source/StatisticsFunctions/arm_max_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_max_no_idx_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_max_no_idx_f64.c
 * Description:  Maximum value of a floating-point vector without returning the index
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -29,47 +29,112 @@
 #include "dsp/statistics_functions.h"

 /**
-  @ingroup groupStats
+ @ingroup groupStats
 */


 /**
-  @addtogroup Max
-  @{
+ @addtogroup Max
+ @{
 */

 /**
-  @brief         Maximum value of a floating-point vector.
-  @param[in]     pSrc       points to the input vector
-  @param[in]     blockSize  number of samples in input vector
-  @param[out]    pResult    maximum value returned here
-  @return        none
+ @brief         Maximum value of a floating-point vector.
+ @param[in]     pSrc       points to the input vector
+ @param[in]     blockSize  number of samples in input vector
+ @param[out]    pResult    maximum value returned here
+ @return        none
 */
+#if defined(ARM_MATH_NEON)
 void arm_max_no_idx_f64(
-    const float64_t *pSrc,
-    uint32_t   blockSize,
-    float64_t *pResult)
+                           const float64_t * pSrc,
+                           uint32_t blockSize,
+                           float64_t * pResult)
 {
-   float64_t   maxValue = F64_MIN;
-   float64_t   newVal;
-
-   while (blockSize > 0U)
-   {
-       newVal = *pSrc++;
-   
-       /* compare for the maximum value */
-       if (maxValue < newVal)
-       {
-           /* Update the maximum value and it's index */
-           maxValue = newVal;
-       }
-   
-       blockSize --;
-   }
+    float64_t maxVal , in;                         /* Temporary variables to store the output value. */
+    uint32_t blkCnt;                     /* Loop counter */
+    
+    float64x2_t maxV;
+    float64x2_t pSrcV ;
+    pSrcV = vld1q_f64(pSrc);
+    pSrc += 2 ;
+    maxV = pSrcV;
+    
+    
+    
+    
+    /* Load first input value that act as reference value for comparision */
+    
+    
+    /* Initialize blkCnt with number of samples */
+    blkCnt = (blockSize - 2U) >> 1U;
+    
+    while (blkCnt > 0U)
+    {
+        /* Initialize maxVal to the next consecutive values one by one */
+        pSrcV = vld1q_f64(pSrc);
+        maxV = vmaxq_f64(maxV, pSrcV);
+        
+        pSrc += 2 ;
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    maxVal =vgetq_lane_f64(maxV, 0);
+    if(maxVal < vgetq_lane_f64(maxV, 1))
+    {
+        maxVal = vgetq_lane_f64(maxV, 1);
+    }
+    blkCnt = (blockSize - 2U) & 1;
+    
+    while (blkCnt > 0U)
+    {
+        /* Initialize maxVal to the next consecutive values one by one */
+        in = *pSrc++;
+        
+        /* compare for the maximum value */
+        if (maxVal < in)
+        {
+            /* Update the maximum value and it's index */
+            maxVal = in;
+        }
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    *pResult = maxVal;
+    
+    
+    /* Store the maximum value and it's index into destination pointers */
+    
+}
+#else
+void arm_max_no_idx_f64(
+                        const float64_t *pSrc,
+                        uint32_t   blockSize,
+                        float64_t *pResult)
+{
+    float64_t   maxValue = F64_MIN;
+    float64_t   newVal;
+    
+    while (blockSize > 0U)
+    {
+        newVal = *pSrc++;
+        
+        /* compare for the maximum value */
+        if (maxValue < newVal)
+        {
+            /* Update the maximum value and it's index */
+            maxValue = newVal;
+        }
+        
+        blockSize --;
+    }
    
-   *pResult = maxValue;
+    *pResult = maxValue;
 }
+#endif

 /**
-  @} end of Max group
+ @} end of Max group
 */
--- a/Source/StatisticsFunctions/arm_mean_f64.c
+++ b/Source/StatisticsFunctions/arm_mean_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mean_f64.c
 * Description:  Mean value of a floating-point vector
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        03 June 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -29,47 +29,90 @@
 #include "dsp/statistics_functions.h"

 /**
-  @ingroup groupStats
+ @ingroup groupStats
 */


 /**
-  @addtogroup mean
-  @{
+ @addtogroup mean
+ @{
 */

 /**
-  @brief         Mean value of a floating-point vector.
-  @param[in]     pSrc       points to the input vector.
-  @param[in]     blockSize  number of samples in input vector.
-  @param[out]    pResult    mean value returned here.
-  @return        none
+ @brief         Mean value of a floating-point vector.
+ @param[in]     pSrc       points to the input vector.
+ @param[in]     blockSize  number of samples in input vector.
+ @param[out]    pResult    mean value returned here.
+ @return        none
 */
+
+#if defined(ARM_MATH_NEON)
+
 void arm_mean_f64(
-  const float64_t * pSrc,
-        uint32_t blockSize,
-        float64_t * pResult)
+                  const float64_t * pSrc,
+                  uint32_t blockSize,
+                  float64_t * pResult)
 {
-        uint32_t blkCnt;                               /* Loop counter */
-        float64_t sum = 0.;                            /* Temporary result storage */
+  uint32_t blkCnt;                               /* Loop counter */
+  float64x2_t vSum = vdupq_n_f64(0.0f);
+  float64_t sum = 0.;                            /* Temporary result storage */
+  float64x2_t afterLoad ;
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize >> 1U;
+  
+  
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    
+    afterLoad = vld1q_f64(pSrc);
+    vSum = vaddq_f64(vSum, afterLoad);
+    pSrc += 2;
+    /* Decrement loop counter */
+    blkCnt--;
+    

+  }
+  sum = vaddvq_f64(vSum);
+  
+  blkCnt = blockSize & 1;
+  
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+    
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+  *pResult = (sum/blockSize);
+}
+#else
+void arm_mean_f64(
+                  const float64_t * pSrc,
+                  uint32_t blockSize,
+                  float64_t * pResult)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+  float64_t sum = 0.;                            /* Temporary result storage */
+  
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
-
+  
  while (blkCnt > 0U)
  {
    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
    sum += *pSrc++;
-
+    
    /* Decrement loop counter */
    blkCnt--;
  }
-
+  
  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
  /* Store result to destination */
  *pResult = (sum / blockSize);
 }
-
+#endif
 /**
-  @} end of mean group
+ @} end of mean group
 */
--- a/Source/StatisticsFunctions/arm_min_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_min_no_idx_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_min_no_idx_f64.c
 * Description:  Maximum value of a floating-point vector without returning the index
 *
- * $Date:        16 November 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -52,8 +52,31 @@ void arm_min_no_idx_f64(
 {
   float64_t   minValue = F64_MAX;
   float64_t   newVal;
+  uint32_t blkCnt ;
+#if defined(ARM_MATH_NEON)
+  float64x2_t minValueV , newValV ;
+  minValueV = vdupq_n_f64(F64_MAX);
+  blkCnt = blockSize >> 1U;
+  while(blkCnt > 0)
+  {
+    newValV = vld1q_f64(pSrc);
+    minValueV = vminq_f64(minValueV, newValV);
+    pSrc += 2 ;
+    blkCnt--;
+    
+  }
+  minValue =vgetq_lane_f64(minValueV, 0);
+  if(minValue > vgetq_lane_f64(minValueV, 1))
+  {
+      minValue = vgetq_lane_f64(minValueV, 1);
+  }
+  
+  blkCnt = blockSize & 1 ;
+#else
+  blkCnt = blockSize;
+#endif

-   while (blockSize > 0U)
+   while (blkCnt > 0U)
   {
       newVal = *pSrc++;
   
@ -64,7 +87,7 @@ void arm_min_no_idx_f64(
           minValue = newVal;
       }
   
-       blockSize --;
+       blkCnt --;
   }
    
   *pResult = minValue;
--- a/Source/StatisticsFunctions/arm_mse_f64.c
+++ b/Source/StatisticsFunctions/arm_mse_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_mse_f64.c
 * Description:  Double floating point mean square error
 *
- * $Date:        05 April 2022
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -29,82 +29,106 @@
 #include "dsp/statistics_functions.h"

 /**
-  @ingroup groupStats
+ @ingroup groupStats
 */

 /**
-  @addtogroup MSE
-  @{
+ @addtogroup MSE
+ @{
 */

 /**
-  @brief         Mean square error between two double floating point vectors.
-  @param[in]     pSrcA       points to the first input vector
-  @param[in]     pSrcB       points to the second input vector
-  @param[in]     blockSize   number of samples in input vector
-  @param[out]    result      mean square error
-  @return        none
+ @brief         Mean square error between two double floating point vectors.
+ @param[in]     pSrcA       points to the first input vector
+ @param[in]     pSrcB       points to the second input vector
+ @param[in]     blockSize   number of samples in input vector
+ @param[out]    result      mean square error
+ @return        none
 */

-
-
-
-
 void arm_mse_f64(
-    const float64_t * pSrcA,
-    const float64_t * pSrcB,
-    uint32_t    blockSize,
-    float64_t * result)
+                 const float64_t * pSrcA,
+                 const float64_t * pSrcB,
+                 uint32_t    blockSize,
+                 float64_t * result)

 {
-  uint32_t blkCnt;                               /* Loop counter */
-  float64_t inA, inB;
-  float64_t sum = 0.0;                          /* Temporary return variable */
+    
+    uint32_t blkCnt;                               /* Loop counter */
+    float64_t inA, inB;
+    float64_t sum = 0.0;
+#if defined (ARM_MATH_NEON)
+    
+    float64x2_t inAV , inBV , subV, sumV;
+    sumV = vdupq_n_f64(0.0f);
+    
+    blkCnt = blockSize >> 1U ;
+    
+    while (blkCnt > 0U)
+    {
+        inAV = vld1q_f64(pSrcA);
+        pSrcA+=2;
+        inBV = vld1q_f64(pSrcB);
+        pSrcB+=2;
+        subV = vsubq_f64(inAV, inBV);
+        sumV = vmlaq_f64(sumV, subV, subV);
+        
+        blkCnt--;
+        
+    }
+    sum = vaddvq_f64(sumV);
+    blkCnt = (blockSize) & 1;
+    
+#else
+                            /* Temporary return variable */
 #if defined (ARM_MATH_LOOPUNROLL)
-  blkCnt = (blockSize) >> 1;
-
- 
-  while (blkCnt > 0U)
-  {
-
-
-    inA = *pSrcA++; 
-    inB = *pSrcB++;
-    inA = inA - inB;
-    sum += inA * inA;
-
-    inA = *pSrcA++; 
-    inB = *pSrcB++;
-    inA = inA - inB;
-    sum += inA * inA;
-
-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-  
-  /* Loop unrolling: Compute remaining outputs */
-  blkCnt = (blockSize) & 1;
+    blkCnt = (blockSize) >> 1;
+    
+#pragma clang loop vectorize(enable)
+    while (blkCnt > 0U)
+    {
+        
+        
+        inA = *pSrcA++;
+        inB = *pSrcB++;
+        inA = inA - inB;
+        sum += inA * inA;
+        
+        inA = *pSrcA++;
+        inB = *pSrcB++;
+        inA = inA - inB;
+        sum += inA * inA;
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    
+    
+    /* Loop unrolling: Compute remaining outputs */
+    blkCnt = (blockSize) & 1;
 #else
-  /* Initialize blkCnt with number of samples */
-  blkCnt = blockSize;
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
 #endif
-  while (blkCnt > 0U)
-  {
-    inA = *pSrcA++; 
-    inB = *pSrcB++;
-    inA = inA - inB;
-    sum += inA * inA;
-
-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-  /* Store result in destination buffer */
-  *result = sum / blockSize;
+#endif
+//#pragma clang loop vectorize(enable) unroll(disable)
+    while (blkCnt > 0U)
+    {
+        inA = *pSrcA++;
+        inB = *pSrcB++;
+        inA = inA - inB;
+        sum += inA * inA;
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    
+    /* Store result in destination buffer */
+    *result = sum / blockSize;
 }


+
 /**
-  @} end of MSE group
+ @} end of MSE group
 */
--- a/Source/StatisticsFunctions/arm_power_f64.c
+++ b/Source/StatisticsFunctions/arm_power_f64.c
@ -3,8 +3,8 @@
 * Title:        arm_power_f64.c
 * Description:  Sum of the squares of the elements of a floating-point vector
 *
- * $Date:        13 September 2021
- * $Revision:    V1.10.0
+ * $Date:        10 August 2022
+ * $Revision:    V1.10.1
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
@ -29,49 +29,100 @@
 #include "dsp/statistics_functions.h"

 /**
-  @ingroup groupStats
+ @ingroup groupStats
 */

 /**
-  @addtogroup power
-  @{
+ @addtogroup power
+ @{
 */

 /**
-  @brief         Sum of the squares of the elements of a floating-point vector.
-  @param[in]     pSrc       points to the input vector
-  @param[in]     blockSize  number of samples in input vector
-  @param[out]    pResult    sum of the squares value returned here
-  @return        none
+ @brief         Sum of the squares of the elements of a floating-point vector.
+ @param[in]     pSrc       points to the input vector
+ @param[in]     blockSize  number of samples in input vector
+ @param[out]    pResult    sum of the squares value returned here
+ @return        none
 */
+#if defined(ARM_MATH_NEON)
 void arm_power_f64(
-  const float64_t * pSrc,
-        uint32_t blockSize,
-        float64_t * pResult)
+                   const float64_t * pSrc,
+                   uint32_t blockSize,
+                   float64_t * pResult)
 {
-        uint32_t blkCnt;                               /* Loop counter */
-        float64_t sum = 0.;                          /* Temporary result storage */
-        float64_t in;                                  /* Temporary variable to store input value */
-
-  /* Initialize blkCnt with number of samples */
-  blkCnt = blockSize;
-
-  while (blkCnt > 0U)
-  {
-    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-
-    /* Compute Power and store result in a temporary variable, sum. */
-    in = *pSrc++;
-    sum += in * in;
-
-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-  /* Store result to destination */
-  *pResult = sum;
+    uint32_t blkCnt;                               /* Loop counter */
+    float64_t sum = 0.;                          /* Temporary result storage */
+    float64x2_t sumV ;                                 /* Temporary variable to store input value */
+    sumV = vdupq_n_f64(0.0f);
+    float64x2_t pSrcV;
+    
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize >> 1U;
+    
+    while (blkCnt > 0U)
+    {
+        /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+        
+        /* Compute Power and store result in a temporary variable, sum. */
+        pSrcV = vld1q_f64(pSrc);
+        sumV = vmlaq_f64(sumV, pSrcV, pSrcV);
+        pSrc+= 2 ;
+        
+        
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    sum = vaddvq_f64(sumV);
+    
+    
+    float64_t in;
+    blkCnt = blockSize & 1;
+    
+    while (blkCnt > 0U)
+    {
+        /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+        
+        /* Compute Power and store result in a temporary variable, sum. */
+        in = *pSrc++;
+        sum += in * in;
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    
+    /* Store result to destination */
+    *pResult = sum;
 }
-
+#else
+void arm_power_f64(
+                   const float64_t * pSrc,
+                   uint32_t blockSize,
+                   float64_t * pResult)
+{
+    uint32_t blkCnt;                               /* Loop counter */
+    float64_t sum = 0.;                          /* Temporary result storage */
+    float64_t in;                                  /* Temporary variable to store input value */
+    
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+    
+    while (blkCnt > 0U)
+    {
+        /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+        
+        /* Compute Power and store result in a temporary variable, sum. */
+        in = *pSrc++;
+        sum += in * in;
+        
+        /* Decrement loop counter */
+        blkCnt--;
+    }
+    
+    /* Store result to destination */
+    *pResult = sum;
+}
+#endif
 /**
-  @} end of power group
+ @} end of power group
 */