diff --git a/Source/BasicMathFunctions/arm_dot_prod_f64.c b/Source/BasicMathFunctions/arm_dot_prod_f64.c
index a4cd07cc..4e300331 100644
--- a/Source/BasicMathFunctions/arm_dot_prod_f64.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f64.c
@@ -3,8 +3,8 @@
* Title: arm_dot_prod_f64.c
* Description: Floating-point dot product
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 03 June 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -45,7 +45,62 @@
@param[out] result output result returned here.
@return none
*/
+#if defined(ARM_MATH_NEON)
+void arm_dot_prod_f64(
+ const float64_t * pSrcA,
+ const float64_t * pSrcB,
+ uint32_t blockSize,
+ float64_t * result)
+ {
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary return variable */
+ float64x2_t sumV ; /* Neon buffer for sum variable */
+
+ /* Neon Buffer Initialisation */
+ sumV = vsetq_lane_f64(0.0f, sumV, 0);
+ sumV = vsetq_lane_f64(0.0f, sumV, 1);
+
+ /* Neon Buffer for sources */
+ float64x2_t pSrcAV;
+ float64x2_t pSrcBV;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
+
+ /* Load source value in Neon Buffer */
+ pSrcAV = vld1q_f64(pSrcA);
+ pSrcBV = vld1q_f64(pSrcB);
+ /* Calculate dot product and store result in a temporary buffer. */
+ sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV);
+
+ pSrcA+=2;
+ pSrcB+=2;
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ /* Sum both 64 bits part in the float64x2 */
+ sum = vaddvq_f64(sumV);
+
+
+ /* Tail */
+ blkCnt = blockSize & 1 ;
+
+ while(blkCnt > 0U)
+ {
+ sum += (*pSrcA++) * (*pSrcB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ /* Store result in destination buffer */
+ *result = sum;
+ }
+#else
void arm_dot_prod_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
@@ -72,6 +127,7 @@ void arm_dot_prod_f64(
/* Store result in destination buffer */
*result = sum;
}
+#endif
/**
@} end of BasicDotProd group
diff --git a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
index 899159dd..63090948 100644
--- a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
+++ b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
@@ -4,8 +4,8 @@
* Title: arm_chebyshev_distance_f64.c
* Description: Chebyshev distance between two vectors
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -48,15 +48,37 @@
*/
float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
- float64_t diff=0., maxVal,tmpA, tmpB;
- tmpA = *pA++;
- tmpB = *pB++;
- diff = fabs(tmpA - tmpB);
- maxVal = diff;
- blockSize--;
+ float64_t diff=0., maxVal,tmpA, tmpB;
+ uint32_t blkCnt;
+ maxVal = F64_MIN;
+#if defined(ARM_MATH_NEON)
+ float64x2_t diffV , tmpAV , tmpBV , maxValV ;
+ maxValV = vdupq_n_f64(maxVal);
+ blkCnt = blockSize >> 1U ;
+ while(blkCnt > 0U)
+ {
+ tmpAV = vld1q_f64(pA);
+ tmpBV = vld1q_f64(pB);
+ diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV)));
+ maxValV = vmaxq_f64(maxValV, diffV);
+ pA+=2;
+ pB+=2;
+ blkCnt--;
+ }
+ maxVal =vgetq_lane_f64(maxValV, 0);
+ if(maxVal < vgetq_lane_f64(maxValV, 1))
+ {
+ maxVal = vgetq_lane_f64(maxValV, 1);
+ }
+ blkCnt = blockSize & 1;
+
+
+#else
+ blkCnt = blockSize;
+#endif
- while(blockSize > 0)
+ while(blkCnt > 0)
{
tmpA = *pA++;
tmpB = *pB++;
@@ -65,7 +87,7 @@ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, ui
{
maxVal = diff;
}
- blockSize --;
+ blkCnt --;
}
return(maxVal);
diff --git a/Source/DistanceFunctions/arm_cityblock_distance_f64.c b/Source/DistanceFunctions/arm_cityblock_distance_f64.c
index 46c0a6df..2f63ffae 100644
--- a/Source/DistanceFunctions/arm_cityblock_distance_f64.c
+++ b/Source/DistanceFunctions/arm_cityblock_distance_f64.c
@@ -4,8 +4,8 @@
* Title: arm_cityblock_distance_f64.c
* Description: Cityblock (Manhattan) distance between two vectors
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -48,15 +48,37 @@
float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
float64_t accum,tmpA, tmpB;
-
+ uint32_t blkCnt;
accum = 0.;
- while(blockSize > 0)
+
+#if defined(ARM_MATH_NEON)
+ float64x2_t tmpAV, tmpBV,accumV , subV;
+ accumV = vdupq_n_f64(0.0f);
+ blkCnt = blockSize >> 1U;
+
+ while(blkCnt > 0U)
+ {
+ tmpAV = vld1q_f64(pA);
+ tmpBV = vld1q_f64(pB);
+ subV = vabdq_f64(tmpAV, tmpBV);
+ accumV = vaddq_f64(accumV, subV);
+ pA+=2;
+ pB+=2;
+ blkCnt--;
+ }
+ accum = vaddvq_f64(accumV);
+ blkCnt = blockSize & 1 ;
+
+#else
+ blkCnt = blockSize;
+#endif
+ while(blkCnt > 0)
{
tmpA = *pA++;
tmpB = *pB++;
accum += fabs(tmpA - tmpB);
- blockSize --;
+ blkCnt--;
}
return(accum);
diff --git a/Source/DistanceFunctions/arm_euclidean_distance_f64.c b/Source/DistanceFunctions/arm_euclidean_distance_f64.c
index 8c8dfde5..794bf93d 100644
--- a/Source/DistanceFunctions/arm_euclidean_distance_f64.c
+++ b/Source/DistanceFunctions/arm_euclidean_distance_f64.c
@@ -4,8 +4,8 @@
* Title: arm_euclidean_distance_f64.c
* Description: Euclidean distance between two vectors
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -50,12 +50,31 @@
float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
float64_t accum=0.,tmp;
-
- while(blockSize > 0)
+ uint32_t blkCnt;
+#if defined(ARM_MATH_NEON)
+ float64x2_t accumV,tmpV , pAV ,pBV;
+ accumV = vdupq_n_f64(0.0f);
+ blkCnt = blockSize >> 1U;
+ while(blkCnt > 0U)
+ {
+ pAV = vld1q_f64(pA);
+ pBV = vld1q_f64(pB);
+ tmpV = vsubq_f64(pAV, pBV);
+ accumV = vmlaq_f64(accumV, tmpV, tmpV);
+ pA+=2;
+ pB+=2;
+ blkCnt--;
+ }
+ accum = vaddvq_f64(accumV);
+ blkCnt = blockSize & 1;
+#else
+ blkCnt = blockSize;
+#endif
+ while(blkCnt > 0)
{
tmp = *pA++ - *pB++;
accum += SQ(tmp);
- blockSize --;
+ blkCnt --;
}
tmp = sqrt(accum);
return(tmp);
diff --git a/Source/FastMathFunctions/arm_vexp_f64.c b/Source/FastMathFunctions/arm_vexp_f64.c
index 8f27ad83..e6c8f359 100644
--- a/Source/FastMathFunctions/arm_vexp_f64.c
+++ b/Source/FastMathFunctions/arm_vexp_f64.c
@@ -3,8 +3,8 @@
* Title: arm_vlog_f64.c
* Description: Fast vectorized log
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -28,31 +28,44 @@
#include "dsp/fast_math_functions.h"
#include "arm_common_tables.h"
+#if defined(ARM_MATH_NEON)
+#include "arm_vec_math.h"
+#endif
-/**
- @addtogroup vexp
- @{
- */
-
-/**
- @brief Floating-point vector of exp values.
- @param[in] pSrc points to the input vector
- @param[out] pDst points to the output vector
- @param[in] blockSize number of samples in each vector
- @return none
- */
void arm_vexp_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
- uint32_t blkCnt;
+ uint32_t blkCnt;
+#if defined(ARM_MATH_NEON)
+
+
+ float64x2_t src;
+ float64x2_t dst;
- blkCnt = blockSize;
+ blkCnt = blockSize >> 1U;
+ while (blkCnt > 0U)
+ {
+ src = vld1q_f64(pSrc);
+ dst = vexpq_f64(src);
+ vst1q_f64(pDst, dst);
+
+ pSrc += 2;
+ pDst += 2;
+
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 1;
+#else
+ blkCnt = blockSize;
+#endif
while (blkCnt > 0U)
{
/* C = log(A) */
+
/* Calculate log and store result in destination buffer. */
*pDst++ = exp(*pSrc++);
@@ -61,7 +74,3 @@ void arm_vexp_f64(
blkCnt--;
}
}
-
-/**
- @} end of vexp group
- */
\ No newline at end of file
diff --git a/Source/FastMathFunctions/arm_vlog_f64.c b/Source/FastMathFunctions/arm_vlog_f64.c
index a60fff69..b9ab25ec 100644
--- a/Source/FastMathFunctions/arm_vlog_f64.c
+++ b/Source/FastMathFunctions/arm_vlog_f64.c
@@ -3,8 +3,8 @@
* Title: arm_vlog_f64.c
* Description: Fast vectorized log
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -29,14 +29,39 @@
#include "dsp/fast_math_functions.h"
#include "arm_common_tables.h"
+#if defined(ARM_MATH_NEON)
+#include "arm_vec_math.h"
+#endif
+
void arm_vlog_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt;
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+ float64x2_t src;
+ float64x2_t dst;
+
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ src = vld1q_f64(pSrc);
+ dst = vlogq_f64(src);
+ vst1q_f64(pDst, dst);
+ pSrc += 2;
+ pDst += 2;
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 1;
+#else
blkCnt = blockSize;
+#endif
+
while (blkCnt > 0U)
{
diff --git a/Source/FilteringFunctions/arm_correlate_f64.c b/Source/FilteringFunctions/arm_correlate_f64.c
index 95edd40f..8d52bcf6 100644
--- a/Source/FilteringFunctions/arm_correlate_f64.c
+++ b/Source/FilteringFunctions/arm_correlate_f64.c
@@ -3,8 +3,8 @@
* Title: arm_correlate_f64.c
* Description: Correlation of floating-point sequences
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -65,6 +65,9 @@ void arm_correlate_f64(
uint32_t j, k, count, blkCnt; /* Loop counters */
uint32_t outBlockSize; /* Loop counter */
int32_t inc = 1; /* Destination address modifier */
+#if defined(ARM_MATH_NEON)
+ float64x2_t sumV,pxV,pyV ;
+#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
@@ -164,10 +167,25 @@ void arm_correlate_f64(
{
/* Accumulator is made zero for every iteration */
sum = 0.;
-
+#if defined(ARM_MATH_NEON)
+ sumV = vdupq_n_f64(0.0f);
+ k = count >> 1U ;
+
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = count & 1 ;
+#else
/* Initialize k with number of samples */
k = count;
-
+#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
@@ -179,6 +197,7 @@ void arm_correlate_f64(
}
/* Store the result in the accumulator in the destination buffer. */
+
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
@@ -192,6 +211,7 @@ void arm_correlate_f64(
/* Decrement loop counter */
blockSize1--;
+
}
/* --------------------------
@@ -229,10 +249,26 @@ void arm_correlate_f64(
{
/* Accumulator is made zero for every iteration */
sum = 0.;
+#if defined(ARM_MATH_NEON)
+ sumV = vdupq_n_f64(0.0f);
+ k = srcBLen >> 1U ;
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = srcBLen & 1 ;
+
+#else
/* Initialize blkCnt with number of samples */
k = srcBLen;
-
+#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
@@ -269,10 +305,26 @@ void arm_correlate_f64(
{
/* Accumulator is made zero for every iteration */
sum = 0.;
+#if defined(ARM_MATH_NEON)
+ sumV = vdupq_n_f64(0.0f);
+ k = srcBLen >> 1U ;
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = srcBLen & 1 ;
+
+#else
/* Loop over srcBLen */
k = srcBLen;
-
+#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
@@ -330,10 +382,26 @@ void arm_correlate_f64(
{
/* Accumulator is made zero for every iteration */
sum = 0.;
+#if defined(ARM_MATH_NEON)
+ sumV = vdupq_n_f64(0.0f);
+ k = count >> 1U ;
+
+ while(k > 0U)
+ {
+ pxV = vld1q_f64(px);
+ pyV = vld1q_f64(py);
+ sumV = vmlaq_f64(sumV, pxV, pyV);
+ px+=2;
+ py+=2;
+ k--;
+ }
+ sum =vaddvq_f64(sumV);
+ k = count & 1 ;
+#else
/* Initialize blkCnt with number of samples */
k = count;
-
+#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
diff --git a/Source/FilteringFunctions/arm_fir_f64.c b/Source/FilteringFunctions/arm_fir_f64.c
index 62e7e998..51fc8ece 100644
--- a/Source/FilteringFunctions/arm_fir_f64.c
+++ b/Source/FilteringFunctions/arm_fir_f64.c
@@ -3,8 +3,8 @@
* Title: arm_fir_f64.c
* Description: Floating-point FIR filter processing function
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 03 June 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -45,7 +45,103 @@
@param[in] blockSize number of samples to process
@return none
*/
+#if defined(ARM_MATH_NEON)
+void arm_fir_f64(
+ const arm_fir_instance_f64 * S,
+ const float64_t * pSrc,
+ float64_t * pDst,
+ uint32_t blockSize)
+{
+ float64_t *pState = S->pState; /* State pointer */
+ const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float64_t *pStateCurnt; /* Points to the current sample of the state */
+ float64_t *px; /* Temporary pointer for state buffer */
+ const float64_t *pb; /* Temporary pointer for coefficient buffer */
+ float64x2_t pxV;
+ float64x2_t pbV;
+ float64x2_t acc0V;
+ float64_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.;
+ acc0V = vdupq_n_f64(0.0f);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps >> 1U;
+
+ /* Perform the multiply-accumulates */
+ while (i > 0U)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ pxV = vld1q_f64(px);
+ pbV = vld1q_f64(pb);
+ acc0V = vmlaq_f64(acc0V, pxV, pbV);
+ px+=2;
+ pb+=2;
+
+ i--;
+ }
+
+ acc0 = vaddvq_f64(acc0V);
+ i = numTaps%2 ;
+ while(i >0U)
+ {
+ acc0+= *px++ * *pb++ ;
+ i--;
+ }
+
+ /* Store result in destination buffer. */
+
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#else
void arm_fir_f64(
const arm_fir_instance_f64 * S,
const float64_t * pSrc,
@@ -123,6 +219,7 @@ void arm_fir_f64(
}
}
+#endif
/**
* @} end of FIR group
diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
index 7669582d..b1605116 100755
--- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mat_cholesky_f64.c
* Description: Floating-point Cholesky decomposition
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -52,6 +52,7 @@
* The decomposition of A is returning a lower triangular matrix L such that A = L L^t
*/
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_cholesky_f64(
const arm_matrix_instance_f64 * pSrc,
@@ -61,6 +62,163 @@ arm_status arm_mat_cholesky_f64(
arm_status status; /* status of matrix inverse */
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float64_t invSqrtVj;
+ float64_t *pA,*pG;
+ int kCnt;
+
+
+ float64x2_t acc, acc0, acc1, acc2, acc3;
+ float64x2_t vecGi;
+ float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+ float64x2_t tmp = vdupq_n_f64(0.0f);
+
+ float64_t sum=0.0f;
+ float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
+
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+ for(i=0 ;i < n ; i++)
+ {
+ for(j=i ; j+3 < n ; j+=4)
+ {
+ pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+ pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+ pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+ pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+ acc0 = vdupq_n_f64(0.0f);
+ acc1 = vdupq_n_f64(0.0f);
+ acc2 = vdupq_n_f64(0.0f);
+ acc3 = vdupq_n_f64(0.0f);
+
+ kCnt = i >> 1U;
+ k=0;
+ while(kCnt > 0)
+ {
+
+ vecGi=vld1q_f64(&pG[i * n + k]);
+
+ vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
+ vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
+ vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
+ vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
+
+ acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
+ acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
+ acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
+ acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
+
+ kCnt--;
+ k+=2;
+ }
+
+
+ sum0 = vaddvq_f64(acc0);
+ sum1 = vaddvq_f64(acc1);
+ sum2 = vaddvq_f64(acc2);
+ sum3 = vaddvq_f64(acc3);
+
+
+ kCnt = i & 1;
+ while(kCnt > 0)
+ {
+
+ sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
+ sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
+ sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
+ sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
+ kCnt--;
+ k++;
+ }
+
+ pG[(j + 0) * n + i] -= sum0;
+ pG[(j + 1) * n + i] -= sum1;
+ pG[(j + 2) * n + i] -= sum2;
+ pG[(j + 3) * n + i] -= sum3;
+ }
+
+ for(; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ acc = vdupq_n_f64(0.0f);
+
+ kCnt = i >> 1U;
+ k=0;
+ while(kCnt > 0)
+ {
+
+ vecGi=vld1q_f64(&pG[i * n + k]);
+ vecGj=vld1q_f64(&pG[j * n + k]);
+
+ acc = vfmaq_f64(acc, vecGi, vecGj);
+
+ kCnt--;
+ k+=2;
+ }
+
+
+ sum = vaddvq_f64(acc);
+
+ kCnt = i & 1;
+ while(kCnt > 0)
+ {
+ sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
+
+
+ kCnt--;
+ k++;
+ }
+
+ pG[j * n + i] -= sum;
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+ SCALE_COL_F64(pDst,i,invSqrtVj,i);
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_cholesky_f64(
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
@@ -115,6 +273,7 @@ arm_status arm_mat_cholesky_f64(
/* Return to application */
return (status);
}
+#endif
/**
@} end of MatrixChol group
diff --git a/Source/MatrixFunctions/arm_mat_mult_f64.c b/Source/MatrixFunctions/arm_mat_mult_f64.c
index ddc60101..1a778e18 100755
--- a/Source/MatrixFunctions/arm_mat_mult_f64.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mat_mult_f64.c
* Description: Floating-point matrix multiplication
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -27,11 +27,31 @@
*/
#include "dsp/matrix_functions.h"
+#if defined(ARM_MATH_NEON)
+#define GROUPOFROWS 8
+#endif
/**
* @ingroup groupMatrix
*/
+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an M x N matrix with an N x P matrix results
+ * in an M x P matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * pSrcA and pSrcB are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of pSrcA and pSrcB.
+ */
+
+
/**
* @addtogroup MatrixMult
* @{
@@ -46,7 +66,322 @@
* ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
*/
+#if defined(ARM_MATH_NEON)
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+arm_status arm_mat_mult_f64(
+ const arm_matrix_instance_f64 * pSrcA,
+ const arm_matrix_instance_f64 * pSrcB,
+ arm_matrix_instance_f64 * pDst)
+{
+ float64_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float64_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float64_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+ float64_t *px; /* Temporary output data matrix pointer */
+ float64_t sum; /* Accumulator */
+ uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+
+
+ uint32_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+ float64x2_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
+ float64x2_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
+ float64x2_t accum = vdupq_n_f64(0);
+ float64_t *pIn1B = pSrcA->pData;
+ float64_t *pIn1C = pSrcA->pData;
+ float64_t *pIn1D = pSrcA->pData;
+ float64_t *pIn1E = pSrcA->pData;
+ float64_t *pIn1F = pSrcA->pData;
+ float64_t *pIn1G = pSrcA->pData;
+ float64_t *pIn1H = pSrcA->pData;
+
+ float64_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
+ float64_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* Row loop */
+ rowCnt = row >> 3;
+
+ while(rowCnt > 0)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + GROUPOFROWS*i;
+ pxB = px + numColsB;
+ pxC = px + 2*numColsB;
+ pxD = px + 3*numColsB;
+ pxE = px + 4*numColsB;
+ pxF = px + 5*numColsB;
+ pxG = px + 6*numColsB;
+ pxH = px + 7*numColsB;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum0 = 0.0f;
+ sum1 = 0.0f;
+ sum2 = 0.0f;
+ sum3 = 0.0f;
+ sum4 = 0.0f;
+ sum5 = 0.0f;
+ sum6 = 0.0f;
+ sum7 = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+ pIn1B = pIn1 + numColsA;
+ pIn1C = pIn1 + 2*numColsA;
+ pIn1D = pIn1 + 3*numColsA;
+ pIn1E = pIn1 + 4*numColsA;
+ pIn1F = pIn1 + 5*numColsA;
+ pIn1G = pIn1 + 6*numColsA;
+ pIn1H = pIn1 + 7*numColsA;
+
+ acc0 = vdupq_n_f64(0.0);
+ acc1 = vdupq_n_f64(0.0);
+ acc2 = vdupq_n_f64(0.0);
+ acc3 = vdupq_n_f64(0.0);
+ acc4 = vdupq_n_f64(0.0);
+ acc5 = vdupq_n_f64(0.0);
+ acc6 = vdupq_n_f64(0.0);
+ acc7 = vdupq_n_f64(0.0);
+
+ /* Compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 1U;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a0V = vld1q_f64(pIn1);
+ a1V = vld1q_f64(pIn1B);
+ a2V = vld1q_f64(pIn1C);
+ a3V = vld1q_f64(pIn1D);
+ a4V = vld1q_f64(pIn1E);
+ a5V = vld1q_f64(pIn1F);
+ a6V = vld1q_f64(pIn1G);
+ a7V = vld1q_f64(pIn1H);
+
+ pIn1 += 2;
+ pIn1B += 2;
+ pIn1C += 2;
+ pIn1D += 2;
+ pIn1E += 2;
+ pIn1F += 2;
+ pIn1G += 2;
+ pIn1H += 2;
+
+ temp = vsetq_lane_f64(*pIn2,temp,0);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f64(*pIn2,temp,1);
+ pIn2 += numColsB;
+
+
+ acc0 = vmlaq_f64(acc0,a0V,temp);
+ acc1 = vmlaq_f64(acc1,a1V,temp);
+ acc2 = vmlaq_f64(acc2,a2V,temp);
+ acc3 = vmlaq_f64(acc3,a3V,temp);
+ acc4 = vmlaq_f64(acc4,a4V,temp);
+ acc5 = vmlaq_f64(acc5,a5V,temp);
+ acc6 = vmlaq_f64(acc6,a6V,temp);
+ acc7 = vmlaq_f64(acc7,a7V,temp);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ sum0 += vaddvq_f64(acc0);
+ sum1 += vaddvq_f64(acc1);
+ sum2 += vaddvq_f64(acc2);
+ sum3 += vaddvq_f64(acc3);
+ sum4 += vaddvq_f64(acc4);
+ sum5 += vaddvq_f64(acc5);
+ sum6 += vaddvq_f64(acc6);
+ sum7 += vaddvq_f64(acc7);
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA & 1;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ sum0 += *pIn1++ * (*pIn2);
+ sum1 += *pIn1B++ * (*pIn2);
+ sum2 += *pIn1C++ * (*pIn2);
+ sum3 += *pIn1D++ * (*pIn2);
+ sum4 += *pIn1E++ * (*pIn2);
+ sum5 += *pIn1F++ * (*pIn2);
+ sum6 += *pIn1G++ * (*pIn2);
+ sum7 += *pIn1H++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum0;
+ *pxB++ = sum1;
+ *pxC++ = sum2;
+ *pxD++ = sum3;
+ *pxE++ = sum4;
+ *pxF++ = sum5;
+ *pxG++ = sum6;
+ *pxH++ = sum7;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + GROUPOFROWS*numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+ }
+
+ /*
+
+ i was the index of a group of rows computed by previous loop.
+ Now i is the index of a row since below code is computing row per row
+ and no more group of row per group of rows.
+
+ */
+
+ i = GROUPOFROWS*i;
+ rowCnt = row & 7;
+
+ while(rowCnt > 0)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+
+ acc0 = vdupq_n_f64(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 1U;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a0V = vld1q_f64(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ pIn1 += 2;
+
+ temp = vsetq_lane_f64(*pIn2,temp,0);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f64(*pIn2,temp,1);
+ pIn2 += numColsB;
+
+
+ acc0 = vmlaq_f64(acc0,a0V,temp);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ //accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+ sum += vaddvq_f64(acc0);
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x2U;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ sum += *pIn1++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+ } while (col > 0U);
+
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
arm_status arm_mat_mult_f64(
const arm_matrix_instance_f64 * pSrcA,
const arm_matrix_instance_f64 * pSrcB,
@@ -178,7 +513,7 @@ arm_status arm_mat_mult_f64(
/* Return to application */
return (status);
}
-
+#endif
/**
* @} end of MatrixMult group
diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
index d54ff679..49d3ac28 100755
--- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mat_solve_lower_triangular_f64.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -46,6 +46,109 @@
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+ arm_status arm_mat_solve_lower_triangular_f64(
+ const arm_matrix_instance_f64 * lt,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n,cols;
+
+ n = dst->numRows;
+ cols = dst->numCols;
+
+ float64_t *pX = dst->pData;
+ float64_t *pLT = lt->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *lt_row;
+ float64_t *a_col;
+
+ float64_t invLT;
+
+ float64x2_t vecA;
+ float64x2_t vecX;
+
+ for(i=0; i < n ; i++)
+ {
+
+ for(j=0; j+1 < cols; j += 2)
+ {
+ vecA = vld1q_f64(&pA[i * cols + j]);
+
+ for(k=0; k < i; k++)
+ {
+ vecX = vld1q_f64(&pX[cols*k+j]);
+ vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
+ }
+
+ if (pLT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invLT = 1.0f / pLT[n*i + i];
+ vecA = vmulq_f64(vecA,vdupq_n_f64(invLT));
+ vst1q_f64(&pX[i*cols+j],vecA);
+
+ }
+
+ for(; j < cols; j ++)
+ {
+ a_col = &pA[j];
+ lt_row = &pLT[n*i];
+
+ float64_t tmp=a_col[i * cols];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[cols*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*cols+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
@@ -119,6 +222,7 @@
/* Return to application */
return (status);
}
+#endif
/**
@} end of MatrixInv group
*/
diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
index 248273d4..68c10194 100755
--- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mat_solve_upper_triangular_f64.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -46,6 +46,104 @@
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+ arm_status arm_mat_solve_upper_triangular_f64(
+ const arm_matrix_instance_f64 * ut,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n,cols;
+
+ n = dst->numRows;
+ cols = dst->numCols;
+
+ float64_t *pX = dst->pData;
+ float64_t *pUT = ut->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *ut_row;
+ float64_t *a_col;
+
+ float64_t invUT;
+
+ float64x2_t vecA;
+ float64x2_t vecX;
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ for(j=0; j+1 < cols; j +=2)
+ {
+ vecA = vld1q_f64(&pA[i * cols + j]);
+
+ for(k=n-1; k > i; k--)
+ {
+ vecX = vld1q_f64(&pX[cols*k+j]);
+ vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
+ }
+
+ if (pUT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invUT = 1.0f / pUT[n*i + i];
+ vecA = vmulq_f64(vecA,vdupq_n_f64(invUT));
+
+
+ vst1q_f64(&pX[i*cols+j],vecA);
+ }
+
+ for(; j < cols; j ++)
+ {
+ a_col = &pA[j];
+
+ ut_row = &pUT[n*i];
+
+ float64_t tmp=a_col[i * cols];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[cols*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*cols+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
@@ -113,7 +211,7 @@ arm_status status; /* status of matrix inverse */
/* Return to application */
return (status);
}
-
+#endif
/**
@} end of MatrixInv group
diff --git a/Source/MatrixFunctions/arm_mat_trans_f64.c b/Source/MatrixFunctions/arm_mat_trans_f64.c
index fee5a074..08184731 100755
--- a/Source/MatrixFunctions/arm_mat_trans_f64.c
+++ b/Source/MatrixFunctions/arm_mat_trans_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mat_trans_f64.c
* Description: Floating-point matrix transpose
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -32,7 +32,14 @@
@ingroup groupMatrix
*/
+/**
+ @defgroup MatrixTrans Matrix Transpose
+
+ Tranposes a matrix.
+ Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
+ \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
/**
@addtogroup MatrixTrans
@@ -47,7 +54,126 @@
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
+#if defined(ARM_MATH_NEON)
+arm_status arm_mat_trans_f64(
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
+{
+ float64_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+ float64_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nColumns = pSrc->numCols; /* number of columns */
+
+ uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* Row loop */
+ rowCnt = row >> 1;
+ while (rowCnt > 0U)
+ {
+ float64_t *row0,*row1;
+ float64x2x4_t raV;
+
+ blkCnt = nColumns >> 2;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U) /* Column loop */
+ {
+ row0 = pIn;
+ row1 = pIn+nColumns;
+ pIn+=4;
+ raV = vld4q_lane_f64(row0, raV, 0);
+ raV = vld4q_lane_f64(row1, raV, 1);
+
+ vst1q_f64(px,raV.val[0]);
+ px += nRows;
+
+ vst1q_f64(px,raV.val[1]);
+ px += nRows;
+
+ vst1q_f64(px,raV.val[2]);
+ px += nRows;
+
+ vst1q_f64(px,raV.val[3]);
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ /* Perform matrix transpose for last 3 samples here. */
+ blkCnt = nColumns % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Read and store the input element in the destination */
+ *px++ = *pIn;
+ *px++ = *(pIn + 1 * nColumns);
+
+ px += (nRows - 2);
+ pIn++;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ i += 2;
+ pIn += 1 * nColumns;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ } /* Row loop end */
+
+ rowCnt = row & 1;
+ while (rowCnt > 0U)
+ {
+ blkCnt = nColumns ;
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ while (blkCnt > 0U)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+ i++;
+ rowCnt -- ;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
arm_status arm_mat_trans_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
@@ -142,7 +268,7 @@ arm_status arm_mat_trans_f64(
/* Return to application */
return (status);
}
-
+#endif
/**
* @} end of MatrixTrans group
*/
diff --git a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
index 23e34005..38cf538f 100755
--- a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
@@ -3,8 +3,8 @@
* Title: arm_absmax_no_idx_f64.c
* Description: Maximum value of absolute values of a floating-point vector
*
- * $Date: 16 November 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -28,60 +28,128 @@
#include "dsp/statistics_functions.h"
+
+
/**
- @ingroup groupStats
+ @ingroup groupStats
*/
/**
- @addtogroup AbsMax
- @{
+ @addtogroup AbsMax
+ @{
*/
/**
- @brief Maximum value of absolute values of a floating-point vector.
- @param[in] pSrc points to the input vector
- @param[in] blockSize number of samples in input vector
- @param[out] pResult maximum value returned here
- @return none
+ @brief Maximum value of absolute values of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @return none
*/
+
+#if defined(ARM_MATH_NEON)
void arm_absmax_no_idx_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- float64_t maxVal, out; /* Temporary variables to store the output value. */
- uint32_t blkCnt; /* Loop counter */
-
-
+ float64_t maxVal , in; /* Temporary variables to store the output value. */
+ uint32_t blkCnt; /* Loop counter */
+
+ float64x2_t maxV;
+ float64x2_t pSrcV ;
+ pSrcV = vld1q_f64(pSrc);
+ pSrc += 2 ;
+ maxV = vabsq_f64(pSrcV);
+
+
+
+ /* Load first input value that act as reference value for comparision */
-
- /* Load first input value that act as reference value for comparision */
- out = fabs(*pSrc++);
-
- /* Initialize blkCnt with number of samples */
- blkCnt = (blockSize - 1U);
-
- while (blkCnt > 0U)
- {
- /* Initialize maxVal to the next consecutive values one by one */
- maxVal = fabs(*pSrc++);
-
- /* compare for the maximum value */
- if (out < maxVal)
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 2U) >> 1U;
+
+ while (blkCnt > 0U)
{
- /* Update the maximum value and it's index */
- out = maxVal;
+ /* Initialize maxVal to the next consecutive values one by one */
+ pSrcV = vld1q_f64(pSrc);
+ maxV = vmaxq_f64(maxV, vabsq_f64(pSrcV));
+
+ pSrc += 2 ;
+
+ /* Decrement loop counter */
+ blkCnt--;
}
+ maxVal =vgetq_lane_f64(maxV, 0);
+ if(maxVal < vgetq_lane_f64(maxV, 1))
+ {
+ maxVal = vgetq_lane_f64(maxV, 1);
+ }
+ blkCnt = (blockSize - 2U) & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ in = fabs(*pSrc++);
+
+ /* compare for the maximum value */
+ if (maxVal < in)
+ {
+ /* Update the maximum value and it's index */
+ maxVal = in;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ *pResult = maxVal;
+
+
+ /* Store the maximum value and it's index into destination pointers */
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store the maximum value and it's index into destination pointers */
- *pResult = out;
}
+#else
+void arm_absmax_no_idx_f64(
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
+{
+ float64_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt; /* Loop counter */
+
+
+
+
+
+ /* Load first input value that act as reference value for comparision */
+ out = fabs(*pSrc++);
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = fabs(*pSrc++);
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+}
+#endif
/**
- @} end of AbsMax group
+ @} end of AbsMax group
*/
diff --git a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
index ac1e9263..ca521085 100755
--- a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
@@ -3,8 +3,8 @@
* Title: arm_absmin_no_idx_f64.c
* Description: Minimum value of absolute values of a floating-point vector
*
- * $Date: 16 November 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -44,6 +44,71 @@
@param[out] pResult minimum value returned here
@return none
*/
+
+#if defined(ARM_MATH_NEON)
+void arm_absmin_no_idx_f64(
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
+{
+ float64_t minVal , in; /* Temporary variables to store the output value. */
+ uint32_t blkCnt; /* Loop counter */
+
+ float64x2_t minV;
+ float64x2_t pSrcV ;
+ pSrcV = vld1q_f64(pSrc);
+ pSrc += 2 ;
+ minV = vabsq_f64(pSrcV);
+
+
+
+
+ /* Load first input value that act as reference value for comparision */
+
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 2U) >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ pSrcV = vld1q_f64(pSrc);
+ minV = vminq_f64(minV, vabsq_f64(pSrcV));
+
+ pSrc += 2 ;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ minVal =vgetq_lane_f64(minV, 0);
+ if(minVal > vgetq_lane_f64(minV, 1))
+ {
+ minVal = vgetq_lane_f64(minV, 1);
+ }
+ blkCnt = (blockSize - 2U) & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ in = fabs(*pSrc++);
+
+ /* compare for the maximum value */
+ if (minVal > in)
+ {
+ /* Update the maximum value and it's index */
+ minVal = in;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ *pResult = minVal;
+
+
+ /* Store the maximum value and it's index into destination pointers */
+
+}
+#else
void arm_absmin_no_idx_f64(
const float64_t * pSrc,
uint32_t blockSize,
@@ -78,7 +143,7 @@ void arm_absmin_no_idx_f64(
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
}
-
+#endif
/**
@} end of AbsMin group
*/
diff --git a/Source/StatisticsFunctions/arm_entropy_f64.c b/Source/StatisticsFunctions/arm_entropy_f64.c
index d671791c..a8fc7ed0 100755
--- a/Source/StatisticsFunctions/arm_entropy_f64.c
+++ b/Source/StatisticsFunctions/arm_entropy_f64.c
@@ -3,8 +3,8 @@
* Title: arm_logsumexp_f64.c
* Description: LogSumExp
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -29,6 +29,9 @@
#include "dsp/statistics_functions.h"
#include
#include
+#if defined(ARM_MATH_NEON)
+#include "arm_vec_math.h"
+#endif
/**
* @addtogroup Entropy
@@ -51,9 +54,26 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
float64_t accum, p;
pIn = pSrcA;
- blkCnt = blockSize;
+
accum = 0.0;
+#if defined(ARM_MATH_NEON)
+ float64x2_t sumV ,pInV ;
+ sumV = vdupq_n_f64(0.0f);
+ blkCnt = blockSize >> 1U ;
+
+ while(blkCnt > 0){
+ pInV = vld1q_f64(pIn);
+ sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) );
+ pIn += 2 ;
+ blkCnt--;
+
+ }
+ accum = vaddvq_f64(sumV);
+ blkCnt = blockSize & 1 ;
+#else
+ blkCnt = blockSize;
+#endif
while(blkCnt > 0)
{
diff --git a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
index b43b2186..a52059ed 100755
--- a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
+++ b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
@@ -3,8 +3,8 @@
* Title: arm_logsumexp_f64.c
* Description: LogSumExp
*
- * $Date: 23 April 2021
- * $Revision: V1.9.0
+ * $Date: 10 August 2022
+ * $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -44,6 +44,61 @@
* @return Kullback-Leibler divergence D(A || B)
*
*/
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+
+float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,const float64_t * pSrcB,uint32_t blockSize)
+{
+ const float64_t *pInA, *pInB;
+ uint32_t blkCnt;
+ float64_t accum, pA,pB;
+
+ float64x2_t accumV;
+ float64x2_t tmpVA, tmpVB,tmpV;
+
+ pInA = pSrcA;
+ pInB = pSrcB;
+
+ accum = 0.0f;
+ accumV = vdupq_n_f64(0.0f);
+
+ blkCnt = blockSize >> 1;
+ while(blkCnt > 0)
+ {
+ tmpVA = vld1q_f64(pInA);
+ pInA += 2;
+
+ tmpVB = vld1q_f64(pInB);
+ pInB += 2;
+
+ tmpV = vinvq_f64(tmpVA);
+ tmpVB = vmulq_f64(tmpVB, tmpV);
+ tmpVB = vlogq_f64(tmpVB);
+
+ accumV = vmlaq_f64(accumV, tmpVA, tmpVB);
+
+ blkCnt--;
+
+ }
+
+
+ accum = vaddvq_f64(accumV);
+
+ blkCnt = blockSize & 1;
+ while(blkCnt > 0)
+ {
+ pA = *pInA++;
+ pB = *pInB++;
+ accum += pA * logf(pB/pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+#else
float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pSrcB, uint32_t blockSize)
{
@@ -69,7 +124,7 @@ float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pS
return(-accum);
}
-
+#endif
/**
* @} end of Kullback-Leibler group
*/
diff --git a/Source/StatisticsFunctions/arm_max_no_idx_f64.c b/Source/StatisticsFunctions/arm_max_no_idx_f64.c
index 7eca31c5..f6404c01 100644
--- a/Source/StatisticsFunctions/arm_max_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_max_no_idx_f64.c
@@ -3,8 +3,8 @@
* Title: arm_max_no_idx_f64.c
* Description: Maximum value of a floating-point vector without returning the index
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -29,47 +29,112 @@
#include "dsp/statistics_functions.h"
/**
- @ingroup groupStats
+ @ingroup groupStats
*/
/**
- @addtogroup Max
- @{
+ @addtogroup Max
+ @{
*/
/**
- @brief Maximum value of a floating-point vector.
- @param[in] pSrc points to the input vector
- @param[in] blockSize number of samples in input vector
- @param[out] pResult maximum value returned here
- @return none
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @return none
*/
+#if defined(ARM_MATH_NEON)
void arm_max_no_idx_f64(
- const float64_t *pSrc,
- uint32_t blockSize,
- float64_t *pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- float64_t maxValue = F64_MIN;
- float64_t newVal;
-
- while (blockSize > 0U)
- {
- newVal = *pSrc++;
-
- /* compare for the maximum value */
- if (maxValue < newVal)
- {
- /* Update the maximum value and it's index */
- maxValue = newVal;
- }
-
- blockSize --;
- }
+ float64_t maxVal , in; /* Temporary variables to store the output value. */
+ uint32_t blkCnt; /* Loop counter */
+
+ float64x2_t maxV;
+ float64x2_t pSrcV ;
+ pSrcV = vld1q_f64(pSrc);
+ pSrc += 2 ;
+ maxV = pSrcV;
+
+
+
+
+ /* Load first input value that act as reference value for comparision */
+
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 2U) >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ pSrcV = vld1q_f64(pSrc);
+ maxV = vmaxq_f64(maxV, pSrcV);
+
+ pSrc += 2 ;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ maxVal =vgetq_lane_f64(maxV, 0);
+ if(maxVal < vgetq_lane_f64(maxV, 1))
+ {
+ maxVal = vgetq_lane_f64(maxV, 1);
+ }
+ blkCnt = (blockSize - 2U) & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ in = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxVal < in)
+ {
+ /* Update the maximum value and it's index */
+ maxVal = in;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ *pResult = maxVal;
+
+
+ /* Store the maximum value and it's index into destination pointers */
+
+}
+#else
+void arm_max_no_idx_f64(
+ const float64_t *pSrc,
+ uint32_t blockSize,
+ float64_t *pResult)
+{
+ float64_t maxValue = F64_MIN;
+ float64_t newVal;
+
+ while (blockSize > 0U)
+ {
+ newVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < newVal)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = newVal;
+ }
+
+ blockSize --;
+ }
- *pResult = maxValue;
+ *pResult = maxValue;
}
+#endif
/**
- @} end of Max group
+ @} end of Max group
*/
diff --git a/Source/StatisticsFunctions/arm_mean_f64.c b/Source/StatisticsFunctions/arm_mean_f64.c
index 72e6f351..577ecf13 100644
--- a/Source/StatisticsFunctions/arm_mean_f64.c
+++ b/Source/StatisticsFunctions/arm_mean_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mean_f64.c
* Description: Mean value of a floating-point vector
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 03 June 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -29,47 +29,90 @@
#include "dsp/statistics_functions.h"
/**
- @ingroup groupStats
+ @ingroup groupStats
*/
/**
- @addtogroup mean
- @{
+ @addtogroup mean
+ @{
*/
/**
- @brief Mean value of a floating-point vector.
- @param[in] pSrc points to the input vector.
- @param[in] blockSize number of samples in input vector.
- @param[out] pResult mean value returned here.
- @return none
+ @brief Mean value of a floating-point vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult mean value returned here.
+ @return none
*/
+
+#if defined(ARM_MATH_NEON)
+
void arm_mean_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary result storage */
+ uint32_t blkCnt; /* Loop counter */
+ float64x2_t vSum = vdupq_n_f64(0.0f);
+ float64_t sum = 0.; /* Temporary result storage */
+ float64x2_t afterLoad ;
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize >> 1U;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ afterLoad = vld1q_f64(pSrc);
+ vSum = vaddq_f64(vSum, afterLoad);
+ pSrc += 2;
+ /* Decrement loop counter */
+ blkCnt--;
+
+ }
+ sum = vaddvq_f64(vSum);
+
+ blkCnt = blockSize & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ *pResult = (sum/blockSize);
+}
+#else
+void arm_mean_f64(
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
-
+
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
-
+
/* Decrement loop counter */
blkCnt--;
}
-
+
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (sum / blockSize);
}
-
+#endif
/**
- @} end of mean group
+ @} end of mean group
*/
diff --git a/Source/StatisticsFunctions/arm_min_no_idx_f64.c b/Source/StatisticsFunctions/arm_min_no_idx_f64.c
index 6e69572d..b9696ff8 100755
--- a/Source/StatisticsFunctions/arm_min_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_min_no_idx_f64.c
@@ -3,8 +3,8 @@
* Title: arm_min_no_idx_f64.c
* Description: Maximum value of a floating-point vector without returning the index
*
- * $Date: 16 November 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -52,8 +52,31 @@ void arm_min_no_idx_f64(
{
float64_t minValue = F64_MAX;
float64_t newVal;
+ uint32_t blkCnt ;
+#if defined(ARM_MATH_NEON)
+ float64x2_t minValueV , newValV ;
+ minValueV = vdupq_n_f64(F64_MAX);
+ blkCnt = blockSize >> 1U;
+ while(blkCnt > 0)
+ {
+ newValV = vld1q_f64(pSrc);
+ minValueV = vminq_f64(minValueV, newValV);
+ pSrc += 2 ;
+ blkCnt--;
+
+ }
+ minValue =vgetq_lane_f64(minValueV, 0);
+ if(minValue > vgetq_lane_f64(minValueV, 1))
+ {
+ minValue = vgetq_lane_f64(minValueV, 1);
+ }
+
+ blkCnt = blockSize & 1 ;
+#else
+ blkCnt = blockSize;
+#endif
- while (blockSize > 0U)
+ while (blkCnt > 0U)
{
newVal = *pSrc++;
@@ -64,7 +87,7 @@ void arm_min_no_idx_f64(
minValue = newVal;
}
- blockSize --;
+ blkCnt --;
}
*pResult = minValue;
diff --git a/Source/StatisticsFunctions/arm_mse_f64.c b/Source/StatisticsFunctions/arm_mse_f64.c
index b785bf82..13daa2cb 100755
--- a/Source/StatisticsFunctions/arm_mse_f64.c
+++ b/Source/StatisticsFunctions/arm_mse_f64.c
@@ -3,8 +3,8 @@
* Title: arm_mse_f64.c
* Description: Double floating point mean square error
*
- * $Date: 05 April 2022
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -29,82 +29,106 @@
#include "dsp/statistics_functions.h"
/**
- @ingroup groupStats
+ @ingroup groupStats
*/
/**
- @addtogroup MSE
- @{
+ @addtogroup MSE
+ @{
*/
/**
- @brief Mean square error between two double floating point vectors.
- @param[in] pSrcA points to the first input vector
- @param[in] pSrcB points to the second input vector
- @param[in] blockSize number of samples in input vector
- @param[out] result mean square error
- @return none
+ @brief Mean square error between two double floating point vectors.
+ @param[in] pSrcA points to the first input vector
+ @param[in] pSrcB points to the second input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] result mean square error
+ @return none
*/
-
-
-
-
void arm_mse_f64(
- const float64_t * pSrcA,
- const float64_t * pSrcB,
- uint32_t blockSize,
- float64_t * result)
+ const float64_t * pSrcA,
+ const float64_t * pSrcB,
+ uint32_t blockSize,
+ float64_t * result)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t inA, inB;
- float64_t sum = 0.0; /* Temporary return variable */
+
+ uint32_t blkCnt; /* Loop counter */
+ float64_t inA, inB;
+ float64_t sum = 0.0;
+#if defined (ARM_MATH_NEON)
+
+ float64x2_t inAV , inBV , subV, sumV;
+ sumV = vdupq_n_f64(0.0f);
+
+ blkCnt = blockSize >> 1U ;
+
+ while (blkCnt > 0U)
+ {
+ inAV = vld1q_f64(pSrcA);
+ pSrcA+=2;
+ inBV = vld1q_f64(pSrcB);
+ pSrcB+=2;
+ subV = vsubq_f64(inAV, inBV);
+ sumV = vmlaq_f64(sumV, subV, subV);
+
+ blkCnt--;
+
+ }
+ sum = vaddvq_f64(sumV);
+ blkCnt = (blockSize) & 1;
+
+#else
+ /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
- blkCnt = (blockSize) >> 1;
-
-
- while (blkCnt > 0U)
- {
-
-
- inA = *pSrcA++;
- inB = *pSrcB++;
- inA = inA - inB;
- sum += inA * inA;
-
- inA = *pSrcA++;
- inB = *pSrcB++;
- inA = inA - inB;
- sum += inA * inA;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
-
- /* Loop unrolling: Compute remaining outputs */
- blkCnt = (blockSize) & 1;
+ blkCnt = (blockSize) >> 1;
+
+#pragma clang loop vectorize(enable)
+ while (blkCnt > 0U)
+ {
+
+
+ inA = *pSrcA++;
+ inB = *pSrcB++;
+ inA = inA - inB;
+ sum += inA * inA;
+
+ inA = *pSrcA++;
+ inB = *pSrcB++;
+ inA = inA - inB;
+ sum += inA * inA;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize) & 1;
#else
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
#endif
- while (blkCnt > 0U)
- {
- inA = *pSrcA++;
- inB = *pSrcB++;
- inA = inA - inB;
- sum += inA * inA;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result in destination buffer */
- *result = sum / blockSize;
+#endif
+//#pragma clang loop vectorize(enable) unroll(disable)
+ while (blkCnt > 0U)
+ {
+ inA = *pSrcA++;
+ inB = *pSrcB++;
+ inA = inA - inB;
+ sum += inA * inA;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *result = sum / blockSize;
}
+
/**
- @} end of MSE group
+ @} end of MSE group
*/
diff --git a/Source/StatisticsFunctions/arm_power_f64.c b/Source/StatisticsFunctions/arm_power_f64.c
index 057e9f7c..710dc834 100644
--- a/Source/StatisticsFunctions/arm_power_f64.c
+++ b/Source/StatisticsFunctions/arm_power_f64.c
@@ -3,8 +3,8 @@
* Title: arm_power_f64.c
* Description: Sum of the squares of the elements of a floating-point vector
*
- * $Date: 13 September 2021
- * $Revision: V1.10.0
+ * $Date: 10 August 2022
+ * $Revision: V1.10.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
@@ -29,49 +29,100 @@
#include "dsp/statistics_functions.h"
/**
- @ingroup groupStats
+ @ingroup groupStats
*/
/**
- @addtogroup power
- @{
+ @addtogroup power
+ @{
*/
/**
- @brief Sum of the squares of the elements of a floating-point vector.
- @param[in] pSrc points to the input vector
- @param[in] blockSize number of samples in input vector
- @param[out] pResult sum of the squares value returned here
- @return none
+ @brief Sum of the squares of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
*/
+#if defined(ARM_MATH_NEON)
void arm_power_f64(
- const float64_t * pSrc,
- uint32_t blockSize,
- float64_t * pResult)
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
{
- uint32_t blkCnt; /* Loop counter */
- float64_t sum = 0.; /* Temporary result storage */
- float64_t in; /* Temporary variable to store input value */
-
- /* Initialize blkCnt with number of samples */
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-
- /* Compute Power and store result in a temporary variable, sum. */
- in = *pSrc++;
- sum += in * in;
-
- /* Decrement loop counter */
- blkCnt--;
- }
-
- /* Store result to destination */
- *pResult = sum;
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+ float64x2_t sumV ; /* Temporary variable to store input value */
+ sumV = vdupq_n_f64(0.0f);
+ float64x2_t pSrcV;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize >> 1U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ pSrcV = vld1q_f64(pSrc);
+ sumV = vmlaq_f64(sumV, pSrcV, pSrcV);
+ pSrc+= 2 ;
+
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ sum = vaddvq_f64(sumV);
+
+
+ float64_t in;
+ blkCnt = blockSize & 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
}
-
+#else
+void arm_power_f64(
+ const float64_t * pSrc,
+ uint32_t blockSize,
+ float64_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float64_t sum = 0.; /* Temporary result storage */
+ float64_t in; /* Temporary variable to store input value */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
+}
+#endif
/**
- @} end of power group
+ @} end of power group
*/