From e5753198af8d81e10058e0045628f44b786e3fff Mon Sep 17 00:00:00 2001 From: Silfurion Date: Wed, 10 Aug 2022 16:40:59 +0200 Subject: [PATCH] Added functions optimised for 64 bit float --- Source/BasicMathFunctions/arm_dot_prod_f64.c | 60 ++- .../arm_chebyshev_distance_f64.c | 42 ++- .../arm_cityblock_distance_f64.c | 32 +- .../arm_euclidean_distance_f64.c | 29 +- Source/FastMathFunctions/arm_vexp_f64.c | 49 ++- Source/FastMathFunctions/arm_vlog_f64.c | 29 +- Source/FilteringFunctions/arm_correlate_f64.c | 82 ++++- Source/FilteringFunctions/arm_fir_f64.c | 101 +++++- Source/MatrixFunctions/arm_mat_cholesky_f64.c | 163 ++++++++- Source/MatrixFunctions/arm_mat_mult_f64.c | 341 +++++++++++++++++- .../arm_mat_solve_lower_triangular_f64.c | 108 +++++- .../arm_mat_solve_upper_triangular_f64.c | 104 +++++- Source/MatrixFunctions/arm_mat_trans_f64.c | 132 ++++++- .../arm_absmax_no_idx_f64.c | 148 ++++++-- .../arm_absmin_no_idx_f64.c | 71 +++- Source/StatisticsFunctions/arm_entropy_f64.c | 26 +- .../arm_kullback_leibler_f64.c | 61 +++- .../StatisticsFunctions/arm_max_no_idx_f64.c | 127 +++++-- Source/StatisticsFunctions/arm_mean_f64.c | 83 ++++- .../StatisticsFunctions/arm_min_no_idx_f64.c | 31 +- Source/StatisticsFunctions/arm_mse_f64.c | 148 ++++---- Source/StatisticsFunctions/arm_power_f64.c | 123 +++++-- 22 files changed, 1822 insertions(+), 268 deletions(-) diff --git a/Source/BasicMathFunctions/arm_dot_prod_f64.c b/Source/BasicMathFunctions/arm_dot_prod_f64.c index a4cd07cc..4e300331 100644 --- a/Source/BasicMathFunctions/arm_dot_prod_f64.c +++ b/Source/BasicMathFunctions/arm_dot_prod_f64.c @@ -3,8 +3,8 @@ * Title: arm_dot_prod_f64.c * Description: Floating-point dot product * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 03 June 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -45,7 +45,62 @@ @param[out] result output result returned here. @return none */ +#if defined(ARM_MATH_NEON) +void arm_dot_prod_f64( + const float64_t * pSrcA, + const float64_t * pSrcB, + uint32_t blockSize, + float64_t * result) + { + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary return variable */ + float64x2_t sumV ; /* Neon buffer for sum variable */ + + /* Neon Buffer Initialisation */ + sumV = vsetq_lane_f64(0.0f, sumV, 0); + sumV = vsetq_lane_f64(0.0f, sumV, 1); + + /* Neon Buffer for sources */ + float64x2_t pSrcAV; + float64x2_t pSrcBV; + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ + + /* Load source value in Neon Buffer */ + pSrcAV = vld1q_f64(pSrcA); + pSrcBV = vld1q_f64(pSrcB); + /* Calculate dot product and store result in a temporary buffer. */ + sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV); + + pSrcA+=2; + pSrcB+=2; + /* Decrement loop counter */ + blkCnt--; + } + /* Sum both 64 bits part in the float64x2 */ + sum = vaddvq_f64(sumV); + + + /* Tail */ + blkCnt = blockSize & 1 ; + + while(blkCnt > 0U) + { + sum += (*pSrcA++) * (*pSrcB++); + + /* Decrement loop counter */ + blkCnt--; + } + /* Store result in destination buffer */ + *result = sum; + } +#else void arm_dot_prod_f64( const float64_t * pSrcA, const float64_t * pSrcB, @@ -72,6 +127,7 @@ void arm_dot_prod_f64( /* Store result in destination buffer */ *result = sum; } +#endif /** @} end of BasicDotProd group diff --git a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c index 899159dd..63090948 100644 --- a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c +++ b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c @@ -4,8 +4,8 @@ * Title: arm_chebyshev_distance_f64.c * Description: Chebyshev distance between two vectors * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -48,15 +48,37 @@ */ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize) { - float64_t diff=0., maxVal,tmpA, tmpB; - tmpA = *pA++; - tmpB = *pB++; - diff = fabs(tmpA - tmpB); - maxVal = diff; - blockSize--; + float64_t diff=0., maxVal,tmpA, tmpB; + uint32_t blkCnt; + maxVal = F64_MIN; +#if defined(ARM_MATH_NEON) + float64x2_t diffV , tmpAV , tmpBV , maxValV ; + maxValV = vdupq_n_f64(maxVal); + blkCnt = blockSize >> 1U ; + while(blkCnt > 0U) + { + tmpAV = vld1q_f64(pA); + tmpBV = vld1q_f64(pB); + diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV))); + maxValV = vmaxq_f64(maxValV, diffV); + pA+=2; + pB+=2; + blkCnt--; + } + maxVal =vgetq_lane_f64(maxValV, 0); + if(maxVal < vgetq_lane_f64(maxValV, 1)) + { + maxVal = vgetq_lane_f64(maxValV, 1); + } + blkCnt = blockSize & 1; + + +#else + blkCnt = blockSize; +#endif - while(blockSize > 0) + while(blkCnt > 0) { tmpA = *pA++; tmpB = *pB++; @@ -65,7 +87,7 @@ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, ui { maxVal = diff; } - blockSize --; + blkCnt --; } return(maxVal); diff --git a/Source/DistanceFunctions/arm_cityblock_distance_f64.c b/Source/DistanceFunctions/arm_cityblock_distance_f64.c index 46c0a6df..2f63ffae 100644 --- a/Source/DistanceFunctions/arm_cityblock_distance_f64.c +++ b/Source/DistanceFunctions/arm_cityblock_distance_f64.c @@ -4,8 +4,8 @@ * Title: arm_cityblock_distance_f64.c * Description: Cityblock (Manhattan) distance between two vectors * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -48,15 +48,37 @@ float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize) { float64_t accum,tmpA, tmpB; - + uint32_t blkCnt; accum = 0.; - while(blockSize > 0) + +#if defined(ARM_MATH_NEON) + float64x2_t tmpAV, tmpBV,accumV , subV; + accumV = vdupq_n_f64(0.0f); + blkCnt = blockSize >> 1U; + + while(blkCnt > 0U) + { + tmpAV = vld1q_f64(pA); + tmpBV = vld1q_f64(pB); + subV = vabdq_f64(tmpAV, tmpBV); + accumV = vaddq_f64(accumV, subV); + pA+=2; + pB+=2; + blkCnt--; + } + accum = vaddvq_f64(accumV); + blkCnt = blockSize & 1 ; + +#else + blkCnt = blockSize; +#endif + while(blkCnt > 0) { tmpA = *pA++; tmpB = *pB++; accum += fabs(tmpA - tmpB); - blockSize --; + blkCnt--; } return(accum); diff --git a/Source/DistanceFunctions/arm_euclidean_distance_f64.c b/Source/DistanceFunctions/arm_euclidean_distance_f64.c index 8c8dfde5..794bf93d 100644 --- a/Source/DistanceFunctions/arm_euclidean_distance_f64.c +++ b/Source/DistanceFunctions/arm_euclidean_distance_f64.c @@ -4,8 +4,8 @@ * Title: arm_euclidean_distance_f64.c * Description: Euclidean distance between two vectors * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -50,12 +50,31 @@ float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize) { float64_t accum=0.,tmp; - - while(blockSize > 0) + uint32_t blkCnt; +#if defined(ARM_MATH_NEON) + float64x2_t accumV,tmpV , pAV ,pBV; + accumV = vdupq_n_f64(0.0f); + blkCnt = blockSize >> 1U; + while(blkCnt > 0U) + { + pAV = vld1q_f64(pA); + pBV = vld1q_f64(pB); + tmpV = vsubq_f64(pAV, pBV); + accumV = vmlaq_f64(accumV, tmpV, tmpV); + pA+=2; + pB+=2; + blkCnt--; + } + accum = vaddvq_f64(accumV); + blkCnt = blockSize & 1; +#else + blkCnt = blockSize; +#endif + while(blkCnt > 0) { tmp = *pA++ - *pB++; accum += SQ(tmp); - blockSize --; + blkCnt --; } tmp = sqrt(accum); return(tmp); diff --git a/Source/FastMathFunctions/arm_vexp_f64.c b/Source/FastMathFunctions/arm_vexp_f64.c index 8f27ad83..e6c8f359 100644 --- a/Source/FastMathFunctions/arm_vexp_f64.c +++ b/Source/FastMathFunctions/arm_vexp_f64.c @@ -3,8 +3,8 @@ * Title: arm_vlog_f64.c * Description: Fast vectorized log * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -28,31 +28,44 @@ #include "dsp/fast_math_functions.h" #include "arm_common_tables.h" +#if defined(ARM_MATH_NEON) +#include "arm_vec_math.h" +#endif -/** - @addtogroup vexp - @{ - */ - -/** - @brief Floating-point vector of exp values. - @param[in] pSrc points to the input vector - @param[out] pDst points to the output vector - @param[in] blockSize number of samples in each vector - @return none - */ void arm_vexp_f64( const float64_t * pSrc, float64_t * pDst, uint32_t blockSize) { - uint32_t blkCnt; + uint32_t blkCnt; +#if defined(ARM_MATH_NEON) + + + float64x2_t src; + float64x2_t dst; - blkCnt = blockSize; + blkCnt = blockSize >> 1U; + while (blkCnt > 0U) + { + src = vld1q_f64(pSrc); + dst = vexpq_f64(src); + vst1q_f64(pDst, dst); + + pSrc += 2; + pDst += 2; + + blkCnt--; + } + + blkCnt = blockSize & 1; +#else + blkCnt = blockSize; +#endif while (blkCnt > 0U) { /* C = log(A) */ + /* Calculate log and store result in destination buffer. */ *pDst++ = exp(*pSrc++); @@ -61,7 +74,3 @@ void arm_vexp_f64( blkCnt--; } } - -/** - @} end of vexp group - */ \ No newline at end of file diff --git a/Source/FastMathFunctions/arm_vlog_f64.c b/Source/FastMathFunctions/arm_vlog_f64.c index a60fff69..b9ab25ec 100644 --- a/Source/FastMathFunctions/arm_vlog_f64.c +++ b/Source/FastMathFunctions/arm_vlog_f64.c @@ -3,8 +3,8 @@ * Title: arm_vlog_f64.c * Description: Fast vectorized log * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -29,14 +29,39 @@ #include "dsp/fast_math_functions.h" #include "arm_common_tables.h" +#if defined(ARM_MATH_NEON) +#include "arm_vec_math.h" +#endif + void arm_vlog_f64( const float64_t * pSrc, float64_t * pDst, uint32_t blockSize) { uint32_t blkCnt; +#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) + float64x2_t src; + float64x2_t dst; + + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + src = vld1q_f64(pSrc); + dst = vlogq_f64(src); + vst1q_f64(pDst, dst); + pSrc += 2; + pDst += 2; + /* Decrement loop counter */ + blkCnt--; + } + + blkCnt = blockSize & 1; +#else blkCnt = blockSize; +#endif + while (blkCnt > 0U) { diff --git a/Source/FilteringFunctions/arm_correlate_f64.c b/Source/FilteringFunctions/arm_correlate_f64.c index 95edd40f..8d52bcf6 100644 --- a/Source/FilteringFunctions/arm_correlate_f64.c +++ b/Source/FilteringFunctions/arm_correlate_f64.c @@ -3,8 +3,8 @@ * Title: arm_correlate_f64.c * Description: Correlation of floating-point sequences * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -65,6 +65,9 @@ void arm_correlate_f64( uint32_t j, k, count, blkCnt; /* Loop counters */ uint32_t outBlockSize; /* Loop counter */ int32_t inc = 1; /* Destination address modifier */ +#if defined(ARM_MATH_NEON) + float64x2_t sumV,pxV,pyV ; +#endif /* The algorithm implementation is based on the lengths of the inputs. */ /* srcB is always made to slide across srcA. */ @@ -164,10 +167,25 @@ void arm_correlate_f64( { /* Accumulator is made zero for every iteration */ sum = 0.; - +#if defined(ARM_MATH_NEON) + sumV = vdupq_n_f64(0.0f); + k = count >> 1U ; + + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = count & 1 ; +#else /* Initialize k with number of samples */ k = count; - +#endif while (k > 0U) { /* Perform the multiply-accumulate */ @@ -179,6 +197,7 @@ void arm_correlate_f64( } /* Store the result in the accumulator in the destination buffer. */ + *pOut = sum; /* Destination pointer is updated according to the address modifier, inc */ pOut += inc; @@ -192,6 +211,7 @@ void arm_correlate_f64( /* Decrement loop counter */ blockSize1--; + } /* -------------------------- @@ -229,10 +249,26 @@ void arm_correlate_f64( { /* Accumulator is made zero for every iteration */ sum = 0.; +#if defined(ARM_MATH_NEON) + sumV = vdupq_n_f64(0.0f); + k = srcBLen >> 1U ; + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = srcBLen & 1 ; + +#else /* Initialize blkCnt with number of samples */ k = srcBLen; - +#endif while (k > 0U) { /* Perform the multiply-accumulate */ @@ -269,10 +305,26 @@ void arm_correlate_f64( { /* Accumulator is made zero for every iteration */ sum = 0.; +#if defined(ARM_MATH_NEON) + sumV = vdupq_n_f64(0.0f); + k = srcBLen >> 1U ; + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = srcBLen & 1 ; + +#else /* Loop over srcBLen */ k = srcBLen; - +#endif while (k > 0U) { /* Perform the multiply-accumulate */ @@ -330,10 +382,26 @@ void arm_correlate_f64( { /* Accumulator is made zero for every iteration */ sum = 0.; +#if defined(ARM_MATH_NEON) + sumV = vdupq_n_f64(0.0f); + k = count >> 1U ; + + while(k > 0U) + { + pxV = vld1q_f64(px); + pyV = vld1q_f64(py); + sumV = vmlaq_f64(sumV, pxV, pyV); + px+=2; + py+=2; + k--; + } + sum =vaddvq_f64(sumV); + k = count & 1 ; +#else /* Initialize blkCnt with number of samples */ k = count; - +#endif while (k > 0U) { /* Perform the multiply-accumulate */ diff --git a/Source/FilteringFunctions/arm_fir_f64.c b/Source/FilteringFunctions/arm_fir_f64.c index 62e7e998..51fc8ece 100644 --- a/Source/FilteringFunctions/arm_fir_f64.c +++ b/Source/FilteringFunctions/arm_fir_f64.c @@ -3,8 +3,8 @@ * Title: arm_fir_f64.c * Description: Floating-point FIR filter processing function * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 03 June 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -45,7 +45,103 @@ @param[in] blockSize number of samples to process @return none */ +#if defined(ARM_MATH_NEON) +void arm_fir_f64( + const arm_fir_instance_f64 * S, + const float64_t * pSrc, + float64_t * pDst, + uint32_t blockSize) +{ + float64_t *pState = S->pState; /* State pointer */ + const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float64_t *pStateCurnt; /* Points to the current sample of the state */ + float64_t *px; /* Temporary pointer for state buffer */ + const float64_t *pb; /* Temporary pointer for coefficient buffer */ + float64x2_t pxV; + float64x2_t pbV; + float64x2_t acc0V; + float64_t acc0; /* Accumulator */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t i, tapCnt, blkCnt; /* Loop counters */ + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* Initialize blkCnt with number of taps */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy one sample at a time into state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Set the accumulator to zero */ + acc0 = 0.; + acc0V = vdupq_n_f64(0.0f); + + /* Initialize state pointer */ + px = pState; + + /* Initialize Coefficient pointer */ + pb = pCoeffs; + + i = numTaps >> 1U; + + /* Perform the multiply-accumulates */ + while (i > 0U) + { + /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ + pxV = vld1q_f64(px); + pbV = vld1q_f64(pb); + acc0V = vmlaq_f64(acc0V, pxV, pbV); + px+=2; + pb+=2; + + i--; + } + + acc0 = vaddvq_f64(acc0V); + i = numTaps%2 ; + while(i >0U) + { + acc0+= *px++ * *pb++ ; + i--; + } + + /* Store result in destination buffer. */ + + *pDst++ = acc0; + + /* Advance state pointer by 1 for the next sample */ + pState = pState + 1U; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Processing is complete. + Now copy the last numTaps - 1 samples to the start of the state buffer. + This prepares the state buffer for the next function call. */ + + /* Points to the start of the state buffer */ + pStateCurnt = S->pState; + + /* Initialize tapCnt with number of taps */ + tapCnt = (numTaps - 1U); + + /* Copy remaining data */ + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement loop counter */ + tapCnt--; + } + +} +#else void arm_fir_f64( const arm_fir_instance_f64 * S, const float64_t * pSrc, @@ -123,6 +219,7 @@ void arm_fir_f64( } } +#endif /** * @} end of FIR group diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/Source/MatrixFunctions/arm_mat_cholesky_f64.c index 7669582d..b1605116 100755 --- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c +++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c @@ -3,8 +3,8 @@ * Title: arm_mat_cholesky_f64.c * Description: Floating-point Cholesky decomposition * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -52,6 +52,7 @@ * The decomposition of A is returning a lower triangular matrix L such that A = L L^t */ +#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) arm_status arm_mat_cholesky_f64( const arm_matrix_instance_f64 * pSrc, @@ -61,6 +62,163 @@ arm_status arm_mat_cholesky_f64( arm_status status; /* status of matrix inverse */ +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((pSrc->numRows != pSrc->numCols) || + (pDst->numRows != pDst->numCols) || + (pSrc->numRows != pDst->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + + { + int i,j,k; + int n = pSrc->numRows; + float64_t invSqrtVj; + float64_t *pA,*pG; + int kCnt; + + + float64x2_t acc, acc0, acc1, acc2, acc3; + float64x2_t vecGi; + float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3; + + float64x2_t tmp = vdupq_n_f64(0.0f); + + float64_t sum=0.0f; + float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f; + + + pA = pSrc->pData; + pG = pDst->pData; + + for(i=0 ;i < n ; i++) + { + for(j=i ; j+3 < n ; j+=4) + { + pG[(j + 0) * n + i] = pA[(j + 0) * n + i]; + pG[(j + 1) * n + i] = pA[(j + 1) * n + i]; + pG[(j + 2) * n + i] = pA[(j + 2) * n + i]; + pG[(j + 3) * n + i] = pA[(j + 3) * n + i]; + + acc0 = vdupq_n_f64(0.0f); + acc1 = vdupq_n_f64(0.0f); + acc2 = vdupq_n_f64(0.0f); + acc3 = vdupq_n_f64(0.0f); + + kCnt = i >> 1U; + k=0; + while(kCnt > 0) + { + + vecGi=vld1q_f64(&pG[i * n + k]); + + vecGj0=vld1q_f64(&pG[(j + 0) * n + k]); + vecGj1=vld1q_f64(&pG[(j + 1) * n + k]); + vecGj2=vld1q_f64(&pG[(j + 2) * n + k]); + vecGj3=vld1q_f64(&pG[(j + 3) * n + k]); + + acc0 = vfmaq_f64(acc0, vecGi, vecGj0); + acc1 = vfmaq_f64(acc1, vecGi, vecGj1); + acc2 = vfmaq_f64(acc2, vecGi, vecGj2); + acc3 = vfmaq_f64(acc3, vecGi, vecGj3); + + kCnt--; + k+=2; + } + + + sum0 = vaddvq_f64(acc0); + sum1 = vaddvq_f64(acc1); + sum2 = vaddvq_f64(acc2); + sum3 = vaddvq_f64(acc3); + + + kCnt = i & 1; + while(kCnt > 0) + { + + sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k]; + sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k]; + sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k]; + sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k]; + kCnt--; + k++; + } + + pG[(j + 0) * n + i] -= sum0; + pG[(j + 1) * n + i] -= sum1; + pG[(j + 2) * n + i] -= sum2; + pG[(j + 3) * n + i] -= sum3; + } + + for(; j < n ; j++) + { + pG[j * n + i] = pA[j * n + i]; + + acc = vdupq_n_f64(0.0f); + + kCnt = i >> 1U; + k=0; + while(kCnt > 0) + { + + vecGi=vld1q_f64(&pG[i * n + k]); + vecGj=vld1q_f64(&pG[j * n + k]); + + acc = vfmaq_f64(acc, vecGi, vecGj); + + kCnt--; + k+=2; + } + + + sum = vaddvq_f64(acc); + + kCnt = i & 1; + while(kCnt > 0) + { + sum = sum + pG[i * n + k] * pG[(j + 0) * n + k]; + + + kCnt--; + k++; + } + + pG[j * n + i] -= sum; + } + + if (pG[i * n + i] <= 0.0f) + { + return(ARM_MATH_DECOMPOSITION_FAILURE); + } + + invSqrtVj = 1.0f/sqrtf(pG[i * n + i]); + SCALE_COL_F64(pDst,i,invSqrtVj,i); + } + + status = ARM_MATH_SUCCESS; + + } + + + /* Return to application */ + return (status); +} +#else +arm_status arm_mat_cholesky_f64( + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst) +{ + + arm_status status; /* status of matrix inverse */ + + #ifdef ARM_MATH_MATRIX_CHECK /* Check for matrix mismatch condition */ @@ -115,6 +273,7 @@ arm_status arm_mat_cholesky_f64( /* Return to application */ return (status); } +#endif /** @} end of MatrixChol group diff --git a/Source/MatrixFunctions/arm_mat_mult_f64.c b/Source/MatrixFunctions/arm_mat_mult_f64.c index ddc60101..1a778e18 100755 --- a/Source/MatrixFunctions/arm_mat_mult_f64.c +++ b/Source/MatrixFunctions/arm_mat_mult_f64.c @@ -3,8 +3,8 @@ * Title: arm_mat_mult_f64.c * Description: Floating-point matrix multiplication * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -27,11 +27,31 @@ */ #include "dsp/matrix_functions.h" +#if defined(ARM_MATH_NEON) +#define GROUPOFROWS 8 +#endif /** * @ingroup groupMatrix */ +/** + * @defgroup MatrixMult Matrix Multiplication + * + * Multiplies two matrices. + * + * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices" + + * Matrix multiplication is only defined if the number of columns of the + * first matrix equals the number of rows of the second matrix. + * Multiplying an M x N matrix with an N x P matrix results + * in an M x P matrix. + * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of + * pSrcA and pSrcB are equal; and (2) that the size of the output + * matrix equals the outer dimensions of pSrcA and pSrcB. + */ + + /** * @addtogroup MatrixMult * @{ @@ -46,7 +66,322 @@ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. */ +#if defined(ARM_MATH_NEON) +/** + * @brief Floating-point matrix multiplication. + * @param[in] *pSrcA points to the first input matrix structure + * @param[in] *pSrcB points to the second input matrix structure + * @param[out] *pDst points to output matrix structure + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_f64( + const arm_matrix_instance_f64 * pSrcA, + const arm_matrix_instance_f64 * pSrcB, + arm_matrix_instance_f64 * pDst) +{ + float64_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */ + float64_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */ + float64_t *pInA = pSrcA->pData; /* input data matrix pointer A */ + float64_t *pOut = pDst->pData; /* output data matrix pointer */ + float64_t *px; /* Temporary output data matrix pointer */ + float64_t sum; /* Accumulator */ + uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ + uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ + uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ + + + uint32_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */ + arm_status status; /* status of matrix multiplication */ + + float64x2_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V; + float64x2_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp; + float64x2_t accum = vdupq_n_f64(0); + float64_t *pIn1B = pSrcA->pData; + float64_t *pIn1C = pSrcA->pData; + float64_t *pIn1D = pSrcA->pData; + float64_t *pIn1E = pSrcA->pData; + float64_t *pIn1F = pSrcA->pData; + float64_t *pIn1G = pSrcA->pData; + float64_t *pIn1H = pSrcA->pData; + + float64_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */ + float64_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7; + +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((pSrcA->numCols != pSrcB->numRows) || + (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + { + /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ + /* Row loop */ + rowCnt = row >> 3; + + while(rowCnt > 0) + { + /* Output pointer is set to starting address of the row being processed */ + px = pOut + GROUPOFROWS*i; + pxB = px + numColsB; + pxC = px + 2*numColsB; + pxD = px + 3*numColsB; + pxE = px + 4*numColsB; + pxF = px + 5*numColsB; + pxG = px + 6*numColsB; + pxH = px + 7*numColsB; + + /* For every row wise process, the column loop counter is to be initiated */ + col = numColsB; + + /* For every row wise process, the pIn2 pointer is set + ** to the starting address of the pSrcB data */ + pIn2 = pSrcB->pData; + + j = 0U; + + /* Column loop */ + do + { + /* Set the variable sum, that acts as accumulator, to zero */ + sum0 = 0.0f; + sum1 = 0.0f; + sum2 = 0.0f; + sum3 = 0.0f; + sum4 = 0.0f; + sum5 = 0.0f; + sum6 = 0.0f; + sum7 = 0.0f; + + /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ + pIn1 = pInA; + pIn1B = pIn1 + numColsA; + pIn1C = pIn1 + 2*numColsA; + pIn1D = pIn1 + 3*numColsA; + pIn1E = pIn1 + 4*numColsA; + pIn1F = pIn1 + 5*numColsA; + pIn1G = pIn1 + 6*numColsA; + pIn1H = pIn1 + 7*numColsA; + + acc0 = vdupq_n_f64(0.0); + acc1 = vdupq_n_f64(0.0); + acc2 = vdupq_n_f64(0.0); + acc3 = vdupq_n_f64(0.0); + acc4 = vdupq_n_f64(0.0); + acc5 = vdupq_n_f64(0.0); + acc6 = vdupq_n_f64(0.0); + acc7 = vdupq_n_f64(0.0); + + /* Compute 2 MACs simultaneously. */ + colCnt = numColsA >> 1U; + + /* Matrix multiplication */ + while (colCnt > 0U) + { + /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */ + a0V = vld1q_f64(pIn1); + a1V = vld1q_f64(pIn1B); + a2V = vld1q_f64(pIn1C); + a3V = vld1q_f64(pIn1D); + a4V = vld1q_f64(pIn1E); + a5V = vld1q_f64(pIn1F); + a6V = vld1q_f64(pIn1G); + a7V = vld1q_f64(pIn1H); + + pIn1 += 2; + pIn1B += 2; + pIn1C += 2; + pIn1D += 2; + pIn1E += 2; + pIn1F += 2; + pIn1G += 2; + pIn1H += 2; + + temp = vsetq_lane_f64(*pIn2,temp,0); + pIn2 += numColsB; + temp = vsetq_lane_f64(*pIn2,temp,1); + pIn2 += numColsB; + + + acc0 = vmlaq_f64(acc0,a0V,temp); + acc1 = vmlaq_f64(acc1,a1V,temp); + acc2 = vmlaq_f64(acc2,a2V,temp); + acc3 = vmlaq_f64(acc3,a3V,temp); + acc4 = vmlaq_f64(acc4,a4V,temp); + acc5 = vmlaq_f64(acc5,a5V,temp); + acc6 = vmlaq_f64(acc6,a6V,temp); + acc7 = vmlaq_f64(acc7,a7V,temp); + + /* Decrement the loop count */ + colCnt--; + } + + sum0 += vaddvq_f64(acc0); + sum1 += vaddvq_f64(acc1); + sum2 += vaddvq_f64(acc2); + sum3 += vaddvq_f64(acc3); + sum4 += vaddvq_f64(acc4); + sum5 += vaddvq_f64(acc5); + sum6 += vaddvq_f64(acc6); + sum7 += vaddvq_f64(acc7); + + /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + colCnt = numColsA & 1; + + while (colCnt > 0U) + { + /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */ + sum0 += *pIn1++ * (*pIn2); + sum1 += *pIn1B++ * (*pIn2); + sum2 += *pIn1C++ * (*pIn2); + sum3 += *pIn1D++ * (*pIn2); + sum4 += *pIn1E++ * (*pIn2); + sum5 += *pIn1F++ * (*pIn2); + sum6 += *pIn1G++ * (*pIn2); + sum7 += *pIn1H++ * (*pIn2); + pIn2 += numColsB; + + /* Decrement the loop counter */ + colCnt--; + } + + /* Store the result in the destination buffer */ + *px++ = sum0; + *pxB++ = sum1; + *pxC++ = sum2; + *pxD++ = sum3; + *pxE++ = sum4; + *pxF++ = sum5; + *pxG++ = sum6; + *pxH++ = sum7; + + /* Update the pointer pIn2 to point to the starting address of the next column */ + j++; + pIn2 = pSrcB->pData + j; + + /* Decrement the column loop counter */ + col--; + + } while (col > 0U); + + /* Update the pointer pInA to point to the starting address of the next row */ + i = i + numColsB; + pInA = pInA + GROUPOFROWS*numColsA; + + /* Decrement the row loop counter */ + rowCnt--; + } + + /* + + i was the index of a group of rows computed by previous loop. + Now i is the index of a row since below code is computing row per row + and no more group of row per group of rows. + + */ + + i = GROUPOFROWS*i; + rowCnt = row & 7; + + while(rowCnt > 0) + { + /* Output pointer is set to starting address of the row being processed */ + px = pOut + i; + + /* For every row wise process, the column loop counter is to be initiated */ + col = numColsB; + + /* For every row wise process, the pIn2 pointer is set + ** to the starting address of the pSrcB data */ + pIn2 = pSrcB->pData; + + j = 0U; + + /* Column loop */ + do + { + /* Set the variable sum, that acts as accumulator, to zero */ + sum = 0.0f; + + /* Initiate the pointer pIn1 to point to the starting address of the column being processed */ + pIn1 = pInA; + + acc0 = vdupq_n_f64(0.0); + + /* Compute 4 MACs simultaneously. */ + colCnt = numColsA >> 1U; + + /* Matrix multiplication */ + while (colCnt > 0U) + { + /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */ + a0V = vld1q_f64(pIn1); // load & separate real/imag pSrcA (de-interleave 2) + pIn1 += 2; + + temp = vsetq_lane_f64(*pIn2,temp,0); + pIn2 += numColsB; + temp = vsetq_lane_f64(*pIn2,temp,1); + pIn2 += numColsB; + + + acc0 = vmlaq_f64(acc0,a0V,temp); + + /* Decrement the loop count */ + colCnt--; + } + + //accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + sum += vaddvq_f64(acc0); + + /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. + ** No loop unrolling is used. */ + colCnt = numColsA % 0x2U; + + while (colCnt > 0U) + { + /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */ + sum += *pIn1++ * (*pIn2); + pIn2 += numColsB; + + /* Decrement the loop counter */ + colCnt--; + } + + /* Store the result in the destination buffer */ + *px++ = sum; + + /* Update the pointer pIn2 to point to the starting address of the next column */ + j++; + pIn2 = pSrcB->pData + j; + + /* Decrement the column loop counter */ + col--; + } while (col > 0U); + + + /* Update the pointer pInA to point to the starting address of the next row */ + i = i + numColsB; + pInA = pInA + numColsA; + + /* Decrement the row loop counter */ + rowCnt--; + + } + /* Set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); +} +#else arm_status arm_mat_mult_f64( const arm_matrix_instance_f64 * pSrcA, const arm_matrix_instance_f64 * pSrcB, @@ -178,7 +513,7 @@ arm_status arm_mat_mult_f64( /* Return to application */ return (status); } - +#endif /** * @} end of MatrixMult group diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c index d54ff679..49d3ac28 100755 --- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c +++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c @@ -3,8 +3,8 @@ * Title: arm_mat_solve_lower_triangular_f64.c * Description: Solve linear system LT X = A with LT lower triangular matrix * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -46,6 +46,109 @@ * @param[out] dst The solution X of LT . X = A * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. */ + +#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) + arm_status arm_mat_solve_lower_triangular_f64( + const arm_matrix_instance_f64 * lt, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst) + { + arm_status status; /* status of matrix inverse */ + + +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((lt->numRows != lt->numCols) || + (lt->numRows != a->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + + { + /* a1 b1 c1 x1 = a1 + b2 c2 x2 a2 + c3 x3 a3 + + x3 = a3 / c3 + x2 = (a2 - c2 x3) / b2 + + */ + int i,j,k,n,cols; + + n = dst->numRows; + cols = dst->numCols; + + float64_t *pX = dst->pData; + float64_t *pLT = lt->pData; + float64_t *pA = a->pData; + + float64_t *lt_row; + float64_t *a_col; + + float64_t invLT; + + float64x2_t vecA; + float64x2_t vecX; + + for(i=0; i < n ; i++) + { + + for(j=0; j+1 < cols; j += 2) + { + vecA = vld1q_f64(&pA[i * cols + j]); + + for(k=0; k < i; k++) + { + vecX = vld1q_f64(&pX[cols*k+j]); + vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX); + } + + if (pLT[n*i + i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + + invLT = 1.0f / pLT[n*i + i]; + vecA = vmulq_f64(vecA,vdupq_n_f64(invLT)); + vst1q_f64(&pX[i*cols+j],vecA); + + } + + for(; j < cols; j ++) + { + a_col = &pA[j]; + lt_row = &pLT[n*i]; + + float64_t tmp=a_col[i * cols]; + + for(k=0; k < i; k++) + { + tmp -= lt_row[k] * pX[cols*k+j]; + } + + if (lt_row[i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + tmp = tmp / lt_row[i]; + pX[i*cols+j] = tmp; + } + + } + status = ARM_MATH_SUCCESS; + + } + + /* Return to application */ + return (status); +} + +#else arm_status arm_mat_solve_lower_triangular_f64( const arm_matrix_instance_f64 * lt, const arm_matrix_instance_f64 * a, @@ -119,6 +222,7 @@ /* Return to application */ return (status); } +#endif /** @} end of MatrixInv group */ diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c index 248273d4..68c10194 100755 --- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c +++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c @@ -3,8 +3,8 @@ * Title: arm_mat_solve_upper_triangular_f64.c * Description: Solve linear system UT X = A with UT upper triangular matrix * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -46,6 +46,104 @@ * @param[out] dst The solution X of UT . X = A * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. */ + +#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) + arm_status arm_mat_solve_upper_triangular_f64( + const arm_matrix_instance_f64 * ut, + const arm_matrix_instance_f64 * a, + arm_matrix_instance_f64 * dst) + { +arm_status status; /* status of matrix inverse */ + + +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((ut->numRows != ut->numCols) || + (ut->numRows != a->numRows) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + + { + + int i,j,k,n,cols; + + n = dst->numRows; + cols = dst->numCols; + + float64_t *pX = dst->pData; + float64_t *pUT = ut->pData; + float64_t *pA = a->pData; + + float64_t *ut_row; + float64_t *a_col; + + float64_t invUT; + + float64x2_t vecA; + float64x2_t vecX; + + for(i=n-1; i >= 0 ; i--) + { + for(j=0; j+1 < cols; j +=2) + { + vecA = vld1q_f64(&pA[i * cols + j]); + + for(k=n-1; k > i; k--) + { + vecX = vld1q_f64(&pX[cols*k+j]); + vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX); + } + + if (pUT[n*i + i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + + invUT = 1.0f / pUT[n*i + i]; + vecA = vmulq_f64(vecA,vdupq_n_f64(invUT)); + + + vst1q_f64(&pX[i*cols+j],vecA); + } + + for(; j < cols; j ++) + { + a_col = &pA[j]; + + ut_row = &pUT[n*i]; + + float64_t tmp=a_col[i * cols]; + + for(k=n-1; k > i; k--) + { + tmp -= ut_row[k] * pX[cols*k+j]; + } + + if (ut_row[i]==0.0f) + { + return(ARM_MATH_SINGULAR); + } + tmp = tmp / ut_row[i]; + pX[i*cols+j] = tmp; + } + + } + status = ARM_MATH_SUCCESS; + + } + + + /* Return to application */ + return (status); +} + +#else arm_status arm_mat_solve_upper_triangular_f64( const arm_matrix_instance_f64 * ut, const arm_matrix_instance_f64 * a, @@ -113,7 +211,7 @@ arm_status status; /* status of matrix inverse */ /* Return to application */ return (status); } - +#endif /** @} end of MatrixInv group diff --git a/Source/MatrixFunctions/arm_mat_trans_f64.c b/Source/MatrixFunctions/arm_mat_trans_f64.c index fee5a074..08184731 100755 --- a/Source/MatrixFunctions/arm_mat_trans_f64.c +++ b/Source/MatrixFunctions/arm_mat_trans_f64.c @@ -3,8 +3,8 @@ * Title: arm_mat_trans_f64.c * Description: Floating-point matrix transpose * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -32,7 +32,14 @@ @ingroup groupMatrix */ +/** + @defgroup MatrixTrans Matrix Transpose + + Tranposes a matrix. + Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix. + \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix" + */ /** @addtogroup MatrixTrans @@ -47,7 +54,126 @@ - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ +#if defined(ARM_MATH_NEON) +arm_status arm_mat_trans_f64( + const arm_matrix_instance_f64 * pSrc, + arm_matrix_instance_f64 * pDst) +{ + float64_t *pIn = pSrc->pData; /* input data matrix pointer */ + float64_t *pOut = pDst->pData; /* output data matrix pointer */ + float64_t *px; /* Temporary output data matrix pointer */ + uint16_t nRows = pSrc->numRows; /* number of rows */ + uint16_t nColumns = pSrc->numCols; /* number of columns */ + + uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */ + arm_status status; /* status of matrix transpose */ + +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + + { + /* Matrix transpose by exchanging the rows with columns */ + /* Row loop */ + rowCnt = row >> 1; + while (rowCnt > 0U) + { + float64_t *row0,*row1; + float64x2x4_t raV; + + blkCnt = nColumns >> 2; + + /* The pointer px is set to starting address of the column being processed */ + px = pOut + i; + + /* Compute 4 outputs at a time. + ** a second loop below computes the remaining 1 to 3 samples. */ + while (blkCnt > 0U) /* Column loop */ + { + row0 = pIn; + row1 = pIn+nColumns; + pIn+=4; + raV = vld4q_lane_f64(row0, raV, 0); + raV = vld4q_lane_f64(row1, raV, 1); + + vst1q_f64(px,raV.val[0]); + px += nRows; + + vst1q_f64(px,raV.val[1]); + px += nRows; + + vst1q_f64(px,raV.val[2]); + px += nRows; + + vst1q_f64(px,raV.val[3]); + px += nRows; + + /* Decrement the column loop counter */ + blkCnt--; + } + + /* Perform matrix transpose for last 3 samples here. */ + blkCnt = nColumns % 0x4U; + + while (blkCnt > 0U) + { + /* Read and store the input element in the destination */ + *px++ = *pIn; + *px++ = *(pIn + 1 * nColumns); + + px += (nRows - 2); + pIn++; + + /* Decrement the column loop counter */ + blkCnt--; + } + + i += 2; + pIn += 1 * nColumns; + + /* Decrement the row loop counter */ + rowCnt--; + + } /* Row loop end */ + + rowCnt = row & 1; + while (rowCnt > 0U) + { + blkCnt = nColumns ; + /* The pointer px is set to starting address of the column being processed */ + px = pOut + i; + + while (blkCnt > 0U) + { + /* Read and store the input element in the destination */ + *px = *pIn++; + + /* Update the pointer px to point to the next row of the transposed matrix */ + px += nRows; + + /* Decrement the column loop counter */ + blkCnt--; + } + i++; + rowCnt -- ; + } + + /* Set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); +} +#else arm_status arm_mat_trans_f64( const arm_matrix_instance_f64 * pSrc, arm_matrix_instance_f64 * pDst) @@ -142,7 +268,7 @@ arm_status arm_mat_trans_f64( /* Return to application */ return (status); } - +#endif /** * @} end of MatrixTrans group */ diff --git a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c index 23e34005..38cf538f 100755 --- a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c @@ -3,8 +3,8 @@ * Title: arm_absmax_no_idx_f64.c * Description: Maximum value of absolute values of a floating-point vector * - * $Date: 16 November 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -28,60 +28,128 @@ #include "dsp/statistics_functions.h" + + /** - @ingroup groupStats + @ingroup groupStats */ /** - @addtogroup AbsMax - @{ + @addtogroup AbsMax + @{ */ /** - @brief Maximum value of absolute values of a floating-point vector. - @param[in] pSrc points to the input vector - @param[in] blockSize number of samples in input vector - @param[out] pResult maximum value returned here - @return none + @brief Maximum value of absolute values of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @return none */ + +#if defined(ARM_MATH_NEON) void arm_absmax_no_idx_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - float64_t maxVal, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt; /* Loop counter */ - - + float64_t maxVal , in; /* Temporary variables to store the output value. */ + uint32_t blkCnt; /* Loop counter */ + + float64x2_t maxV; + float64x2_t pSrcV ; + pSrcV = vld1q_f64(pSrc); + pSrc += 2 ; + maxV = vabsq_f64(pSrcV); + + + + /* Load first input value that act as reference value for comparision */ - - /* Load first input value that act as reference value for comparision */ - out = fabs(*pSrc++); - - /* Initialize blkCnt with number of samples */ - blkCnt = (blockSize - 1U); - - while (blkCnt > 0U) - { - /* Initialize maxVal to the next consecutive values one by one */ - maxVal = fabs(*pSrc++); - - /* compare for the maximum value */ - if (out < maxVal) + + /* Initialize blkCnt with number of samples */ + blkCnt = (blockSize - 2U) >> 1U; + + while (blkCnt > 0U) { - /* Update the maximum value and it's index */ - out = maxVal; + /* Initialize maxVal to the next consecutive values one by one */ + pSrcV = vld1q_f64(pSrc); + maxV = vmaxq_f64(maxV, vabsq_f64(pSrcV)); + + pSrc += 2 ; + + /* Decrement loop counter */ + blkCnt--; } + maxVal =vgetq_lane_f64(maxV, 0); + if(maxVal < vgetq_lane_f64(maxV, 1)) + { + maxVal = vgetq_lane_f64(maxV, 1); + } + blkCnt = (blockSize - 2U) & 1; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + in = fabs(*pSrc++); + + /* compare for the maximum value */ + if (maxVal < in) + { + /* Update the maximum value and it's index */ + maxVal = in; + } + + /* Decrement loop counter */ + blkCnt--; + } + *pResult = maxVal; + + + /* Store the maximum value and it's index into destination pointers */ - /* Decrement loop counter */ - blkCnt--; - } - - /* Store the maximum value and it's index into destination pointers */ - *pResult = out; } +#else +void arm_absmax_no_idx_f64( + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) +{ + float64_t maxVal, out; /* Temporary variables to store the output value. */ + uint32_t blkCnt; /* Loop counter */ + + + + + + /* Load first input value that act as reference value for comparision */ + out = fabs(*pSrc++); + + /* Initialize blkCnt with number of samples */ + blkCnt = (blockSize - 1U); + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + maxVal = fabs(*pSrc++); + + /* compare for the maximum value */ + if (out < maxVal) + { + /* Update the maximum value and it's index */ + out = maxVal; + } + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store the maximum value and it's index into destination pointers */ + *pResult = out; +} +#endif /** - @} end of AbsMax group + @} end of AbsMax group */ diff --git a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c index ac1e9263..ca521085 100755 --- a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c @@ -3,8 +3,8 @@ * Title: arm_absmin_no_idx_f64.c * Description: Minimum value of absolute values of a floating-point vector * - * $Date: 16 November 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -44,6 +44,71 @@ @param[out] pResult minimum value returned here @return none */ + +#if defined(ARM_MATH_NEON) +void arm_absmin_no_idx_f64( + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) +{ + float64_t minVal , in; /* Temporary variables to store the output value. */ + uint32_t blkCnt; /* Loop counter */ + + float64x2_t minV; + float64x2_t pSrcV ; + pSrcV = vld1q_f64(pSrc); + pSrc += 2 ; + minV = vabsq_f64(pSrcV); + + + + + /* Load first input value that act as reference value for comparision */ + + + /* Initialize blkCnt with number of samples */ + blkCnt = (blockSize - 2U) >> 1U; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + pSrcV = vld1q_f64(pSrc); + minV = vminq_f64(minV, vabsq_f64(pSrcV)); + + pSrc += 2 ; + + /* Decrement loop counter */ + blkCnt--; + } + minVal =vgetq_lane_f64(minV, 0); + if(minVal > vgetq_lane_f64(minV, 1)) + { + minVal = vgetq_lane_f64(minV, 1); + } + blkCnt = (blockSize - 2U) & 1; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + in = fabs(*pSrc++); + + /* compare for the maximum value */ + if (minVal > in) + { + /* Update the maximum value and it's index */ + minVal = in; + } + + /* Decrement loop counter */ + blkCnt--; + } + *pResult = minVal; + + + /* Store the maximum value and it's index into destination pointers */ + +} +#else void arm_absmin_no_idx_f64( const float64_t * pSrc, uint32_t blockSize, @@ -78,7 +143,7 @@ void arm_absmin_no_idx_f64( /* Store the minimum value and it's index into destination pointers */ *pResult = out; } - +#endif /** @} end of AbsMin group */ diff --git a/Source/StatisticsFunctions/arm_entropy_f64.c b/Source/StatisticsFunctions/arm_entropy_f64.c index d671791c..a8fc7ed0 100755 --- a/Source/StatisticsFunctions/arm_entropy_f64.c +++ b/Source/StatisticsFunctions/arm_entropy_f64.c @@ -3,8 +3,8 @@ * Title: arm_logsumexp_f64.c * Description: LogSumExp * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -29,6 +29,9 @@ #include "dsp/statistics_functions.h" #include #include +#if defined(ARM_MATH_NEON) +#include "arm_vec_math.h" +#endif /** * @addtogroup Entropy @@ -51,9 +54,26 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize) float64_t accum, p; pIn = pSrcA; - blkCnt = blockSize; + accum = 0.0; +#if defined(ARM_MATH_NEON) + float64x2_t sumV ,pInV ; + sumV = vdupq_n_f64(0.0f); + blkCnt = blockSize >> 1U ; + + while(blkCnt > 0){ + pInV = vld1q_f64(pIn); + sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) ); + pIn += 2 ; + blkCnt--; + + } + accum = vaddvq_f64(sumV); + blkCnt = blockSize & 1 ; +#else + blkCnt = blockSize; +#endif while(blkCnt > 0) { diff --git a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c index b43b2186..a52059ed 100755 --- a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c +++ b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c @@ -3,8 +3,8 @@ * Title: arm_logsumexp_f64.c * Description: LogSumExp * - * $Date: 23 April 2021 - * $Revision: V1.9.0 + * $Date: 10 August 2022 + * $Revision: V1.9.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -44,6 +44,61 @@ * @return Kullback-Leibler divergence D(A || B) * */ +#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "NEMath.h" + +float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,const float64_t * pSrcB,uint32_t blockSize) +{ + const float64_t *pInA, *pInB; + uint32_t blkCnt; + float64_t accum, pA,pB; + + float64x2_t accumV; + float64x2_t tmpVA, tmpVB,tmpV; + + pInA = pSrcA; + pInB = pSrcB; + + accum = 0.0f; + accumV = vdupq_n_f64(0.0f); + + blkCnt = blockSize >> 1; + while(blkCnt > 0) + { + tmpVA = vld1q_f64(pInA); + pInA += 2; + + tmpVB = vld1q_f64(pInB); + pInB += 2; + + tmpV = vinvq_f64(tmpVA); + tmpVB = vmulq_f64(tmpVB, tmpV); + tmpVB = vlogq_f64(tmpVB); + + accumV = vmlaq_f64(accumV, tmpVA, tmpVB); + + blkCnt--; + + } + + + accum = vaddvq_f64(accumV); + + blkCnt = blockSize & 1; + while(blkCnt > 0) + { + pA = *pInA++; + pB = *pInB++; + accum += pA * logf(pB/pA); + + blkCnt--; + + } + + return(-accum); +} +#else float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pSrcB, uint32_t blockSize) { @@ -69,7 +124,7 @@ float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pS return(-accum); } - +#endif /** * @} end of Kullback-Leibler group */ diff --git a/Source/StatisticsFunctions/arm_max_no_idx_f64.c b/Source/StatisticsFunctions/arm_max_no_idx_f64.c index 7eca31c5..f6404c01 100644 --- a/Source/StatisticsFunctions/arm_max_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_max_no_idx_f64.c @@ -3,8 +3,8 @@ * Title: arm_max_no_idx_f64.c * Description: Maximum value of a floating-point vector without returning the index * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -29,47 +29,112 @@ #include "dsp/statistics_functions.h" /** - @ingroup groupStats + @ingroup groupStats */ /** - @addtogroup Max - @{ + @addtogroup Max + @{ */ /** - @brief Maximum value of a floating-point vector. - @param[in] pSrc points to the input vector - @param[in] blockSize number of samples in input vector - @param[out] pResult maximum value returned here - @return none + @brief Maximum value of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult maximum value returned here + @return none */ +#if defined(ARM_MATH_NEON) void arm_max_no_idx_f64( - const float64_t *pSrc, - uint32_t blockSize, - float64_t *pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - float64_t maxValue = F64_MIN; - float64_t newVal; - - while (blockSize > 0U) - { - newVal = *pSrc++; - - /* compare for the maximum value */ - if (maxValue < newVal) - { - /* Update the maximum value and it's index */ - maxValue = newVal; - } - - blockSize --; - } + float64_t maxVal , in; /* Temporary variables to store the output value. */ + uint32_t blkCnt; /* Loop counter */ + + float64x2_t maxV; + float64x2_t pSrcV ; + pSrcV = vld1q_f64(pSrc); + pSrc += 2 ; + maxV = pSrcV; + + + + + /* Load first input value that act as reference value for comparision */ + + + /* Initialize blkCnt with number of samples */ + blkCnt = (blockSize - 2U) >> 1U; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + pSrcV = vld1q_f64(pSrc); + maxV = vmaxq_f64(maxV, pSrcV); + + pSrc += 2 ; + + /* Decrement loop counter */ + blkCnt--; + } + maxVal =vgetq_lane_f64(maxV, 0); + if(maxVal < vgetq_lane_f64(maxV, 1)) + { + maxVal = vgetq_lane_f64(maxV, 1); + } + blkCnt = (blockSize - 2U) & 1; + + while (blkCnt > 0U) + { + /* Initialize maxVal to the next consecutive values one by one */ + in = *pSrc++; + + /* compare for the maximum value */ + if (maxVal < in) + { + /* Update the maximum value and it's index */ + maxVal = in; + } + + /* Decrement loop counter */ + blkCnt--; + } + *pResult = maxVal; + + + /* Store the maximum value and it's index into destination pointers */ + +} +#else +void arm_max_no_idx_f64( + const float64_t *pSrc, + uint32_t blockSize, + float64_t *pResult) +{ + float64_t maxValue = F64_MIN; + float64_t newVal; + + while (blockSize > 0U) + { + newVal = *pSrc++; + + /* compare for the maximum value */ + if (maxValue < newVal) + { + /* Update the maximum value and it's index */ + maxValue = newVal; + } + + blockSize --; + } - *pResult = maxValue; + *pResult = maxValue; } +#endif /** - @} end of Max group + @} end of Max group */ diff --git a/Source/StatisticsFunctions/arm_mean_f64.c b/Source/StatisticsFunctions/arm_mean_f64.c index 72e6f351..577ecf13 100644 --- a/Source/StatisticsFunctions/arm_mean_f64.c +++ b/Source/StatisticsFunctions/arm_mean_f64.c @@ -3,8 +3,8 @@ * Title: arm_mean_f64.c * Description: Mean value of a floating-point vector * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 03 June 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -29,47 +29,90 @@ #include "dsp/statistics_functions.h" /** - @ingroup groupStats + @ingroup groupStats */ /** - @addtogroup mean - @{ + @addtogroup mean + @{ */ /** - @brief Mean value of a floating-point vector. - @param[in] pSrc points to the input vector. - @param[in] blockSize number of samples in input vector. - @param[out] pResult mean value returned here. - @return none + @brief Mean value of a floating-point vector. + @param[in] pSrc points to the input vector. + @param[in] blockSize number of samples in input vector. + @param[out] pResult mean value returned here. + @return none */ + +#if defined(ARM_MATH_NEON) + void arm_mean_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary result storage */ + uint32_t blkCnt; /* Loop counter */ + float64x2_t vSum = vdupq_n_f64(0.0f); + float64_t sum = 0.; /* Temporary result storage */ + float64x2_t afterLoad ; + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize >> 1U; + + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + + afterLoad = vld1q_f64(pSrc); + vSum = vaddq_f64(vSum, afterLoad); + pSrc += 2; + /* Decrement loop counter */ + blkCnt--; + + } + sum = vaddvq_f64(vSum); + + blkCnt = blockSize & 1; + + while (blkCnt > 0U) + { + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ + sum += *pSrc++; + + /* Decrement loop counter */ + blkCnt--; + } + *pResult = (sum/blockSize); +} +#else +void arm_mean_f64( + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + /* Initialize blkCnt with number of samples */ blkCnt = blockSize; - + while (blkCnt > 0U) { /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */ sum += *pSrc++; - + /* Decrement loop counter */ blkCnt--; } - + /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */ /* Store result to destination */ *pResult = (sum / blockSize); } - +#endif /** - @} end of mean group + @} end of mean group */ diff --git a/Source/StatisticsFunctions/arm_min_no_idx_f64.c b/Source/StatisticsFunctions/arm_min_no_idx_f64.c index 6e69572d..b9696ff8 100755 --- a/Source/StatisticsFunctions/arm_min_no_idx_f64.c +++ b/Source/StatisticsFunctions/arm_min_no_idx_f64.c @@ -3,8 +3,8 @@ * Title: arm_min_no_idx_f64.c * Description: Maximum value of a floating-point vector without returning the index * - * $Date: 16 November 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -52,8 +52,31 @@ void arm_min_no_idx_f64( { float64_t minValue = F64_MAX; float64_t newVal; + uint32_t blkCnt ; +#if defined(ARM_MATH_NEON) + float64x2_t minValueV , newValV ; + minValueV = vdupq_n_f64(F64_MAX); + blkCnt = blockSize >> 1U; + while(blkCnt > 0) + { + newValV = vld1q_f64(pSrc); + minValueV = vminq_f64(minValueV, newValV); + pSrc += 2 ; + blkCnt--; + + } + minValue =vgetq_lane_f64(minValueV, 0); + if(minValue > vgetq_lane_f64(minValueV, 1)) + { + minValue = vgetq_lane_f64(minValueV, 1); + } + + blkCnt = blockSize & 1 ; +#else + blkCnt = blockSize; +#endif - while (blockSize > 0U) + while (blkCnt > 0U) { newVal = *pSrc++; @@ -64,7 +87,7 @@ void arm_min_no_idx_f64( minValue = newVal; } - blockSize --; + blkCnt --; } *pResult = minValue; diff --git a/Source/StatisticsFunctions/arm_mse_f64.c b/Source/StatisticsFunctions/arm_mse_f64.c index b785bf82..13daa2cb 100755 --- a/Source/StatisticsFunctions/arm_mse_f64.c +++ b/Source/StatisticsFunctions/arm_mse_f64.c @@ -3,8 +3,8 @@ * Title: arm_mse_f64.c * Description: Double floating point mean square error * - * $Date: 05 April 2022 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -29,82 +29,106 @@ #include "dsp/statistics_functions.h" /** - @ingroup groupStats + @ingroup groupStats */ /** - @addtogroup MSE - @{ + @addtogroup MSE + @{ */ /** - @brief Mean square error between two double floating point vectors. - @param[in] pSrcA points to the first input vector - @param[in] pSrcB points to the second input vector - @param[in] blockSize number of samples in input vector - @param[out] result mean square error - @return none + @brief Mean square error between two double floating point vectors. + @param[in] pSrcA points to the first input vector + @param[in] pSrcB points to the second input vector + @param[in] blockSize number of samples in input vector + @param[out] result mean square error + @return none */ - - - - void arm_mse_f64( - const float64_t * pSrcA, - const float64_t * pSrcB, - uint32_t blockSize, - float64_t * result) + const float64_t * pSrcA, + const float64_t * pSrcB, + uint32_t blockSize, + float64_t * result) { - uint32_t blkCnt; /* Loop counter */ - float64_t inA, inB; - float64_t sum = 0.0; /* Temporary return variable */ + + uint32_t blkCnt; /* Loop counter */ + float64_t inA, inB; + float64_t sum = 0.0; +#if defined (ARM_MATH_NEON) + + float64x2_t inAV , inBV , subV, sumV; + sumV = vdupq_n_f64(0.0f); + + blkCnt = blockSize >> 1U ; + + while (blkCnt > 0U) + { + inAV = vld1q_f64(pSrcA); + pSrcA+=2; + inBV = vld1q_f64(pSrcB); + pSrcB+=2; + subV = vsubq_f64(inAV, inBV); + sumV = vmlaq_f64(sumV, subV, subV); + + blkCnt--; + + } + sum = vaddvq_f64(sumV); + blkCnt = (blockSize) & 1; + +#else + /* Temporary return variable */ #if defined (ARM_MATH_LOOPUNROLL) - blkCnt = (blockSize) >> 1; - - - while (blkCnt > 0U) - { - - - inA = *pSrcA++; - inB = *pSrcB++; - inA = inA - inB; - sum += inA * inA; - - inA = *pSrcA++; - inB = *pSrcB++; - inA = inA - inB; - sum += inA * inA; - - /* Decrement loop counter */ - blkCnt--; - } - - - /* Loop unrolling: Compute remaining outputs */ - blkCnt = (blockSize) & 1; + blkCnt = (blockSize) >> 1; + +#pragma clang loop vectorize(enable) + while (blkCnt > 0U) + { + + + inA = *pSrcA++; + inB = *pSrcB++; + inA = inA - inB; + sum += inA * inA; + + inA = *pSrcA++; + inB = *pSrcB++; + inA = inA - inB; + sum += inA * inA; + + /* Decrement loop counter */ + blkCnt--; + } + + + /* Loop unrolling: Compute remaining outputs */ + blkCnt = (blockSize) & 1; #else - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; #endif - while (blkCnt > 0U) - { - inA = *pSrcA++; - inB = *pSrcB++; - inA = inA - inB; - sum += inA * inA; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result in destination buffer */ - *result = sum / blockSize; +#endif +//#pragma clang loop vectorize(enable) unroll(disable) + while (blkCnt > 0U) + { + inA = *pSrcA++; + inB = *pSrcB++; + inA = inA - inB; + sum += inA * inA; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result in destination buffer */ + *result = sum / blockSize; } + /** - @} end of MSE group + @} end of MSE group */ diff --git a/Source/StatisticsFunctions/arm_power_f64.c b/Source/StatisticsFunctions/arm_power_f64.c index 057e9f7c..710dc834 100644 --- a/Source/StatisticsFunctions/arm_power_f64.c +++ b/Source/StatisticsFunctions/arm_power_f64.c @@ -3,8 +3,8 @@ * Title: arm_power_f64.c * Description: Sum of the squares of the elements of a floating-point vector * - * $Date: 13 September 2021 - * $Revision: V1.10.0 + * $Date: 10 August 2022 + * $Revision: V1.10.1 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -29,49 +29,100 @@ #include "dsp/statistics_functions.h" /** - @ingroup groupStats + @ingroup groupStats */ /** - @addtogroup power - @{ + @addtogroup power + @{ */ /** - @brief Sum of the squares of the elements of a floating-point vector. - @param[in] pSrc points to the input vector - @param[in] blockSize number of samples in input vector - @param[out] pResult sum of the squares value returned here - @return none + @brief Sum of the squares of the elements of a floating-point vector. + @param[in] pSrc points to the input vector + @param[in] blockSize number of samples in input vector + @param[out] pResult sum of the squares value returned here + @return none */ +#if defined(ARM_MATH_NEON) void arm_power_f64( - const float64_t * pSrc, - uint32_t blockSize, - float64_t * pResult) + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) { - uint32_t blkCnt; /* Loop counter */ - float64_t sum = 0.; /* Temporary result storage */ - float64_t in; /* Temporary variable to store input value */ - - /* Initialize blkCnt with number of samples */ - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ - - /* Compute Power and store result in a temporary variable, sum. */ - in = *pSrc++; - sum += in * in; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store result to destination */ - *pResult = sum; + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + float64x2_t sumV ; /* Temporary variable to store input value */ + sumV = vdupq_n_f64(0.0f); + float64x2_t pSrcV; + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize >> 1U; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ + pSrcV = vld1q_f64(pSrc); + sumV = vmlaq_f64(sumV, pSrcV, pSrcV); + pSrc+= 2 ; + + + + /* Decrement loop counter */ + blkCnt--; + } + sum = vaddvq_f64(sumV); + + + float64_t in; + blkCnt = blockSize & 1; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ + in = *pSrc++; + sum += in * in; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result to destination */ + *pResult = sum; } - +#else +void arm_power_f64( + const float64_t * pSrc, + uint32_t blockSize, + float64_t * pResult) +{ + uint32_t blkCnt; /* Loop counter */ + float64_t sum = 0.; /* Temporary result storage */ + float64_t in; /* Temporary variable to store input value */ + + /* Initialize blkCnt with number of samples */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */ + + /* Compute Power and store result in a temporary variable, sum. */ + in = *pSrc++; + sum += in * in; + + /* Decrement loop counter */ + blkCnt--; + } + + /* Store result to destination */ + *pResult = sum; +} +#endif /** - @} end of power group + @} end of power group */