|
|
|
|
@ -45,50 +45,8 @@
|
|
|
|
|
@param[out] pResult sum value returned here.
|
|
|
|
|
@return none
|
|
|
|
|
*/
|
|
|
|
|
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|
|
|
|
|
|
|
|
|
#include "arm_helium_utils.h"
|
|
|
|
|
|
|
|
|
|
void arm_accumulate_f32(
|
|
|
|
|
const float32_t * pSrc,
|
|
|
|
|
uint32_t blockSize,
|
|
|
|
|
float32_t * pResult)
|
|
|
|
|
{
|
|
|
|
|
uint32_t blkCnt; /* loop counters */
|
|
|
|
|
f32x4_t vecSrc;
|
|
|
|
|
f32x4_t sumVec = vdupq_n_f32(0.0f);
|
|
|
|
|
float32_t sum = 0.0f;
|
|
|
|
|
|
|
|
|
|
/* Compute 4 outputs at a time */
|
|
|
|
|
blkCnt = blockSize >> 2U;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
vecSrc = vldrwq_f32(pSrc);
|
|
|
|
|
sumVec = vaddq_f32(sumVec, vecSrc);
|
|
|
|
|
|
|
|
|
|
blkCnt --;
|
|
|
|
|
pSrc += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sum = vecAddAcrossF32Mve(sumVec);
|
|
|
|
|
|
|
|
|
|
/* Tail */
|
|
|
|
|
blkCnt = blockSize & 0x3;
|
|
|
|
|
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
|
|
|
|
|
sum += *pSrc++;
|
|
|
|
|
|
|
|
|
|
/* Decrement loop counter */
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*pResult = sum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|
|
|
|
void arm_accumulate_f32(
|
|
|
|
|
const float32_t * pSrc,
|
|
|
|
|
@ -191,7 +149,7 @@ void arm_accumulate_f32(
|
|
|
|
|
*pResult = sum ;
|
|
|
|
|
}
|
|
|
|
|
#endif /* #if defined(ARM_MATH_NEON) */
|
|
|
|
|
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@} end of Accumulation group
|
|
|
|
|
|