From dc5477f872cd18e96ecadec3df833db1ecb7ce36 Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Wed, 10 Mar 2021 10:51:58 +0100 Subject: [PATCH] CMSIS-DSP: Added mve code for Levinson Durbin f32. Some correction to Levinson Durbin Q31. --- .../arm_levinson_durbin_f32.c | 166 +++++++++++++++++- .../arm_levinson_durbin_q31.c | 28 --- 2 files changed, 165 insertions(+), 29 deletions(-) diff --git a/Source/FilteringFunctions/arm_levinson_durbin_f32.c b/Source/FilteringFunctions/arm_levinson_durbin_f32.c index 34a20a11..c644b3f0 100755 --- a/Source/FilteringFunctions/arm_levinson_durbin_f32.c +++ b/Source/FilteringFunctions/arm_levinson_durbin_f32.c @@ -47,6 +47,169 @@ @param[in] nbCoefs number of autoregressive coefficients @return none */ + +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +#define LANE23_MASK 0xFF00 + +void arm_levinson_durbin_f32(const float32_t *phi, + float32_t *a, + float32_t *err, + int nbCoefs) +{ + float32_t e; + static const uint32_t revOffsetArray[4] = {3,2,1,0}; + + a[0] = phi[1] / phi[0]; + + e = phi[0] - phi[1] * a[0]; + for(int p=1; p < nbCoefs; p++) + { + float32_t suma = 0.0f; + float32_t sumb = 0.0f; + f32x4_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB; + float32_t k; + uint32_t blkCnt; + const float32_t *pPhi,*pRevPhi,*pA; + uint32x4_t revOffset; + + int nb,j,i; + + revOffset = vld1q(revOffsetArray); + vecSumA = vdupq_n_f32(0.0f); + vecSumB = vdupq_n_f32(0.0f); + + pRevPhi = &phi[p-3]; + pPhi = &phi[1]; + pA = a; + + i = 0; + blkCnt = p >> 2; + while(blkCnt > 0) + { + vecA = vld1q(pA); + pA += 4; + + vecPhi = vld1q(pPhi); + pPhi += 4; + + vecRevPhi = vldrwq_gather_shifted_offset_f32(pRevPhi,revOffset); + pRevPhi -= 4; + + vecSumA = vfmaq(vecSumA,vecA,vecRevPhi); + vecSumB = vfmaq(vecSumB,vecA,vecPhi); + + i += 4; + blkCnt--; + + } + + suma = vecAddAcrossF32Mve(vecSumA); + sumb = vecAddAcrossF32Mve(vecSumB); + + blkCnt = p & 3; + while(blkCnt > 0) + { + suma += a[i] * phi[p - i]; + sumb += a[i] * phi[i + 1]; + + i++; + blkCnt--; + } + + k = (phi[p+1] - suma)/(phi[0] - sumb); + + f32x4_t vecRevA,tmp; + static uint32_t orgOffsetArray[4]={0,1,-1,-2}; + static const uint32_t offsetIncArray[4]={2,2,-2,-2}; + + uint32x4_t offset,offsetInc,vecTmp; + + + offset = vld1q(orgOffsetArray); + vecTmp = vdupq_n_u32(p); + + offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK); + offsetInc = vld1q(offsetIncArray); + + nb = p >> 2; + j=0; + for(int i = 0; i < nb ; i++) + { + + /* + x0=a[j] - k * a[p-1-j]; + x1=a[j+1] - k * a[p-2-j]; + x3=a[p-1-j] - k * a[j]; + x4=a[p-2-j] - k * a[j+1]; + + a[j] = x0; + a[j+1] = x1; + a[p-1-j] = x2; + a[p-2-j] = x3; + */ + + uint64_t tmpa,tmpb; + vecA = vldrwq_gather_shifted_offset_f32(a,offset); + + + tmpa = vgetq_lane_u64((uint64x2_t)vecA,0); + tmpb = vgetq_lane_u64((uint64x2_t)vecA,1); + vecRevA = (f32x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0); + vecRevA = (f32x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1); + + + tmp = vsubq(vecA,vmulq_n_f32(vecRevA,k)); + vstrwq_scatter_shifted_offset_f32(a, offset, tmp); + + offset = vaddq(offset,offsetInc); + + j+=2; + + } + + switch(p & 3) + { + case 3: + { + float32_t x,y; + x = a[j] - k * a[p-1-j]; + y = a[p-1-j] - k * a[j]; + + a[j] = x; + a[p-1-j] = y; + + a[j+1] = a[j+1] - k * a[p-1-(j+1)]; + } + break; + + case 2: + { + float32_t x,y; + x = a[j] - k * a[p-1-j]; + y = a[p-1-j] - k * a[j]; + + a[j] = x; + a[p-1-j] = y; + } + break; + + case 1: + a[j] = a[j]- k * a[p-1-j]; + break; + } + + a[p] = k; + e = e * (1 - k*k); + + + } + *err = e; +} + +#else void arm_levinson_durbin_f32(const float32_t *phi, float32_t *a, float32_t *err, @@ -75,7 +238,7 @@ void arm_levinson_durbin_f32(const float32_t *phi, nb = p >> 1; j=0; - for(int i =0;i < nb ; i++) + for(int i =0; i < nb ; i++) { float32_t x,y; @@ -101,6 +264,7 @@ void arm_levinson_durbin_f32(const float32_t *phi, } *err = e; } +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of LD group diff --git a/Source/FilteringFunctions/arm_levinson_durbin_q31.c b/Source/FilteringFunctions/arm_levinson_durbin_q31.c index 0bb5e46c..3bececff 100755 --- a/Source/FilteringFunctions/arm_levinson_durbin_q31.c +++ b/Source/FilteringFunctions/arm_levinson_durbin_q31.c @@ -33,31 +33,6 @@ #define HALF_Q15 0x3FFF #define LOWPART_MASK 0x07FFF -#include - -#define FQ31(X) (1.0*(X) / ONE_Q31) -#define FQ15(X) (1.0*(X) / ONE_Q15) - -#define PQ31(X) \ - if ((X)>=0) \ - { \ - printf("%08X (%f)",(X),FQ31(X)); \ - } \ - else \ - { \ - printf("-%08X (%f)",-(X),FQ31(X));\ - } - -#define PQ15(X) \ - if ((X)>=0) \ - { \ - printf("%04X (%f)",(X),FQ15(X)); \ - } \ - else \ - { \ - printf("-%04X (%f)",-(X),FQ15(X));\ - } - __STATIC_FORCEINLINE q31_t mul32x16(q31_t a, q15_t b) { @@ -144,8 +119,6 @@ void arm_levinson_durbin_q31(const q31_t *phi, //a[0] = phi[1] / phi[0]; a[0] = divide(phi[1], phi[0]); - //printf("%08X %08X\n",phi[1],phi[0]); - //printf("%f / %f = %f\n",FQ31(phi[1]),FQ31(phi[0]),FQ31(a[0])); //e = phi[0] - phi[1] * a[0]; @@ -167,7 +140,6 @@ void arm_levinson_durbin_q31(const q31_t *phi, suma = suma >> 31; sumb = sumb >> 31; - //printf("suma = %08X, sumb=%08X\n",(q31_t)(suma & 0x0FFFFFFFF),(q31_t)(sumb & 0x0FFFFFFFF)); //k = (phi[p+1]-suma)/(phi[0] - sumb);