From 4ba79c27c6814720d69a8f147ddfb23d819fcec7 Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Wed, 10 Mar 2021 13:15:05 +0100 Subject: [PATCH] CMSIS-DSP: MVE version of Levinson Durbin f16 --- .../arm_levinson_durbin_f16.c | 189 ++++++++++++++++-- .../arm_levinson_durbin_f32.c | 4 +- Testing/Source/Tests/MISCF16.cpp | 2 +- 3 files changed, 177 insertions(+), 18 deletions(-) diff --git a/Source/FilteringFunctions/arm_levinson_durbin_f16.c b/Source/FilteringFunctions/arm_levinson_durbin_f16.c index 85b6abc0..8d550a5e 100755 --- a/Source/FilteringFunctions/arm_levinson_durbin_f16.c +++ b/Source/FilteringFunctions/arm_levinson_durbin_f16.c @@ -48,6 +48,164 @@ @return none */ +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +#define LANE4567_MASK 0xFF00 + +void arm_levinson_durbin_f16(const float16_t *phi, + float16_t *a, + float16_t *err, + int nbCoefs) +{ + _Float16 e; + static const uint16_t revOffsetArray[8] = {7,6,5,4,3,2,1,0}; + + a[0] = (_Float16)phi[1] / (_Float16)phi[0]; + + e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0]; + for(int p=1; p < nbCoefs; p++) + { + _Float16 suma = 0.0f16; + _Float16 sumb = 0.0f16; + f16x8_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB; + _Float16 k; + uint32_t blkCnt; + const float16_t *pPhi,*pRevPhi,*pA; + uint16x8_t revOffset; + + int nb,j,i; + + revOffset = vld1q(revOffsetArray); + vecSumA = vdupq_n_f16(0.0f16); + vecSumB = vdupq_n_f16(0.0f16); + + pRevPhi = &phi[p-7]; + pPhi = &phi[1]; + pA = a; + + i = 0; + blkCnt = p >> 3; + while(blkCnt > 0) + { + vecA = vld1q(pA); + pA += 8; + + vecPhi = vld1q(pPhi); + pPhi += 8; + + vecRevPhi = vldrhq_gather_shifted_offset_f16(pRevPhi,revOffset); + pRevPhi -= 8; + + vecSumA = vfmaq(vecSumA,vecA,vecRevPhi); + vecSumB = vfmaq(vecSumB,vecA,vecPhi); + + i += 8; + blkCnt--; + + } + + suma = vecAddAcrossF16Mve(vecSumA); + sumb = vecAddAcrossF16Mve(vecSumB); + + blkCnt = p & 7; + while(blkCnt > 0) + { + suma += (_Float16)a[i] * (_Float16)phi[p - i]; + sumb += (_Float16)a[i] * (_Float16)phi[i + 1]; + + i++; + blkCnt--; + } + + k = ((_Float16)phi[p+1] - suma)/((_Float16)phi[0] - sumb); + + f16x8_t vecRevA,tmp; + static uint16_t orgOffsetArray[8]={0,1,2,3,-1,-2,-3,-4}; + static const uint16_t offsetIncArray[8]={4,4,4,4,-4,-4,-4,-4}; + + uint16x8_t offset,offsetInc,vecTmp; + + + offset = vld1q(orgOffsetArray); + vecTmp = vdupq_n_u16(p); + + offset = vaddq_m_u16(offset,offset,vecTmp,LANE4567_MASK); + offsetInc = vld1q(offsetIncArray); + + nb = p >> 3; + j=0; + for(int i = 0; i < nb ; i++) + { + + /* + x0=a[j] - k * a[p-1-j]; + x1=a[j+1] - k * a[p-2-j]; + x3=a[p-1-j] - k * a[j]; + x4=a[p-2-j] - k * a[j+1]; + + a[j] = x0; + a[j+1] = x1; + a[p-1-j] = x2; + a[p-2-j] = x3; + */ + + uint64_t tmpa,tmpb; + vecA = vldrhq_gather_shifted_offset_f16(a,offset); + + + tmpa = vgetq_lane_u64((uint64x2_t)vecA,0); + tmpb = vgetq_lane_u64((uint64x2_t)vecA,1); + vecRevA = (f16x8_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0); + vecRevA = (f16x8_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1); + + + tmp = vsubq(vecA,vmulq_n_f16(vecRevA,k)); + vstrhq_scatter_shifted_offset_f16(a, offset, tmp); + + offset = vaddq(offset,offsetInc); + + j+=4; + + } + + blkCnt = p & 7; + + if (blkCnt) + { + nb = blkCnt >> 1; + for(int i =0;i < nb ; i++) + { + _Float16 x,y; + + x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j]; + y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j]; + + a[j] = x; + a[p-1-j] = y; + + j++; + } + + nb = blkCnt & 1; + if (nb) + { + a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j]; + } + } + + + a[p] = k; + e = e * (1.0f16 - k*k); + + + } + *err = e; +} + +#else + #if defined(ARM_FLOAT16_SUPPORTED) void arm_levinson_durbin_f16(const float16_t *phi, @@ -55,35 +213,35 @@ void arm_levinson_durbin_f16(const float16_t *phi, float16_t *err, int nbCoefs) { - float16_t e; + _Float16 e; - a[0] = phi[1] / phi[0]; + a[0] = (_Float16)phi[1] / (_Float16)phi[0]; - e = phi[0] - phi[1] * a[0]; + e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0]; for(int p=1; p < nbCoefs; p++) { - float16_t suma=0.0f16; - float16_t sumb=0.0f16; - float16_t k; + _Float16 suma=0.0f16; + _Float16 sumb=0.0f16; + _Float16 k; int nb,j; for(int i=0; i < p; i++) { - suma += a[i] * phi[p - i]; - sumb += a[i] * phi[i + 1]; + suma += (_Float16)a[i] * (_Float16)phi[p - i]; + sumb += (_Float16)a[i] * (_Float16)phi[i + 1]; } - k = (phi[p+1]-suma)/(phi[0] - sumb); + k = ((_Float16)phi[p+1]-suma)/((_Float16)phi[0] - sumb); nb = p >> 1; j=0; for(int i =0;i < nb ; i++) { - float16_t x,y; + _Float16 x,y; - x=a[j] - k * a[p-1-j]; - y=a[p-1-j] - k * a[j]; + x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j]; + y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j]; a[j] = x; a[p-1-j] = y; @@ -94,17 +252,18 @@ void arm_levinson_durbin_f16(const float16_t *phi, nb = p & 1; if (nb) { - a[j]=a[j]- k * a[p-1-j]; + a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j]; } a[p] = k; - e = e * (1 - k*k); + e = e * (1.0f16 - k*k); } *err = e; } -#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ +#endif /* defined(ARM_FLOAT16_SUPPORTED */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of LD group */ diff --git a/Source/FilteringFunctions/arm_levinson_durbin_f32.c b/Source/FilteringFunctions/arm_levinson_durbin_f32.c index c644b3f0..d8f4999d 100755 --- a/Source/FilteringFunctions/arm_levinson_durbin_f32.c +++ b/Source/FilteringFunctions/arm_levinson_durbin_f32.c @@ -202,7 +202,7 @@ void arm_levinson_durbin_f32(const float32_t *phi, } a[p] = k; - e = e * (1 - k*k); + e = e * (1.0f - k*k); } @@ -258,7 +258,7 @@ void arm_levinson_durbin_f32(const float32_t *phi, } a[p] = k; - e = e * (1 - k*k); + e = e * (1.0f - k*k); } diff --git a/Testing/Source/Tests/MISCF16.cpp b/Testing/Source/Tests/MISCF16.cpp index 19bfc422..631be787 100755 --- a/Testing/Source/Tests/MISCF16.cpp +++ b/Testing/Source/Tests/MISCF16.cpp @@ -18,7 +18,7 @@ a double precision computation. For tests of the error value of the Levinson Durbin algorithm */ -#define SNR_LD_THRESHOLD 59 +#define SNR_LD_THRESHOLD 57 #define REL_LD_ERROR (1.0e-3) #define ABS_LD_ERROR (1.0e-3)