From 266d88ba4a6aa9aeb5af1c297acce69accc9ad20 Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Wed, 15 Jan 2020 11:26:16 +0100 Subject: [PATCH] CMSIS-DSP: Added MVE version of stage_rfft_f32 --- Source/TransformFunctions/arm_cfft_q15.c | 2 +- Source/TransformFunctions/arm_rfft_fast_f32.c | 225 +++++++++++++++++- Source/TransformFunctions/arm_rfft_q15.c | 2 +- Source/TransformFunctions/arm_rfft_q31.c | 2 +- 4 files changed, 227 insertions(+), 4 deletions(-) diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c index 1fa9abb0..a25fc824 100644 --- a/Source/TransformFunctions/arm_cfft_q15.c +++ b/Source/TransformFunctions/arm_cfft_q15.c @@ -946,4 +946,4 @@ void arm_cfft_radix4by2_inverse_q15( } } -#endif /* defined(ARM_MATH_MVEI) */ \ No newline at end of file +#endif /* defined(ARM_MATH_MVEI) */ diff --git a/Source/TransformFunctions/arm_rfft_fast_f32.c b/Source/TransformFunctions/arm_rfft_fast_f32.c index e2be2a5d..f7e79f90 100644 --- a/Source/TransformFunctions/arm_rfft_fast_f32.c +++ b/Source/TransformFunctions/arm_rfft_fast_f32.c @@ -28,6 +28,7 @@ #include "arm_math.h" +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) void stage_rfft_f32( const arm_rfft_fast_instance_f32 * S, float32_t * p, @@ -42,6 +43,16 @@ void stage_rfft_f32( float32_t t1a, t1b; /* temporary variables */ float32_t p0, p1, p2, p3; /* temporary variables */ + float32x4x2_t tw,xA,xB; + float32x4x2_t tmp1, tmp2, res; + + uint32x4_t vecStridesFwd, vecStridesBkwd; + + vecStridesFwd = vidupq_u32(0, 2); + vecStridesBkwd = -vecStridesFwd; + + int blockCnt; + k = (S->Sint).fftLen - 1; @@ -55,6 +66,216 @@ void stage_rfft_f32( twR = *pCoeff++ ; twI = *pCoeff++ ; + // U1 = XA(1) + XB(1); % It is real + t1a = xBR + xAR ; + + // U2 = XB(1) - XA(1); % It is imaginary + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + *pOut++ = 0.5f * ( t1a + t1b ); + *pOut++ = 0.5f * ( t1a - t1b ); + + // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); + pB = p + 2*k; + pA += 2; + + blockCnt = k >> 2; + while (blockCnt > 0) + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + + xA = vld2q_f32(pA); + pA += 8; + + xB = vld2q_f32(pB); + + xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd); + xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd); + + xB.val[1] = vnegq_f32(xB.val[1]); + pB -= 8; + + + tw = vld2q_f32(pCoeff); + pCoeff += 8; + + + tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]); + tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]); + + tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]); + tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]); + + res.val[0] = vmulq(tw.val[0], tmp2.val[0]); + res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); + + res.val[1] = vmulq(tw.val[0], tmp2.val[1]); + res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); + + res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] ); + res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] ); + + res.val[0] = vmulq_n_f32(res.val[0], 0.5f); + res.val[1] = vmulq_n_f32(res.val[1], 0.5f); + + + vst2q_f32(pOut, res); + pOut += 8; + + + blockCnt--; + } + + blockCnt = k & 3; + while (blockCnt > 0) + { + /* + function X = my_split_rfft(X, ifftFlag) + % X is a series of real numbers + L = length(X); + XC = X(1:2:end) +i*X(2:2:end); + XA = fft(XC); + XB = conj(XA([1 end:-1:2])); + TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; + for l = 2:L/2 + XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); + end + XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); + X = XA; + */ + + xBI = pB[1]; + xBR = pB[0]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xBR - xAR ; + t1b = xBI + xAI ; + + // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); + // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); + p0 = twR * t1a; + p1 = twI * t1a; + p2 = twR * t1b; + p3 = twI * t1b; + + *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR + *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI + + pA += 2; + pB -= 2; + blockCnt--; + } +} + +/* Prepares data for inverse cfft */ +void merge_rfft_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut) +{ + uint32_t k; /* Loop Counter */ + float32_t twR, twI; /* RFFT Twiddle coefficients */ + const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float32_t *pA = p; /* increasing pointer */ + float32_t *pB = p; /* decreasing pointer */ + float32_t xAR, xAI, xBR, xBI; /* temporary variables */ + float32_t t1a, t1b, r, s, t, u; /* temporary variables */ + + + k = (S->Sint).fftLen - 1; + + xAR = pA[0]; + xAI = pA[1]; + + pCoeff += 2 ; + + *pOut++ = 0.5f * ( xAR + xAI ); + *pOut++ = 0.5f * ( xAR - xAI ); + + pB = p + 2*k ; + pA += 2 ; + + while (k > 0U) + { + /* G is half of the frequency complex spectrum */ + //for k = 2:N + // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); + xBI = pB[1] ; + xBR = pB[0] ; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++; + twI = *pCoeff++; + + t1a = xAR - xBR ; + t1b = xAI + xBI ; + + r = twR * t1a; + s = twI * t1b; + t = twI * t1a; + u = twR * t1b; + + // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); + // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); + *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR + *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI + + pA += 2; + pB -= 2; + k--; + } + +} +#else +void stage_rfft_f32( + const arm_rfft_fast_instance_f32 * S, + float32_t * p, + float32_t * pOut) +{ + uint32_t k; /* Loop Counter */ + float32_t twR, twI; /* RFFT Twiddle coefficients */ + const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ + float32_t *pA = p; /* increasing pointer */ + float32_t *pB = p; /* decreasing pointer */ + float32_t xAR, xAI, xBR, xBI; /* temporary variables */ + float32_t t1a, t1b; /* temporary variables */ + float32_t p0, p1, p2, p3; /* temporary variables */ + + + k = (S->Sint).fftLen - 1; + + /* Pack first and last sample of the frequency domain together */ + + xBR = pB[0]; + xBI = pB[1]; + xAR = pA[0]; + xAI = pA[1]; + + twR = *pCoeff++ ; + twI = *pCoeff++ ; + + // U1 = XA(1) + XB(1); % It is real t1a = xBR + xAR ; @@ -108,6 +329,7 @@ void stage_rfft_f32( *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI + pA += 2; pB -= 2; k--; @@ -174,6 +396,8 @@ void merge_rfft_f32( } +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ + /** @ingroup groupTransforms */ @@ -303,7 +527,6 @@ void arm_rfft_fast_f32( { /* Real FFT compression */ merge_rfft_f32(S, p, pOut); - /* Complex radix-4 IFFT process */ arm_cfft_f32( Sint, pOut, ifftFlag, 1); } diff --git a/Source/TransformFunctions/arm_rfft_q15.c b/Source/TransformFunctions/arm_rfft_q15.c index 8aa24355..ca4c9d45 100644 --- a/Source/TransformFunctions/arm_rfft_q15.c +++ b/Source/TransformFunctions/arm_rfft_q15.c @@ -586,4 +586,4 @@ void arm_split_rifft_q15( } } -#endif /* defined(ARM_MATH_MVEI) */ \ No newline at end of file +#endif /* defined(ARM_MATH_MVEI) */ diff --git a/Source/TransformFunctions/arm_rfft_q31.c b/Source/TransformFunctions/arm_rfft_q31.c index 0b8d7f9a..9ba0d521 100644 --- a/Source/TransformFunctions/arm_rfft_q31.c +++ b/Source/TransformFunctions/arm_rfft_q31.c @@ -494,4 +494,4 @@ void arm_split_rifft_q31( } -#endif /* defined(ARM_MATH_MVEI) */ \ No newline at end of file +#endif /* defined(ARM_MATH_MVEI) */