|
|
|
|
@ -28,6 +28,7 @@
|
|
|
|
|
|
|
|
|
|
#include "arm_math.h"
|
|
|
|
|
|
|
|
|
|
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|
|
|
|
void stage_rfft_f32(
|
|
|
|
|
const arm_rfft_fast_instance_f32 * S,
|
|
|
|
|
float32_t * p,
|
|
|
|
|
@ -42,6 +43,16 @@ void stage_rfft_f32(
|
|
|
|
|
float32_t t1a, t1b; /* temporary variables */
|
|
|
|
|
float32_t p0, p1, p2, p3; /* temporary variables */
|
|
|
|
|
|
|
|
|
|
float32x4x2_t tw,xA,xB;
|
|
|
|
|
float32x4x2_t tmp1, tmp2, res;
|
|
|
|
|
|
|
|
|
|
uint32x4_t vecStridesFwd, vecStridesBkwd;
|
|
|
|
|
|
|
|
|
|
vecStridesFwd = vidupq_u32(0, 2);
|
|
|
|
|
vecStridesBkwd = -vecStridesFwd;
|
|
|
|
|
|
|
|
|
|
int blockCnt;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
k = (S->Sint).fftLen - 1;
|
|
|
|
|
|
|
|
|
|
@ -55,6 +66,216 @@ void stage_rfft_f32(
|
|
|
|
|
twR = *pCoeff++ ;
|
|
|
|
|
twI = *pCoeff++ ;
|
|
|
|
|
|
|
|
|
|
// U1 = XA(1) + XB(1); % It is real
|
|
|
|
|
t1a = xBR + xAR ;
|
|
|
|
|
|
|
|
|
|
// U2 = XB(1) - XA(1); % It is imaginary
|
|
|
|
|
t1b = xBI + xAI ;
|
|
|
|
|
|
|
|
|
|
// real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
|
|
|
|
|
// imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
|
|
|
|
|
*pOut++ = 0.5f * ( t1a + t1b );
|
|
|
|
|
*pOut++ = 0.5f * ( t1a - t1b );
|
|
|
|
|
|
|
|
|
|
// XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
|
|
|
|
|
pB = p + 2*k;
|
|
|
|
|
pA += 2;
|
|
|
|
|
|
|
|
|
|
blockCnt = k >> 2;
|
|
|
|
|
while (blockCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
function X = my_split_rfft(X, ifftFlag)
|
|
|
|
|
% X is a series of real numbers
|
|
|
|
|
L = length(X);
|
|
|
|
|
XC = X(1:2:end) +i*X(2:2:end);
|
|
|
|
|
XA = fft(XC);
|
|
|
|
|
XB = conj(XA([1 end:-1:2]));
|
|
|
|
|
TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
|
|
|
|
|
for l = 2:L/2
|
|
|
|
|
XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
|
|
|
|
|
end
|
|
|
|
|
XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
|
|
|
|
|
X = XA;
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xA = vld2q_f32(pA);
|
|
|
|
|
pA += 8;
|
|
|
|
|
|
|
|
|
|
xB = vld2q_f32(pB);
|
|
|
|
|
|
|
|
|
|
xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
|
|
|
|
|
xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
|
|
|
|
|
|
|
|
|
|
xB.val[1] = vnegq_f32(xB.val[1]);
|
|
|
|
|
pB -= 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tw = vld2q_f32(pCoeff);
|
|
|
|
|
pCoeff += 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
|
|
|
|
|
tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
|
|
|
|
|
|
|
|
|
|
tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
|
|
|
|
|
tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
|
|
|
|
|
|
|
|
|
|
res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
|
|
|
|
|
res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
|
|
|
|
|
|
|
|
|
|
res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
|
|
|
|
|
res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
|
|
|
|
|
|
|
|
|
|
res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
|
|
|
|
|
res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
|
|
|
|
|
|
|
|
|
|
res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
|
|
|
|
|
res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vst2q_f32(pOut, res);
|
|
|
|
|
pOut += 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
blockCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blockCnt = k & 3;
|
|
|
|
|
while (blockCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
function X = my_split_rfft(X, ifftFlag)
|
|
|
|
|
% X is a series of real numbers
|
|
|
|
|
L = length(X);
|
|
|
|
|
XC = X(1:2:end) +i*X(2:2:end);
|
|
|
|
|
XA = fft(XC);
|
|
|
|
|
XB = conj(XA([1 end:-1:2]));
|
|
|
|
|
TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
|
|
|
|
|
for l = 2:L/2
|
|
|
|
|
XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
|
|
|
|
|
end
|
|
|
|
|
XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
|
|
|
|
|
X = XA;
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
xBI = pB[1];
|
|
|
|
|
xBR = pB[0];
|
|
|
|
|
xAR = pA[0];
|
|
|
|
|
xAI = pA[1];
|
|
|
|
|
|
|
|
|
|
twR = *pCoeff++;
|
|
|
|
|
twI = *pCoeff++;
|
|
|
|
|
|
|
|
|
|
t1a = xBR - xAR ;
|
|
|
|
|
t1b = xBI + xAI ;
|
|
|
|
|
|
|
|
|
|
// real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
|
|
|
|
|
// imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
|
|
|
|
|
p0 = twR * t1a;
|
|
|
|
|
p1 = twI * t1a;
|
|
|
|
|
p2 = twR * t1b;
|
|
|
|
|
p3 = twI * t1b;
|
|
|
|
|
|
|
|
|
|
*pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
|
|
|
|
|
*pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
|
|
|
|
|
|
|
|
|
|
pA += 2;
|
|
|
|
|
pB -= 2;
|
|
|
|
|
blockCnt--;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Prepares data for inverse cfft */
|
|
|
|
|
void merge_rfft_f32(
|
|
|
|
|
const arm_rfft_fast_instance_f32 * S,
|
|
|
|
|
float32_t * p,
|
|
|
|
|
float32_t * pOut)
|
|
|
|
|
{
|
|
|
|
|
uint32_t k; /* Loop Counter */
|
|
|
|
|
float32_t twR, twI; /* RFFT Twiddle coefficients */
|
|
|
|
|
const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
|
|
|
|
|
float32_t *pA = p; /* increasing pointer */
|
|
|
|
|
float32_t *pB = p; /* decreasing pointer */
|
|
|
|
|
float32_t xAR, xAI, xBR, xBI; /* temporary variables */
|
|
|
|
|
float32_t t1a, t1b, r, s, t, u; /* temporary variables */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
k = (S->Sint).fftLen - 1;
|
|
|
|
|
|
|
|
|
|
xAR = pA[0];
|
|
|
|
|
xAI = pA[1];
|
|
|
|
|
|
|
|
|
|
pCoeff += 2 ;
|
|
|
|
|
|
|
|
|
|
*pOut++ = 0.5f * ( xAR + xAI );
|
|
|
|
|
*pOut++ = 0.5f * ( xAR - xAI );
|
|
|
|
|
|
|
|
|
|
pB = p + 2*k ;
|
|
|
|
|
pA += 2 ;
|
|
|
|
|
|
|
|
|
|
while (k > 0U)
|
|
|
|
|
{
|
|
|
|
|
/* G is half of the frequency complex spectrum */
|
|
|
|
|
//for k = 2:N
|
|
|
|
|
// Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
|
|
|
|
|
xBI = pB[1] ;
|
|
|
|
|
xBR = pB[0] ;
|
|
|
|
|
xAR = pA[0];
|
|
|
|
|
xAI = pA[1];
|
|
|
|
|
|
|
|
|
|
twR = *pCoeff++;
|
|
|
|
|
twI = *pCoeff++;
|
|
|
|
|
|
|
|
|
|
t1a = xAR - xBR ;
|
|
|
|
|
t1b = xAI + xBI ;
|
|
|
|
|
|
|
|
|
|
r = twR * t1a;
|
|
|
|
|
s = twI * t1b;
|
|
|
|
|
t = twI * t1a;
|
|
|
|
|
u = twR * t1b;
|
|
|
|
|
|
|
|
|
|
// real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
|
|
|
|
|
// imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
|
|
|
|
|
*pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
|
|
|
|
|
*pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
|
|
|
|
|
|
|
|
|
|
pA += 2;
|
|
|
|
|
pB -= 2;
|
|
|
|
|
k--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
void stage_rfft_f32(
|
|
|
|
|
const arm_rfft_fast_instance_f32 * S,
|
|
|
|
|
float32_t * p,
|
|
|
|
|
float32_t * pOut)
|
|
|
|
|
{
|
|
|
|
|
uint32_t k; /* Loop Counter */
|
|
|
|
|
float32_t twR, twI; /* RFFT Twiddle coefficients */
|
|
|
|
|
const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
|
|
|
|
|
float32_t *pA = p; /* increasing pointer */
|
|
|
|
|
float32_t *pB = p; /* decreasing pointer */
|
|
|
|
|
float32_t xAR, xAI, xBR, xBI; /* temporary variables */
|
|
|
|
|
float32_t t1a, t1b; /* temporary variables */
|
|
|
|
|
float32_t p0, p1, p2, p3; /* temporary variables */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
k = (S->Sint).fftLen - 1;
|
|
|
|
|
|
|
|
|
|
/* Pack first and last sample of the frequency domain together */
|
|
|
|
|
|
|
|
|
|
xBR = pB[0];
|
|
|
|
|
xBI = pB[1];
|
|
|
|
|
xAR = pA[0];
|
|
|
|
|
xAI = pA[1];
|
|
|
|
|
|
|
|
|
|
twR = *pCoeff++ ;
|
|
|
|
|
twI = *pCoeff++ ;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// U1 = XA(1) + XB(1); % It is real
|
|
|
|
|
t1a = xBR + xAR ;
|
|
|
|
|
|
|
|
|
|
@ -108,6 +329,7 @@ void stage_rfft_f32(
|
|
|
|
|
*pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
|
|
|
|
|
*pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pA += 2;
|
|
|
|
|
pB -= 2;
|
|
|
|
|
k--;
|
|
|
|
|
@ -174,6 +396,8 @@ void merge_rfft_f32(
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@ingroup groupTransforms
|
|
|
|
|
*/
|
|
|
|
|
@ -303,7 +527,6 @@ void arm_rfft_fast_f32(
|
|
|
|
|
{
|
|
|
|
|
/* Real FFT compression */
|
|
|
|
|
merge_rfft_f32(S, p, pOut);
|
|
|
|
|
|
|
|
|
|
/* Complex radix-4 IFFT process */
|
|
|
|
|
arm_cfft_f32( Sint, pOut, ifftFlag, 1);
|
|
|
|
|
}
|
|
|
|
|
|