CMSIS-DSP: FFT bit reversal unrolling

Issue index update earlier, adding more distance with
           subsequent gather loads
           Added Out-of-place variants
pull/19/head
FabKlein 5 years ago
parent 0a5a96d904
commit a509fa39d4

@ -47,6 +47,273 @@ extern "C"
#define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B) #define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B)
/**
@brief In-place 32 bit reversal function for helium
@param[in,out] pSrc points to in-place buffer of unknown 32-bit data type
@param[in] bitRevLen bit reversal table length
@param[in] pBitRevTab points to bit reversal table
@return none
*/
__STATIC_INLINE void arm_bitreversal_32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint64_t *src = (uint64_t *) pSrc;
int32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
uint64x2_t inLow, inHigh;
uint64x2_t bitRevOff1Low, bitRevOff0Low;
uint64x2_t bitRevOff1High, bitRevOff0High;
/* load scheduling to increase gather load idx update / gather load distance */
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
blkCnt = bitRevLen / 8;
while (blkCnt > 0) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
/* 64-bit index expansion */
bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
/* unrolled */
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
if (bitRevLen & 7) {
/* FFT size = 16 */
inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
}
}
/**
@brief In-place 16 bit reversal function for helium
@param[in,out] pSrc points to in-place buffer of unknown 16-bit data type
@param[in] bitRevLen bit reversal table length
@param[in] pBitRevTab points to bit reversal table
@return none
*/
__STATIC_INLINE void arm_bitreversal_16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint32_t *src = (uint32_t *) pSrc;
int32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
uint32x4_t bitRevOff1Low, bitRevOff0Low;
uint32x4_t bitRevOff1High, bitRevOff0High;
uint32x4_t inLow, inHigh;
/* load scheduling to increase gather load idx update / gather load distance */
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
blkCnt = (bitRevLen / 16);
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
/* loop unrolling */
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
blkCnt--;
}
/* tail handling */
blkCnt = bitRevLen & 0xf;
if (blkCnt == 8) {
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
} else if (blkCnt == 12) {
/* FFT 16 special case */
mve_pred16_t p = vctp16q(4);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
}
}
/**
@brief Out-of-place 32 bit reversal function for helium
@param[out] pDst points to destination buffer of unknown 32-bit data type
@param[in] pSrc points to input buffer of unknown 32-bit data type
@param[in] fftLen FFT length
@return none
*/
__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
{
uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
uint32_t bitRevPos, blkCnt;
uint32_t *pDst32 = (uint32_t *) pDst;
/* fwd indexes */
idxOffs0 = vdupq_n_u32(0);
idxOffs1 = vdupq_n_u32(0);
idxOffs0[0] = 0; idxOffs0[2] = 4;
idxOffs1[0] = 8; idxOffs1[2] = 12;
bitRevPos = (31 - __CLZ(fftLen)) + 5;
blkCnt = fftLen >> 2;
/* issued earlier to increase gather load idx update / gather load distance */
/* bit-reverse fwd indexes */
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
while (blkCnt > 0U) {
uint64x2_t vecIn;
vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
idxOffs0 = idxOffs0 + 16;
vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
idxOffs1 = idxOffs1 + 16;
vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4;
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
blkCnt--;
}
}
/**
@brief Out-of-place 16 bit reversal function for helium
@param[out] pDst points to destination buffer of unknown 16-bit data type
@param[in] pSrc points to input buffer of unknown 16-bit data type
@param[in] fftLen FFT length
@return none
*/
__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
{
uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
uint32_t bitRevPos, blkCnt;
uint16_t *pDst16 = (uint16_t *) pDst;
uint32_t incrIdx = 0;
/* fwd indexes */
idxOffs0 = vidupq_wb_u32(&incrIdx, 4); // {0, 4, 8, 12}
idxOffs1 = vidupq_wb_u32(&incrIdx, 4); // {16, 20, 24, 28}
bitRevPos = (31 - __CLZ(fftLen)) + 4;
blkCnt = fftLen >> 3;
/* issued earlier to increase gather load idx update / gather load distance */
/* bit-reverse fwd indexes */
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
while (blkCnt > 0U) {
uint32x4_t vecIn;
vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
idxOffs0 = idxOffs0 + 32;
vst1q(pDst16, (uint16x8_t) vecIn);
pDst16 += 8;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
idxOffs1 = idxOffs1 + 32;
vst1q(pDst16, (uint16x8_t) vecIn);
pDst16 += 8;
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
blkCnt--;
}
}
#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/

@ -40,111 +40,51 @@
static float16_t arm_inverse_fft_length_f16(uint16_t fftLen) static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
{ {
float16_t retValue=1.0; float16_t retValue=1.0;
switch (fftLen)
{
case 4096U:
retValue = (float16_t)0.000244140625f;
break;
case 2048U:
retValue = (float16_t)0.00048828125f;
break;
case 1024U:
retValue = (float16_t)0.0009765625f;
break;
case 512U:
retValue = (float16_t)0.001953125f;
break;
case 256U:
retValue = (float16_t)0.00390625f;
break;
case 128U:
retValue = (float16_t)0.0078125f;
break;
case 64U:
retValue = (float16_t)0.015625f;
break;
case 32U:
retValue = (float16_t)0.03125f;
break;
case 16U:
retValue = (float16_t)0.0625f;
break;
default:
break;
}
return(retValue);
}
switch (fftLen)
{
static void arm_bitreversal_f16_inpl_mve( case 4096U:
uint16_t *pSrc, retValue = (float16_t)0.000244140625f;
const uint16_t bitRevLen, break;
const uint16_t *pBitRevTab)
{ case 2048U:
uint32_t *src = (uint32_t *)pSrc; retValue = (float16_t)0.00048828125f;
uint32_t blkCnt; /* loop counters */ break;
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
blkCnt = (bitRevLen / 2) / 4; case 1024U:
while (blkCnt > 0U) { retValue = (float16_t)0.0009765625f;
bitRevTabOff = vldrhq_u16(pBitRevTab); break;
pBitRevTab += 8;
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); case 512U:
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); retValue = (float16_t)0.001953125f;
break;
bitRevOff1 = bitRevOff1 >> 3; case 256U:
bitRevOff2 = bitRevOff2 >> 3; retValue = (float16_t)0.00390625f;
break;
uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1); case 128U:
uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2); retValue = (float16_t)0.0078125f;
break;
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2); case 64U:
vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1); retValue = (float16_t)0.015625f;
break;
/* case 32U:
* Decrement the blockSize loop counter retValue = (float16_t)0.03125f;
*/ break;
blkCnt--;
}
case 16U:
retValue = (float16_t)0.0625f;
break;
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = bitRevLen & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0); default:
break;
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); }
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); return(retValue);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
}
} }
@ -590,53 +530,53 @@ void arm_cfft_f16(
float16_t * pSrc, float16_t * pSrc,
uint8_t ifftFlag, uint8_t ifftFlag,
uint8_t bitReverseFlag) uint8_t bitReverseFlag)
{ {
uint32_t fftLen = S->fftLen; uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) { if (ifftFlag == 1U) {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen); arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
break; break;
} }
} else { } else {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_f16_mve(S, pSrc, fftLen); _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen); arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
break; break;
} }
} }
if (bitReverseFlag) if (bitReverseFlag)
{ {
arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
} }
} }
#else #else
@ -666,7 +606,7 @@ extern void arm_radix4_butterfly_f16(
/** /**
@defgroup ComplexFFT Complex FFT Functions @defgroup ComplexFFT Complex FFT Functions
@par @par
The Fast Fourier Transform (FFT) is an efficient algorithm for computing the The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
@ -684,7 +624,7 @@ extern void arm_radix4_butterfly_f16(
<pre>{real[0], imag[0], real[1], imag[1], ...} </pre> <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
The FFT result will be contained in the same array and the frequency domain The FFT result will be contained in the same array and the frequency domain
values will have the same interleaving. values will have the same interleaving.
@par Floating-point @par Floating-point
The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
stages are performed along with a single radix-2 or radix-4 stage, as needed. stages are performed along with a single radix-2 or radix-4 stage, as needed.
@ -696,12 +636,12 @@ extern void arm_radix4_butterfly_f16(
inverse transform includes a scale of <code>1/fftLen</code> as part of the inverse transform includes a scale of <code>1/fftLen</code> as part of the
calculation and this matches the textbook definition of the inverse FFT. calculation and this matches the textbook definition of the inverse FFT.
@par @par
For the MVE version, the new arm_cfft_init_f32 initialization function is For the MVE version, the new arm_cfft_init_f32 initialization function is
<b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
needed FFTs.</b> Other FFT versions can continue to be initialized as needed FFTs.</b> Other FFT versions can continue to be initialized as
explained below. explained below.
@par @par
For not MVE versions, pre-initialized data structures containing twiddle factors For not MVE versions, pre-initialized data structures containing twiddle factors
and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
this header in your function and then pass one of the constant structures as this header in your function and then pass one of the constant structures as
an argument to arm_cfft_f32. For example: an argument to arm_cfft_f32. For example:
@ -816,7 +756,7 @@ extern void arm_radix4_butterfly_f16(
break; break;
} }
@endcode @endcode
*/ */
@ -875,7 +815,7 @@ void arm_cfft_f16(
case 2048: case 2048:
arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle); arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
break; break;
} }
if ( bitReverseFlag ) if ( bitReverseFlag )

@ -39,87 +39,56 @@
static float32_t arm_inverse_fft_length_f32(uint16_t fftLen) static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
{ {
float32_t retValue=1.0; float32_t retValue=1.0;
switch (fftLen)
{
case 4096U:
retValue = 0.000244140625;
break;
case 2048U:
retValue = 0.00048828125;
break;
case 1024U:
retValue = 0.0009765625f;
break;
case 512U:
retValue = 0.001953125;
break;
case 256U:
retValue = 0.00390625f;
break;
case 128U:
retValue = 0.0078125;
break;
case 64U:
retValue = 0.015625f;
break;
case 32U:
retValue = 0.03125;
break;
case 16U:
retValue = 0.0625f;
break;
default:
break;
}
return(retValue);
}
switch (fftLen)
{
static void arm_bitreversal_f32_inpl_mve( case 4096U:
uint32_t *pSrc, retValue = 0.000244140625;
const uint16_t bitRevLen, break;
const uint16_t *pBitRevTab)
{ case 2048U:
uint64_t *src = (uint64_t *) pSrc; retValue = 0.00048828125;
uint32_t blkCnt; /* loop counters */ break;
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1); case 1024U:
retValue = 0.0009765625f;
break;
case 512U:
retValue = 0.001953125;
break;
blkCnt = (bitRevLen / 2) / 2; case 256U:
while (blkCnt > 0U) { retValue = 0.00390625f;
bitRevTabOff = vldrhq_u32(pBitRevTab); break;
pBitRevTab += 4;
uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one); case 128U:
uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one); retValue = 0.0078125;
break;
uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1); case 64U:
uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2); retValue = 0.015625f;
break;
vstrdq_scatter_offset_u64(src, bitRevOff1, in2); case 32U:
vstrdq_scatter_offset_u64(src, bitRevOff2, in1); retValue = 0.03125;
break;
/* case 16U:
* Decrement the blockSize loop counter retValue = 0.0625f;
*/ break;
blkCnt--;
}
default:
break;
}
return(retValue);
} }
static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen) static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
{ {
f32x4_t vecTmp0, vecTmp1; f32x4_t vecTmp0, vecTmp1;
@ -563,53 +532,53 @@ void arm_cfft_f32(
float32_t * pSrc, float32_t * pSrc,
uint8_t ifftFlag, uint8_t ifftFlag,
uint8_t bitReverseFlag) uint8_t bitReverseFlag)
{ {
uint32_t fftLen = S->fftLen; uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) { if (ifftFlag == 1U) {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen); arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
break; break;
} }
} else { } else {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_f32_mve(S, pSrc, fftLen); _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen); arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
break; break;
} }
} }
if (bitReverseFlag) if (bitReverseFlag)
{ {
arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
} }
} }
@ -631,7 +600,7 @@ extern void arm_bitreversal_32(
/** /**
@defgroup ComplexFFT Complex FFT Functions @defgroup ComplexFFT Complex FFT Functions
@par @par
The Fast Fourier Transform (FFT) is an efficient algorithm for computing the The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
<pre>{real[0], imag[0], real[1], imag[1], ...} </pre> <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
The FFT result will be contained in the same array and the frequency domain The FFT result will be contained in the same array and the frequency domain
values will have the same interleaving. values will have the same interleaving.
@par Floating-point @par Floating-point
The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
stages are performed along with a single radix-2 or radix-4 stage, as needed. stages are performed along with a single radix-2 or radix-4 stage, as needed.
@ -661,12 +630,12 @@ extern void arm_bitreversal_32(
inverse transform includes a scale of <code>1/fftLen</code> as part of the inverse transform includes a scale of <code>1/fftLen</code> as part of the
calculation and this matches the textbook definition of the inverse FFT. calculation and this matches the textbook definition of the inverse FFT.
@par @par
For the MVE version, the new arm_cfft_init_f32 initialization function is For the MVE version, the new arm_cfft_init_f32 initialization function is
<b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
needed FFTs.</b> Other FFT versions can continue to be initialized as needed FFTs.</b> Other FFT versions can continue to be initialized as
explained below. explained below.
@par @par
For not MVE versions, pre-initialized data structures containing twiddle factors For not MVE versions, pre-initialized data structures containing twiddle factors
and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
this header in your function and then pass one of the constant structures as this header in your function and then pass one of the constant structures as
an argument to arm_cfft_f32. For example: an argument to arm_cfft_f32. For example:
@ -781,7 +750,7 @@ extern void arm_bitreversal_32(
break; break;
} }
@endcode @endcode
*/ */
void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1) void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)

@ -33,65 +33,6 @@
#include "arm_vec_fft.h" #include "arm_vec_fft.h"
static void arm_bitreversal_16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint32_t *src = (uint32_t *)pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
blkCnt = (bitRevLen / 2) / 4;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = bitRevLen & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
}
}
static void _arm_radix4_butterfly_q15_mve( static void _arm_radix4_butterfly_q15_mve(
const arm_cfft_instance_q15 * S, const arm_cfft_instance_q15 * S,
q15_t *pSrc, q15_t *pSrc,
@ -592,53 +533,53 @@ void arm_cfft_q15(
q15_t * pSrc, q15_t * pSrc,
uint8_t ifftFlag, uint8_t ifftFlag,
uint8_t bitReverseFlag) uint8_t bitReverseFlag)
{ {
uint32_t fftLen = S->fftLen; uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) { if (ifftFlag == 1U) {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen); arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
break; break;
} }
} else { } else {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_q15_mve(S, pSrc, fftLen); _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen); arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
break; break;
} }
} }
if (bitReverseFlag) if (bitReverseFlag)
{ {
arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
} }
} }
#else #else

@ -34,37 +34,6 @@
#include "arm_vec_fft.h" #include "arm_vec_fft.h"
static void arm_bitreversal_32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint64_t *src = (uint64_t *) pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
blkCnt = (bitRevLen / 2) / 2;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
}
static void _arm_radix4_butterfly_q31_mve( static void _arm_radix4_butterfly_q31_mve(
const arm_cfft_instance_q31 * S, const arm_cfft_instance_q31 * S,
@ -598,55 +567,55 @@ void arm_cfft_q31(
q31_t * pSrc, q31_t * pSrc,
uint8_t ifftFlag, uint8_t ifftFlag,
uint8_t bitReverseFlag) uint8_t bitReverseFlag)
{ {
uint32_t fftLen = S->fftLen; uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) { if (ifftFlag == 1U) {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen); arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
break; break;
} }
} else { } else {
switch (fftLen) { switch (fftLen) {
case 16: case 16:
case 64: case 64:
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_q31_mve(S, pSrc, fftLen); _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen); arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
break; break;
} }
} }
if (bitReverseFlag) if (bitReverseFlag)
{ {
arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
} }
} }
#else #else
extern void arm_radix4_butterfly_q31( extern void arm_radix4_butterfly_q31(
q31_t * pSrc, q31_t * pSrc,

Loading…
Cancel
Save