CMSIS-DSP: FFT bit reversal unrolling

Issue index update earlier, adding more distance with
           subsequent gather loads
           Added Out-of-place variants
pull/19/head
FabKlein 5 years ago
parent 0a5a96d904
commit a509fa39d4

@ -47,6 +47,273 @@ extern "C"
#define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B)
/**
@brief In-place 32 bit reversal function for helium
@param[in,out] pSrc points to in-place buffer of unknown 32-bit data type
@param[in] bitRevLen bit reversal table length
@param[in] pBitRevTab points to bit reversal table
@return none
*/
__STATIC_INLINE void arm_bitreversal_32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint64_t *src = (uint64_t *) pSrc;
int32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
uint64x2_t inLow, inHigh;
uint64x2_t bitRevOff1Low, bitRevOff0Low;
uint64x2_t bitRevOff1High, bitRevOff0High;
/* load scheduling to increase gather load idx update / gather load distance */
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
blkCnt = bitRevLen / 8;
while (blkCnt > 0) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
/* 64-bit index expansion */
bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
/* unrolled */
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
if (bitRevLen & 7) {
/* FFT size = 16 */
inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
}
}
/**
@brief In-place 16 bit reversal function for helium
@param[in,out] pSrc points to in-place buffer of unknown 16-bit data type
@param[in] bitRevLen bit reversal table length
@param[in] pBitRevTab points to bit reversal table
@return none
*/
__STATIC_INLINE void arm_bitreversal_16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint32_t *src = (uint32_t *) pSrc;
int32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
uint32x4_t bitRevOff1Low, bitRevOff0Low;
uint32x4_t bitRevOff1High, bitRevOff0High;
uint32x4_t inLow, inHigh;
/* load scheduling to increase gather load idx update / gather load distance */
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
blkCnt = (bitRevLen / 16);
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
/* loop unrolling */
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
blkCnt--;
}
/* tail handling */
blkCnt = bitRevLen & 0xf;
if (blkCnt == 8) {
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
} else if (blkCnt == 12) {
/* FFT 16 special case */
mve_pred16_t p = vctp16q(4);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
}
}
/**
@brief Out-of-place 32 bit reversal function for helium
@param[out] pDst points to destination buffer of unknown 32-bit data type
@param[in] pSrc points to input buffer of unknown 32-bit data type
@param[in] fftLen FFT length
@return none
*/
__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
{
uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
uint32_t bitRevPos, blkCnt;
uint32_t *pDst32 = (uint32_t *) pDst;
/* fwd indexes */
idxOffs0 = vdupq_n_u32(0);
idxOffs1 = vdupq_n_u32(0);
idxOffs0[0] = 0; idxOffs0[2] = 4;
idxOffs1[0] = 8; idxOffs1[2] = 12;
bitRevPos = (31 - __CLZ(fftLen)) + 5;
blkCnt = fftLen >> 2;
/* issued earlier to increase gather load idx update / gather load distance */
/* bit-reverse fwd indexes */
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
while (blkCnt > 0U) {
uint64x2_t vecIn;
vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
idxOffs0 = idxOffs0 + 16;
vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
idxOffs1 = idxOffs1 + 16;
vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4;
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
blkCnt--;
}
}
/**
@brief Out-of-place 16 bit reversal function for helium
@param[out] pDst points to destination buffer of unknown 16-bit data type
@param[in] pSrc points to input buffer of unknown 16-bit data type
@param[in] fftLen FFT length
@return none
*/
__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
{
uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
uint32_t bitRevPos, blkCnt;
uint16_t *pDst16 = (uint16_t *) pDst;
uint32_t incrIdx = 0;
/* fwd indexes */
idxOffs0 = vidupq_wb_u32(&incrIdx, 4); // {0, 4, 8, 12}
idxOffs1 = vidupq_wb_u32(&incrIdx, 4); // {16, 20, 24, 28}
bitRevPos = (31 - __CLZ(fftLen)) + 4;
blkCnt = fftLen >> 3;
/* issued earlier to increase gather load idx update / gather load distance */
/* bit-reverse fwd indexes */
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
while (blkCnt > 0U) {
uint32x4_t vecIn;
vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
idxOffs0 = idxOffs0 + 32;
vst1q(pDst16, (uint16x8_t) vecIn);
pDst16 += 8;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
idxOffs1 = idxOffs1 + 32;
vst1q(pDst16, (uint16x8_t) vecIn);
pDst16 += 8;
bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
blkCnt--;
}
}
#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/

@ -40,111 +40,51 @@
static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
{
float16_t retValue=1.0;
switch (fftLen)
{
case 4096U:
retValue = (float16_t)0.000244140625f;
break;
case 2048U:
retValue = (float16_t)0.00048828125f;
break;
case 1024U:
retValue = (float16_t)0.0009765625f;
break;
case 512U:
retValue = (float16_t)0.001953125f;
break;
case 256U:
retValue = (float16_t)0.00390625f;
break;
case 128U:
retValue = (float16_t)0.0078125f;
break;
case 64U:
retValue = (float16_t)0.015625f;
break;
case 32U:
retValue = (float16_t)0.03125f;
break;
case 16U:
retValue = (float16_t)0.0625f;
break;
default:
break;
}
return(retValue);
}
switch (fftLen)
{
static void arm_bitreversal_f16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
case 4096U:
retValue = (float16_t)0.000244140625f;
break;
{
uint32_t *src = (uint32_t *)pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
case 2048U:
retValue = (float16_t)0.00048828125f;
break;
blkCnt = (bitRevLen / 2) / 4;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
case 1024U:
retValue = (float16_t)0.0009765625f;
break;
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
case 512U:
retValue = (float16_t)0.001953125f;
break;
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
case 256U:
retValue = (float16_t)0.00390625f;
break;
uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
case 128U:
retValue = (float16_t)0.0078125f;
break;
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
case 64U:
retValue = (float16_t)0.015625f;
break;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
case 32U:
retValue = (float16_t)0.03125f;
break;
case 16U:
retValue = (float16_t)0.0625f;
break;
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = bitRevLen & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
}
default:
break;
}
return(retValue);
}
@ -590,53 +530,53 @@ void arm_cfft_f16(
float16_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
}
#else
@ -666,7 +606,7 @@ extern void arm_radix4_butterfly_f16(
/**
@defgroup ComplexFFT Complex FFT Functions
@par
The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
@ -684,7 +624,7 @@ extern void arm_radix4_butterfly_f16(
<pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
The FFT result will be contained in the same array and the frequency domain
values will have the same interleaving.
@par Floating-point
The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
stages are performed along with a single radix-2 or radix-4 stage, as needed.
@ -696,12 +636,12 @@ extern void arm_radix4_butterfly_f16(
inverse transform includes a scale of <code>1/fftLen</code> as part of the
calculation and this matches the textbook definition of the inverse FFT.
@par
For the MVE version, the new arm_cfft_init_f32 initialization function is
For the MVE version, the new arm_cfft_init_f32 initialization function is
<b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
needed FFTs.</b> Other FFT versions can continue to be initialized as
needed FFTs.</b> Other FFT versions can continue to be initialized as
explained below.
@par
For not MVE versions, pre-initialized data structures containing twiddle factors
For not MVE versions, pre-initialized data structures containing twiddle factors
and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
this header in your function and then pass one of the constant structures as
an argument to arm_cfft_f32. For example:
@ -816,7 +756,7 @@ extern void arm_radix4_butterfly_f16(
break;
}
@endcode
*/
@ -875,7 +815,7 @@ void arm_cfft_f16(
case 2048:
arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
break;
}
if ( bitReverseFlag )

@ -39,87 +39,56 @@
static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
{
float32_t retValue=1.0;
switch (fftLen)
{
case 4096U:
retValue = 0.000244140625;
break;
case 2048U:
retValue = 0.00048828125;
break;
case 1024U:
retValue = 0.0009765625f;
break;
case 512U:
retValue = 0.001953125;
break;
case 256U:
retValue = 0.00390625f;
break;
case 128U:
retValue = 0.0078125;
break;
case 64U:
retValue = 0.015625f;
break;
case 32U:
retValue = 0.03125;
break;
case 16U:
retValue = 0.0625f;
break;
default:
break;
}
return(retValue);
}
switch (fftLen)
{
static void arm_bitreversal_f32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
case 4096U:
retValue = 0.000244140625;
break;
{
uint64_t *src = (uint64_t *) pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
case 2048U:
retValue = 0.00048828125;
break;
case 1024U:
retValue = 0.0009765625f;
break;
case 512U:
retValue = 0.001953125;
break;
blkCnt = (bitRevLen / 2) / 2;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
case 256U:
retValue = 0.00390625f;
break;
uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
case 128U:
retValue = 0.0078125;
break;
uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
case 64U:
retValue = 0.015625f;
break;
vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
case 32U:
retValue = 0.03125;
break;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
case 16U:
retValue = 0.0625f;
break;
default:
break;
}
return(retValue);
}
static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
{
f32x4_t vecTmp0, vecTmp1;
@ -563,53 +532,53 @@ void arm_cfft_f32(
float32_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
}
@ -631,7 +600,7 @@ extern void arm_bitreversal_32(
/**
@defgroup ComplexFFT Complex FFT Functions
@par
The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
<pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
The FFT result will be contained in the same array and the frequency domain
values will have the same interleaving.
@par Floating-point
The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
stages are performed along with a single radix-2 or radix-4 stage, as needed.
@ -661,12 +630,12 @@ extern void arm_bitreversal_32(
inverse transform includes a scale of <code>1/fftLen</code> as part of the
calculation and this matches the textbook definition of the inverse FFT.
@par
For the MVE version, the new arm_cfft_init_f32 initialization function is
For the MVE version, the new arm_cfft_init_f32 initialization function is
<b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
needed FFTs.</b> Other FFT versions can continue to be initialized as
needed FFTs.</b> Other FFT versions can continue to be initialized as
explained below.
@par
For not MVE versions, pre-initialized data structures containing twiddle factors
For not MVE versions, pre-initialized data structures containing twiddle factors
and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
this header in your function and then pass one of the constant structures as
an argument to arm_cfft_f32. For example:
@ -781,7 +750,7 @@ extern void arm_bitreversal_32(
break;
}
@endcode
*/
void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)

@ -33,65 +33,6 @@
#include "arm_vec_fft.h"
static void arm_bitreversal_16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint32_t *src = (uint32_t *)pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint16x8_t one = vdupq_n_u16(1);
blkCnt = (bitRevLen / 2) / 4;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = bitRevLen & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
}
}
static void _arm_radix4_butterfly_q15_mve(
const arm_cfft_instance_q15 * S,
q15_t *pSrc,
@ -592,53 +533,53 @@ void arm_cfft_q15(
q15_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
}
}
#else

@ -34,37 +34,6 @@
#include "arm_vec_fft.h"
static void arm_bitreversal_32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint64_t *src = (uint64_t *) pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
blkCnt = (bitRevLen / 2) / 2;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
}
static void _arm_radix4_butterfly_q31_mve(
const arm_cfft_instance_q31 * S,
@ -598,55 +567,55 @@ void arm_cfft_q31(
q31_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
{
uint32_t fftLen = S->fftLen;
if (ifftFlag == 1U) {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
break;
}
} else {
switch (fftLen) {
case 16:
case 64:
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
break;
}
}
if (bitReverseFlag)
{
arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
}
}
#else
#else
extern void arm_radix4_butterfly_q31(
q31_t * pSrc,

Loading…
Cancel
Save