From a509fa39d49417699769ddaf037a0a2e57a504b3 Mon Sep 17 00:00:00 2001 From: FabKlein Date: Tue, 30 Mar 2021 12:09:40 +0200 Subject: [PATCH] CMSIS-DSP: FFT bit reversal unrolling Issue index update earlier, adding more distance with subsequent gather loads Added Out-of-place variants --- PrivateInclude/arm_vec_fft.h | 267 +++++++++++++++++++++++ Source/TransformFunctions/arm_cfft_f16.c | 234 ++++++++------------ Source/TransformFunctions/arm_cfft_f32.c | 215 ++++++++---------- Source/TransformFunctions/arm_cfft_q15.c | 151 ++++--------- Source/TransformFunctions/arm_cfft_q31.c | 125 ++++------- 5 files changed, 539 insertions(+), 453 deletions(-) mode change 100644 => 100755 Source/TransformFunctions/arm_cfft_f32.c diff --git a/PrivateInclude/arm_vec_fft.h b/PrivateInclude/arm_vec_fft.h index b2f21049..a500eb30 100755 --- a/PrivateInclude/arm_vec_fft.h +++ b/PrivateInclude/arm_vec_fft.h @@ -47,6 +47,273 @@ extern "C" #define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B) +/** + @brief In-place 32 bit reversal function for helium + @param[in,out] pSrc points to in-place buffer of unknown 32-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +__STATIC_INLINE void arm_bitreversal_32_inpl_mve( + uint32_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) + +{ + uint64_t *src = (uint64_t *) pSrc; + int32_t blkCnt; /* loop counters */ + uint32x4_t bitRevTabOff; + uint32x4_t one = vdupq_n_u32(1); + uint64x2_t inLow, inHigh; + uint64x2_t bitRevOff1Low, bitRevOff0Low; + uint64x2_t bitRevOff1High, bitRevOff0High; + + /* load scheduling to increase gather load idx update / gather load distance */ + bitRevTabOff = vldrhq_u32(pBitRevTab); + pBitRevTab += 4; + + bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one); + + + blkCnt = bitRevLen / 8; + while (blkCnt > 0) { + bitRevTabOff = vldrhq_u32(pBitRevTab); + pBitRevTab += 4; + + /* 64-bit index expansion */ + bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one); + bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one); + + inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low); + inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High); + + vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh); + vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow); + + + /* unrolled */ + bitRevTabOff = vldrhq_u32(pBitRevTab); + pBitRevTab += 4; + + bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one); + + inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low); + inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High); + + vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh); + vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow); + + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + if (bitRevLen & 7) { + /* FFT size = 16 */ + inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low); + inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High); + + vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh); + vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow); + } +} + + + +/** + @brief In-place 16 bit reversal function for helium + @param[in,out] pSrc points to in-place buffer of unknown 16-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +__STATIC_INLINE void arm_bitreversal_16_inpl_mve( + uint16_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) + +{ + uint32_t *src = (uint32_t *) pSrc; + int32_t blkCnt; /* loop counters */ + uint32x4_t bitRevTabOff; + uint16x8_t one = vdupq_n_u16(1); + uint32x4_t bitRevOff1Low, bitRevOff0Low; + uint32x4_t bitRevOff1High, bitRevOff0High; + uint32x4_t inLow, inHigh; + + /* load scheduling to increase gather load idx update / gather load distance */ + bitRevTabOff = vldrhq_u16(pBitRevTab); + pBitRevTab += 8; + + bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + + blkCnt = (bitRevLen / 16); + while (blkCnt > 0U) { + bitRevTabOff = vldrhq_u16(pBitRevTab); + pBitRevTab += 8; + + bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3); + bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3); + + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); + + /* loop unrolling */ + bitRevTabOff = vldrhq_u16(pBitRevTab); + pBitRevTab += 8; + + bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow); + + blkCnt--; + } + + /* tail handling */ + blkCnt = bitRevLen & 0xf; + if (blkCnt == 8) { + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); + } else if (blkCnt == 12) { + /* FFT 16 special case */ + mve_pred16_t p = vctp16q(4); + + bitRevTabOff = vldrhq_z_u16(pBitRevTab, p); + + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); + + bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + + inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p); + inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p); + + vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p); + vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p); + } +} + +/** + @brief Out-of-place 32 bit reversal function for helium + @param[out] pDst points to destination buffer of unknown 32-bit data type + @param[in] pSrc points to input buffer of unknown 32-bit data type + @param[in] fftLen FFT length + @return none +*/ +__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen) +{ + uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1; + uint32_t bitRevPos, blkCnt; + uint32_t *pDst32 = (uint32_t *) pDst; + + /* fwd indexes */ + idxOffs0 = vdupq_n_u32(0); + idxOffs1 = vdupq_n_u32(0); + idxOffs0[0] = 0; idxOffs0[2] = 4; + idxOffs1[0] = 8; idxOffs1[2] = 12; + + bitRevPos = (31 - __CLZ(fftLen)) + 5; + blkCnt = fftLen >> 2; + + /* issued earlier to increase gather load idx update / gather load distance */ + /* bit-reverse fwd indexes */ + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + while (blkCnt > 0U) { + uint64x2_t vecIn; + + vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0); + idxOffs0 = idxOffs0 + 16; + vst1q(pDst32, (uint32x4_t) vecIn); + pDst32 += 4; + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + + vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1); + idxOffs1 = idxOffs1 + 16; + vst1q(pDst32, (uint32x4_t) vecIn); + pDst32 += 4; + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + + blkCnt--; + } +} + + +/** + @brief Out-of-place 16 bit reversal function for helium + @param[out] pDst points to destination buffer of unknown 16-bit data type + @param[in] pSrc points to input buffer of unknown 16-bit data type + @param[in] fftLen FFT length + @return none +*/ + +__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen) +{ + uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1; + uint32_t bitRevPos, blkCnt; + uint16_t *pDst16 = (uint16_t *) pDst; + uint32_t incrIdx = 0; + + /* fwd indexes */ + idxOffs0 = vidupq_wb_u32(&incrIdx, 4); // {0, 4, 8, 12} + idxOffs1 = vidupq_wb_u32(&incrIdx, 4); // {16, 20, 24, 28} + + bitRevPos = (31 - __CLZ(fftLen)) + 4; + blkCnt = fftLen >> 3; + + /* issued earlier to increase gather load idx update / gather load distance */ + /* bit-reverse fwd indexes */ + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + while (blkCnt > 0U) { + uint32x4_t vecIn; + + vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0); + idxOffs0 = idxOffs0 + 32; + vst1q(pDst16, (uint16x8_t) vecIn); + pDst16 += 8; + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + + vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1); + idxOffs1 = idxOffs1 + 32; + vst1q(pDst16, (uint16x8_t) vecIn); + pDst16 += 8; + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + + blkCnt--; + } +} + + #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ diff --git a/Source/TransformFunctions/arm_cfft_f16.c b/Source/TransformFunctions/arm_cfft_f16.c index cc3fefc2..367e9d49 100755 --- a/Source/TransformFunctions/arm_cfft_f16.c +++ b/Source/TransformFunctions/arm_cfft_f16.c @@ -40,111 +40,51 @@ static float16_t arm_inverse_fft_length_f16(uint16_t fftLen) { float16_t retValue=1.0; - - switch (fftLen) - { - - case 4096U: - retValue = (float16_t)0.000244140625f; - break; - - case 2048U: - retValue = (float16_t)0.00048828125f; - break; - - case 1024U: - retValue = (float16_t)0.0009765625f; - break; - - case 512U: - retValue = (float16_t)0.001953125f; - break; - - case 256U: - retValue = (float16_t)0.00390625f; - break; - - case 128U: - retValue = (float16_t)0.0078125f; - break; - - case 64U: - retValue = (float16_t)0.015625f; - break; - - case 32U: - retValue = (float16_t)0.03125f; - break; - - case 16U: - retValue = (float16_t)0.0625f; - break; - - - default: - break; - } - return(retValue); -} + switch (fftLen) + { -static void arm_bitreversal_f16_inpl_mve( - uint16_t *pSrc, - const uint16_t bitRevLen, - const uint16_t *pBitRevTab) + case 4096U: + retValue = (float16_t)0.000244140625f; + break; -{ - uint32_t *src = (uint32_t *)pSrc; - uint32_t blkCnt; /* loop counters */ - uint32x4_t bitRevTabOff; - uint16x8_t one = vdupq_n_u16(1); + case 2048U: + retValue = (float16_t)0.00048828125f; + break; - blkCnt = (bitRevLen / 2) / 4; - while (blkCnt > 0U) { - bitRevTabOff = vldrhq_u16(pBitRevTab); - pBitRevTab += 8; + case 1024U: + retValue = (float16_t)0.0009765625f; + break; - uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); - uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); + case 512U: + retValue = (float16_t)0.001953125f; + break; - bitRevOff1 = bitRevOff1 >> 3; - bitRevOff2 = bitRevOff2 >> 3; + case 256U: + retValue = (float16_t)0.00390625f; + break; - uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1); - uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2); + case 128U: + retValue = (float16_t)0.0078125f; + break; - vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2); - vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1); + case 64U: + retValue = (float16_t)0.015625f; + break; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } + case 32U: + retValue = (float16_t)0.03125f; + break; + case 16U: + retValue = (float16_t)0.0625f; + break; - /* - * tail - * (will be merged thru tail predication) - */ - blkCnt = bitRevLen & 7; - if (blkCnt > 0U) { - mve_pred16_t p0 = vctp16q(blkCnt); - bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0); - - uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); - uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); - - bitRevOff1 = bitRevOff1 >> 3; - bitRevOff2 = bitRevOff2 >> 3; - - uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0); - uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0); - - vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0); - vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0); - } + default: + break; + } + return(retValue); } @@ -590,53 +530,53 @@ void arm_cfft_f16( float16_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) -{ - uint32_t fftLen = S->fftLen; - - if (ifftFlag == 1U) { - - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen); - break; - } - } else { - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen); - break; - } - } - - - if (bitReverseFlag) - { - - arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); - - } +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } } #else @@ -666,7 +606,7 @@ extern void arm_radix4_butterfly_f16( /** @defgroup ComplexFFT Complex FFT Functions - + @par The Fast Fourier Transform (FFT) is an efficient algorithm for computing the Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster @@ -684,7 +624,7 @@ extern void arm_radix4_butterfly_f16(
{real[0], imag[0], real[1], imag[1], ...} 
The FFT result will be contained in the same array and the frequency domain values will have the same interleaving. - + @par Floating-point The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8 stages are performed along with a single radix-2 or radix-4 stage, as needed. @@ -696,12 +636,12 @@ extern void arm_radix4_butterfly_f16( inverse transform includes a scale of 1/fftLen as part of the calculation and this matches the textbook definition of the inverse FFT. @par - For the MVE version, the new arm_cfft_init_f32 initialization function is + For the MVE version, the new arm_cfft_init_f32 initialization function is mandatory. Compilation flags are available to include only the required tables for the - needed FFTs. Other FFT versions can continue to be initialized as + needed FFTs. Other FFT versions can continue to be initialized as explained below. @par - For not MVE versions, pre-initialized data structures containing twiddle factors + For not MVE versions, pre-initialized data structures containing twiddle factors and bit reversal tables are provided and defined in arm_const_structs.h. Include this header in your function and then pass one of the constant structures as an argument to arm_cfft_f32. For example: @@ -816,7 +756,7 @@ extern void arm_radix4_butterfly_f16( break; } @endcode - + */ @@ -875,7 +815,7 @@ void arm_cfft_f16( case 2048: arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle); break; - + } if ( bitReverseFlag ) diff --git a/Source/TransformFunctions/arm_cfft_f32.c b/Source/TransformFunctions/arm_cfft_f32.c old mode 100644 new mode 100755 index f47ba426..8948aa9e --- a/Source/TransformFunctions/arm_cfft_f32.c +++ b/Source/TransformFunctions/arm_cfft_f32.c @@ -39,87 +39,56 @@ static float32_t arm_inverse_fft_length_f32(uint16_t fftLen) { float32_t retValue=1.0; - - switch (fftLen) - { - - case 4096U: - retValue = 0.000244140625; - break; - - case 2048U: - retValue = 0.00048828125; - break; - - case 1024U: - retValue = 0.0009765625f; - break; - - case 512U: - retValue = 0.001953125; - break; - - case 256U: - retValue = 0.00390625f; - break; - - case 128U: - retValue = 0.0078125; - break; - - case 64U: - retValue = 0.015625f; - break; - - case 32U: - retValue = 0.03125; - break; - - case 16U: - retValue = 0.0625f; - break; - - - default: - break; - } - return(retValue); -} + switch (fftLen) + { -static void arm_bitreversal_f32_inpl_mve( - uint32_t *pSrc, - const uint16_t bitRevLen, - const uint16_t *pBitRevTab) + case 4096U: + retValue = 0.000244140625; + break; -{ - uint64_t *src = (uint64_t *) pSrc; - uint32_t blkCnt; /* loop counters */ - uint32x4_t bitRevTabOff; - uint32x4_t one = vdupq_n_u32(1); + case 2048U: + retValue = 0.00048828125; + break; + + case 1024U: + retValue = 0.0009765625f; + break; + + case 512U: + retValue = 0.001953125; + break; - blkCnt = (bitRevLen / 2) / 2; - while (blkCnt > 0U) { - bitRevTabOff = vldrhq_u32(pBitRevTab); - pBitRevTab += 4; + case 256U: + retValue = 0.00390625f; + break; - uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one); - uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one); + case 128U: + retValue = 0.0078125; + break; - uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1); - uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2); + case 64U: + retValue = 0.015625f; + break; - vstrdq_scatter_offset_u64(src, bitRevOff1, in2); - vstrdq_scatter_offset_u64(src, bitRevOff2, in1); + case 32U: + retValue = 0.03125; + break; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } + case 16U: + retValue = 0.0625f; + break; + + + default: + break; + } + return(retValue); } + + static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen) { f32x4_t vecTmp0, vecTmp1; @@ -563,53 +532,53 @@ void arm_cfft_f32( float32_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) -{ - uint32_t fftLen = S->fftLen; - - if (ifftFlag == 1U) { - - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen); - break; - } - } else { - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen); - break; - } - } - - - if (bitReverseFlag) - { - - arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); - - } +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } } @@ -631,7 +600,7 @@ extern void arm_bitreversal_32( /** @defgroup ComplexFFT Complex FFT Functions - + @par The Fast Fourier Transform (FFT) is an efficient algorithm for computing the Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster @@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
{real[0], imag[0], real[1], imag[1], ...} 
The FFT result will be contained in the same array and the frequency domain values will have the same interleaving. - + @par Floating-point The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8 stages are performed along with a single radix-2 or radix-4 stage, as needed. @@ -661,12 +630,12 @@ extern void arm_bitreversal_32( inverse transform includes a scale of 1/fftLen as part of the calculation and this matches the textbook definition of the inverse FFT. @par - For the MVE version, the new arm_cfft_init_f32 initialization function is + For the MVE version, the new arm_cfft_init_f32 initialization function is mandatory. Compilation flags are available to include only the required tables for the - needed FFTs. Other FFT versions can continue to be initialized as + needed FFTs. Other FFT versions can continue to be initialized as explained below. @par - For not MVE versions, pre-initialized data structures containing twiddle factors + For not MVE versions, pre-initialized data structures containing twiddle factors and bit reversal tables are provided and defined in arm_const_structs.h. Include this header in your function and then pass one of the constant structures as an argument to arm_cfft_f32. For example: @@ -781,7 +750,7 @@ extern void arm_bitreversal_32( break; } @endcode - + */ void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1) diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c index 00503a6e..1cfc20ee 100644 --- a/Source/TransformFunctions/arm_cfft_q15.c +++ b/Source/TransformFunctions/arm_cfft_q15.c @@ -33,65 +33,6 @@ #include "arm_vec_fft.h" -static void arm_bitreversal_16_inpl_mve( - uint16_t *pSrc, - const uint16_t bitRevLen, - const uint16_t *pBitRevTab) - -{ - uint32_t *src = (uint32_t *)pSrc; - uint32_t blkCnt; /* loop counters */ - uint32x4_t bitRevTabOff; - uint16x8_t one = vdupq_n_u16(1); - - blkCnt = (bitRevLen / 2) / 4; - while (blkCnt > 0U) { - bitRevTabOff = vldrhq_u16(pBitRevTab); - pBitRevTab += 8; - - uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); - uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); - - bitRevOff1 = bitRevOff1 >> 3; - bitRevOff2 = bitRevOff2 >> 3; - - uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1); - uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2); - - vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2); - vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1); - - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - - /* - * tail - * (will be merged thru tail predication) - */ - blkCnt = bitRevLen & 7; - if (blkCnt > 0U) { - mve_pred16_t p0 = vctp16q(blkCnt); - - bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0); - - uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); - uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); - - bitRevOff1 = bitRevOff1 >> 3; - bitRevOff2 = bitRevOff2 >> 3; - - uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0); - uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0); - - vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0); - vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0); - } -} - static void _arm_radix4_butterfly_q15_mve( const arm_cfft_instance_q15 * S, q15_t *pSrc, @@ -592,53 +533,53 @@ void arm_cfft_q15( q15_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) -{ - uint32_t fftLen = S->fftLen; - - if (ifftFlag == 1U) { - - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen); - break; - } - } else { - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen); - break; - } - } - - - if (bitReverseFlag) - { - +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); - - } + + } } #else diff --git a/Source/TransformFunctions/arm_cfft_q31.c b/Source/TransformFunctions/arm_cfft_q31.c index 13c5d840..75e4e838 100644 --- a/Source/TransformFunctions/arm_cfft_q31.c +++ b/Source/TransformFunctions/arm_cfft_q31.c @@ -34,37 +34,6 @@ #include "arm_vec_fft.h" -static void arm_bitreversal_32_inpl_mve( - uint32_t *pSrc, - const uint16_t bitRevLen, - const uint16_t *pBitRevTab) - -{ - uint64_t *src = (uint64_t *) pSrc; - uint32_t blkCnt; /* loop counters */ - uint32x4_t bitRevTabOff; - uint32x4_t one = vdupq_n_u32(1); - - blkCnt = (bitRevLen / 2) / 2; - while (blkCnt > 0U) { - bitRevTabOff = vldrhq_u32(pBitRevTab); - pBitRevTab += 4; - - uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one); - uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one); - - uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1); - uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2); - - vstrdq_scatter_offset_u64(src, bitRevOff1, in2); - vstrdq_scatter_offset_u64(src, bitRevOff2, in1); - - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } -} static void _arm_radix4_butterfly_q31_mve( const arm_cfft_instance_q31 * S, @@ -598,55 +567,55 @@ void arm_cfft_q31( q31_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) -{ - uint32_t fftLen = S->fftLen; - - if (ifftFlag == 1U) { - - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen); - break; - } - } else { - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen); - break; - } - } - - - if (bitReverseFlag) - { - +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); - - } + + } } -#else +#else extern void arm_radix4_butterfly_q31( q31_t * pSrc,