diff --git a/PrivateInclude/arm_vec_fft.h b/PrivateInclude/arm_vec_fft.h index b2f21049..a500eb30 100755 --- a/PrivateInclude/arm_vec_fft.h +++ b/PrivateInclude/arm_vec_fft.h @@ -47,6 +47,273 @@ extern "C" #define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B) +/** + @brief In-place 32 bit reversal function for helium + @param[in,out] pSrc points to in-place buffer of unknown 32-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +__STATIC_INLINE void arm_bitreversal_32_inpl_mve( + uint32_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) + +{ + uint64_t *src = (uint64_t *) pSrc; + int32_t blkCnt; /* loop counters */ + uint32x4_t bitRevTabOff; + uint32x4_t one = vdupq_n_u32(1); + uint64x2_t inLow, inHigh; + uint64x2_t bitRevOff1Low, bitRevOff0Low; + uint64x2_t bitRevOff1High, bitRevOff0High; + + /* load scheduling to increase gather load idx update / gather load distance */ + bitRevTabOff = vldrhq_u32(pBitRevTab); + pBitRevTab += 4; + + bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one); + + + blkCnt = bitRevLen / 8; + while (blkCnt > 0) { + bitRevTabOff = vldrhq_u32(pBitRevTab); + pBitRevTab += 4; + + /* 64-bit index expansion */ + bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one); + bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one); + + inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low); + inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High); + + vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh); + vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow); + + + /* unrolled */ + bitRevTabOff = vldrhq_u32(pBitRevTab); + pBitRevTab += 4; + + bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one); + + inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low); + inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High); + + vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh); + vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow); + + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + if (bitRevLen & 7) { + /* FFT size = 16 */ + inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low); + inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High); + + vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh); + vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow); + } +} + + + +/** + @brief In-place 16 bit reversal function for helium + @param[in,out] pSrc points to in-place buffer of unknown 16-bit data type + @param[in] bitRevLen bit reversal table length + @param[in] pBitRevTab points to bit reversal table + @return none +*/ + +__STATIC_INLINE void arm_bitreversal_16_inpl_mve( + uint16_t *pSrc, + const uint16_t bitRevLen, + const uint16_t *pBitRevTab) + +{ + uint32_t *src = (uint32_t *) pSrc; + int32_t blkCnt; /* loop counters */ + uint32x4_t bitRevTabOff; + uint16x8_t one = vdupq_n_u16(1); + uint32x4_t bitRevOff1Low, bitRevOff0Low; + uint32x4_t bitRevOff1High, bitRevOff0High; + uint32x4_t inLow, inHigh; + + /* load scheduling to increase gather load idx update / gather load distance */ + bitRevTabOff = vldrhq_u16(pBitRevTab); + pBitRevTab += 8; + + bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + + blkCnt = (bitRevLen / 16); + while (blkCnt > 0U) { + bitRevTabOff = vldrhq_u16(pBitRevTab); + pBitRevTab += 8; + + bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3); + bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3); + + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); + + /* loop unrolling */ + bitRevTabOff = vldrhq_u16(pBitRevTab); + pBitRevTab += 8; + + bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow); + + blkCnt--; + } + + /* tail handling */ + blkCnt = bitRevLen & 0xf; + if (blkCnt == 8) { + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); + } else if (blkCnt == 12) { + /* FFT 16 special case */ + mve_pred16_t p = vctp16q(4); + + bitRevTabOff = vldrhq_z_u16(pBitRevTab, p); + + inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); + inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); + + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); + vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); + + bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + + inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p); + inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p); + + vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p); + vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p); + } +} + +/** + @brief Out-of-place 32 bit reversal function for helium + @param[out] pDst points to destination buffer of unknown 32-bit data type + @param[in] pSrc points to input buffer of unknown 32-bit data type + @param[in] fftLen FFT length + @return none +*/ +__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen) +{ + uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1; + uint32_t bitRevPos, blkCnt; + uint32_t *pDst32 = (uint32_t *) pDst; + + /* fwd indexes */ + idxOffs0 = vdupq_n_u32(0); + idxOffs1 = vdupq_n_u32(0); + idxOffs0[0] = 0; idxOffs0[2] = 4; + idxOffs1[0] = 8; idxOffs1[2] = 12; + + bitRevPos = (31 - __CLZ(fftLen)) + 5; + blkCnt = fftLen >> 2; + + /* issued earlier to increase gather load idx update / gather load distance */ + /* bit-reverse fwd indexes */ + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + while (blkCnt > 0U) { + uint64x2_t vecIn; + + vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0); + idxOffs0 = idxOffs0 + 16; + vst1q(pDst32, (uint32x4_t) vecIn); + pDst32 += 4; + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + + vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1); + idxOffs1 = idxOffs1 + 16; + vst1q(pDst32, (uint32x4_t) vecIn); + pDst32 += 4; + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + + blkCnt--; + } +} + + +/** + @brief Out-of-place 16 bit reversal function for helium + @param[out] pDst points to destination buffer of unknown 16-bit data type + @param[in] pSrc points to input buffer of unknown 16-bit data type + @param[in] fftLen FFT length + @return none +*/ + +__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen) +{ + uint32x4_t idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1; + uint32_t bitRevPos, blkCnt; + uint16_t *pDst16 = (uint16_t *) pDst; + uint32_t incrIdx = 0; + + /* fwd indexes */ + idxOffs0 = vidupq_wb_u32(&incrIdx, 4); // {0, 4, 8, 12} + idxOffs1 = vidupq_wb_u32(&incrIdx, 4); // {16, 20, 24, 28} + + bitRevPos = (31 - __CLZ(fftLen)) + 4; + blkCnt = fftLen >> 3; + + /* issued earlier to increase gather load idx update / gather load distance */ + /* bit-reverse fwd indexes */ + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + while (blkCnt > 0U) { + uint32x4_t vecIn; + + vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0); + idxOffs0 = idxOffs0 + 32; + vst1q(pDst16, (uint16x8_t) vecIn); + pDst16 += 8; + bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); + + vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1); + idxOffs1 = idxOffs1 + 32; + vst1q(pDst16, (uint16x8_t) vecIn); + pDst16 += 8; + bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos); + + blkCnt--; + } +} + + #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ diff --git a/Source/TransformFunctions/arm_cfft_f16.c b/Source/TransformFunctions/arm_cfft_f16.c index cc3fefc2..367e9d49 100755 --- a/Source/TransformFunctions/arm_cfft_f16.c +++ b/Source/TransformFunctions/arm_cfft_f16.c @@ -40,111 +40,51 @@ static float16_t arm_inverse_fft_length_f16(uint16_t fftLen) { float16_t retValue=1.0; - - switch (fftLen) - { - - case 4096U: - retValue = (float16_t)0.000244140625f; - break; - - case 2048U: - retValue = (float16_t)0.00048828125f; - break; - - case 1024U: - retValue = (float16_t)0.0009765625f; - break; - - case 512U: - retValue = (float16_t)0.001953125f; - break; - - case 256U: - retValue = (float16_t)0.00390625f; - break; - - case 128U: - retValue = (float16_t)0.0078125f; - break; - - case 64U: - retValue = (float16_t)0.015625f; - break; - - case 32U: - retValue = (float16_t)0.03125f; - break; - - case 16U: - retValue = (float16_t)0.0625f; - break; - - - default: - break; - } - return(retValue); -} + switch (fftLen) + { -static void arm_bitreversal_f16_inpl_mve( - uint16_t *pSrc, - const uint16_t bitRevLen, - const uint16_t *pBitRevTab) + case 4096U: + retValue = (float16_t)0.000244140625f; + break; -{ - uint32_t *src = (uint32_t *)pSrc; - uint32_t blkCnt; /* loop counters */ - uint32x4_t bitRevTabOff; - uint16x8_t one = vdupq_n_u16(1); + case 2048U: + retValue = (float16_t)0.00048828125f; + break; - blkCnt = (bitRevLen / 2) / 4; - while (blkCnt > 0U) { - bitRevTabOff = vldrhq_u16(pBitRevTab); - pBitRevTab += 8; + case 1024U: + retValue = (float16_t)0.0009765625f; + break; - uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); - uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); + case 512U: + retValue = (float16_t)0.001953125f; + break; - bitRevOff1 = bitRevOff1 >> 3; - bitRevOff2 = bitRevOff2 >> 3; + case 256U: + retValue = (float16_t)0.00390625f; + break; - uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1); - uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2); + case 128U: + retValue = (float16_t)0.0078125f; + break; - vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2); - vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1); + case 64U: + retValue = (float16_t)0.015625f; + break; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } + case 32U: + retValue = (float16_t)0.03125f; + break; + case 16U: + retValue = (float16_t)0.0625f; + break; - /* - * tail - * (will be merged thru tail predication) - */ - blkCnt = bitRevLen & 7; - if (blkCnt > 0U) { - mve_pred16_t p0 = vctp16q(blkCnt); - bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0); - - uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one); - uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one); - - bitRevOff1 = bitRevOff1 >> 3; - bitRevOff2 = bitRevOff2 >> 3; - - uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0); - uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0); - - vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0); - vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0); - } + default: + break; + } + return(retValue); } @@ -590,53 +530,53 @@ void arm_cfft_f16( float16_t * pSrc, uint8_t ifftFlag, uint8_t bitReverseFlag) -{ - uint32_t fftLen = S->fftLen; - - if (ifftFlag == 1U) { - - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen); - break; - } - } else { - switch (fftLen) { - case 16: - case 64: - case 256: - case 1024: - case 4096: - _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen); - break; - - case 32: - case 128: - case 512: - case 2048: - arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen); - break; - } - } - - - if (bitReverseFlag) - { - - arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); - - } +{ + uint32_t fftLen = S->fftLen; + + if (ifftFlag == 1U) { + + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen); + break; + } + } else { + switch (fftLen) { + case 16: + case 64: + case 256: + case 1024: + case 4096: + _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen); + break; + + case 32: + case 128: + case 512: + case 2048: + arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen); + break; + } + } + + + if (bitReverseFlag) + { + + arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable); + + } } #else @@ -666,7 +606,7 @@ extern void arm_radix4_butterfly_f16( /** @defgroup ComplexFFT Complex FFT Functions - + @par The Fast Fourier Transform (FFT) is an efficient algorithm for computing the Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster @@ -684,7 +624,7 @@ extern void arm_radix4_butterfly_f16(
{real[0], imag[0], real[1], imag[1], ...}
The FFT result will be contained in the same array and the frequency domain
values will have the same interleaving.
-
+
@par Floating-point
The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
stages are performed along with a single radix-2 or radix-4 stage, as needed.
@@ -696,12 +636,12 @@ extern void arm_radix4_butterfly_f16(
inverse transform includes a scale of 1/fftLen as part of the
calculation and this matches the textbook definition of the inverse FFT.
@par
- For the MVE version, the new arm_cfft_init_f32 initialization function is
+ For the MVE version, the new arm_cfft_init_f32 initialization function is
mandatory. Compilation flags are available to include only the required tables for the
- needed FFTs. Other FFT versions can continue to be initialized as
+ needed FFTs. Other FFT versions can continue to be initialized as
explained below.
@par
- For not MVE versions, pre-initialized data structures containing twiddle factors
+ For not MVE versions, pre-initialized data structures containing twiddle factors
and bit reversal tables are provided and defined in arm_const_structs.h. Include
this header in your function and then pass one of the constant structures as
an argument to arm_cfft_f32. For example:
@@ -816,7 +756,7 @@ extern void arm_radix4_butterfly_f16(
break;
}
@endcode
-
+
*/
@@ -875,7 +815,7 @@ void arm_cfft_f16(
case 2048:
arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
break;
-
+
}
if ( bitReverseFlag )
diff --git a/Source/TransformFunctions/arm_cfft_f32.c b/Source/TransformFunctions/arm_cfft_f32.c
old mode 100644
new mode 100755
index f47ba426..8948aa9e
--- a/Source/TransformFunctions/arm_cfft_f32.c
+++ b/Source/TransformFunctions/arm_cfft_f32.c
@@ -39,87 +39,56 @@
static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
{
float32_t retValue=1.0;
-
- switch (fftLen)
- {
-
- case 4096U:
- retValue = 0.000244140625;
- break;
-
- case 2048U:
- retValue = 0.00048828125;
- break;
-
- case 1024U:
- retValue = 0.0009765625f;
- break;
-
- case 512U:
- retValue = 0.001953125;
- break;
-
- case 256U:
- retValue = 0.00390625f;
- break;
-
- case 128U:
- retValue = 0.0078125;
- break;
-
- case 64U:
- retValue = 0.015625f;
- break;
-
- case 32U:
- retValue = 0.03125;
- break;
-
- case 16U:
- retValue = 0.0625f;
- break;
-
-
- default:
- break;
- }
- return(retValue);
-}
+ switch (fftLen)
+ {
-static void arm_bitreversal_f32_inpl_mve(
- uint32_t *pSrc,
- const uint16_t bitRevLen,
- const uint16_t *pBitRevTab)
+ case 4096U:
+ retValue = 0.000244140625;
+ break;
-{
- uint64_t *src = (uint64_t *) pSrc;
- uint32_t blkCnt; /* loop counters */
- uint32x4_t bitRevTabOff;
- uint32x4_t one = vdupq_n_u32(1);
+ case 2048U:
+ retValue = 0.00048828125;
+ break;
+
+ case 1024U:
+ retValue = 0.0009765625f;
+ break;
+
+ case 512U:
+ retValue = 0.001953125;
+ break;
- blkCnt = (bitRevLen / 2) / 2;
- while (blkCnt > 0U) {
- bitRevTabOff = vldrhq_u32(pBitRevTab);
- pBitRevTab += 4;
+ case 256U:
+ retValue = 0.00390625f;
+ break;
- uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
- uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
+ case 128U:
+ retValue = 0.0078125;
+ break;
- uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
- uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
+ case 64U:
+ retValue = 0.015625f;
+ break;
- vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
- vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
+ case 32U:
+ retValue = 0.03125;
+ break;
- /*
- * Decrement the blockSize loop counter
- */
- blkCnt--;
- }
+ case 16U:
+ retValue = 0.0625f;
+ break;
+
+
+ default:
+ break;
+ }
+ return(retValue);
}
+
+
static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
{
f32x4_t vecTmp0, vecTmp1;
@@ -563,53 +532,53 @@ void arm_cfft_f32(
float32_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
-{
- uint32_t fftLen = S->fftLen;
-
- if (ifftFlag == 1U) {
-
- switch (fftLen) {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
- break;
-
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
- break;
- }
- } else {
- switch (fftLen) {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
- break;
-
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
- break;
- }
- }
-
-
- if (bitReverseFlag)
- {
-
- arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-
- }
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
+ arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+ }
}
@@ -631,7 +600,7 @@ extern void arm_bitreversal_32(
/**
@defgroup ComplexFFT Complex FFT Functions
-
+
@par
The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
@@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
{real[0], imag[0], real[1], imag[1], ...}
The FFT result will be contained in the same array and the frequency domain
values will have the same interleaving.
-
+
@par Floating-point
The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
stages are performed along with a single radix-2 or radix-4 stage, as needed.
@@ -661,12 +630,12 @@ extern void arm_bitreversal_32(
inverse transform includes a scale of 1/fftLen as part of the
calculation and this matches the textbook definition of the inverse FFT.
@par
- For the MVE version, the new arm_cfft_init_f32 initialization function is
+ For the MVE version, the new arm_cfft_init_f32 initialization function is
mandatory. Compilation flags are available to include only the required tables for the
- needed FFTs. Other FFT versions can continue to be initialized as
+ needed FFTs. Other FFT versions can continue to be initialized as
explained below.
@par
- For not MVE versions, pre-initialized data structures containing twiddle factors
+ For not MVE versions, pre-initialized data structures containing twiddle factors
and bit reversal tables are provided and defined in arm_const_structs.h. Include
this header in your function and then pass one of the constant structures as
an argument to arm_cfft_f32. For example:
@@ -781,7 +750,7 @@ extern void arm_bitreversal_32(
break;
}
@endcode
-
+
*/
void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c
index 00503a6e..1cfc20ee 100644
--- a/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Source/TransformFunctions/arm_cfft_q15.c
@@ -33,65 +33,6 @@
#include "arm_vec_fft.h"
-static void arm_bitreversal_16_inpl_mve(
- uint16_t *pSrc,
- const uint16_t bitRevLen,
- const uint16_t *pBitRevTab)
-
-{
- uint32_t *src = (uint32_t *)pSrc;
- uint32_t blkCnt; /* loop counters */
- uint32x4_t bitRevTabOff;
- uint16x8_t one = vdupq_n_u16(1);
-
- blkCnt = (bitRevLen / 2) / 4;
- while (blkCnt > 0U) {
- bitRevTabOff = vldrhq_u16(pBitRevTab);
- pBitRevTab += 8;
-
- uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
- uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
- bitRevOff1 = bitRevOff1 >> 3;
- bitRevOff2 = bitRevOff2 >> 3;
-
- uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
- uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
-
- vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
- vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
-
- /*
- * Decrement the blockSize loop counter
- */
- blkCnt--;
- }
-
-
- /*
- * tail
- * (will be merged thru tail predication)
- */
- blkCnt = bitRevLen & 7;
- if (blkCnt > 0U) {
- mve_pred16_t p0 = vctp16q(blkCnt);
-
- bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
-
- uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
- uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
- bitRevOff1 = bitRevOff1 >> 3;
- bitRevOff2 = bitRevOff2 >> 3;
-
- uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
- uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
-
- vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
- vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
- }
-}
-
static void _arm_radix4_butterfly_q15_mve(
const arm_cfft_instance_q15 * S,
q15_t *pSrc,
@@ -592,53 +533,53 @@ void arm_cfft_q15(
q15_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
-{
- uint32_t fftLen = S->fftLen;
-
- if (ifftFlag == 1U) {
-
- switch (fftLen) {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
- break;
-
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
- break;
- }
- } else {
- switch (fftLen) {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
- break;
-
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
- break;
- }
- }
-
-
- if (bitReverseFlag)
- {
-
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-
- }
+
+ }
}
#else
diff --git a/Source/TransformFunctions/arm_cfft_q31.c b/Source/TransformFunctions/arm_cfft_q31.c
index 13c5d840..75e4e838 100644
--- a/Source/TransformFunctions/arm_cfft_q31.c
+++ b/Source/TransformFunctions/arm_cfft_q31.c
@@ -34,37 +34,6 @@
#include "arm_vec_fft.h"
-static void arm_bitreversal_32_inpl_mve(
- uint32_t *pSrc,
- const uint16_t bitRevLen,
- const uint16_t *pBitRevTab)
-
-{
- uint64_t *src = (uint64_t *) pSrc;
- uint32_t blkCnt; /* loop counters */
- uint32x4_t bitRevTabOff;
- uint32x4_t one = vdupq_n_u32(1);
-
- blkCnt = (bitRevLen / 2) / 2;
- while (blkCnt > 0U) {
- bitRevTabOff = vldrhq_u32(pBitRevTab);
- pBitRevTab += 4;
-
- uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
- uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
-
- uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
- uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
-
- vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
- vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
-
- /*
- * Decrement the blockSize loop counter
- */
- blkCnt--;
- }
-}
static void _arm_radix4_butterfly_q31_mve(
const arm_cfft_instance_q31 * S,
@@ -598,55 +567,55 @@ void arm_cfft_q31(
q31_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
-{
- uint32_t fftLen = S->fftLen;
-
- if (ifftFlag == 1U) {
-
- switch (fftLen) {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
- break;
-
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
- break;
- }
- } else {
- switch (fftLen) {
- case 16:
- case 64:
- case 256:
- case 1024:
- case 4096:
- _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
- break;
-
- case 32:
- case 128:
- case 512:
- case 2048:
- arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
- break;
- }
- }
-
-
- if (bitReverseFlag)
- {
-
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-
- }
+
+ }
}
-#else
+#else
extern void arm_radix4_butterfly_q31(
q31_t * pSrc,