CMSIS-DSP: FFT bit reversal unrolling

Issue index update earlier, adding more distance with subsequent gather loads Added Out-of-place variants
5 years ago · a509fa39d4
parent 0a5a96d904
commit a509fa39d4
5 changed files with 539 additions and 453 deletions
--- a/PrivateInclude/arm_vec_fft.h
+++ b/PrivateInclude/arm_vec_fft.h
@ -47,6 +47,273 @@ extern "C"
 #define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
 /**
  @brief         In-place 32 bit reversal function for helium
  @param[in,out] pSrc        points to in-place buffer of unknown 32-bit data type
  @param[in]     bitRevLen   bit reversal table length
  @param[in]     pBitRevTab  points to bit reversal table
  @return        none
 */
 __STATIC_INLINE void arm_bitreversal_32_inpl_mve(
        uint32_t *pSrc,
  const uint16_t  bitRevLen,
  const uint16_t *pBitRevTab)
 {
    uint64_t       *src = (uint64_t *) pSrc;
    int32_t         blkCnt;     /* loop counters */
    uint32x4_t      bitRevTabOff;
    uint32x4_t      one = vdupq_n_u32(1);
    uint64x2_t      inLow, inHigh;
    uint64x2_t      bitRevOff1Low, bitRevOff0Low;
    uint64x2_t      bitRevOff1High, bitRevOff0High;
    /* load scheduling to increase gather load idx update / gather load distance */
    bitRevTabOff = vldrhq_u32(pBitRevTab);
    pBitRevTab += 4;
    bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
    bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
    blkCnt = bitRevLen / 8;
    while (blkCnt > 0) {
        bitRevTabOff = vldrhq_u32(pBitRevTab);
        pBitRevTab += 4;
        /* 64-bit index expansion */
        bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
        bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
        inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
        inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
        vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
        vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
        /* unrolled */
        bitRevTabOff = vldrhq_u32(pBitRevTab);
        pBitRevTab += 4;
        bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
        bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
        inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
        inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
        vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
        vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
    }
    if (bitRevLen & 7) {
        /* FFT size = 16 */
        inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
        inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
        vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
        vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
    }
 }
 /**
  @brief         In-place 16 bit reversal function for helium
  @param[in,out] pSrc        points to in-place buffer of unknown 16-bit data type
  @param[in]     bitRevLen   bit reversal table length
  @param[in]     pBitRevTab  points to bit reversal table
  @return        none
 */
 __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
        uint16_t *pSrc,
  const uint16_t bitRevLen,
  const uint16_t *pBitRevTab)
 {
    uint32_t       *src = (uint32_t *) pSrc;
    int32_t         blkCnt;     /* loop counters */
    uint32x4_t      bitRevTabOff;
    uint16x8_t      one = vdupq_n_u16(1);
    uint32x4_t      bitRevOff1Low, bitRevOff0Low;
    uint32x4_t      bitRevOff1High, bitRevOff0High;
    uint32x4_t      inLow, inHigh;
    /* load scheduling to increase gather load idx update / gather load distance */
    bitRevTabOff = vldrhq_u16(pBitRevTab);
    pBitRevTab += 8;
    bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
    bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
    blkCnt = (bitRevLen / 16);
    while (blkCnt > 0U) {
        bitRevTabOff = vldrhq_u16(pBitRevTab);
        pBitRevTab += 8;
        bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
        bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
        /* loop unrolling */
        bitRevTabOff = vldrhq_u16(pBitRevTab);
        pBitRevTab += 8;
        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
        blkCnt--;
    }
    /* tail handling */
    blkCnt = bitRevLen & 0xf;
    if (blkCnt == 8) {
        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
    } else if (blkCnt == 12) {
        /* FFT 16 special case */
        mve_pred16_t    p = vctp16q(4);
        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
        inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
        inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
    }
 }
 /**
  @brief         Out-of-place 32 bit reversal function for helium
  @param[out]   pDst        points to destination buffer of unknown 32-bit data type
  @param[in]    pSrc        points to input buffer of unknown 32-bit data type
  @param[in]    fftLen      FFT length
  @return       none
 */
 __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
 {
    uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
    uint32_t        bitRevPos, blkCnt;
    uint32_t       *pDst32 = (uint32_t *) pDst;
    /* fwd indexes */
    idxOffs0 = vdupq_n_u32(0);
    idxOffs1 = vdupq_n_u32(0);
    idxOffs0[0] = 0;    idxOffs0[2] = 4;
    idxOffs1[0] = 8;    idxOffs1[2] = 12;
    bitRevPos = (31 - __CLZ(fftLen)) + 5;
    blkCnt = fftLen >> 2;
    /* issued earlier to increase gather load idx update / gather load distance */
    /* bit-reverse fwd indexes */
    bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
    bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
    while (blkCnt > 0U) {
        uint64x2_t      vecIn;
        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
        idxOffs0 = idxOffs0 + 16;
        vst1q(pDst32, (uint32x4_t) vecIn);
        pDst32 += 4;
        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
        idxOffs1 = idxOffs1 + 16;
        vst1q(pDst32, (uint32x4_t) vecIn);
        pDst32 += 4;
        bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
        blkCnt--;
    }
 }
 /**
  @brief         Out-of-place 16 bit reversal function for helium
  @param[out]   pDst        points to destination buffer of unknown 16-bit data type
  @param[in]    pSrc        points to input buffer of unknown 16-bit data type
  @param[in]    fftLen      FFT length
  @return       none
 */
 __STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
 {
    uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
    uint32_t        bitRevPos, blkCnt;
    uint16_t       *pDst16 = (uint16_t *) pDst;
    uint32_t        incrIdx = 0;
    /* fwd indexes */
    idxOffs0 = vidupq_wb_u32(&incrIdx, 4);    // {0, 4, 8, 12}
    idxOffs1 = vidupq_wb_u32(&incrIdx, 4);    // {16, 20, 24, 28}
    bitRevPos = (31 - __CLZ(fftLen)) + 4;
    blkCnt = fftLen >> 3;
    /* issued earlier to increase gather load idx update / gather load distance */
    /* bit-reverse fwd indexes */
    bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
    bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
    while (blkCnt > 0U) {
        uint32x4_t      vecIn;
        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
        idxOffs0 = idxOffs0 + 32;
        vst1q(pDst16, (uint16x8_t) vecIn);
        pDst16 += 8;
        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
        idxOffs1 = idxOffs1 + 32;
        vst1q(pDst16, (uint16x8_t) vecIn);
        pDst16 += 8;
        bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
        blkCnt--;
    }
 }
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
--- a/Source/TransformFunctions/arm_cfft_f16.c
+++ b/Source/TransformFunctions/arm_cfft_f16.c
@ -40,111 +40,51 @@
 static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
 {
  float16_t retValue=1.0;
  switch (fftLen)                                     
  {                                                   
  case 4096U:                                         
    retValue = (float16_t)0.000244140625f;                        
    break;                                            
  case 2048U:                                         
    retValue = (float16_t)0.00048828125f;                         
    break;                                            
  case 1024U:                                         
    retValue = (float16_t)0.0009765625f;                         
    break;                                            
  case 512U:                                          
    retValue = (float16_t)0.001953125f;                           
    break;                                            
  case 256U:                                          
    retValue = (float16_t)0.00390625f;                           
    break;                                            
  case 128U:                                          
    retValue = (float16_t)0.0078125f;                             
    break;                                            
  case 64U:                                           
    retValue = (float16_t)0.015625f;                             
    break;                                            
  case 32U:                                           
    retValue = (float16_t)0.03125f;                               
    break;                                            
  case 16U:                                           
    retValue = (float16_t)0.0625f;                               
    break;                                            
  default:                                            
    break;                                            
  }                                                   
  return(retValue); 
 }
  switch (fftLen)
  {
-static void arm_bitreversal_f16_inpl_mve(
+  case 4096U:
-        uint16_t *pSrc,
+    retValue = (float16_t)0.000244140625f;
-  const uint16_t bitRevLen,
+    break;
  const uint16_t *pBitRevTab)
-{
+  case 2048U:
-    uint32_t       *src = (uint32_t *)pSrc;
+    retValue = (float16_t)0.00048828125f;
-    uint32_t        blkCnt;     /* loop counters */
+    break;
    uint32x4_t      bitRevTabOff;
    uint16x8_t      one = vdupq_n_u16(1);
-    blkCnt = (bitRevLen / 2) / 4;
+  case 1024U:
-    while (blkCnt > 0U) {
+    retValue = (float16_t)0.0009765625f;
-        bitRevTabOff = vldrhq_u16(pBitRevTab);
+    break;
        pBitRevTab += 8;
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
+  case 512U:
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
+    retValue = (float16_t)0.001953125f;
    break;
-        bitRevOff1 = bitRevOff1 >> 3;
+  case 256U:
-        bitRevOff2 = bitRevOff2 >> 3;
+    retValue = (float16_t)0.00390625f;
    break;
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
+  case 128U:
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
+    retValue = (float16_t)0.0078125f;
    break;
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
+  case 64U:
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
+    retValue = (float16_t)0.015625f;
    break;
-        /*
+  case 32U:
-         * Decrement the blockSize loop counter
+    retValue = (float16_t)0.03125f;
-         */
+    break;
        blkCnt--;
    }
  case 16U:
    retValue = (float16_t)0.0625f;
    break;
    /*
     * tail
     * (will be merged thru tail predication)
     */
    blkCnt = bitRevLen & 7;
    if (blkCnt > 0U) {
        mve_pred16_t    p0 = vctp16q(blkCnt);
-        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
+  default:
-
+    break;
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
+  }
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
+  return(retValue);
        bitRevOff1 = bitRevOff1 >> 3;
        bitRevOff2 = bitRevOff2 >> 3;
        uint32x4_t      in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
        uint32x4_t      in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
    }
 }
@ -590,53 +530,53 @@ void arm_cfft_f16(
        float16_t * pSrc,
        uint8_t ifftFlag,
        uint8_t bitReverseFlag)
-{                                                                                
+{
-        uint32_t fftLen = S->fftLen;     
+        uint32_t fftLen = S->fftLen;
-
+
-        if (ifftFlag == 1U) {                                                            
+        if (ifftFlag == 1U) {
-                                                                                         
+
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:                                                                   
+            case 4096:
-                _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); 
+                _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);              
+                arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }  
+            }
-        } else {                                                                         
+        } else {
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:                                                                   
+            case 4096:
-                _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);         
+                _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);                      
+                arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }                                                                            
+            }
-        }                                                                                
+        }
-                                                                                         
+
-                                                                                         
+
-        if (bitReverseFlag) 
+        if (bitReverseFlag)
-        {                                                            
+        {
-            
+
-            arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+            arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-                    
+
-        } 
+        }
 }
 #else
@ -666,7 +606,7 @@ extern void arm_radix4_butterfly_f16(
 /**
  @defgroup ComplexFFT Complex FFT Functions
- 
+
  @par
                   The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
                   Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
@ -684,7 +624,7 @@ extern void arm_radix4_butterfly_f16(
                   <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
                   The FFT result will be contained in the same array and the frequency domain
                   values will have the same interleaving.
- 
+
  @par Floating-point
                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
                   stages are performed along with a single radix-2 or radix-4 stage, as needed.
@ -696,12 +636,12 @@ extern void arm_radix4_butterfly_f16(
                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
                   calculation and this matches the textbook definition of the inverse FFT.
  @par
-                   For the MVE version, the new arm_cfft_init_f32 initialization function is 
+                   For the MVE version, the new arm_cfft_init_f32 initialization function is
                   <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
-                   needed FFTs.</b> Other FFT versions can continue to be initialized as 
+                   needed FFTs.</b> Other FFT versions can continue to be initialized as
                   explained below.
  @par
-                   For not MVE versions, pre-initialized data structures containing twiddle factors 
+                   For not MVE versions, pre-initialized data structures containing twiddle factors
                   and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                   this header in your function and then pass one of the constant structures as
                   an argument to arm_cfft_f32.  For example:
@ -816,7 +756,7 @@ extern void arm_radix4_butterfly_f16(
                         break;
                     }
  @endcode
- 
+
 */
@ -875,7 +815,7 @@ void arm_cfft_f16(
        case 2048:
        arm_cfft_radix4by2_f16  ( p1, L, (float16_t*)S->pTwiddle);
        break;
-   
+
    }
    if ( bitReverseFlag )
--- a/Source/TransformFunctions/arm_cfft_f32.c
+++ b/Source/TransformFunctions/arm_cfft_f32.c
@ -39,87 +39,56 @@
 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
 {
  float32_t retValue=1.0;
  switch (fftLen)                                     
  {                                                   
  case 4096U:                                         
    retValue = 0.000244140625;                        
    break;                                            
  case 2048U:                                         
    retValue = 0.00048828125;                         
    break;                                            
  case 1024U:                                         
    retValue = 0.0009765625f;                         
    break;                                            
  case 512U:                                          
    retValue = 0.001953125;                           
    break;                                            
  case 256U:                                          
    retValue = 0.00390625f;                           
    break;                                            
  case 128U:                                          
    retValue = 0.0078125;                             
    break;                                            
  case 64U:                                           
    retValue = 0.015625f;                             
    break;                                            
  case 32U:                                           
    retValue = 0.03125;                               
    break;                                            
  case 16U:                                           
    retValue = 0.0625f;                               
    break;                                            
  default:                                            
    break;                                            
  }                                                   
  return(retValue); 
 }
  switch (fftLen)
  {
-static void arm_bitreversal_f32_inpl_mve(
+  case 4096U:
-        uint32_t *pSrc,
+    retValue = 0.000244140625;
-  const uint16_t  bitRevLen,
+    break;
  const uint16_t *pBitRevTab)
-{
+  case 2048U:
-    uint64_t       *src = (uint64_t *) pSrc;
+    retValue = 0.00048828125;
-    uint32_t        blkCnt;     /* loop counters */
+    break;
-    uint32x4_t      bitRevTabOff;
+
-    uint32x4_t      one = vdupq_n_u32(1);
+  case 1024U:
    retValue = 0.0009765625f;
    break;
  case 512U:
    retValue = 0.001953125;
    break;
-    blkCnt = (bitRevLen / 2) / 2;
+  case 256U:
-    while (blkCnt > 0U) {
+    retValue = 0.00390625f;
-        bitRevTabOff = vldrhq_u32(pBitRevTab);
+    break;
        pBitRevTab += 4;
-        uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
+  case 128U:
-        uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
+    retValue = 0.0078125;
    break;
-        uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
+  case 64U:
-        uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
+    retValue = 0.015625f;
    break;
-        vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
+  case 32U:
-        vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
+    retValue = 0.03125;
    break;
-        /*
+  case 16U:
-         * Decrement the blockSize loop counter
+    retValue = 0.0625f;
-         */
+    break;
-        blkCnt--;
+
-    }
+
  default:
    break;
  }
  return(retValue);
 }
 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
 {
    f32x4_t vecTmp0, vecTmp1;
@ -563,53 +532,53 @@ void arm_cfft_f32(
        float32_t * pSrc,
        uint8_t ifftFlag,
        uint8_t bitReverseFlag)
-{                                                                                
+{
-        uint32_t fftLen = S->fftLen;     
+        uint32_t fftLen = S->fftLen;
-
+
-        if (ifftFlag == 1U) {                                                            
+        if (ifftFlag == 1U) {
-                                                                                         
+
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:                                                                   
+            case 4096:
-                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); 
+                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);              
+                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }  
+            }
-        } else {                                                                         
+        } else {
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:                                                                   
+            case 4096:
-                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);         
+                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);                      
+                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }                                                                            
+            }
-        }                                                                                
+        }
-                                                                                         
+
-                                                                                         
+
-        if (bitReverseFlag) 
+        if (bitReverseFlag)
-        {                                                            
+        {
-            
+
-            arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+            arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-                    
+
-        } 
+        }
 }
@ -631,7 +600,7 @@ extern void arm_bitreversal_32(
 /**
  @defgroup ComplexFFT Complex FFT Functions
- 
+
  @par
                   The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
                   Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
                   <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
                   The FFT result will be contained in the same array and the frequency domain
                   values will have the same interleaving.
- 
+
  @par Floating-point
                   The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
                   stages are performed along with a single radix-2 or radix-4 stage, as needed.
@ -661,12 +630,12 @@ extern void arm_bitreversal_32(
                   inverse transform includes a scale of <code>1/fftLen</code> as part of the
                   calculation and this matches the textbook definition of the inverse FFT.
  @par
-                   For the MVE version, the new arm_cfft_init_f32 initialization function is 
+                   For the MVE version, the new arm_cfft_init_f32 initialization function is
                   <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
-                   needed FFTs.</b> Other FFT versions can continue to be initialized as 
+                   needed FFTs.</b> Other FFT versions can continue to be initialized as
                   explained below.
  @par
-                   For not MVE versions, pre-initialized data structures containing twiddle factors 
+                   For not MVE versions, pre-initialized data structures containing twiddle factors
                   and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                   this header in your function and then pass one of the constant structures as
                   an argument to arm_cfft_f32.  For example:
@ -781,7 +750,7 @@ extern void arm_bitreversal_32(
                         break;
                     }
  @endcode
- 
+
 */
 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
--- a/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Source/TransformFunctions/arm_cfft_q15.c
@ -33,65 +33,6 @@
 #include "arm_vec_fft.h"
 static void arm_bitreversal_16_inpl_mve(
        uint16_t *pSrc,
  const uint16_t bitRevLen,
  const uint16_t *pBitRevTab)
 {
    uint32_t       *src = (uint32_t *)pSrc;
    uint32_t        blkCnt;     /* loop counters */
    uint32x4_t      bitRevTabOff;
    uint16x8_t      one = vdupq_n_u16(1);
    blkCnt = (bitRevLen / 2) / 4;
    while (blkCnt > 0U) {
        bitRevTabOff = vldrhq_u16(pBitRevTab);
        pBitRevTab += 8;
        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
        bitRevOff1 = bitRevOff1 >> 3;
        bitRevOff2 = bitRevOff2 >> 3;
        uint32x4_t      in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
        uint32x4_t      in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
    }
    /*
     * tail
     * (will be merged thru tail predication)
     */
    blkCnt = bitRevLen & 7;
    if (blkCnt > 0U) {
        mve_pred16_t    p0 = vctp16q(blkCnt);
        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
        bitRevOff1 = bitRevOff1 >> 3;
        bitRevOff2 = bitRevOff2 >> 3;
        uint32x4_t      in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
        uint32x4_t      in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
    }
 }
 static void _arm_radix4_butterfly_q15_mve(
    const arm_cfft_instance_q15 * S,
    q15_t   *pSrc,
@ -592,53 +533,53 @@ void arm_cfft_q15(
        q15_t * pSrc,
        uint8_t ifftFlag,
        uint8_t bitReverseFlag)
-{                                                                             
+{
-        uint32_t fftLen = S->fftLen;     
+        uint32_t fftLen = S->fftLen;
-
+
-        if (ifftFlag == 1U) {                                                            
+        if (ifftFlag == 1U) {
-                                                                                         
+
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:                                                                   
+            case 4096:
-                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); 
+                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);              
+                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }  
+            }
-        } else {                                                                         
+        } else {
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:    
+            case 4096:
-                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);         
+                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);                      
+                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }                                                                            
+            }
-        }                                                                                
+        }
-                                                                                         
+
-                                                                                         
+
-        if (bitReverseFlag) 
+        if (bitReverseFlag)
-        {                                                            
+        {
-            
+
            arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-       
+
-        } 
+        }
 }
 #else
--- a/Source/TransformFunctions/arm_cfft_q31.c
+++ b/Source/TransformFunctions/arm_cfft_q31.c
@ -34,37 +34,6 @@
 #include "arm_vec_fft.h"
 static void arm_bitreversal_32_inpl_mve(
        uint32_t *pSrc,
  const uint16_t  bitRevLen,
  const uint16_t *pBitRevTab)
 {
    uint64_t       *src = (uint64_t *) pSrc;
    uint32_t        blkCnt;     /* loop counters */
    uint32x4_t      bitRevTabOff;
    uint32x4_t      one = vdupq_n_u32(1);
    blkCnt = (bitRevLen / 2) / 2;
    while (blkCnt > 0U) {
        bitRevTabOff = vldrhq_u32(pBitRevTab);
        pBitRevTab += 4;
        uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
        uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
        uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
        uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
        vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
        vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
        /*
         * Decrement the blockSize loop counter
         */
        blkCnt--;
    }
 }
 static void _arm_radix4_butterfly_q31_mve(
    const arm_cfft_instance_q31 * S,
@ -598,55 +567,55 @@ void arm_cfft_q31(
        q31_t * pSrc,
        uint8_t ifftFlag,
        uint8_t bitReverseFlag)
-{                                                                             
+{
-        uint32_t fftLen = S->fftLen;     
+        uint32_t fftLen = S->fftLen;
-
+
-        if (ifftFlag == 1U) {                                                            
+        if (ifftFlag == 1U) {
-                                                                                         
+
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:                                                                   
+            case 4096:
-                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); 
+                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);              
+                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }  
+            }
-        } else {                                                                         
+        } else {
-            switch (fftLen) {                                                            
+            switch (fftLen) {
-            case 16:                                                                     
+            case 16:
-            case 64:                                                                     
+            case 64:
-            case 256:                                                                    
+            case 256:
-            case 1024:                                                                   
+            case 1024:
-            case 4096:    
+            case 4096:
-                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);         
+                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-                                                                                         
+
-            case 32:                                                                     
+            case 32:
-            case 128:                                                                    
+            case 128:
-            case 512:                                                                    
+            case 512:
-            case 2048:                                                                   
+            case 2048:
-                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);                      
+                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
-                break;                                                                   
+                break;
-            }                                                                            
+            }
-        }                                                                                
+        }
-                                                                                         
+
-                                                                                         
+
-        if (bitReverseFlag) 
+        if (bitReverseFlag)
-        {                                                            
+        {
-            
+
            arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-       
+
-        } 
+        }
 }
-#else 
+#else
 extern void arm_radix4_butterfly_q31(
        q31_t * pSrc,