From a509fa39d49417699769ddaf037a0a2e57a504b3 Mon Sep 17 00:00:00 2001
From: FabKlein <fabien.klein@arm.com>
Date: Tue, 30 Mar 2021 12:09:40 +0200
Subject: [PATCH] CMSIS-DSP: FFT bit reversal unrolling            Issue index
 update earlier, adding more distance with            subsequent gather loads 
           Added Out-of-place variants

---
 PrivateInclude/arm_vec_fft.h             | 267 +++++++++++++++++++++++
 Source/TransformFunctions/arm_cfft_f16.c | 234 ++++++++------------
 Source/TransformFunctions/arm_cfft_f32.c | 215 ++++++++----------
 Source/TransformFunctions/arm_cfft_q15.c | 151 ++++---------
 Source/TransformFunctions/arm_cfft_q31.c | 125 ++++-------
 5 files changed, 539 insertions(+), 453 deletions(-)
 mode change 100644 => 100755 Source/TransformFunctions/arm_cfft_f32.c

diff --git a/PrivateInclude/arm_vec_fft.h b/PrivateInclude/arm_vec_fft.h
index b2f21049..a500eb30 100755
--- a/PrivateInclude/arm_vec_fft.h
+++ b/PrivateInclude/arm_vec_fft.h
@@ -47,6 +47,273 @@ extern "C"
 #define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
 
 
+/**
+  @brief         In-place 32 bit reversal function for helium
+  @param[in,out] pSrc        points to in-place buffer of unknown 32-bit data type
+  @param[in]     bitRevLen   bit reversal table length
+  @param[in]     pBitRevTab  points to bit reversal table
+  @return        none
+*/
+
+__STATIC_INLINE void arm_bitreversal_32_inpl_mve(
+        uint32_t *pSrc,
+  const uint16_t  bitRevLen,
+  const uint16_t *pBitRevTab)
+
+{
+    uint64_t       *src = (uint64_t *) pSrc;
+    int32_t         blkCnt;     /* loop counters */
+    uint32x4_t      bitRevTabOff;
+    uint32x4_t      one = vdupq_n_u32(1);
+    uint64x2_t      inLow, inHigh;
+    uint64x2_t      bitRevOff1Low, bitRevOff0Low;
+    uint64x2_t      bitRevOff1High, bitRevOff0High;
+
+    /* load scheduling to increase gather load idx update / gather load distance */
+    bitRevTabOff = vldrhq_u32(pBitRevTab);
+    pBitRevTab += 4;
+
+    bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
+    bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
+
+
+    blkCnt = bitRevLen / 8;
+    while (blkCnt > 0) {
+        bitRevTabOff = vldrhq_u32(pBitRevTab);
+        pBitRevTab += 4;
+
+        /* 64-bit index expansion */
+        bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
+        bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
+
+        inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
+        inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
+
+        vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
+        vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
+
+
+        /* unrolled */
+        bitRevTabOff = vldrhq_u32(pBitRevTab);
+        pBitRevTab += 4;
+
+        bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
+
+        inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
+        inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
+
+        vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
+        vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
+
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    if (bitRevLen & 7) {
+        /* FFT size = 16 */
+        inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
+        inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
+
+        vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
+        vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
+    }
+}
+
+
+
+/**
+  @brief         In-place 16 bit reversal function for helium
+  @param[in,out] pSrc        points to in-place buffer of unknown 16-bit data type
+  @param[in]     bitRevLen   bit reversal table length
+  @param[in]     pBitRevTab  points to bit reversal table
+  @return        none
+*/
+
+__STATIC_INLINE void arm_bitreversal_16_inpl_mve(
+        uint16_t *pSrc,
+  const uint16_t bitRevLen,
+  const uint16_t *pBitRevTab)
+
+{
+    uint32_t       *src = (uint32_t *) pSrc;
+    int32_t         blkCnt;     /* loop counters */
+    uint32x4_t      bitRevTabOff;
+    uint16x8_t      one = vdupq_n_u16(1);
+    uint32x4_t      bitRevOff1Low, bitRevOff0Low;
+    uint32x4_t      bitRevOff1High, bitRevOff0High;
+    uint32x4_t      inLow, inHigh;
+
+    /* load scheduling to increase gather load idx update / gather load distance */
+    bitRevTabOff = vldrhq_u16(pBitRevTab);
+    pBitRevTab += 8;
+
+    bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+    bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+
+    blkCnt = (bitRevLen / 16);
+    while (blkCnt > 0U) {
+        bitRevTabOff = vldrhq_u16(pBitRevTab);
+        pBitRevTab += 8;
+
+        bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
+        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
+
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
+
+        /* loop unrolling */
+        bitRevTabOff = vldrhq_u16(pBitRevTab);
+        pBitRevTab += 8;
+
+        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
+
+        blkCnt--;
+    }
+
+    /* tail handling */
+    blkCnt = bitRevLen & 0xf;
+    if (blkCnt == 8) {
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
+    } else if (blkCnt == 12) {
+        /* FFT 16 special case */
+        mve_pred16_t    p = vctp16q(4);
+
+        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
+
+        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
+        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
+
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
+        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
+
+        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+
+        inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
+        inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
+
+        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
+        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
+    }
+}
+
+/**
+  @brief         Out-of-place 32 bit reversal function for helium
+  @param[out]   pDst        points to destination buffer of unknown 32-bit data type
+  @param[in]    pSrc        points to input buffer of unknown 32-bit data type
+  @param[in]    fftLen      FFT length
+  @return       none
+*/
+__STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
+{
+    uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
+    uint32_t        bitRevPos, blkCnt;
+    uint32_t       *pDst32 = (uint32_t *) pDst;
+
+    /* fwd indexes */
+    idxOffs0 = vdupq_n_u32(0);
+    idxOffs1 = vdupq_n_u32(0);
+    idxOffs0[0] = 0;    idxOffs0[2] = 4;
+    idxOffs1[0] = 8;    idxOffs1[2] = 12;
+
+    bitRevPos = (31 - __CLZ(fftLen)) + 5;
+    blkCnt = fftLen >> 2;
+
+    /* issued earlier to increase gather load idx update / gather load distance */
+    /* bit-reverse fwd indexes */
+    bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+    bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+    while (blkCnt > 0U) {
+        uint64x2_t      vecIn;
+
+        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
+        idxOffs0 = idxOffs0 + 16;
+        vst1q(pDst32, (uint32x4_t) vecIn);
+        pDst32 += 4;
+        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+
+        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
+        idxOffs1 = idxOffs1 + 16;
+        vst1q(pDst32, (uint32x4_t) vecIn);
+        pDst32 += 4;
+        bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+
+        blkCnt--;
+    }
+}
+
+
+/**
+  @brief         Out-of-place 16 bit reversal function for helium
+  @param[out]   pDst        points to destination buffer of unknown 16-bit data type
+  @param[in]    pSrc        points to input buffer of unknown 16-bit data type
+  @param[in]    fftLen      FFT length
+  @return       none
+*/
+
+__STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
+{
+    uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
+    uint32_t        bitRevPos, blkCnt;
+    uint16_t       *pDst16 = (uint16_t *) pDst;
+    uint32_t        incrIdx = 0;
+
+    /* fwd indexes */
+    idxOffs0 = vidupq_wb_u32(&incrIdx, 4);    // {0, 4, 8, 12}
+    idxOffs1 = vidupq_wb_u32(&incrIdx, 4);    // {16, 20, 24, 28}
+
+    bitRevPos = (31 - __CLZ(fftLen)) + 4;
+    blkCnt = fftLen >> 3;
+
+    /* issued earlier to increase gather load idx update / gather load distance */
+    /* bit-reverse fwd indexes */
+    bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+    bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+    while (blkCnt > 0U) {
+        uint32x4_t      vecIn;
+
+        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
+        idxOffs0 = idxOffs0 + 32;
+        vst1q(pDst16, (uint16x8_t) vecIn);
+        pDst16 += 8;
+        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
+
+        vecIn = vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
+        idxOffs1 = idxOffs1 + 32;
+        vst1q(pDst16, (uint16x8_t) vecIn);
+        pDst16 += 8;
+        bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
+
+        blkCnt--;
+    }
+}
+
+
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 
 
diff --git a/Source/TransformFunctions/arm_cfft_f16.c b/Source/TransformFunctions/arm_cfft_f16.c
index cc3fefc2..367e9d49 100755
--- a/Source/TransformFunctions/arm_cfft_f16.c
+++ b/Source/TransformFunctions/arm_cfft_f16.c
@@ -40,111 +40,51 @@
 static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
 {
   float16_t retValue=1.0;
-                                                      
-  switch (fftLen)                                     
-  {                                                   
-                                                      
-  case 4096U:                                         
-    retValue = (float16_t)0.000244140625f;                        
-    break;                                            
-                                                      
-  case 2048U:                                         
-    retValue = (float16_t)0.00048828125f;                         
-    break;                                            
-                                                      
-  case 1024U:                                         
-    retValue = (float16_t)0.0009765625f;                         
-    break;                                            
-                                                      
-  case 512U:                                          
-    retValue = (float16_t)0.001953125f;                           
-    break;                                            
-                                                      
-  case 256U:                                          
-    retValue = (float16_t)0.00390625f;                           
-    break;                                            
-                                                      
-  case 128U:                                          
-    retValue = (float16_t)0.0078125f;                             
-    break;                                            
-                                                      
-  case 64U:                                           
-    retValue = (float16_t)0.015625f;                             
-    break;                                            
-                                                      
-  case 32U:                                           
-    retValue = (float16_t)0.03125f;                               
-    break;                                            
-                                                      
-  case 16U:                                           
-    retValue = (float16_t)0.0625f;                               
-    break;                                            
-                                                      
-                                                      
-  default:                                            
-    break;                                            
-  }                                                   
-  return(retValue); 
-}
 
+  switch (fftLen)
+  {
 
-static void arm_bitreversal_f16_inpl_mve(
-        uint16_t *pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t *pBitRevTab)
+  case 4096U:
+    retValue = (float16_t)0.000244140625f;
+    break;
 
-{
-    uint32_t       *src = (uint32_t *)pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint16x8_t      one = vdupq_n_u16(1);
+  case 2048U:
+    retValue = (float16_t)0.00048828125f;
+    break;
 
-    blkCnt = (bitRevLen / 2) / 4;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u16(pBitRevTab);
-        pBitRevTab += 8;
+  case 1024U:
+    retValue = (float16_t)0.0009765625f;
+    break;
 
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
+  case 512U:
+    retValue = (float16_t)0.001953125f;
+    break;
 
-        bitRevOff1 = bitRevOff1 >> 3;
-        bitRevOff2 = bitRevOff2 >> 3;
+  case 256U:
+    retValue = (float16_t)0.00390625f;
+    break;
 
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
+  case 128U:
+    retValue = (float16_t)0.0078125f;
+    break;
 
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
+  case 64U:
+    retValue = (float16_t)0.015625f;
+    break;
 
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
+  case 32U:
+    retValue = (float16_t)0.03125f;
+    break;
 
+  case 16U:
+    retValue = (float16_t)0.0625f;
+    break;
 
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = bitRevLen & 7;
-    if (blkCnt > 0U) {
-        mve_pred16_t    p0 = vctp16q(blkCnt);
 
-        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
-
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
-        bitRevOff1 = bitRevOff1 >> 3;
-        bitRevOff2 = bitRevOff2 >> 3;
-
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
-
-        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
-        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
-    }
+  default:
+    break;
+  }
+  return(retValue);
 }
 
 
@@ -590,53 +530,53 @@ void arm_cfft_f16(
         float16_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                                
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen)); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
-            arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-                    
-        } 
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
+            arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+        }
 }
 
 #else
@@ -666,7 +606,7 @@ extern void arm_radix4_butterfly_f16(
 
 /**
   @defgroup ComplexFFT Complex FFT Functions
- 
+
   @par
                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
@@ -684,7 +624,7 @@ extern void arm_radix4_butterfly_f16(
                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
                    The FFT result will be contained in the same array and the frequency domain
                    values will have the same interleaving.
- 
+
   @par Floating-point
                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
@@ -696,12 +636,12 @@ extern void arm_radix4_butterfly_f16(
                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
                    calculation and this matches the textbook definition of the inverse FFT.
   @par
-                   For the MVE version, the new arm_cfft_init_f32 initialization function is 
+                   For the MVE version, the new arm_cfft_init_f32 initialization function is
                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
-                   needed FFTs.</b> Other FFT versions can continue to be initialized as 
+                   needed FFTs.</b> Other FFT versions can continue to be initialized as
                    explained below.
   @par
-                   For not MVE versions, pre-initialized data structures containing twiddle factors 
+                   For not MVE versions, pre-initialized data structures containing twiddle factors
                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                    this header in your function and then pass one of the constant structures as
                    an argument to arm_cfft_f32.  For example:
@@ -816,7 +756,7 @@ extern void arm_radix4_butterfly_f16(
                          break;
                      }
   @endcode
- 
+
  */
 
 
@@ -875,7 +815,7 @@ void arm_cfft_f16(
         case 2048:
         arm_cfft_radix4by2_f16  ( p1, L, (float16_t*)S->pTwiddle);
         break;
-   
+
     }
 
     if ( bitReverseFlag )
diff --git a/Source/TransformFunctions/arm_cfft_f32.c b/Source/TransformFunctions/arm_cfft_f32.c
old mode 100644
new mode 100755
index f47ba426..8948aa9e
--- a/Source/TransformFunctions/arm_cfft_f32.c
+++ b/Source/TransformFunctions/arm_cfft_f32.c
@@ -39,87 +39,56 @@
 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
 {
   float32_t retValue=1.0;
-                                                      
-  switch (fftLen)                                     
-  {                                                   
-                                                      
-  case 4096U:                                         
-    retValue = 0.000244140625;                        
-    break;                                            
-                                                      
-  case 2048U:                                         
-    retValue = 0.00048828125;                         
-    break;                                            
-                                                      
-  case 1024U:                                         
-    retValue = 0.0009765625f;                         
-    break;                                            
-                                                      
-  case 512U:                                          
-    retValue = 0.001953125;                           
-    break;                                            
-                                                      
-  case 256U:                                          
-    retValue = 0.00390625f;                           
-    break;                                            
-                                                      
-  case 128U:                                          
-    retValue = 0.0078125;                             
-    break;                                            
-                                                      
-  case 64U:                                           
-    retValue = 0.015625f;                             
-    break;                                            
-                                                      
-  case 32U:                                           
-    retValue = 0.03125;                               
-    break;                                            
-                                                      
-  case 16U:                                           
-    retValue = 0.0625f;                               
-    break;                                            
-                                                      
-                                                      
-  default:                                            
-    break;                                            
-  }                                                   
-  return(retValue); 
-}
 
+  switch (fftLen)
+  {
 
-static void arm_bitreversal_f32_inpl_mve(
-        uint32_t *pSrc,
-  const uint16_t  bitRevLen,
-  const uint16_t *pBitRevTab)
+  case 4096U:
+    retValue = 0.000244140625;
+    break;
 
-{
-    uint64_t       *src = (uint64_t *) pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint32x4_t      one = vdupq_n_u32(1);
+  case 2048U:
+    retValue = 0.00048828125;
+    break;
+
+  case 1024U:
+    retValue = 0.0009765625f;
+    break;
+
+  case 512U:
+    retValue = 0.001953125;
+    break;
 
-    blkCnt = (bitRevLen / 2) / 2;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u32(pBitRevTab);
-        pBitRevTab += 4;
+  case 256U:
+    retValue = 0.00390625f;
+    break;
 
-        uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
-        uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
+  case 128U:
+    retValue = 0.0078125;
+    break;
 
-        uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
-        uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
+  case 64U:
+    retValue = 0.015625f;
+    break;
 
-        vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
-        vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
+  case 32U:
+    retValue = 0.03125;
+    break;
 
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
+  case 16U:
+    retValue = 0.0625f;
+    break;
+
+
+  default:
+    break;
+  }
+  return(retValue);
 }
 
 
+
+
 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
 {
     f32x4_t vecTmp0, vecTmp1;
@@ -563,53 +532,53 @@ void arm_cfft_f32(
         float32_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                                
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
-            arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-                    
-        } 
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
+            arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+        }
 }
 
 
@@ -631,7 +600,7 @@ extern void arm_bitreversal_32(
 
 /**
   @defgroup ComplexFFT Complex FFT Functions
- 
+
   @par
                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
@@ -649,7 +618,7 @@ extern void arm_bitreversal_32(
                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
                    The FFT result will be contained in the same array and the frequency domain
                    values will have the same interleaving.
- 
+
   @par Floating-point
                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
@@ -661,12 +630,12 @@ extern void arm_bitreversal_32(
                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
                    calculation and this matches the textbook definition of the inverse FFT.
   @par
-                   For the MVE version, the new arm_cfft_init_f32 initialization function is 
+                   For the MVE version, the new arm_cfft_init_f32 initialization function is
                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
-                   needed FFTs.</b> Other FFT versions can continue to be initialized as 
+                   needed FFTs.</b> Other FFT versions can continue to be initialized as
                    explained below.
   @par
-                   For not MVE versions, pre-initialized data structures containing twiddle factors 
+                   For not MVE versions, pre-initialized data structures containing twiddle factors
                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
                    this header in your function and then pass one of the constant structures as
                    an argument to arm_cfft_f32.  For example:
@@ -781,7 +750,7 @@ extern void arm_bitreversal_32(
                          break;
                      }
   @endcode
- 
+
  */
 
 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c
index 00503a6e..1cfc20ee 100644
--- a/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Source/TransformFunctions/arm_cfft_q15.c
@@ -33,65 +33,6 @@
 #include "arm_vec_fft.h"
 
 
-static void arm_bitreversal_16_inpl_mve(
-        uint16_t *pSrc,
-  const uint16_t bitRevLen,
-  const uint16_t *pBitRevTab)
-
-{
-    uint32_t       *src = (uint32_t *)pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint16x8_t      one = vdupq_n_u16(1);
-
-    blkCnt = (bitRevLen / 2) / 4;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u16(pBitRevTab);
-        pBitRevTab += 8;
-
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
-        bitRevOff1 = bitRevOff1 >> 3;
-        bitRevOff2 = bitRevOff2 >> 3;
-
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
-
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
-        vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
-
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-
-
-    /*
-     * tail
-     * (will be merged thru tail predication)
-     */
-    blkCnt = bitRevLen & 7;
-    if (blkCnt > 0U) {
-        mve_pred16_t    p0 = vctp16q(blkCnt);
-
-        bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
-
-        uint32x4_t      bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
-        uint32x4_t      bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
-
-        bitRevOff1 = bitRevOff1 >> 3;
-        bitRevOff2 = bitRevOff2 >> 3;
-
-        uint32x4_t      in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
-        uint32x4_t      in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
-
-        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
-        vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
-    }
-}
-
 static void _arm_radix4_butterfly_q15_mve(
     const arm_cfft_instance_q15 * S,
     q15_t   *pSrc,
@@ -592,53 +533,53 @@ void arm_cfft_q15(
         q15_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                             
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:    
-                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
             arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-       
-        } 
+
+        }
 }
 
 #else
diff --git a/Source/TransformFunctions/arm_cfft_q31.c b/Source/TransformFunctions/arm_cfft_q31.c
index 13c5d840..75e4e838 100644
--- a/Source/TransformFunctions/arm_cfft_q31.c
+++ b/Source/TransformFunctions/arm_cfft_q31.c
@@ -34,37 +34,6 @@
 
 #include "arm_vec_fft.h"
 
-static void arm_bitreversal_32_inpl_mve(
-        uint32_t *pSrc,
-  const uint16_t  bitRevLen,
-  const uint16_t *pBitRevTab)
-
-{
-    uint64_t       *src = (uint64_t *) pSrc;
-    uint32_t        blkCnt;     /* loop counters */
-    uint32x4_t      bitRevTabOff;
-    uint32x4_t      one = vdupq_n_u32(1);
-
-    blkCnt = (bitRevLen / 2) / 2;
-    while (blkCnt > 0U) {
-        bitRevTabOff = vldrhq_u32(pBitRevTab);
-        pBitRevTab += 4;
-
-        uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
-        uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
-
-        uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
-        uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
-
-        vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
-        vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
-
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-}
 
 static void _arm_radix4_butterfly_q31_mve(
     const arm_cfft_instance_q31 * S,
@@ -598,55 +567,55 @@ void arm_cfft_q31(
         q31_t * pSrc,
         uint8_t ifftFlag,
         uint8_t bitReverseFlag)
-{                                                                             
-        uint32_t fftLen = S->fftLen;     
-
-        if (ifftFlag == 1U) {                                                            
-                                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:                                                                   
-                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen); 
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);              
-                break;                                                                   
-            }  
-        } else {                                                                         
-            switch (fftLen) {                                                            
-            case 16:                                                                     
-            case 64:                                                                     
-            case 256:                                                                    
-            case 1024:                                                                   
-            case 4096:    
-                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);         
-                break;                                                                   
-                                                                                         
-            case 32:                                                                     
-            case 128:                                                                    
-            case 512:                                                                    
-            case 2048:                                                                   
-                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);                      
-                break;                                                                   
-            }                                                                            
-        }                                                                                
-                                                                                         
-                                                                                         
-        if (bitReverseFlag) 
-        {                                                            
-            
+{
+        uint32_t fftLen = S->fftLen;
+
+        if (ifftFlag == 1U) {
+
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
+                break;
+            }
+        } else {
+            switch (fftLen) {
+            case 16:
+            case 64:
+            case 256:
+            case 1024:
+            case 4096:
+                _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
+                break;
+
+            case 32:
+            case 128:
+            case 512:
+            case 2048:
+                arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
+                break;
+            }
+        }
+
+
+        if (bitReverseFlag)
+        {
+
             arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
-       
-        } 
+
+        }
 }
-#else 
+#else
 
 extern void arm_radix4_butterfly_q31(
         q31_t * pSrc,