From 266d88ba4a6aa9aeb5af1c297acce69accc9ad20 Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Wed, 15 Jan 2020 11:26:16 +0100
Subject: [PATCH] CMSIS-DSP: Added MVE version of stage_rfft_f32

---
 Source/TransformFunctions/arm_cfft_q15.c      |   2 +-
 Source/TransformFunctions/arm_rfft_fast_f32.c | 225 +++++++++++++++++-
 Source/TransformFunctions/arm_rfft_q15.c      |   2 +-
 Source/TransformFunctions/arm_rfft_q31.c      |   2 +-
 4 files changed, 227 insertions(+), 4 deletions(-)

diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c
index 1fa9abb0..a25fc824 100644
--- a/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Source/TransformFunctions/arm_cfft_q15.c
@@ -946,4 +946,4 @@ void arm_cfft_radix4by2_inverse_q15(
   }
 }
 
-#endif /* defined(ARM_MATH_MVEI) */
\ No newline at end of file
+#endif /* defined(ARM_MATH_MVEI) */
diff --git a/Source/TransformFunctions/arm_rfft_fast_f32.c b/Source/TransformFunctions/arm_rfft_fast_f32.c
index e2be2a5d..f7e79f90 100644
--- a/Source/TransformFunctions/arm_rfft_fast_f32.c
+++ b/Source/TransformFunctions/arm_rfft_fast_f32.c
@@ -28,6 +28,7 @@
 
 #include "arm_math.h"
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 void stage_rfft_f32(
   const arm_rfft_fast_instance_f32 * S,
         float32_t * p,
@@ -42,6 +43,16 @@ void stage_rfft_f32(
         float32_t t1a, t1b;                         /* temporary variables */
         float32_t p0, p1, p2, p3;                   /* temporary variables */
 
+        float32x4x2_t tw,xA,xB;
+        float32x4x2_t tmp1, tmp2, res;
+
+        uint32x4_t     vecStridesFwd, vecStridesBkwd;
+
+        vecStridesFwd = vidupq_u32(0, 2);
+        vecStridesBkwd = -vecStridesFwd;
+
+        int blockCnt;
+
 
    k = (S->Sint).fftLen - 1;
 
@@ -55,6 +66,216 @@ void stage_rfft_f32(
    twR = *pCoeff++ ;
    twI = *pCoeff++ ;
 
+   // U1 = XA(1) + XB(1); % It is real
+   t1a = xBR + xAR  ;
+
+   // U2 = XB(1) - XA(1); % It is imaginary
+   t1b = xBI + xAI  ;
+
+   // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+   // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+   *pOut++ = 0.5f * ( t1a + t1b );
+   *pOut++ = 0.5f * ( t1a - t1b );
+
+   // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
+   pB  = p + 2*k;
+   pA += 2;
+
+   blockCnt = k >> 2;
+   while (blockCnt > 0)
+   {
+      /*
+         function X = my_split_rfft(X, ifftFlag)
+         % X is a series of real numbers
+         L  = length(X);
+         XC = X(1:2:end) +i*X(2:2:end);
+         XA = fft(XC);
+         XB = conj(XA([1 end:-1:2]));
+         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+         for l = 2:L/2
+            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+         end
+         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+         X = XA;
+      */
+
+
+      xA = vld2q_f32(pA);
+      pA += 8;
+
+      xB = vld2q_f32(pB);
+
+      xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
+      xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
+
+      xB.val[1] = vnegq_f32(xB.val[1]);
+      pB -= 8;
+
+
+      tw = vld2q_f32(pCoeff);
+      pCoeff += 8;
+
+
+      tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
+      tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
+
+      tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
+      tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
+
+      res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+      res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+      res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+      res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+      res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
+      res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
+
+      res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
+      res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
+
+
+      vst2q_f32(pOut, res);
+      pOut += 8;
+
+    
+      blockCnt--;
+   } 
+
+   blockCnt = k & 3;
+   while (blockCnt > 0)
+   {
+      /*
+         function X = my_split_rfft(X, ifftFlag)
+         % X is a series of real numbers
+         L  = length(X);
+         XC = X(1:2:end) +i*X(2:2:end);
+         XA = fft(XC);
+         XB = conj(XA([1 end:-1:2]));
+         TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+         for l = 2:L/2
+            XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+         end
+         XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+         X = XA;
+      */
+
+      xBI = pB[1];
+      xBR = pB[0];
+      xAR = pA[0];
+      xAI = pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xBR - xAR ;
+      t1b = xBI + xAI ;
+
+      // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+      // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+      p0 = twR * t1a;
+      p1 = twI * t1a;
+      p2 = twR * t1b;
+      p3 = twI * t1b;
+
+      *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      blockCnt--;
+   }
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f32(
+  const arm_rfft_fast_instance_f32 * S,
+        float32_t * p,
+        float32_t * pOut)
+{
+        uint32_t  k;                                /* Loop Counter */
+        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float32_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
+        float32_t *pA = p;                          /* increasing pointer */
+        float32_t *pB = p;                          /* decreasing pointer */
+        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float32_t t1a, t1b, r, s, t, u;             /* temporary variables */
+        
+
+   k = (S->Sint).fftLen - 1;
+
+   xAR = pA[0];
+   xAI = pA[1];
+
+   pCoeff += 2 ;
+
+   *pOut++ = 0.5f * ( xAR + xAI );
+   *pOut++ = 0.5f * ( xAR - xAI );
+
+   pB  =  p + 2*k ;
+   pA +=  2    ;
+
+   while (k > 0U)
+   {
+      /* G is half of the frequency complex spectrum */
+      //for k = 2:N
+      //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+      xBI =   pB[1]    ;
+      xBR =   pB[0]    ;
+      xAR =  pA[0];
+      xAI =  pA[1];
+
+      twR = *pCoeff++;
+      twI = *pCoeff++;
+
+      t1a = xAR - xBR ;
+      t1b = xAI + xBI ;
+
+      r = twR * t1a;
+      s = twI * t1b;
+      t = twI * t1a;
+      u = twR * t1b;
+
+      // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+      // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+      *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+      *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+      pA += 2;
+      pB -= 2;
+      k--;
+   }
+
+}
+#else
+void stage_rfft_f32(
+  const arm_rfft_fast_instance_f32 * S,
+        float32_t * p,
+        float32_t * pOut)
+{
+        uint32_t  k;                                /* Loop Counter */
+        float32_t twR, twI;                         /* RFFT Twiddle coefficients */
+  const float32_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
+        float32_t *pA = p;                          /* increasing pointer */
+        float32_t *pB = p;                          /* decreasing pointer */
+        float32_t xAR, xAI, xBR, xBI;               /* temporary variables */
+        float32_t t1a, t1b;                         /* temporary variables */
+        float32_t p0, p1, p2, p3;                   /* temporary variables */
+
+
+   k = (S->Sint).fftLen - 1;
+
+   /* Pack first and last sample of the frequency domain together */
+
+   xBR = pB[0];
+   xBI = pB[1];
+   xAR = pA[0];
+   xAI = pA[1];
+
+   twR = *pCoeff++ ;
+   twI = *pCoeff++ ;
+
+
    // U1 = XA(1) + XB(1); % It is real
    t1a = xBR + xAR  ;
 
@@ -108,6 +329,7 @@ void stage_rfft_f32(
       *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
       *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
 
+
       pA += 2;
       pB -= 2;
       k--;
@@ -174,6 +396,8 @@ void merge_rfft_f32(
 
 }
 
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
 /**
   @ingroup groupTransforms
 */
@@ -303,7 +527,6 @@ void arm_rfft_fast_f32(
    {
       /*  Real FFT compression */
       merge_rfft_f32(S, p, pOut);
-
       /* Complex radix-4 IFFT process */
       arm_cfft_f32( Sint, pOut, ifftFlag, 1);
    }
diff --git a/Source/TransformFunctions/arm_rfft_q15.c b/Source/TransformFunctions/arm_rfft_q15.c
index 8aa24355..ca4c9d45 100644
--- a/Source/TransformFunctions/arm_rfft_q15.c
+++ b/Source/TransformFunctions/arm_rfft_q15.c
@@ -586,4 +586,4 @@ void arm_split_rifft_q15(
   }
 
 }
-#endif /* defined(ARM_MATH_MVEI) */
\ No newline at end of file
+#endif /* defined(ARM_MATH_MVEI) */
diff --git a/Source/TransformFunctions/arm_rfft_q31.c b/Source/TransformFunctions/arm_rfft_q31.c
index 0b8d7f9a..9ba0d521 100644
--- a/Source/TransformFunctions/arm_rfft_q31.c
+++ b/Source/TransformFunctions/arm_rfft_q31.c
@@ -494,4 +494,4 @@ void arm_split_rifft_q31(
 
 }
 
-#endif /* defined(ARM_MATH_MVEI) */
\ No newline at end of file
+#endif /* defined(ARM_MATH_MVEI) */