diff --git a/PrivateInclude/arm_vec_fft.h b/PrivateInclude/arm_vec_fft.h index b372d5a6..b2f21049 100755 --- a/PrivateInclude/arm_vec_fft.h +++ b/PrivateInclude/arm_vec_fft.h @@ -40,8 +40,8 @@ extern "C" #define MVE_CMPLX_MULT_FLT_AxB(A,B) vcmlaq_rot90(vcmulq(A, B), A, B) #define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B) vcmlaq_rot270(vcmulq(A, B), A, B) -#define MVE_CMPLX_MULT_FX_AxB(A,B) vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B); -#define MVE_CMPLX_MULT_FX_AxConjB(A,B) vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B); +#define MVE_CMPLX_MULT_FX_AxB(A,B) vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B) +#define MVE_CMPLX_MULT_FX_AxConjB(A,B) vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B) #define MVE_CMPLX_ADD_FX_A_ixB(A, B) vhcaddq_rot90(A,B) #define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B) diff --git a/Source/TransformFunctions/arm_rfft_q15.c b/Source/TransformFunctions/arm_rfft_q15.c index 6d68bf5e..d2d34c16 100644 --- a/Source/TransformFunctions/arm_rfft_q15.c +++ b/Source/TransformFunctions/arm_rfft_q15.c @@ -90,7 +90,6 @@ void arm_rfft_q15( const arm_cfft_instance_q15 *S_CFFT = S->pCfft; #endif uint32_t L2 = S->fftLenReal >> 1U; - uint32_t i; /* Calculation of RIFFT of input */ if (S->ifftFlagR == 1U) @@ -101,10 +100,7 @@ void arm_rfft_q15( /* Complex IFFT process */ arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR); - for(i = 0; i < S->fftLenReal; i++) - { - pDst[i] = pDst[i] << 1U; - } + arm_shift_q15(pDst, 1, pDst, S->fftLenReal); } else { @@ -138,6 +134,10 @@ void arm_rfft_q15( */ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" + void arm_split_rfft_q15( q15_t * pSrc, uint32_t fftLen, @@ -146,101 +146,54 @@ void arm_split_rfft_q15( q15_t * pDst, uint32_t modifier) { - q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ - q15_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U - 14]; /* temp pointers for output buffer */ - q15_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U - 14]; /* temp pointers for input buffer */ - q15_t const *pVecSrc1; - q15_t *pVecDst1; - q15x8x2_t vecIn, vecSum; - uint32_t blkCnt; - uint16x8_t vecStridesFwd, vecStridesBkwd; - q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; - - /* - * Init coefficient pointers - */ - pCoefA = &pATable[modifier * 2U]; - pCoefB = &pBTable[modifier * 2U]; - /* - * scatter / gather offsets - * for ascending & descending addressing - */ - vecStridesFwd = vidupq_u16((uint32_t)0, 2); // 0, 2, 4, 6, 8, 10, 12, 14 - vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0 - vecStridesFwd = vecStridesFwd * (uint16_t) modifier; - - pVecSrc1 = (q15_t const *) pSrc1; - pVecDst1 = pDst1; - - blkCnt = fftLen >> 3; - while (blkCnt > 0U) - { - vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd); - vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); - vecIn = vld2q(pVecSrc1); - pVecSrc1 += 16; - /* - * outR = *pSrc1 * CoefA1; - */ - vecSum.val[0] = vrmulhq(vecIn.val[0], vecCoefFwd0); - /* - * outI = *pSrc1++ * CoefA2; - */ - vecSum.val[1] = vrmulhq(vecIn.val[0], vecCoefFwd1); - - vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outR -= (*pSrc1 + *pSrc2) * CoefA2; - */ - vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd); - vecSum.val[0] = vqsubq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd1)); - - vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outI += *pSrc1++ * CoefA1; - */ - vecSum.val[1] = vqaddq(vecSum.val[1], vrmulhq(vecIn.val[1], vecCoefFwd0)); - - vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd); - /* - * outI -= *pSrc2-- * CoefB1; - */ - vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd0)); - - vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); - /* - * outI -= *pSrc2 * CoefA2; - */ - vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd1)); - /* - * outR += *pSrc2-- * CoefB1; - */ - vecSum.val[0] = vqaddq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd0)); - - vst2q(pVecDst1, vecSum); - pVecDst1 += 16; - /* - * write complex conjugate output - */ - vecSum.val[1] = -vecSum.val[1]; - vstrhq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]); - vstrhq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]); - /* - * update fwd and backwd offsets - */ - vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U); - /* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/ - //vecStridesBkwd = vecStridesBkwd - (uint16_t)16; - pSrc2 = pSrc2 - 16; - pDst2 = pDst2 - 16; - - blkCnt--; + uint32_t i; /* Loop Counter */ + const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pOut1 = &pDst[2]; + q15_t *pIn1 = &pSrc[2]; + uint16x8_t offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 }; + uint16x8_t offsetCoef; + const uint16_t offsetCoefArr[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, + 0, 1, 0, 1, 0, 1, 0, 1 + }; + + offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8); + offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8)); + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + const q15_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[2]; + + i = fftLen - 1U; + i = i / 4 + 1; + while (i > 0U) { + q15x8_t in1 = vld1q(pIn1); + q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn); + q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef); + q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef); + + q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA), + MVE_CMPLX_MULT_FX_AxConjB(coefB, in2)); + + vst1q(pOut1, out); + pOut1 += 8; + + offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8); + offsetIn -= 8; + pIn1 += 8; + i -= 1; } - pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1; - pDst[(2U * fftLen) + 1U] = 0; + pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U; + pDst[2 * fftLen + 1] = 0; - pDst[0] = (pSrc[0] + pSrc[1]) >> 1; + pDst[0] = (pSrc[0] + pSrc[1]) >> 1U; pDst[1] = 0; } #else @@ -404,6 +357,9 @@ void arm_split_rfft_q15( #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" + void arm_split_rifft_q15( q15_t * pSrc, uint32_t fftLen, @@ -412,90 +368,52 @@ void arm_split_rifft_q15( q15_t * pDst, uint32_t modifier) { - q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ - q15_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U - 14U]; - q15_t *pDst1 = &pDst[0]; - q15_t const *pVecSrc1; - q15_t *pVecDst1; - q15x8x2_t vecIn, vecSum; - uint32_t blkCnt; - uint16x8_t vecStridesFwd, vecStridesBkwd; - q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; - - /* - * Init coefficient pointers - */ + uint32_t i; /* Loop Counter */ + const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pIn1; + uint16x8_t offset = { 6, 7, 4, 5, 2, 3, 0, 1 }; + uint16x8_t offsetCoef; + int16x8_t conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */ + const uint16_t offsetCoefArr[16] = { + 0, 0, 2, 2, 4, 4, 6, 6, + 0, 1, 0, 1, 0, 1, 0, 1 + }; + + offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8); + + offset = vaddq_n_u16(offset, (2 * fftLen - 6)); + + /* Init coefficient pointers */ pCoefA = &pATable[0]; pCoefB = &pBTable[0]; - /* - * scatter / gather offsets - * for ascending & descending addressing - */ - vecStridesFwd = vidupq_u16((uint32_t)0, 2); // 0, 2, 4, 6, 8, 10, 12, 14 - vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0 - vecStridesFwd = vecStridesFwd * (uint16_t) modifier; + const q15_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; - pVecSrc1 = (q15_t const *) pSrc1; - pVecDst1 = pDst1; - - blkCnt = fftLen >> 3; - while (blkCnt > 0U) - { - vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd); - vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); - vecIn = vld2q(pVecSrc1); - pVecSrc1 += 16; - /* - * outR = *pSrc1 * CoefA1; - */ - vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0); - /* - * outI = -(*pSrc1++) * CoefA2; - */ - vecIn.val[0] = vnegq(vecIn.val[0]); - vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1); + pIn1 = &pSrc[0]; - vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outR += (*pSrc1 + *pSrc2) * CoefA2; - */ - vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd); - vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1)); + i = fftLen; + i = i / 4; - vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outI += *pSrc1++ * CoefA1; - */ - vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0)); + while (i > 0U) { + q15x8_t in1 = vld1q(pIn1); + q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset); + q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef); + q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef); - vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd); - /* - * outI -= *pSrc2-- * CoefB1; - */ - vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0)); + /* can we avoid the conjugate here ? */ + q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA), + vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB))); - vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); - /* - * outI += *pSrc2 * CoefA2; - */ - vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1)); - /* - * outR += *pSrc2-- * CoefB1; - */ - vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0)); + vst1q(pDst, out); + pDst += 8; - vst2q(pVecDst1, vecSum); - pVecDst1 += 16; - /* - * update fwd and backwd offsets - */ - vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U); + offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8); + offset -= 8; - /* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/ - //vecStridesBkwd = vecStridesBkwd - (uint16_t)16; - pSrc2 = pSrc2 - 16; - blkCnt--; + pIn1 += 8; + i -= 1; } } #else diff --git a/Source/TransformFunctions/arm_rfft_q31.c b/Source/TransformFunctions/arm_rfft_q31.c index 190ac91a..a79d7e0a 100644 --- a/Source/TransformFunctions/arm_rfft_q31.c +++ b/Source/TransformFunctions/arm_rfft_q31.c @@ -91,7 +91,6 @@ void arm_rfft_q31( const arm_cfft_instance_q31 *S_CFFT = S->pCfft; #endif uint32_t L2 = S->fftLenReal >> 1U; - uint32_t i; /* Calculation of RIFFT of input */ if (S->ifftFlagR == 1U) @@ -102,10 +101,7 @@ void arm_rfft_q31( /* Complex IFFT process */ arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR); - for(i = 0; i < S->fftLenReal; i++) - { - pDst[i] = pDst[i] << 1U; - } + arm_shift_q31(pDst, 1, pDst, S->fftLenReal); } else { @@ -137,6 +133,9 @@ void arm_rfft_q31( #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_helium_utils.h" +#include "arm_vec_fft.h" + void arm_split_rfft_q31( q31_t *pSrc, uint32_t fftLen, @@ -145,98 +144,50 @@ void arm_split_rfft_q31( q31_t *pDst, uint32_t modifier) { - q31_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ - q31_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U]; /* temp pointers for output buffer */ - q31_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U]; /* temp pointers for input buffer */ - q31_t const *pVecSrc1; - q31_t *pVecDst1; - q31x4x2_t vecIn, vecSum; - uint32_t blkCnt; - uint32x4_t vecStridesFwd, vecStridesBkwd; - q31x4_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; - - /* - * Init coefficient pointers - */ - pCoefA = &pATable[modifier * 2U]; - pCoefB = &pBTable[modifier * 2U]; - /* - * scatter / gather offsets - * for ascending & descending addressing - */ - vecStridesFwd = vidupq_u32((uint32_t)0, 2); - vecStridesBkwd = -vecStridesFwd; - vecStridesFwd = vecStridesFwd * modifier; - - pVecSrc1 = (q31_t const *) pSrc1; - pVecDst1 = pDst1; - - blkCnt = fftLen >> 2; - while (blkCnt > 0U) - { - vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd); - vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); - vecIn = vld2q(pVecSrc1); - pVecSrc1 += 8; - /* - * outR = *pSrc1 * CoefA1; - */ - vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0); - /* - * outI = *pSrc1++ * CoefA2; - */ - vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1); - - vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outR -= (*pSrc1 + *pSrc2) * CoefA2; - */ - vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd); - vecSum.val[0] = vqsubq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1)); - - vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outI += *pSrc1++ * CoefA1; - */ - vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0)); - - vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd); - /* - * outI -= *pSrc2-- * CoefB1; - */ - vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0)); - - vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); - /* - * outI -= *pSrc2 * CoefA2; - */ - vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1)); - /* - * outR += *pSrc2-- * CoefB1; - */ - vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0)); - - vst2q(pVecDst1, vecSum); - pVecDst1 += 8; - /* - * write complex conjugate output - */ - vecSum.val[1] = -vecSum.val[1]; - vstrwq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]); - vstrwq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]); - /* - * update fwd and backwd offsets - */ - vecStridesFwd = vecStridesFwd + (modifier * 8U); - vecStridesBkwd = vecStridesBkwd - 8; - - blkCnt--; + uint32_t i; /* Loop Counter */ + const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q31_t *pOut1 = &pDst[2]; + q31_t *pIn1 = &pSrc[2]; + uint32x4_t offset = { 2, 3, 0, 1 }; + uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 }; + + offset = offset + (2 * fftLen - 4); + + + /* Init coefficient pointers */ + pCoefA = &pATable[modifier * 2]; + pCoefB = &pBTable[modifier * 2]; + + const q31_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[2]; + + i = fftLen - 1U; + i = i / 2 + 1; + while (i > 0U) { + q31x4_t in1 = vld1q(pIn1); + q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset); + q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef); + q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef); + + q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2)); + + vst1q(pOut1, out); + pOut1 += 4; + + offsetCoef += modifier * 4; + offset -= 4; + + pIn1 += 4; + i -= 1; } - pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1; - pDst[(2U * fftLen) + 1U] = 0; + pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U; + pDst[2 * fftLen + 1] = 0; - pDst[0] = (pSrc[0] + pSrc[1]) >> 1; + pDst[0] = (pSrc[0] + pSrc[1]) >> 1U; pDst[1] = 0; } #else @@ -348,87 +299,45 @@ void arm_split_rifft_q31( q31_t * pDst, uint32_t modifier) { - q31_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ - q31_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U]; - q31_t const *pVecSrc1; - q31_t *pVecDst; - q31x4x2_t vecIn, vecSum; - uint32_t blkCnt; - uint32x4_t vecStridesFwd, vecStridesBkwd; - q31x4_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; - - - /* - * Init coefficient pointers - */ + uint32_t i; /* Loop Counter */ + const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q31_t *pIn1; + uint32x4_t offset = { 2, 3, 0, 1 }; + uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 }; + int32x4_t conj = { 1, -1, 1, -1 }; + + offset = offset + (2 * fftLen - 2); + + /* Init coefficient pointers */ pCoefA = &pATable[0]; pCoefB = &pBTable[0]; - /* - * scatter / gather offsets - * for ascending & descending addressing - */ - vecStridesFwd = vidupq_u32((uint32_t)0, 2); - vecStridesBkwd = -vecStridesFwd; - vecStridesFwd = vecStridesFwd * modifier; - - pVecSrc1 = (q31_t const *) pSrc1; - pVecDst = pDst; - - blkCnt = fftLen >> 2; - while (blkCnt > 0U) - { - vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd); - vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); - vecIn = vld2q(pVecSrc1); - pVecSrc1 += 8; - /* - * outR = *pSrc1 * CoefA1; - */ - vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0); - /* - * outI = -(*pSrc1++) * CoefA2; - */ - vecIn.val[0] = (-vecIn.val[0]); - vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1); - - vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outR += (*pSrc1 + *pSrc2) * CoefA2; - */ - vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd); - vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1)); - - vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd); - /* - * outI += *pSrc1++ * CoefA1; - */ - vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0)); - - vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd); - /* - * outI -= *pSrc2-- * CoefB1; - */ - vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0)); - - vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); - /* - * outI += *pSrc2-- * CoefA2;; - */ - vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1)); - /* - * outR += *pSrc2-- * CoefB1; - */ - vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0)); - - vst2q(pVecDst, vecSum); - pVecDst += 8; - /* - * update fwd and backwd offsets - */ - vecStridesFwd = vecStridesFwd + (modifier * 8U); - vecStridesBkwd = vecStridesBkwd - 8; - - blkCnt--; + + const q31_t *pCoefAb, *pCoefBb; + pCoefAb = pCoefA; + pCoefBb = pCoefB; + + pIn1 = &pSrc[0]; + + i = fftLen; + i = i >> 1; + while (i > 0U) { + q31x4_t in1 = vld1q(pIn1); + q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset); + q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef); + q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef); + + /* can we avoid the conjugate here ? */ + q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA), + vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB))); + + vst1q(pDst, out); + pDst += 4; + + offsetCoef += modifier * 4; + offset -= 4; + + pIn1 += 4; + i -= 1; } } #else