CMSIS-DSP: Improvements to Helium implementation of RFFT Q15 and Q31.

pull/19/head
Christophe Favergeon 5 years ago
parent 49a4c065b8
commit 43f79ca92b

@ -40,8 +40,8 @@ extern "C"
#define MVE_CMPLX_MULT_FLT_AxB(A,B) vcmlaq_rot90(vcmulq(A, B), A, B) #define MVE_CMPLX_MULT_FLT_AxB(A,B) vcmlaq_rot90(vcmulq(A, B), A, B)
#define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B) vcmlaq_rot270(vcmulq(A, B), A, B) #define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B) vcmlaq_rot270(vcmulq(A, B), A, B)
#define MVE_CMPLX_MULT_FX_AxB(A,B) vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B); #define MVE_CMPLX_MULT_FX_AxB(A,B) vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
#define MVE_CMPLX_MULT_FX_AxConjB(A,B) vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B); #define MVE_CMPLX_MULT_FX_AxConjB(A,B) vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
#define MVE_CMPLX_ADD_FX_A_ixB(A, B) vhcaddq_rot90(A,B) #define MVE_CMPLX_ADD_FX_A_ixB(A, B) vhcaddq_rot90(A,B)
#define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B) #define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B)

@ -90,7 +90,6 @@ void arm_rfft_q15(
const arm_cfft_instance_q15 *S_CFFT = S->pCfft; const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
#endif #endif
uint32_t L2 = S->fftLenReal >> 1U; uint32_t L2 = S->fftLenReal >> 1U;
uint32_t i;
/* Calculation of RIFFT of input */ /* Calculation of RIFFT of input */
if (S->ifftFlagR == 1U) if (S->ifftFlagR == 1U)
@ -101,10 +100,7 @@ void arm_rfft_q15(
/* Complex IFFT process */ /* Complex IFFT process */
arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR); arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
for(i = 0; i < S->fftLenReal; i++) arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
{
pDst[i] = pDst[i] << 1U;
}
} }
else else
{ {
@ -138,6 +134,10 @@ void arm_rfft_q15(
*/ */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
void arm_split_rfft_q15( void arm_split_rfft_q15(
q15_t * pSrc, q15_t * pSrc,
uint32_t fftLen, uint32_t fftLen,
@ -146,101 +146,54 @@ void arm_split_rfft_q15(
q15_t * pDst, q15_t * pDst,
uint32_t modifier) uint32_t modifier)
{ {
q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ uint32_t i; /* Loop Counter */
q15_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U - 14]; /* temp pointers for output buffer */ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q15_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U - 14]; /* temp pointers for input buffer */ q15_t *pOut1 = &pDst[2];
q15_t const *pVecSrc1; q15_t *pIn1 = &pSrc[2];
q15_t *pVecDst1; uint16x8_t offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
q15x8x2_t vecIn, vecSum; uint16x8_t offsetCoef;
uint32_t blkCnt; const uint16_t offsetCoefArr[16] = {
uint16x8_t vecStridesFwd, vecStridesBkwd; 0, 0, 2, 2, 4, 4, 6, 6,
q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; 0, 1, 0, 1, 0, 1, 0, 1
};
/*
* Init coefficient pointers offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8);
*/ offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
pCoefA = &pATable[modifier * 2U];
pCoefB = &pBTable[modifier * 2U]; /* Init coefficient pointers */
/* pCoefA = &pATable[modifier * 2];
* scatter / gather offsets pCoefB = &pBTable[modifier * 2];
* for ascending & descending addressing
*/ const q15_t *pCoefAb, *pCoefBb;
vecStridesFwd = vidupq_u16((uint32_t)0, 2); // 0, 2, 4, 6, 8, 10, 12, 14 pCoefAb = pCoefA;
vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0 pCoefBb = pCoefB;
vecStridesFwd = vecStridesFwd * (uint16_t) modifier;
pIn1 = &pSrc[2];
pVecSrc1 = (q15_t const *) pSrc1;
pVecDst1 = pDst1; i = fftLen - 1U;
i = i / 4 + 1;
blkCnt = fftLen >> 3; while (i > 0U) {
while (blkCnt > 0U) q15x8_t in1 = vld1q(pIn1);
{ q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd); q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
vecIn = vld2q(pVecSrc1);
pVecSrc1 += 16; q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA),
/* MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
* outR = *pSrc1 * CoefA1;
*/ vst1q(pOut1, out);
vecSum.val[0] = vrmulhq(vecIn.val[0], vecCoefFwd0); pOut1 += 8;
/*
* outI = *pSrc1++ * CoefA2; offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
*/ offsetIn -= 8;
vecSum.val[1] = vrmulhq(vecIn.val[0], vecCoefFwd1); pIn1 += 8;
i -= 1;
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR -= (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqsubq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vrmulhq(vecIn.val[1], vecCoefFwd0));
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI -= *pSrc2 * CoefA2;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst1, vecSum);
pVecDst1 += 16;
/*
* write complex conjugate output
*/
vecSum.val[1] = -vecSum.val[1];
vstrhq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]);
vstrhq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]);
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U);
/* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/
//vecStridesBkwd = vecStridesBkwd - (uint16_t)16;
pSrc2 = pSrc2 - 16;
pDst2 = pDst2 - 16;
blkCnt--;
} }
pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1; pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
pDst[(2U * fftLen) + 1U] = 0; pDst[2 * fftLen + 1] = 0;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1; pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
pDst[1] = 0; pDst[1] = 0;
} }
#else #else
@ -404,6 +357,9 @@ void arm_split_rfft_q15(
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
void arm_split_rifft_q15( void arm_split_rifft_q15(
q15_t * pSrc, q15_t * pSrc,
uint32_t fftLen, uint32_t fftLen,
@ -412,90 +368,52 @@ void arm_split_rifft_q15(
q15_t * pDst, q15_t * pDst,
uint32_t modifier) uint32_t modifier)
{ {
q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ uint32_t i; /* Loop Counter */
q15_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U - 14U]; const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q15_t *pDst1 = &pDst[0]; q15_t *pIn1;
q15_t const *pVecSrc1; uint16x8_t offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
q15_t *pVecDst1; uint16x8_t offsetCoef;
q15x8x2_t vecIn, vecSum; int16x8_t conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
uint32_t blkCnt; const uint16_t offsetCoefArr[16] = {
uint16x8_t vecStridesFwd, vecStridesBkwd; 0, 0, 2, 2, 4, 4, 6, 6,
q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; 0, 1, 0, 1, 0, 1, 0, 1
};
/*
* Init coefficient pointers offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8);
*/
offset = vaddq_n_u16(offset, (2 * fftLen - 6));
/* Init coefficient pointers */
pCoefA = &pATable[0]; pCoefA = &pATable[0];
pCoefB = &pBTable[0]; pCoefB = &pBTable[0];
/*
* scatter / gather offsets
* for ascending & descending addressing
*/
vecStridesFwd = vidupq_u16((uint32_t)0, 2); // 0, 2, 4, 6, 8, 10, 12, 14
vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0
vecStridesFwd = vecStridesFwd * (uint16_t) modifier;
const q15_t *pCoefAb, *pCoefBb;
pCoefAb = pCoefA;
pCoefBb = pCoefB;
pVecSrc1 = (q15_t const *) pSrc1; pIn1 = &pSrc[0];
pVecDst1 = pDst1;
blkCnt = fftLen >> 3;
while (blkCnt > 0U)
{
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
vecIn = vld2q(pVecSrc1);
pVecSrc1 += 16;
/*
* outR = *pSrc1 * CoefA1;
*/
vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
/*
* outI = -(*pSrc1++) * CoefA2;
*/
vecIn.val[0] = vnegq(vecIn.val[0]);
vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); i = fftLen;
/* i = i / 4;
* outR += (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); while (i > 0U) {
/* q15x8_t in1 = vld1q(pIn1);
* outI += *pSrc1++ * CoefA1; q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
*/ q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0)); q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd); /* can we avoid the conjugate here ? */
/* q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
* outI -= *pSrc2-- * CoefB1; vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); vst1q(pDst, out);
/* pDst += 8;
* outI += *pSrc2 * CoefA2;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst1, vecSum); offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
pVecDst1 += 16; offset -= 8;
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U);
/* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/ pIn1 += 8;
//vecStridesBkwd = vecStridesBkwd - (uint16_t)16; i -= 1;
pSrc2 = pSrc2 - 16;
blkCnt--;
} }
} }
#else #else

@ -91,7 +91,6 @@ void arm_rfft_q31(
const arm_cfft_instance_q31 *S_CFFT = S->pCfft; const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
#endif #endif
uint32_t L2 = S->fftLenReal >> 1U; uint32_t L2 = S->fftLenReal >> 1U;
uint32_t i;
/* Calculation of RIFFT of input */ /* Calculation of RIFFT of input */
if (S->ifftFlagR == 1U) if (S->ifftFlagR == 1U)
@ -102,10 +101,7 @@ void arm_rfft_q31(
/* Complex IFFT process */ /* Complex IFFT process */
arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR); arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
for(i = 0; i < S->fftLenReal; i++) arm_shift_q31(pDst, 1, pDst, S->fftLenReal);
{
pDst[i] = pDst[i] << 1U;
}
} }
else else
{ {
@ -137,6 +133,9 @@ void arm_rfft_q31(
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
void arm_split_rfft_q31( void arm_split_rfft_q31(
q31_t *pSrc, q31_t *pSrc,
uint32_t fftLen, uint32_t fftLen,
@ -145,98 +144,50 @@ void arm_split_rfft_q31(
q31_t *pDst, q31_t *pDst,
uint32_t modifier) uint32_t modifier)
{ {
q31_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ uint32_t i; /* Loop Counter */
q31_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U]; /* temp pointers for output buffer */ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q31_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U]; /* temp pointers for input buffer */ q31_t *pOut1 = &pDst[2];
q31_t const *pVecSrc1; q31_t *pIn1 = &pSrc[2];
q31_t *pVecDst1; uint32x4_t offset = { 2, 3, 0, 1 };
q31x4x2_t vecIn, vecSum; uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
uint32_t blkCnt;
uint32x4_t vecStridesFwd, vecStridesBkwd; offset = offset + (2 * fftLen - 4);
q31x4_t vecInBkwd, vecCoefFwd0, vecCoefFwd1;
/* /* Init coefficient pointers */
* Init coefficient pointers pCoefA = &pATable[modifier * 2];
*/ pCoefB = &pBTable[modifier * 2];
pCoefA = &pATable[modifier * 2U];
pCoefB = &pBTable[modifier * 2U]; const q31_t *pCoefAb, *pCoefBb;
/* pCoefAb = pCoefA;
* scatter / gather offsets pCoefBb = pCoefB;
* for ascending & descending addressing
*/ pIn1 = &pSrc[2];
vecStridesFwd = vidupq_u32((uint32_t)0, 2);
vecStridesBkwd = -vecStridesFwd; i = fftLen - 1U;
vecStridesFwd = vecStridesFwd * modifier; i = i / 2 + 1;
while (i > 0U) {
pVecSrc1 = (q31_t const *) pSrc1; q31x4_t in1 = vld1q(pIn1);
pVecDst1 = pDst1; q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
blkCnt = fftLen >> 2; q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
while (blkCnt > 0U)
{ q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); vst1q(pOut1, out);
vecIn = vld2q(pVecSrc1); pOut1 += 4;
pVecSrc1 += 8;
/* offsetCoef += modifier * 4;
* outR = *pSrc1 * CoefA1; offset -= 4;
*/
vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0); pIn1 += 4;
/* i -= 1;
* outI = *pSrc1++ * CoefA2;
*/
vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR -= (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqsubq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI -= *pSrc2 * CoefA2;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst1, vecSum);
pVecDst1 += 8;
/*
* write complex conjugate output
*/
vecSum.val[1] = -vecSum.val[1];
vstrwq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]);
vstrwq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]);
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (modifier * 8U);
vecStridesBkwd = vecStridesBkwd - 8;
blkCnt--;
} }
pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1; pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
pDst[(2U * fftLen) + 1U] = 0; pDst[2 * fftLen + 1] = 0;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1; pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
pDst[1] = 0; pDst[1] = 0;
} }
#else #else
@ -348,87 +299,45 @@ void arm_split_rifft_q31(
q31_t * pDst, q31_t * pDst,
uint32_t modifier) uint32_t modifier)
{ {
q31_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ uint32_t i; /* Loop Counter */
q31_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U]; const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q31_t const *pVecSrc1; q31_t *pIn1;
q31_t *pVecDst; uint32x4_t offset = { 2, 3, 0, 1 };
q31x4x2_t vecIn, vecSum; uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
uint32_t blkCnt; int32x4_t conj = { 1, -1, 1, -1 };
uint32x4_t vecStridesFwd, vecStridesBkwd;
q31x4_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; offset = offset + (2 * fftLen - 2);
/* Init coefficient pointers */
/*
* Init coefficient pointers
*/
pCoefA = &pATable[0]; pCoefA = &pATable[0];
pCoefB = &pBTable[0]; pCoefB = &pBTable[0];
/*
* scatter / gather offsets const q31_t *pCoefAb, *pCoefBb;
* for ascending & descending addressing pCoefAb = pCoefA;
*/ pCoefBb = pCoefB;
vecStridesFwd = vidupq_u32((uint32_t)0, 2);
vecStridesBkwd = -vecStridesFwd; pIn1 = &pSrc[0];
vecStridesFwd = vecStridesFwd * modifier;
i = fftLen;
pVecSrc1 = (q31_t const *) pSrc1; i = i >> 1;
pVecDst = pDst; while (i > 0U) {
q31x4_t in1 = vld1q(pIn1);
blkCnt = fftLen >> 2; q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
while (blkCnt > 0U) q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
{ q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); /* can we avoid the conjugate here ? */
vecIn = vld2q(pVecSrc1); q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
pVecSrc1 += 8; vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
/*
* outR = *pSrc1 * CoefA1; vst1q(pDst, out);
*/ pDst += 4;
vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
/* offsetCoef += modifier * 4;
* outI = -(*pSrc1++) * CoefA2; offset -= 4;
*/
vecIn.val[0] = (-vecIn.val[0]); pIn1 += 4;
vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1); i -= 1;
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR += (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI += *pSrc2-- * CoefA2;;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst, vecSum);
pVecDst += 8;
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (modifier * 8U);
vecStridesBkwd = vecStridesBkwd - 8;
blkCnt--;
} }
} }
#else #else

Loading…
Cancel
Save