CMSIS-DSP: Improvements to Helium implementation of RFFT Q15 and Q31.

pull/19/head
Christophe Favergeon 5 years ago
parent 49a4c065b8
commit 43f79ca92b

@ -40,8 +40,8 @@ extern "C"
#define MVE_CMPLX_MULT_FLT_AxB(A,B) vcmlaq_rot90(vcmulq(A, B), A, B)
#define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B) vcmlaq_rot270(vcmulq(A, B), A, B)
#define MVE_CMPLX_MULT_FX_AxB(A,B) vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
#define MVE_CMPLX_MULT_FX_AxConjB(A,B) vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B);
#define MVE_CMPLX_MULT_FX_AxB(A,B) vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
#define MVE_CMPLX_MULT_FX_AxConjB(A,B) vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
#define MVE_CMPLX_ADD_FX_A_ixB(A, B) vhcaddq_rot90(A,B)
#define MVE_CMPLX_SUB_FX_A_ixB(A,B) vhcaddq_rot270(A,B)

@ -90,7 +90,6 @@ void arm_rfft_q15(
const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
#endif
uint32_t L2 = S->fftLenReal >> 1U;
uint32_t i;
/* Calculation of RIFFT of input */
if (S->ifftFlagR == 1U)
@ -101,10 +100,7 @@ void arm_rfft_q15(
/* Complex IFFT process */
arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
for(i = 0; i < S->fftLenReal; i++)
{
pDst[i] = pDst[i] << 1U;
}
arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
}
else
{
@ -138,6 +134,10 @@ void arm_rfft_q15(
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
void arm_split_rfft_q15(
q15_t * pSrc,
uint32_t fftLen,
@ -146,101 +146,54 @@ void arm_split_rfft_q15(
q15_t * pDst,
uint32_t modifier)
{
q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q15_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U - 14]; /* temp pointers for output buffer */
q15_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U - 14]; /* temp pointers for input buffer */
q15_t const *pVecSrc1;
q15_t *pVecDst1;
q15x8x2_t vecIn, vecSum;
uint32_t blkCnt;
uint16x8_t vecStridesFwd, vecStridesBkwd;
q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1;
/*
* Init coefficient pointers
*/
pCoefA = &pATable[modifier * 2U];
pCoefB = &pBTable[modifier * 2U];
/*
* scatter / gather offsets
* for ascending & descending addressing
*/
vecStridesFwd = vidupq_u16((uint32_t)0, 2); // 0, 2, 4, 6, 8, 10, 12, 14
vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0
vecStridesFwd = vecStridesFwd * (uint16_t) modifier;
pVecSrc1 = (q15_t const *) pSrc1;
pVecDst1 = pDst1;
blkCnt = fftLen >> 3;
while (blkCnt > 0U)
{
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
vecIn = vld2q(pVecSrc1);
pVecSrc1 += 16;
/*
* outR = *pSrc1 * CoefA1;
*/
vecSum.val[0] = vrmulhq(vecIn.val[0], vecCoefFwd0);
/*
* outI = *pSrc1++ * CoefA2;
*/
vecSum.val[1] = vrmulhq(vecIn.val[0], vecCoefFwd1);
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR -= (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqsubq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vrmulhq(vecIn.val[1], vecCoefFwd0));
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI -= *pSrc2 * CoefA2;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst1, vecSum);
pVecDst1 += 16;
/*
* write complex conjugate output
*/
vecSum.val[1] = -vecSum.val[1];
vstrhq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]);
vstrhq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]);
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U);
/* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/
//vecStridesBkwd = vecStridesBkwd - (uint16_t)16;
pSrc2 = pSrc2 - 16;
pDst2 = pDst2 - 16;
blkCnt--;
uint32_t i; /* Loop Counter */
const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q15_t *pOut1 = &pDst[2];
q15_t *pIn1 = &pSrc[2];
uint16x8_t offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
uint16x8_t offsetCoef;
const uint16_t offsetCoefArr[16] = {
0, 0, 2, 2, 4, 4, 6, 6,
0, 1, 0, 1, 0, 1, 0, 1
};
offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8);
offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
/* Init coefficient pointers */
pCoefA = &pATable[modifier * 2];
pCoefB = &pBTable[modifier * 2];
const q15_t *pCoefAb, *pCoefBb;
pCoefAb = pCoefA;
pCoefBb = pCoefB;
pIn1 = &pSrc[2];
i = fftLen - 1U;
i = i / 4 + 1;
while (i > 0U) {
q15x8_t in1 = vld1q(pIn1);
q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA),
MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
vst1q(pOut1, out);
pOut1 += 8;
offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
offsetIn -= 8;
pIn1 += 8;
i -= 1;
}
pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
pDst[(2U * fftLen) + 1U] = 0;
pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
pDst[2 * fftLen + 1] = 0;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
pDst[1] = 0;
}
#else
@ -404,6 +357,9 @@ void arm_split_rfft_q15(
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
void arm_split_rifft_q15(
q15_t * pSrc,
uint32_t fftLen,
@ -412,90 +368,52 @@ void arm_split_rifft_q15(
q15_t * pDst,
uint32_t modifier)
{
q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q15_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U - 14U];
q15_t *pDst1 = &pDst[0];
q15_t const *pVecSrc1;
q15_t *pVecDst1;
q15x8x2_t vecIn, vecSum;
uint32_t blkCnt;
uint16x8_t vecStridesFwd, vecStridesBkwd;
q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1;
/*
* Init coefficient pointers
*/
uint32_t i; /* Loop Counter */
const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q15_t *pIn1;
uint16x8_t offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
uint16x8_t offsetCoef;
int16x8_t conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
const uint16_t offsetCoefArr[16] = {
0, 0, 2, 2, 4, 4, 6, 6,
0, 1, 0, 1, 0, 1, 0, 1
};
offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8);
offset = vaddq_n_u16(offset, (2 * fftLen - 6));
/* Init coefficient pointers */
pCoefA = &pATable[0];
pCoefB = &pBTable[0];
/*
* scatter / gather offsets
* for ascending & descending addressing
*/
vecStridesFwd = vidupq_u16((uint32_t)0, 2); // 0, 2, 4, 6, 8, 10, 12, 14
vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0
vecStridesFwd = vecStridesFwd * (uint16_t) modifier;
const q15_t *pCoefAb, *pCoefBb;
pCoefAb = pCoefA;
pCoefBb = pCoefB;
pVecSrc1 = (q15_t const *) pSrc1;
pVecDst1 = pDst1;
blkCnt = fftLen >> 3;
while (blkCnt > 0U)
{
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
vecIn = vld2q(pVecSrc1);
pVecSrc1 += 16;
/*
* outR = *pSrc1 * CoefA1;
*/
vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
/*
* outI = -(*pSrc1++) * CoefA2;
*/
vecIn.val[0] = vnegq(vecIn.val[0]);
vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
pIn1 = &pSrc[0];
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR += (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
i = fftLen;
i = i / 4;
vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
while (i > 0U) {
q15x8_t in1 = vld1q(pIn1);
q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
/* can we avoid the conjugate here ? */
q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI += *pSrc2 * CoefA2;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
vst1q(pDst, out);
pDst += 8;
vst2q(pVecDst1, vecSum);
pVecDst1 += 16;
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U);
offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
offset -= 8;
/* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/
//vecStridesBkwd = vecStridesBkwd - (uint16_t)16;
pSrc2 = pSrc2 - 16;
blkCnt--;
pIn1 += 8;
i -= 1;
}
}
#else

@ -91,7 +91,6 @@ void arm_rfft_q31(
const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
#endif
uint32_t L2 = S->fftLenReal >> 1U;
uint32_t i;
/* Calculation of RIFFT of input */
if (S->ifftFlagR == 1U)
@ -102,10 +101,7 @@ void arm_rfft_q31(
/* Complex IFFT process */
arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
for(i = 0; i < S->fftLenReal; i++)
{
pDst[i] = pDst[i] << 1U;
}
arm_shift_q31(pDst, 1, pDst, S->fftLenReal);
}
else
{
@ -137,6 +133,9 @@ void arm_rfft_q31(
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
void arm_split_rfft_q31(
q31_t *pSrc,
uint32_t fftLen,
@ -145,98 +144,50 @@ void arm_split_rfft_q31(
q31_t *pDst,
uint32_t modifier)
{
q31_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q31_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U]; /* temp pointers for output buffer */
q31_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U]; /* temp pointers for input buffer */
q31_t const *pVecSrc1;
q31_t *pVecDst1;
q31x4x2_t vecIn, vecSum;
uint32_t blkCnt;
uint32x4_t vecStridesFwd, vecStridesBkwd;
q31x4_t vecInBkwd, vecCoefFwd0, vecCoefFwd1;
/*
* Init coefficient pointers
*/
pCoefA = &pATable[modifier * 2U];
pCoefB = &pBTable[modifier * 2U];
/*
* scatter / gather offsets
* for ascending & descending addressing
*/
vecStridesFwd = vidupq_u32((uint32_t)0, 2);
vecStridesBkwd = -vecStridesFwd;
vecStridesFwd = vecStridesFwd * modifier;
pVecSrc1 = (q31_t const *) pSrc1;
pVecDst1 = pDst1;
blkCnt = fftLen >> 2;
while (blkCnt > 0U)
{
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
vecIn = vld2q(pVecSrc1);
pVecSrc1 += 8;
/*
* outR = *pSrc1 * CoefA1;
*/
vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
/*
* outI = *pSrc1++ * CoefA2;
*/
vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR -= (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqsubq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI -= *pSrc2 * CoefA2;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst1, vecSum);
pVecDst1 += 8;
/*
* write complex conjugate output
*/
vecSum.val[1] = -vecSum.val[1];
vstrwq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]);
vstrwq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]);
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (modifier * 8U);
vecStridesBkwd = vecStridesBkwd - 8;
blkCnt--;
uint32_t i; /* Loop Counter */
const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q31_t *pOut1 = &pDst[2];
q31_t *pIn1 = &pSrc[2];
uint32x4_t offset = { 2, 3, 0, 1 };
uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
offset = offset + (2 * fftLen - 4);
/* Init coefficient pointers */
pCoefA = &pATable[modifier * 2];
pCoefB = &pBTable[modifier * 2];
const q31_t *pCoefAb, *pCoefBb;
pCoefAb = pCoefA;
pCoefBb = pCoefB;
pIn1 = &pSrc[2];
i = fftLen - 1U;
i = i / 2 + 1;
while (i > 0U) {
q31x4_t in1 = vld1q(pIn1);
q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
vst1q(pOut1, out);
pOut1 += 4;
offsetCoef += modifier * 4;
offset -= 4;
pIn1 += 4;
i -= 1;
}
pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
pDst[(2U * fftLen) + 1U] = 0;
pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
pDst[2 * fftLen + 1] = 0;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
pDst[1] = 0;
}
#else
@ -348,87 +299,45 @@ void arm_split_rifft_q31(
q31_t * pDst,
uint32_t modifier)
{
q31_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q31_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U];
q31_t const *pVecSrc1;
q31_t *pVecDst;
q31x4x2_t vecIn, vecSum;
uint32_t blkCnt;
uint32x4_t vecStridesFwd, vecStridesBkwd;
q31x4_t vecInBkwd, vecCoefFwd0, vecCoefFwd1;
/*
* Init coefficient pointers
*/
uint32_t i; /* Loop Counter */
const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
q31_t *pIn1;
uint32x4_t offset = { 2, 3, 0, 1 };
uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
int32x4_t conj = { 1, -1, 1, -1 };
offset = offset + (2 * fftLen - 2);
/* Init coefficient pointers */
pCoefA = &pATable[0];
pCoefB = &pBTable[0];
/*
* scatter / gather offsets
* for ascending & descending addressing
*/
vecStridesFwd = vidupq_u32((uint32_t)0, 2);
vecStridesBkwd = -vecStridesFwd;
vecStridesFwd = vecStridesFwd * modifier;
pVecSrc1 = (q31_t const *) pSrc1;
pVecDst = pDst;
blkCnt = fftLen >> 2;
while (blkCnt > 0U)
{
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefA, vecStridesFwd);
vecCoefFwd1 = vldrwq_gather_shifted_offset(&pCoefA[1], vecStridesFwd);
vecIn = vld2q(pVecSrc1);
pVecSrc1 += 8;
/*
* outR = *pSrc1 * CoefA1;
*/
vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0);
/*
* outI = -(*pSrc1++) * CoefA2;
*/
vecIn.val[0] = (-vecIn.val[0]);
vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1);
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outR += (*pSrc1 + *pSrc2) * CoefA2;
*/
vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd);
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1));
vecInBkwd = vldrwq_gather_shifted_offset(pSrc2, vecStridesBkwd);
/*
* outI += *pSrc1++ * CoefA1;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0));
vecCoefFwd0 = vldrwq_gather_shifted_offset(pCoefB, vecStridesFwd);
/*
* outI -= *pSrc2-- * CoefB1;
*/
vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0));
vecInBkwd = vldrwq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd);
/*
* outI += *pSrc2-- * CoefA2;;
*/
vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1));
/*
* outR += *pSrc2-- * CoefB1;
*/
vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0));
vst2q(pVecDst, vecSum);
pVecDst += 8;
/*
* update fwd and backwd offsets
*/
vecStridesFwd = vecStridesFwd + (modifier * 8U);
vecStridesBkwd = vecStridesBkwd - 8;
blkCnt--;
const q31_t *pCoefAb, *pCoefBb;
pCoefAb = pCoefA;
pCoefBb = pCoefB;
pIn1 = &pSrc[0];
i = fftLen;
i = i >> 1;
while (i > 0U) {
q31x4_t in1 = vld1q(pIn1);
q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
/* can we avoid the conjugate here ? */
q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
vst1q(pDst, out);
pDst += 4;
offsetCoef += modifier * 4;
offset -= 4;
pIn1 += 4;
i -= 1;
}
}
#else

Loading…
Cancel
Save