From 0c8bacd043f3629d5d1ab65ae54528c4bb3b482a Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Tue, 14 Jan 2020 15:33:50 +0100 Subject: [PATCH] CMSIS-DSP: Added MVE code for rfft q15 --- Source/TransformFunctions/arm_rfft_q15.c | 207 ++++++++++++++++++++++- 1 file changed, 206 insertions(+), 1 deletion(-) diff --git a/Source/TransformFunctions/arm_rfft_q15.c b/Source/TransformFunctions/arm_rfft_q15.c index 6013e8cd..8aa24355 100644 --- a/Source/TransformFunctions/arm_rfft_q15.c +++ b/Source/TransformFunctions/arm_rfft_q15.c @@ -128,6 +128,113 @@ void arm_rfft_q15( The function implements a Real FFT */ +#if defined(ARM_MATH_MVEI) +void arm_split_rfft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier) +{ + q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U - 14]; /* temp pointers for output buffer */ + q15_t const *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U - 14]; /* temp pointers for input buffer */ + q15_t const *pVecSrc1; + q15_t *pVecDst1; + q15x8x2_t vecIn, vecSum; + uint32_t blkCnt; + uint16x8_t vecStridesFwd, vecStridesBkwd; + q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; + + /* + * Init coefficient pointers + */ + pCoefA = &pATable[modifier * 2U]; + pCoefB = &pBTable[modifier * 2U]; + /* + * scatter / gather offsets + * for ascending & descending addressing + */ + vecStridesFwd = vidupq_u16(0, 2); // 0, 2, 4, 6, 8, 10, 12, 14 + vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0 + vecStridesFwd = vecStridesFwd * (uint16_t) modifier; + + pVecSrc1 = (q15_t const *) pSrc1; + pVecDst1 = pDst1; + + blkCnt = fftLen >> 3; + while (blkCnt > 0U) + { + vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd); + vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); + vecIn = vld2q(pVecSrc1); + pVecSrc1 += 16; + /* + * outR = *pSrc1 * CoefA1; + */ + vecSum.val[0] = vrmulhq(vecIn.val[0], vecCoefFwd0); + /* + * outI = *pSrc1++ * CoefA2; + */ + vecSum.val[1] = vrmulhq(vecIn.val[0], vecCoefFwd1); + + vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); + /* + * outR -= (*pSrc1 + *pSrc2) * CoefA2; + */ + vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd); + vecSum.val[0] = vqsubq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd1)); + + vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); + /* + * outI += *pSrc1++ * CoefA1; + */ + vecSum.val[1] = vqaddq(vecSum.val[1], vrmulhq(vecIn.val[1], vecCoefFwd0)); + + vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd); + /* + * outI -= *pSrc2-- * CoefB1; + */ + vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd0)); + + vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); + /* + * outI -= *pSrc2 * CoefA2; + */ + vecSum.val[1] = vqsubq(vecSum.val[1], vrmulhq(vecInBkwd, vecCoefFwd1)); + /* + * outR += *pSrc2-- * CoefB1; + */ + vecSum.val[0] = vqaddq(vecSum.val[0], vrmulhq(vecInBkwd, vecCoefFwd0)); + + vst2q(pVecDst1, vecSum); + pVecDst1 += 16; + /* + * write complex conjugate output + */ + vecSum.val[1] = -vecSum.val[1]; + vstrhq_scatter_shifted_offset(pDst2, vecStridesBkwd, vecSum.val[1]); + vstrhq_scatter_shifted_offset(&pDst2[-1], vecStridesBkwd, vecSum.val[0]); + /* + * update fwd and backwd offsets + */ + vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U); + /* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/ + //vecStridesBkwd = vecStridesBkwd - (uint16_t)16; + pSrc2 = pSrc2 - 16; + pDst2 = pDst2 - 16; + + blkCnt--; + } + + pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1; + pDst[(2U * fftLen) + 1U] = 0; + + pDst[0] = (pSrc[0] + pSrc[1]) >> 1; + pDst[1] = 0; +} +#else void arm_split_rfft_q15( q15_t * pSrc, uint32_t fftLen, @@ -270,7 +377,7 @@ void arm_split_rfft_q15( #endif /* #if defined (ARM_MATH_DSP) */ } - +#endif /* defined(ARM_MATH_MVEI) */ /** @brief Core Real IFFT process @@ -286,6 +393,103 @@ void arm_split_rfft_q15( The function implements a Real IFFT */ +#if defined(ARM_MATH_MVEI) + +void arm_split_rifft_q15( + q15_t * pSrc, + uint32_t fftLen, + const q15_t * pATable, + const q15_t * pBTable, + q15_t * pDst, + uint32_t modifier) +{ + q15_t const *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */ + q15_t const *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U - 14U]; + q15_t *pDst1 = &pDst[0]; + q15_t const *pVecSrc1; + q15_t *pVecDst1; + q15x8x2_t vecIn, vecSum; + uint32_t blkCnt; + uint16x8_t vecStridesFwd, vecStridesBkwd; + q15x8_t vecInBkwd, vecCoefFwd0, vecCoefFwd1; + + /* + * Init coefficient pointers + */ + pCoefA = &pATable[0]; + pCoefB = &pBTable[0]; + /* + * scatter / gather offsets + * for ascending & descending addressing + */ + vecStridesFwd = vidupq_u16(0, 2); // 0, 2, 4, 6, 8, 10, 12, 14 + vecStridesBkwd = vddupq_u16(14, 2); // 14, 12, 10, 8, 6, 4, 2, 0 + vecStridesFwd = vecStridesFwd * (uint16_t) modifier; + + + pVecSrc1 = (q15_t const *) pSrc1; + pVecDst1 = pDst1; + + blkCnt = fftLen >> 3; + while (blkCnt > 0U) + { + vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefA, vecStridesFwd); + vecCoefFwd1 = vldrhq_gather_shifted_offset(&pCoefA[1], vecStridesFwd); + vecIn = vld2q(pVecSrc1); + pVecSrc1 += 16; + /* + * outR = *pSrc1 * CoefA1; + */ + vecSum.val[0] = vmulhq(vecIn.val[0], vecCoefFwd0); + /* + * outI = -(*pSrc1++) * CoefA2; + */ + vecIn.val[0] = vnegq(vecIn.val[0]); + vecSum.val[1] = vmulhq(vecIn.val[0], vecCoefFwd1); + + vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); + /* + * outR += (*pSrc1 + *pSrc2) * CoefA2; + */ + vecInBkwd = vqaddq(vecIn.val[1], vecInBkwd); + vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd1)); + + vecInBkwd = vldrhq_gather_shifted_offset(pSrc2, vecStridesBkwd); + /* + * outI += *pSrc1++ * CoefA1; + */ + vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecIn.val[1], vecCoefFwd0)); + + vecCoefFwd0 = vldrhq_gather_shifted_offset(pCoefB, vecStridesFwd); + /* + * outI -= *pSrc2-- * CoefB1; + */ + vecSum.val[1] = vqsubq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd0)); + + vecInBkwd = vldrhq_gather_shifted_offset(&pSrc2[-1], vecStridesBkwd); + /* + * outI += *pSrc2 * CoefA2; + */ + vecSum.val[1] = vqaddq(vecSum.val[1], vmulhq(vecInBkwd, vecCoefFwd1)); + /* + * outR += *pSrc2-- * CoefB1; + */ + vecSum.val[0] = vqaddq(vecSum.val[0], vmulhq(vecInBkwd, vecCoefFwd0)); + + vst2q(pVecDst1, vecSum); + pVecDst1 += 16; + /* + * update fwd and backwd offsets + */ + vecStridesFwd = vecStridesFwd + (uint16_t)(modifier * 16U); + + /* cannot use negative 16-bit offsets (would lead to positive 32-65K jump*/ + //vecStridesBkwd = vecStridesBkwd - (uint16_t)16; + pSrc2 = pSrc2 - 16; + blkCnt--; + } +} +#else void arm_split_rifft_q15( q15_t * pSrc, uint32_t fftLen, @@ -382,3 +586,4 @@ void arm_split_rifft_q15( } } +#endif /* defined(ARM_MATH_MVEI) */ \ No newline at end of file