From 68b219bb1f7ba7d5c037fbdff8ddc31204b3c47c Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Thu, 5 Nov 2020 10:56:11 +0100 Subject: [PATCH] CMSIS-DSP: New MVE implementation of the FIR F16 --- Include/dsp/filtering_functions.h | 2 + Include/dsp/filtering_functions_f16.h | 2 + Source/FilteringFunctions/arm_fir_f16.c | 382 +++++++++++-------- Source/FilteringFunctions/arm_fir_f32.c | 11 +- Source/FilteringFunctions/arm_fir_init_f16.c | 13 +- Testing/Source/Benchmarks/FIRF16.cpp | 19 +- Testing/Source/Benchmarks/FIRF32.cpp | 3 +- Testing/Source/Tests/FIRF16.cpp | 4 +- 8 files changed, 276 insertions(+), 160 deletions(-) diff --git a/Include/dsp/filtering_functions.h b/Include/dsp/filtering_functions.h index 0cbd7cf4..1f9e0fac 100755 --- a/Include/dsp/filtering_functions.h +++ b/Include/dsp/filtering_functions.h @@ -39,6 +39,8 @@ extern "C" { #endif + + #define DELTA_Q31 ((q31_t)(0x100)) #define DELTA_Q15 ((q15_t)0x5) diff --git a/Include/dsp/filtering_functions_f16.h b/Include/dsp/filtering_functions_f16.h index 4a99e831..0265f04e 100755 --- a/Include/dsp/filtering_functions_f16.h +++ b/Include/dsp/filtering_functions_f16.h @@ -40,6 +40,8 @@ extern "C" #if defined(ARM_FLOAT16_SUPPORTED) +#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S)) + /** * @brief Instance structure for the floating-point FIR filter. */ diff --git a/Source/FilteringFunctions/arm_fir_f16.c b/Source/FilteringFunctions/arm_fir_f16.c index d8713cf3..0cc55843 100755 --- a/Source/FilteringFunctions/arm_fir_f16.c +++ b/Source/FilteringFunctions/arm_fir_f16.c @@ -47,6 +47,8 @@ #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#define FIR_F32_MAX_COEF_BLK 8 + #define FIR_F16_CORE(pSamples, c, NB_TAPS) \ vecAcc0 = vdupq_n_f16(0.0f16); \ for (int i = 0; i < NB_TAPS; i++) { \ @@ -54,7 +56,9 @@ vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \ } -static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize) +__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, + const float16_t * __restrict pSrc, + float16_t * __restrict pDst, uint32_t blockSize) { float16_t *pState = S->pState; /* State pointer */ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ @@ -65,8 +69,8 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t float16_t *pTempDest; /* Temporary pointer to the destination buffer */ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ int32_t blkCnt; - f16x8_t vecIn0; - f16x8_t vecAcc0; + float16x8_t vecIn0; + float16x8_t vecAcc0; const int NB_TAPS=4; float16_t c[NB_TAPS]; @@ -107,6 +111,7 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t } blkCnt = blockSize & 7; + if (blkCnt) { mve_pred16_t p0 = vctp16q(blkCnt); @@ -141,7 +146,9 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t } -static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize) +__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, + const float16_t * __restrict pSrc, + float16_t * __restrict pDst, uint32_t blockSize) { float16_t *pState = S->pState; /* State pointer */ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ @@ -152,8 +159,8 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t float16_t *pTempDest; /* Temporary pointer to the destination buffer */ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ int32_t blkCnt; - f16x8_t vecIn0; - f16x8_t vecAcc0; + float16x8_t vecIn0; + float16x8_t vecAcc0; const int NB_TAPS=8; float16_t c[NB_TAPS]; @@ -194,6 +201,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t } blkCnt = blockSize & 7; + if (blkCnt) { mve_pred16_t p0 = vctp16q(blkCnt); @@ -224,6 +232,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t mve_pred16_t p0 = vctp16q(blkCnt); vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0); } + } @@ -232,224 +241,299 @@ void arm_fir_f16(const arm_fir_instance_f16 * S, float16_t * pDst, uint32_t blockSize) { - float16_t *pState = S->pState; /* State pointer */ - const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - float16_t *pStateCur; /* Points to the current sample of the state */ - const float16_t *pSamples; /* Temporary pointer to the sample buffer */ - float16_t *pOutput; /* Temporary pointer to the output buffer */ - const float16_t *pTempSrc; /* Temporary pointer to the source data */ - float16_t *pTempDest; /* Temporary pointer to the destination buffer */ - int32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ - uint32_t blkCnt; - f16x8_t vecIn0; - f16x8_t vecAcc0; - float16_t c0, c1, c2, c3; - float16_t c4, c5, c6, c7; + float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8); + float16_t *pState = pRefStatePtr ; /* State pointer */ + const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + const float16_t *pSamples; /* Temporary pointer to the sample buffer */ + float16_t *pOutput; /* Temporary pointer to the output buffer */ + const float16_t *pTempSrc; /* Temporary pointer to the source data */ + float16_t *pTempDest; /* Temporary pointer to the destination buffer */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t blkCnt; + float16_t c0, c1, c2, c3; + float16_t c4, c5, c6, c7; /* * [1 to 8 taps] specialized routines */ - if (numTaps <= 4) - { + if (numTaps <= 4) { arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize); return; - } - else if (numTaps <= 8) - { + } else if (numTaps <= 8) { arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize); return; } - /* - * pState points to state array which contains previous frame (numTaps - 1) samples - * pStateCur points to the location where the new input data should be written - */ - pStateCur = &(pState[(numTaps - 1u)]); - /* - * Copy new data into state so that we obtain a continuous sample buffer - * containing both the tail end of the old data and the new data. - */ - pSamples = pState; pTempSrc = pSrc; - pOutput = pDst; - - blkCnt = blockSize >> 3; - while (blkCnt > 0U) - { - int i; - const float16_t *pCoeffsCur = pCoeffs; - - /* - * Save 8 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 8; + pTempDest = &(pState[(numTaps - 1u)]); + int cnt = blockSize; + do { + mve_pred16_t p0 = vctp16q(cnt); + vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0); + pTempDest += 8; pTempSrc += 8; + cnt -= 8; + } while (cnt > 0); + + float16_t *partial_accu_ptr = S->pState; - c0 = *pCoeffsCur++; - c1 = *pCoeffsCur++; - c2 = *pCoeffsCur++; - c3 = *pCoeffsCur++; - c4 = *pCoeffsCur++; - c5 = *pCoeffsCur++; - c6 = *pCoeffsCur++; - c7 = *pCoeffsCur++; + pSamples = pState; + c0 = *pCoeffs++; + c1 = *pCoeffs++; + c2 = *pCoeffs++; + c3 = *pCoeffs++; + c4 = *pCoeffs++; + c5 = *pCoeffs++; + c6 = *pCoeffs++; + c7 = *pCoeffs++; + + cnt = blockSize >> 3; + while (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; vecIn0 = vld1q(pSamples); vecAcc0 = vmulq(vecIn0, c0); - vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); + vecIn0 = vld1q(&pSamples[7]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); + pSamples += 8; + vst1q(partial_accu_ptr, vecAcc0); + cnt--; + partial_accu_ptr += 8; + } + + cnt = blockSize & 7; + if (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; + + mve_pred16_t p0 = vctp16q(cnt); + + vecIn0 = vld1q(pSamples); + vecAcc0 = vmulq(vecIn0, c0); + vecIn0 = vld1q(&pSamples[1]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); + vecIn0 = vld1q(&pSamples[2]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); + vecIn0 = vld1q(&pSamples[3]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); + vecIn0 = vld1q(&pSamples[4]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); + vecIn0 = vld1q(&pSamples[5]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); + vecIn0 = vld1q(&pSamples[6]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); + vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0); + } - pSamples += 8; + int localTaps = numTaps - FIR_F32_MAX_COEF_BLK; + int sample_offset = FIR_F32_MAX_COEF_BLK; + while (localTaps > FIR_F32_MAX_COEF_BLK) { + c0 = *pCoeffs++; + c1 = *pCoeffs++; + c2 = *pCoeffs++; + c3 = *pCoeffs++; + c4 = *pCoeffs++; + c5 = *pCoeffs++; + c6 = *pCoeffs++; + c7 = *pCoeffs++; + + partial_accu_ptr = S->pState; + pSamples = pState + sample_offset; + int cnt = blockSize >> 3; + while (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; - for (i = 0; i <= ((numTaps - 9) / 8); i++) - { - c0 = *pCoeffsCur++; - c1 = *pCoeffsCur++; - c2 = *pCoeffsCur++; - c3 = *pCoeffsCur++; - c4 = *pCoeffsCur++; - c5 = *pCoeffsCur++; - c6 = *pCoeffsCur++; - c7 = *pCoeffsCur++; vecIn0 = vld1q(pSamples); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c0); - + vecAcc0 = vmulq(vecIn0, c0); vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); - vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); - pSamples += 8; + vecAcc0 += vld1q_f16(partial_accu_ptr); + vst1q(partial_accu_ptr, vecAcc0); + cnt--; + partial_accu_ptr += 8; } - vst1q(pOutput, vecAcc0); - pOutput += 8; - pSamples = pSamples - (i + 1) * 8 + 8; - - blkCnt--; - } - - blkCnt = blockSize & 7; - { - mve_pred16_t p0 = vctp16q(blkCnt); - int i; - const float16_t *pCoeffsCur = pCoeffs; - - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 8; - pTempSrc += 8; - - c0 = *pCoeffsCur++; - c1 = *pCoeffsCur++; - c2 = *pCoeffsCur++; - c3 = *pCoeffsCur++; - c4 = *pCoeffsCur++; - c5 = *pCoeffsCur++; - c6 = *pCoeffsCur++; - c7 = *pCoeffsCur++; - - vecIn0 = vld1q(pSamples); - vecAcc0 = vmulq(vecIn0, c0); - - vecIn0 = vld1q(&pSamples[1]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); + cnt = blockSize & 7; + if (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; - vecIn0 = vld1q(&pSamples[2]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); + mve_pred16_t p0 = vctp16q(cnt); - vecIn0 = vld1q(&pSamples[3]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - - vecIn0 = vld1q(&pSamples[4]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); + vecIn0 = vld1q(pSamples); + vecAcc0 = vmulq(vecIn0, c0); + vecIn0 = vld1q(&pSamples[1]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); + vecIn0 = vld1q(&pSamples[2]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); + vecIn0 = vld1q(&pSamples[3]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); + vecIn0 = vld1q(&pSamples[4]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); + vecIn0 = vld1q(&pSamples[5]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); + vecIn0 = vld1q(&pSamples[6]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); + vecIn0 = vld1q(&pSamples[7]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); + vecAcc0 += vld1q_f16(partial_accu_ptr); + vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0); + } - vecIn0 = vld1q(&pSamples[5]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); + localTaps -= FIR_F32_MAX_COEF_BLK; + sample_offset += FIR_F32_MAX_COEF_BLK; + } - vecIn0 = vld1q(&pSamples[6]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); + pSamples = pState + sample_offset; + + if (localTaps > 4) { + c0 = *pCoeffs++; + c1 = *pCoeffs++; + c2 = *pCoeffs++; + c3 = *pCoeffs++; + c4 = *pCoeffs++; + c5 = *pCoeffs++; + c6 = *pCoeffs++; + c7 = *pCoeffs++; + pOutput = pDst; + + partial_accu_ptr = S->pState; + cnt = blockSize >> 3; + while (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; - vecIn0 = vld1q(&pSamples[7]); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); + vecIn0 = vld1q(pSamples); + vecAcc0 = vmulq(vecIn0, c0); + vecIn0 = vld1q(&pSamples[1]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); + vecIn0 = vld1q(&pSamples[2]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); + vecIn0 = vld1q(&pSamples[3]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); + vecIn0 = vld1q(&pSamples[4]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); + vecIn0 = vld1q(&pSamples[5]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); + vecIn0 = vld1q(&pSamples[6]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); + vecIn0 = vld1q(&pSamples[7]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); + pSamples += 8; + float16x8_t pap = vld1q_f16(partial_accu_ptr); + vst1q(pOutput, vecAcc0 + pap); + cnt--; + partial_accu_ptr += 8; + pOutput += 8; + } - pSamples += 8; + cnt = blockSize & 7; + if (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; - for (i = 0; i <= ((numTaps - 9) / 8); i++) - { - c0 = *pCoeffsCur++; - c1 = *pCoeffsCur++; - c2 = *pCoeffsCur++; - c3 = *pCoeffsCur++; - c4 = *pCoeffsCur++; - c5 = *pCoeffsCur++; - c6 = *pCoeffsCur++; - c7 = *pCoeffsCur++; + mve_pred16_t p0 = vctp16q(cnt); vecIn0 = vld1q(pSamples); - vecAcc0 = vfmaq(vecAcc0, vecIn0, c0); - + vecAcc0 = vmulq(vecIn0, c0); vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); - vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); + float16x8_t pap = vld1q_f16(partial_accu_ptr); + vstrhq_p_f16(pOutput, vecAcc0 + pap, p0); + pOutput += cnt; + } + + } else { + c0 = *pCoeffs++; + c1 = *pCoeffs++; + c2 = *pCoeffs++; + c3 = *pCoeffs++; + pOutput = pDst; + partial_accu_ptr = S->pState; + cnt = blockSize >> 3; + while (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; + + vecIn0 = vld1q(pSamples); + vecAcc0 = vmulq(vecIn0, c0); + vecIn0 = vld1q(&pSamples[1]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); + vecIn0 = vld1q(&pSamples[2]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); + vecIn0 = vld1q(&pSamples[3]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); pSamples += 8; + float16x8_t pap = vld1q_f16(partial_accu_ptr); + vst1q(pOutput, vecAcc0 + pap); + cnt--; + partial_accu_ptr += 8; + pOutput += 8; } - vstrhq_p_f16(pOutput, vecAcc0, p0); + cnt = blockSize & 7; + if (cnt > 0) { + float16x8_t vecAcc0; + float16x8_t vecIn0; + + mve_pred16_t p0 = vctp16q(cnt); + + vecIn0 = vld1q(pSamples); + vecAcc0 = vmulq(vecIn0, c0); + vecIn0 = vld1q(&pSamples[1]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); + vecIn0 = vld1q(&pSamples[2]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); + vecIn0 = vld1q(&pSamples[3]); + vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); + float16x8_t pap = vld1q_f16(partial_accu_ptr); + vstrhq_p_f16(pOutput, vecAcc0 + pap, p0); + pOutput += cnt; + } } /* @@ -459,17 +543,15 @@ void arm_fir_f16(const arm_fir_instance_f16 * S, pTempDest = pState; blkCnt = numTaps >> 3; - while (blkCnt > 0U) - { + while (blkCnt > 0U) { vst1q(pTempDest, vld1q(pTempSrc)); pTempSrc += 8; pTempDest += 8; blkCnt--; } blkCnt = numTaps & 7; - if (blkCnt > 0U) - { - mve_pred16_t p0 = vctp16q(blkCnt); + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp16q(blkCnt); vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0); } } diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c index 3731e9cc..6fa87565 100644 --- a/Source/FilteringFunctions/arm_fir_f32.c +++ b/Source/FilteringFunctions/arm_fir_f32.c @@ -107,9 +107,12 @@ @par Helium state buffer The state buffer must contain some additional temporary data used during the computation but which is not the state of the FIR. - The first blockSize samples are temporary data. + The first A samples are temporary data. The remaining samples are the state of the FIR filter. - So the state buffer has size numTaps + 2 * blockSize - 1 + @par + So the state buffer has size numTaps + A * blockSize - 1 : + - A is blockSize for f32 + - A is 8*ceil(blockSize/8) for f16 @par Fixed-Point Behavior Care must be taken when using the fixed-point versions of the FIR filter functions. @@ -144,7 +147,7 @@ } -static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, +__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t * __restrict pSrc, float32_t * __restrict pDst, uint32_t blockSize) { @@ -229,7 +232,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, -static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, +__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * __restrict pSrc, float32_t * __restrict pDst, uint32_t blockSize) { diff --git a/Source/FilteringFunctions/arm_fir_init_f16.c b/Source/FilteringFunctions/arm_fir_init_f16.c index 4e300d7c..8b63796b 100755 --- a/Source/FilteringFunctions/arm_fir_init_f16.c +++ b/Source/FilteringFunctions/arm_fir_init_f16.c @@ -52,7 +52,7 @@ @par pState points to the array of state variables. - pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f16(). + pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_f16(). @par Initialization of Helium version For Helium version the array of coefficients must be a multiple of 16 even if less then 16 coefficients are used. The additional coefficients must be set to 0. @@ -61,6 +61,13 @@ the implementation may require to read more coefficients due to the vectorization and to avoid having to manage too many different cases in the code. + @par Helium state buffer + The state buffer must contain some additional temporary data + used during the computation but which is not the state of the FIR. + The first 8*ceil(blockSize/8) samples are temporary data. + The remaining samples are the state of the FIR filter. + So the state buffer has size numTaps + 8*ceil(blockSize/8) + blockSize - 1 + */ void arm_fir_init_f16( @@ -77,7 +84,11 @@ void arm_fir_init_f16( S->pCoeffs = pCoeffs; /* Clear state buffer. The size is always (blockSize + numTaps - 1) */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t)); +#else memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t)); +#endif /* Assign state pointer */ S->pState = pState; diff --git a/Testing/Source/Benchmarks/FIRF16.cpp b/Testing/Source/Benchmarks/FIRF16.cpp index d3605e1b..86b54bca 100755 --- a/Testing/Source/Benchmarks/FIRF16.cpp +++ b/Testing/Source/Benchmarks/FIRF16.cpp @@ -1,6 +1,9 @@ #include "FIRF16.h" #include "Error.h" +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +static __ALIGNED(8) float16_t coeffArray[64]; +#endif void FIRF16::test_fir_f16() { @@ -21,16 +24,28 @@ samples.reload(FIRF16::SAMPLES1_F16_ID,mgr,this->nbSamples); coefs.reload(FIRF16::COEFS1_F16_ID,mgr,this->nbTaps); - state.create(this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr); + state.create(ROUND_UP(this->nbSamples,8) + this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr); output.create(this->nbSamples,FIRF16::OUT_SAMPLES_F16_ID,mgr); switch(id) { case TEST_FIR_F16_1: +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + /* Copy coefficients and pad to zero + */ + memset(coeffArray,0,32*sizeof(float16_t)); + float16_t *ptr; + + ptr=coefs.ptr(); + memcpy(coeffArray,ptr,this->nbTaps*sizeof(float16_t)); + this->pCoefs = coeffArray; +#else + this->pCoefs=coefs.ptr(); +#endif + arm_fir_init_f16(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples); this->pSrc=samples.ptr(); - this->pCoefs=coefs.ptr(); this->pDst=output.ptr(); break; diff --git a/Testing/Source/Benchmarks/FIRF32.cpp b/Testing/Source/Benchmarks/FIRF32.cpp index 41a32018..bbe9ed55 100755 --- a/Testing/Source/Benchmarks/FIRF32.cpp +++ b/Testing/Source/Benchmarks/FIRF32.cpp @@ -39,7 +39,6 @@ static __ALIGNED(8) float32_t coeffArray[64]; switch(id) { case TEST_FIR_F32_1: - arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples); #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) /* Copy coefficients and pad to zero @@ -57,6 +56,8 @@ static __ALIGNED(8) float32_t coeffArray[64]; this->pSrc=samples.ptr(); this->pDst=output.ptr(); + + arm_fir_init_f32(&instFir,this->nbTaps,this->pCoefs,state.ptr(),this->nbSamples); break; case TEST_LMS_F32_2: diff --git a/Testing/Source/Tests/FIRF16.cpp b/Testing/Source/Tests/FIRF16.cpp index 0573caa3..0933c743 100755 --- a/Testing/Source/Tests/FIRF16.cpp +++ b/Testing/Source/Tests/FIRF16.cpp @@ -137,8 +137,8 @@ void checkInnerTail(float16_t *b) ref.reload(FIRF16::FIRREFS_F16_ID,mgr); output.create(ref.nbSamples(),FIRF16::OUT_F16_ID,mgr); - /* Max blockSize + numTaps - 1 as generated by Python script */ - state.create(47,FIRF16::OUT_F16_ID,mgr); + /* > Max 8*ceil(blockSize,8) + blockSize + numTaps - 1 as generated by Python script */ + state.create(47+47,FIRF16::OUT_F16_ID,mgr); } void FIRF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)