From a108d6763e0c7df369d0814b5f2388effcd497fe Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Thu, 5 Nov 2020 14:59:01 +0100 Subject: [PATCH] CMSIS-DSP: Added new MVE implementation of FIR Q31 --- Include/dsp/filtering_functions_f16.h | 2 - Include/dsp/utils.h | 1 + Source/FilteringFunctions/arm_fir_f32.c | 5 +- Source/FilteringFunctions/arm_fir_init_q31.c | 22 +- Source/FilteringFunctions/arm_fir_q31.c | 1024 ++++++++++-------- Testing/Source/Benchmarks/FIRQ31.cpp | 19 +- Testing/Source/Tests/FIRQ31.cpp | 7 +- 7 files changed, 611 insertions(+), 469 deletions(-) diff --git a/Include/dsp/filtering_functions_f16.h b/Include/dsp/filtering_functions_f16.h index 0265f04e..4a99e831 100755 --- a/Include/dsp/filtering_functions_f16.h +++ b/Include/dsp/filtering_functions_f16.h @@ -40,8 +40,6 @@ extern "C" #if defined(ARM_FLOAT16_SUPPORTED) -#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S)) - /** * @brief Instance structure for the floating-point FIR filter. */ diff --git a/Include/dsp/utils.h b/Include/dsp/utils.h index 794023c5..7f5acb37 100755 --- a/Include/dsp/utils.h +++ b/Include/dsp/utils.h @@ -42,6 +42,7 @@ extern "C" #define SQ(x) ((x) * (x)) +#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S)) /** diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c index 6fa87565..7f3da5e0 100644 --- a/Source/FilteringFunctions/arm_fir_f32.c +++ b/Source/FilteringFunctions/arm_fir_f32.c @@ -110,9 +110,11 @@ The first A samples are temporary data. The remaining samples are the state of the FIR filter. @par - So the state buffer has size numTaps + A * blockSize - 1 : + So the state buffer has size numTaps + A + blockSize - 1 : - A is blockSize for f32 - A is 8*ceil(blockSize/8) for f16 + - A is 8*ceil(blockSize/4) for q31 + @par Fixed-Point Behavior Care must be taken when using the fixed-point versions of the FIR filter functions. @@ -200,6 +202,7 @@ __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, } blkCnt = blockSize & 3; + if (blkCnt) { mve_pred16_t p0 = vctp32q(blkCnt); diff --git a/Source/FilteringFunctions/arm_fir_init_q31.c b/Source/FilteringFunctions/arm_fir_init_q31.c index 2a9600c1..e491437e 100644 --- a/Source/FilteringFunctions/arm_fir_init_q31.c +++ b/Source/FilteringFunctions/arm_fir_init_q31.c @@ -52,7 +52,23 @@ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]} pState points to the array of state variables. - pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q31(). + pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_q31(). + + @par Initialization of Helium version + For Helium version the array of coefficients must be a multiple of 16 even if less + then 16 coefficients are used. The additional coefficients must be set to 0. + It does not mean that all the coefficients will be used in the filter (numTaps + is still set to its right value in the init function.) It just means that + the implementation may require to read more coefficients due to the vectorization and + to avoid having to manage too many different cases in the code. + + @par Helium state buffer + The state buffer must contain some additional temporary data + used during the computation but which is not the state of the FIR. + The first 2*4*ceil(blockSize/4) samples are temporary data. + The remaining samples are the state of the FIR filter. + So the state buffer has size numTaps + 8*ceil(blockSize/4) + blockSize - 1 + */ void arm_fir_init_q31( @@ -69,7 +85,11 @@ void arm_fir_init_q31( S->pCoeffs = pCoeffs; /* Clear state buffer. The size is always (blockSize + numTaps - 1) */ + #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t)); + #else memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t)); + #endif /* Assign state pointer */ S->pState = pState; diff --git a/Source/FilteringFunctions/arm_fir_q31.c b/Source/FilteringFunctions/arm_fir_q31.c index bf406350..eda1d4f0 100644 --- a/Source/FilteringFunctions/arm_fir_q31.c +++ b/Source/FilteringFunctions/arm_fir_q31.c @@ -28,6 +28,7 @@ #include "dsp/filtering_functions.h" + /** @ingroup groupFilters */ @@ -58,12 +59,160 @@ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" - -static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize) +#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \ + for (int j = 0; j < nbAcc; j++) { \ + const q31_t *pSmp = &pSamples[j]; \ + q31x4_t vecIn0; \ + q63_t acc[4]; \ + \ + acc[j] = 0; \ + for (int i = 0; i < nbVecTaps; i++) { \ + vecIn0 = vld1q(pSmp + 4 * i); \ + acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \ + } \ + *pOutput++ = (q31_t)asrl(acc[j], 23); \ + } + + +#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \ + for (int j = 0; j < nbAcc; j++) { \ + const q31_t *pSmp = &pSamples[j]; \ + q31x4_t vecIn0; \ + \ + acc[j] = 0; \ + for (int i = 0; i < nbVecTaps; i++) { \ + vecIn0 = vld1q(pSmp + 4 * i); \ + acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \ + } \ + *arm_fir_partial_accu_ptr++ = acc[j]; \ + } + + +#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \ + for (int j = 0; j < nbAcc; j++) { \ + const q31_t *pSmp = &pSamples[j]; \ + q31x4_t vecIn0; \ + \ + acc[j] = *arm_fir_partial_accu_ptr++; \ + \ + for (int i = 0; i < nbVecTaps; i++) { \ + vecIn0 = vld1q(pSmp + 4 * i); \ + acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \ + } \ + *pOutput++ = (q31_t)asrl(acc[j], 23); \ + } + + +#define FIR_Q31_MAIN_CORE() \ +{ \ + q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); \ + q31_t *pState = pRefStatePtr; /* State pointer */ \ + const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \ + q31_t *pStateCur; /* Points to the current sample of the state */ \ + const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \ + q31_t *pOutput; /* Temporary pointer to the output buffer */ \ + const q31_t *pTempSrc; /* Temporary pointer to the source data */ \ + q31_t *pTempDest; /* Temporary pointer to the destination buffer */\ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\ + int32_t blkCnt; \ + const int32_t nbVecTaps = (NBTAPS / 4); \ + \ + /* \ + * load coefs \ + */ \ + q31x4_t vecCoeffs[nbVecTaps]; \ + \ + for (int i = 0; i < nbVecTaps; i++) \ + vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \ + \ + /* \ + * pState points to state array which contains previous frame (numTaps - 1) samples \ + * pStateCur points to the location where the new input data should be written \ + */ \ + pStateCur = &(pState[(numTaps - 1u)]); \ + pTempSrc = pSrc; \ + pSamples = pState; \ + pOutput = pDst; \ + \ + blkCnt = blockSize >> 2; \ + while (blkCnt > 0) { \ + /* \ + * Save 4 input samples in the history buffer \ + */ \ + vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \ + pStateCur += 4; \ + pTempSrc += 4; \ + \ + FIR_Q31_CORE(4, nbVecTaps, pSamples, vecCoeffs); \ + \ + pSamples += 4; \ + /* \ + * Decrement the sample block loop counter \ + */ \ + blkCnt--; \ + } \ + \ + /* tail */ \ + int32_t residual = blockSize & 3; \ + switch (residual) { \ + case 3: \ + { \ + for (int i = 0; i < residual; i++) \ + *pStateCur++ = *pTempSrc++; \ + \ + FIR_Q31_CORE(3, nbVecTaps, pSamples, vecCoeffs); \ + } \ + break; \ + \ + case 2: \ + { \ + for (int i = 0; i < residual; i++) \ + *pStateCur++ = *pTempSrc++; \ + \ + FIR_Q31_CORE(2, nbVecTaps, pSamples, vecCoeffs); \ + } \ + break; \ + \ + case 1: \ + { \ + for (int i = 0; i < residual; i++) \ + *pStateCur++ = *pTempSrc++; \ + \ + FIR_Q31_CORE(1, nbVecTaps, pSamples, vecCoeffs); \ + } \ + break; \ + } \ + \ + /* \ + * Copy the samples back into the history buffer start \ + */ \ + pTempSrc = &pState[blockSize]; \ + pTempDest = pState; \ + \ + blkCnt =(numTaps - 1) >> 2; \ + while (blkCnt > 0) \ + { \ + vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \ + pTempSrc += 4; \ + pTempDest += 4; \ + blkCnt--; \ + } \ + blkCnt = (numTaps - 1) & 3; \ + if (blkCnt > 0) \ + { \ + mve_pred16_t p0 = vctp32q(blkCnt); \ + vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \ + } \ +} + +static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) { - q31_t *pState = S->pState; /* State pointer */ + q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); + q31_t *pState = pRefStatePtr; /* State pointer */ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q31_t *pStateCur; /* Points to the current sample of the state */ const q31_t *pSamples; /* Temporary pointer to the sample buffer */ @@ -74,6 +223,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS uint32_t blkCnt; q31x4_t vecIn0; + /* * pState points to state array which contains previous frame (numTaps - 1) samples * pStateCur points to the location where the new input data should be written @@ -83,7 +233,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS pSamples = pState; pOutput = pDst; - q63_t acc0, acc1, acc2, acc3; + q63_t acc0=0, acc1=0, acc2=0, acc3=0; /* * load 4 coefs */ @@ -131,7 +281,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS } uint32_t residual = blockSize & 3; - switch (residual) { case 3: @@ -139,7 +288,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS /* * Save 4 input samples in the history buffer */ - *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc; pStateCur += 4; pTempSrc += 4; @@ -205,14 +353,13 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS break; } - /* * Copy the samples back into the history buffer start */ - pTempSrc = &S->pState[blockSize]; - pTempDest = S->pState; + pTempSrc = &pState[blockSize]; + pTempDest = pState; - blkCnt = numTaps >> 2; + blkCnt = (numTaps-1) >> 2; while (blkCnt > 0U) { vst1q(pTempDest, vld1q(pTempSrc)); @@ -220,7 +367,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS pTempDest += 4; blkCnt--; } - blkCnt = numTaps & 3; + blkCnt = (numTaps-1) & 3; if (blkCnt > 0U) { mve_pred16_t p0 = vctp32q(blkCnt); @@ -228,9 +375,274 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS } } -static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize) + + +static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) { - q31_t *pState = S->pState; /* State pointer */ + #define NBTAPS 8 + FIR_Q31_MAIN_CORE(); + #undef NBTAPS +} + + +static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 12 + FIR_Q31_MAIN_CORE(); + #undef NBTAPS +} + + +static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 16 + FIR_Q31_MAIN_CORE(); + #undef NBTAPS +} + + +static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 20 + FIR_Q31_MAIN_CORE(); + #undef NBTAPS +} + + +static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 24 + FIR_Q31_MAIN_CORE(); + #undef NBTAPS +} + + +static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 28 + FIR_Q31_MAIN_CORE(); + #undef NBTAPS +} + +static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S, + const q31_t * __restrict pSrc, + q31_t * __restrict pDst, + uint32_t blockSize) +{ + q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); + q31_t *pState = pRefStatePtr; /* State pointer */ + const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + q31_t *pStateCur; /* Points to the current sample of the state */ + const q31_t *pSamples; /* Temporary pointer to the sample buffer */ + q31_t *pOutput; /* Temporary pointer to the output buffer */ + const q31_t *pTempSrc; /* Temporary pointer to the source data */ + q31_t *pTempDest; /* Temporary pointer to the destination buffer */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + int32_t blkCnt; + q63_t acc0, acc1, acc2, acc3; + +#define MAX_VECT_BATCH 7 + + /* + * pre-load 28 1st coefs + */ + q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0); + q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1); + q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2); + q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3); + q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4); + q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5); + q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6); + + /* + * pState points to state array which contains previous frame (numTaps - 1) samples + * pStateCur points to the location where the new input data should be written + */ + pStateCur = &(pState[(numTaps - 1u)]); + pTempSrc = pSrc; + pSamples = pState; + + q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState; + + blkCnt = blockSize >> 2; + while (blkCnt > 0) { + /* + * Save 4 input samples in the history buffer + */ + vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); + pStateCur += 4; + pTempSrc += 4; + + const q31_t *pSmp; + q31x4_t vecIn0; + + pSmp = &pSamples[0]; + + vecIn0 = vld1q(pSmp); + acc0 = vrmlaldavhq(vecIn0, vecCoeffs0); + vecIn0 = vld1q(pSmp + 4 * 1); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1); + vecIn0 = vld1q(pSmp + 4 * 2); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2); + vecIn0 = vld1q(pSmp + 4 * 3); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3); + vecIn0 = vld1q(pSmp + 4 * 4); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4); + vecIn0 = vld1q(pSmp + 4 * 5); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5); + vecIn0 = vld1q(pSmp + 4 * 6); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6); + + *arm_fir_partial_accu_ptr++ = acc0; + + pSmp = &pSamples[1]; + + vecIn0 = vld1q(pSmp); + acc1 = vrmlaldavhq(vecIn0, vecCoeffs0); + vecIn0 = vld1q(pSmp + 4 * 1); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1); + vecIn0 = vld1q(pSmp + 4 * 2); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2); + vecIn0 = vld1q(pSmp + 4 * 3); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3); + vecIn0 = vld1q(pSmp + 4 * 4); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4); + vecIn0 = vld1q(pSmp + 4 * 5); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5); + vecIn0 = vld1q(pSmp + 4 * 6); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6); + + *arm_fir_partial_accu_ptr++ = acc1; + + pSmp = &pSamples[2]; + + vecIn0 = vld1q(pSmp); + acc2 = vrmlaldavhq(vecIn0, vecCoeffs0); + vecIn0 = vld1q(pSmp + 4 * 1); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1); + vecIn0 = vld1q(pSmp + 4 * 2); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2); + vecIn0 = vld1q(pSmp + 4 * 3); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3); + vecIn0 = vld1q(pSmp + 4 * 4); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4); + vecIn0 = vld1q(pSmp + 4 * 5); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5); + vecIn0 = vld1q(pSmp + 4 * 6); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6); + *arm_fir_partial_accu_ptr++ = acc2; + + pSmp = &pSamples[3]; + + vecIn0 = vld1q(pSmp); + acc3 = vrmlaldavhq(vecIn0, vecCoeffs0); + vecIn0 = vld1q(pSmp + 4 * 1); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1); + vecIn0 = vld1q(pSmp + 4 * 2); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2); + vecIn0 = vld1q(pSmp + 4 * 3); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3); + vecIn0 = vld1q(pSmp + 4 * 4); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4); + vecIn0 = vld1q(pSmp + 4 * 5); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5); + vecIn0 = vld1q(pSmp + 4 * 6); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6); + + *arm_fir_partial_accu_ptr++ = acc3; + + pSamples += 4; + /* + * Decrement the sample block loop counter + */ + blkCnt--; + } + + + /* reminder */ + + /* load last 4 coef */ + vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH); + arm_fir_partial_accu_ptr = (q63_t*)S->pState; + pOutput = pDst; + pSamples = pState + (MAX_VECT_BATCH * 4); + + + blkCnt = blockSize >> 2; + while (blkCnt > 0) { + q31x4_t vecIn0; + + /* reload intermediate MAC */ + acc0 = *arm_fir_partial_accu_ptr++; + acc1 = *arm_fir_partial_accu_ptr++; + acc2 = *arm_fir_partial_accu_ptr++; + acc3 = *arm_fir_partial_accu_ptr++; + + + vecIn0 = vld1q(&pSamples[0]); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0); + + vecIn0 = vld1q(&pSamples[1]); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0); + + vecIn0 = vld1q(&pSamples[2]); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0); + + vecIn0 = vld1q(&pSamples[3]); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0); + + *pOutput++ = asrl(acc0, 23); + *pOutput++ = asrl(acc1, 23); + *pOutput++ = asrl(acc2, 23); + *pOutput++ = asrl(acc3, 23); + + pSamples += 4; + /* + * Decrement the sample block loop counter + */ + blkCnt--; + } + + /* + * Copy the samples back into the history buffer start + */ + pTempSrc = &pState[blockSize]; + pTempDest = pState; + + blkCnt = numTaps - 1; + do { + mve_pred16_t p = vctp32q(blkCnt); + + vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p); + pTempSrc += 4; + pTempDest += 4; + blkCnt -= 4; + } + while (blkCnt > 0); +} + + + +void arm_fir_q31( + const arm_fir_instance_q31 * S, + const q31_t * pSrc, + q31_t * pDst, + uint32_t blockSize) +{ + q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); + q31_t *pState = pRefStatePtr; /* State pointer */ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q31_t *pStateCur; /* Points to the current sample of the state */ const q31_t *pSamples; /* Temporary pointer to the sample buffer */ @@ -240,60 +652,110 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t blkCnt; q31x4_t vecIn0; + uint32_t tapsBlkCnt = (numTaps + 3) / 4; q63_t acc0, acc1, acc2, acc3; - q31x4_t vecCoeffs1_4, vecCoeffs5_8; + q31x4_t vecCoeffs; + /* - * pState points to state array which contains previous frame (numTaps - 1) samples - * pStateCur points to the location where the new input data should be written + * [1 to 32 taps] specialized routines */ - pStateCur = &(pState[(numTaps - 1u)]); - pTempSrc = pSrc; - pSamples = pState; - pOutput = pDst; - + if (numTaps <= 4) + { + arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 8) + { + arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 12) + { + arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 16) + { + arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 20) + { + arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 24) + { + arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 28) + { + arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize); + return; + } + else if ((numTaps <= 32) && (blockSize >= 32)) + { + arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize); + return; + } /* - * load 8 coefs + * pState points to state array which contains previous frame (numTaps - 1) samples + * pStateCur points to the location where the new input data should be written */ - vecCoeffs1_4 = *(q31x4_t *) pCoeffs; - vecCoeffs5_8 = *(q31x4_t *) (pCoeffs + 4); - - blkCnt = blockSize >> 2; - while (blkCnt > 0U) + pStateCur = &(pState[(numTaps - 1u)]); + pSamples = pState; + pTempSrc = pSrc; + pOutput = pDst; + blkCnt = blockSize >> 2; + while (blkCnt > 0) { + const q31_t *pCoeffsTmp = pCoeffs; const q31_t *pSamplesTmp = pSamples; + acc0 = 0LL; + acc1 = 0LL; + acc2 = 0LL; + acc3 = 0LL; + /* * Save 4 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); + pStateCur += 4; + pTempSrc += 4; - vecIn0 = vld1q(pSamplesTmp); - acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4); - - vecIn0 = vld1q(&pSamplesTmp[3]); - acc3 = vrmlaldavhq(vecIn0, vecCoeffs1_4); + int i = tapsBlkCnt; + while (i > 0) + { + /* + * load 4 coefs + */ + vecCoeffs = *(q31x4_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamplesTmp[4]); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[5]); - acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(&pSamplesTmp[1]); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[6]); - acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(&pSamplesTmp[2]); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[7]); - acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(&pSamplesTmp[3]); + acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs); + pSamplesTmp += 4; + pCoeffsTmp += 4; + /* + * Decrement the taps block loop counter + */ + i--; + } + /* .54-> .31 conversion and store accumulators */ acc0 = asrl(acc0, 23); acc1 = asrl(acc1, 23); acc2 = asrl(acc2, 23); @@ -305,8 +767,6 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS *pOutput++ = (q31_t) acc3; pSamples += 4; - pStateCur += 4; - pTempSrc += 4; /* * Decrement the sample block loop counter @@ -314,11 +774,18 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS blkCnt--; } - uint32_t residual = blockSize & 3; + int32_t residual = blockSize & 3; switch (residual) { case 3: { + const q31_t *pCoeffsTmp = pCoeffs; + const q31_t *pSamplesTmp = pSamples; + + acc0 = 0LL; + acc1 = 0LL; + acc2 = 0LL; + /* * Save 4 input samples in the history buffer */ @@ -326,23 +793,24 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS pStateCur += 4; pTempSrc += 4; - vecIn0 = vld1q(pSamples); - acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4); - - vecIn0 = vld1q(&pSamples[1]); - acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4); + int i = tapsBlkCnt; + while (i > 0) + { + vecCoeffs = *(q31x4_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamples[2]); - acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[4]); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(&pSamplesTmp[1]); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[5]); - acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(&pSamplesTmp[2]); + acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[6]); - acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8); + pSamplesTmp += 4; + pCoeffsTmp += 4; + i--; + } acc0 = asrl(acc0, 23); acc1 = asrl(acc1, 23); @@ -356,6 +824,12 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS case 2: { + const q31_t *pCoeffsTmp = pCoeffs; + const q31_t *pSamplesTmp = pSamples; + + acc0 = 0LL; + acc1 = 0LL; + /* * Save 4 input samples in the history buffer */ @@ -363,17 +837,21 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS pStateCur += 4; pTempSrc += 4; - vecIn0 = vld1q(pSamples); - acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4); + int i = tapsBlkCnt; + while (i > 0) + { + vecCoeffs = *(q31x4_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamples[1]); - acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[4]); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8); + vecIn0 = vld1q(&pSamplesTmp[1]); + acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[5]); - acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8); + pSamplesTmp += 4; + pCoeffsTmp += 4; + i--; + } acc0 = asrl(acc0, 23); acc1 = asrl(acc1, 23); @@ -384,431 +862,55 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS break; case 1: - { - /* - * Save 4 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 4; - pTempSrc += 4; - - vecIn0 = vld1q(pSamples); - acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4); - - vecIn0 = vld1q(&pSamples[4]); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8); - - acc0 = asrl(acc0, 23); - - *pOutput++ = (q31_t) acc0; - } - break; - } - - /* - * Copy the samples back into the history buffer start - */ - pTempSrc = &S->pState[blockSize]; - pTempDest = S->pState; - - blkCnt = numTaps >> 2; - while (blkCnt > 0U) - { - vst1q(pTempDest, vld1q(pTempSrc)); - pTempSrc += 4; - pTempDest += 4; - blkCnt--; - } - blkCnt = numTaps & 3; - if (blkCnt > 0U) - { - mve_pred16_t p0 = vctp32q(blkCnt); - vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0); - } -} - -void arm_fir_q31( - const arm_fir_instance_q31 * S, - const q31_t * pSrc, - q31_t * pDst, - uint32_t blockSize) -{ - q31_t *pState = S->pState; /* State pointer */ - const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - q31_t *pStateCur; /* Points to the current sample of the state */ - const q31_t *pSamples; /* Temporary pointer to the sample buffer */ - q31_t *pOutput; /* Temporary pointer to the output buffer */ - const q31_t *pTempSrc; /* Temporary pointer to the source data */ - q31_t *pTempDest; /* Temporary pointer to the destination buffer */ - uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ - uint32_t blkCnt; - q31x4_t vecIn0; - uint32_t tapsBlkCnt = (numTaps + 3) / 4; - q63_t acc0, acc1, acc2, acc3; - q31x4_t vecCoeffs; - - /* - * [1 to 8 taps] specialized routines - */ - - if (blockSize >= 8) - { - if (numTaps <= 4) - { - arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize); - return; - } - else if (numTaps <= 8) - { - arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize); - return; - } - } - - - /* - * pState points to state array which contains previous frame (numTaps - 1) samples - * pStateCur points to the location where the new input data should be written - */ - if (blockSize >= 8) - { - pStateCur = &(pState[(numTaps - 1u)]); - pSamples = pState; - pTempSrc = pSrc; - pOutput = pDst; - blkCnt = blockSize >> 2; - while (blkCnt > 0U) { const q31_t *pCoeffsTmp = pCoeffs; const q31_t *pSamplesTmp = pSamples; - + acc0 = 0LL; - acc1 = 0LL; - acc2 = 0LL; - acc3 = 0LL; - + /* * Save 4 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 4; pTempSrc += 4; - - tapsBlkCnt = (numTaps ) / 4; - uint32_t i = tapsBlkCnt ; - while (i > 0U) + + int i = tapsBlkCnt; + while (i > 0) { - /* - * load 4 coefs - */ vecCoeffs = *(q31x4_t *) pCoeffsTmp; - + vecIn0 = vld1q(pSamplesTmp); acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[3]); - acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs); - + pSamplesTmp += 4; pCoeffsTmp += 4; - /* - * Decrement the taps block loop counter - */ i--; } - tapsBlkCnt = (numTaps ) & 3; - i = tapsBlkCnt ; - while (i > 0U) - { - /* - * load 4 coefs - */ - - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8; - acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8; - acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8; - acc3 += ((q63_t) pSamplesTmp[3] * *pCoeffsTmp) >> 8; - - - pSamplesTmp += 1; - pCoeffsTmp += 1; - /* - * Decrement the taps block loop counter - */ - i--; - } - - /* .54-> .31 conversion and store accumulators */ acc0 = asrl(acc0, 23); - acc1 = asrl(acc1, 23); - acc2 = asrl(acc2, 23); - acc3 = asrl(acc3, 23); - - *pOutput++ = (q31_t) acc0; - *pOutput++ = (q31_t) acc1; - *pOutput++ = (q31_t) acc2; - *pOutput++ = (q31_t) acc3; - - pSamples += 4; - - /* - * Decrement the sample block loop counter - */ - blkCnt--; - } - - uint32_t residual = blockSize & 3; - switch (residual) - { - case 3: - { - const q31_t *pCoeffsTmp = pCoeffs; - const q31_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - acc1 = 0LL; - acc2 = 0LL; - - /* - * Save 4 input samples in the history buffer - */ - - *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc; - pStateCur += 4; - pTempSrc += 4; - - tapsBlkCnt = numTaps / 4; - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - vecCoeffs = *(q31x4_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs); - - pSamplesTmp += 4; - pCoeffsTmp += 4; - i--; - } - - tapsBlkCnt = (numTaps ) & 3; - - i = tapsBlkCnt ; - while (i > 0U) - { - - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8; - acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8; - acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8; - - pSamplesTmp += 1; - pCoeffsTmp += 1; - /* - * Decrement the taps block loop counter - */ - i--; - } - - - acc0 = asrl(acc0, 23); - acc1 = asrl(acc1, 23); - acc2 = asrl(acc2, 23); - - *pOutput++ = (q31_t) acc0; - *pOutput++ = (q31_t) acc1; - *pOutput++ = (q31_t) acc2; - } - break; - - case 2: - { - const q31_t *pCoeffsTmp = pCoeffs; - const q31_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - acc1 = 0LL; - - /* - * Save 4 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 4; - pTempSrc += 4; - - tapsBlkCnt = (numTaps ) / 4; - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - vecCoeffs = *(q31x4_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs); - - pSamplesTmp += 4; - pCoeffsTmp += 4; - i--; - } - - tapsBlkCnt = (numTaps ) & 3; - i = tapsBlkCnt ; - while (i > 0U) - { - - - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8; - acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8; - - pSamplesTmp += 1; - pCoeffsTmp += 1; - /* - * Decrement the taps block loop counter - */ - i--; - } - - acc0 = asrl(acc0, 23); - acc1 = asrl(acc1, 23); - - *pOutput++ = (q31_t) acc0; - *pOutput++ = (q31_t) acc1; - } - break; - - case 1: - { - const q31_t *pCoeffsTmp = pCoeffs; - const q31_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - - /* - * Save 4 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 4; - pTempSrc += 4; - - tapsBlkCnt = (numTaps ) / 4; - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - vecCoeffs = *(q31x4_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs); - - pSamplesTmp += 4; - pCoeffsTmp += 4; - i--; - } - - tapsBlkCnt = (numTaps ) & 3; - i = tapsBlkCnt ; - while (i > 0U) - { - - - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8; - - pSamplesTmp += 1; - pCoeffsTmp += 1; - /* - * Decrement the taps block loop counter - */ - i--; - } - - acc0 = asrl(acc0, 23); - - *pOutput++ = (q31_t) acc0; - } - break; - } - } - else - { - - q31_t *pStateCurnt; /* Points to the current sample of the state */ - q31_t *px; /* Temporary pointer for state buffer */ - const q31_t *pb; /* Temporary pointer for coefficient buffer */ - q63_t acc0; /* Accumulator */ - uint32_t i, blkCnt; /* Loop counters */ - pStateCurnt = &(S->pState[(numTaps - 1U)]); - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Copy one sample at a time into state buffer */ - *pStateCurnt++ = *pSrc++; - - /* Set the accumulator to zero */ - acc0 = 0; - - /* Initialize state pointer */ - px = pState; - - /* Initialize Coefficient pointer */ - pb = pCoeffs; - - i = numTaps; - - /* Perform the multiply-accumulates */ - do - { - /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ - acc0 += (q63_t) *px++ * *pb++; - - i--; - } while (i > 0U); - - /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */ - *pDst++ = (q31_t) (acc0 >> 31U); - - /* Advance state pointer by 1 for the next sample */ - pState = pState + 1U; - - /* Decrement loop counter */ - blkCnt--; + *pOutput++ = (q31_t) acc0; } + break; } /* * Copy the samples back into the history buffer start */ - pTempSrc = &S->pState[blockSize]; - pTempDest = S->pState; + pTempSrc = &pState[blockSize]; + pTempDest = pState; - blkCnt = numTaps >> 2; - while (blkCnt > 0U) + blkCnt = (numTaps - 1U) >> 2; + while (blkCnt > 0) { vst1q(pTempDest, vld1q(pTempSrc)); pTempSrc += 4; pTempDest += 4; blkCnt--; } - blkCnt = numTaps & 3; - if (blkCnt > 0U) + blkCnt = (numTaps - 1U) & 3; + if (blkCnt > 0) { mve_pred16_t p0 = vctp32q(blkCnt); vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0); diff --git a/Testing/Source/Benchmarks/FIRQ31.cpp b/Testing/Source/Benchmarks/FIRQ31.cpp index 3626a134..25cc206e 100755 --- a/Testing/Source/Benchmarks/FIRQ31.cpp +++ b/Testing/Source/Benchmarks/FIRQ31.cpp @@ -1,6 +1,9 @@ #include "FIRQ31.h" #include "Error.h" +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +static __ALIGNED(8) q31_t coeffArray[64]; +#endif void FIRQ31::test_fir_q31() { @@ -30,16 +33,28 @@ samples.reload(FIRQ31::SAMPLES1_Q31_ID,mgr,this->nbSamples); coefs.reload(FIRQ31::COEFS1_Q31_ID,mgr,this->nbTaps); - state.create(this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr); + state.create(2*ROUND_UP(this->nbSamples,4) + this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr); output.create(this->nbSamples,FIRQ31::OUT_SAMPLES_Q31_ID,mgr); switch(id) { case TEST_FIR_Q31_1: +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + /* Copy coefficients and pad to zero + */ + memset(coeffArray,0,32*sizeof(q31_t)); + q31_t *ptr; + + ptr=coefs.ptr(); + memcpy(coeffArray,ptr,this->nbTaps*sizeof(q31_t)); + this->pCoefs = coeffArray; +#else + this->pCoefs=coefs.ptr(); +#endif + arm_fir_init_q31(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples); this->pSrc=samples.ptr(); - this->pCoefs=coefs.ptr(); this->pDst=output.ptr(); break; diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp index dfee9e1e..80f8195e 100644 --- a/Testing/Source/Tests/FIRQ31.cpp +++ b/Testing/Source/Tests/FIRQ31.cpp @@ -37,6 +37,7 @@ void checkInnerTail(q31_t *b) #endif int blockSize; int numTaps; + int nb=1; /* @@ -98,6 +99,8 @@ void checkInnerTail(q31_t *b) configp += 2; orgcoefsp += numTaps; + nb += blockSize + blockSize; + } @@ -129,8 +132,8 @@ void checkInnerTail(q31_t *b) ref.reload(FIRQ31::FIRREFS_Q31_ID,mgr); output.create(ref.nbSamples(),FIRQ31::OUT_Q31_ID,mgr); - /* Max blockSize + numTaps - 1 as generated by Python script */ - state.create(47,FIRQ31::OUT_Q31_ID,mgr); + /* > Max blockSize + numTaps - 1 as generated by Python script */ + state.create(47 + 47+47,FIRQ31::OUT_Q31_ID,mgr); } void FIRQ31::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)