diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c index 7f3da5e0..0813ac29 100644 --- a/Source/FilteringFunctions/arm_fir_f32.c +++ b/Source/FilteringFunctions/arm_fir_f32.c @@ -114,6 +114,7 @@ - A is blockSize for f32 - A is 8*ceil(blockSize/8) for f16 - A is 8*ceil(blockSize/4) for q31 + - A is 0 for other datatypes (q15 and q7) @par Fixed-Point Behavior diff --git a/Source/FilteringFunctions/arm_fir_q15.c b/Source/FilteringFunctions/arm_fir_q15.c index 208f67c3..cda633cd 100644 --- a/Source/FilteringFunctions/arm_fir_q15.c +++ b/Source/FilteringFunctions/arm_fir_q15.c @@ -60,7 +60,140 @@ #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff) -static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pSrc, q15_t * pDst, uint32_t blockSize) + +#define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \ + for (int j = 0; j < nbAcc; j++) { \ + const q15_t *pSmp = &pSample[j]; \ + q63_t acc[4]; \ + \ + acc[j] = 0; \ + for (int i = 0; i < nbVecTaps; i++) { \ + vecIn0 = vld1q(pSmp + 8 * i); \ + acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \ + } \ + *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \ + } + +#define FIR_Q15_MAIN_CORE() \ +{ \ + q15_t *pState = S->pState; /* State pointer */ \ + const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \ + q15_t *pStateCur; /* Points to the current sample of the state */ \ + const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \ + q15_t *pOutput; /* Temporary pointer to the output buffer */ \ + const q15_t *pTempSrc; /* Temporary pointer to the source data */ \ + q15_t *pTempDest; /* Temporary pointer to the destination buffer */\ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\ + int32_t blkCnt; \ + q15x8_t vecIn0; \ + const int32_t nbVecTaps = (NBTAPS / 8); \ + \ + /* \ + * load coefs \ + */ \ + q15x8_t vecCoeffs[nbVecTaps]; \ + \ + for (int i = 0; i < nbVecTaps; i++) \ + vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \ + \ + /* \ + * pState points to state array which contains previous frame (numTaps - 1) samples \ + * pStateCur points to the location where the new input data should be written \ + */ \ + pStateCur = &(pState[(numTaps - 1u)]); \ + pTempSrc = pSrc; \ + pSamples = pState; \ + pOutput = pDst; \ + \ + blkCnt = blockSize >> 2; \ + while (blkCnt > 0) { \ + /* \ + * Save 4 input samples in the history buffer \ + */ \ + vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \ + pStateCur += 4; \ + pTempSrc += 4; \ + \ + FIR_Q15_CORE(pOutput, 4, nbVecTaps, pSamples, vecCoeffs); \ + pSamples += 4; \ + \ + blkCnt--; \ + } \ + \ + /* tail */ \ + int32_t residual = blockSize & 3; \ + \ + for (int i = 0; i < residual; i++) \ + *pStateCur++ = *pTempSrc++; \ + \ + FIR_Q15_CORE(pOutput, residual, nbVecTaps, pSamples, vecCoeffs); \ + \ + /* \ + * Copy the samples back into the history buffer start \ + */ \ + pTempSrc = &pState[blockSize]; \ + pTempDest = pState; \ + \ + /* current compiler limitation */ \ + blkCnt = (numTaps - 1) >> 3; \ + while (blkCnt > 0) \ + { \ + vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \ + pTempSrc += 8; \ + pTempDest += 8; \ + blkCnt--; \ + } \ + blkCnt = (numTaps - 1) & 7; \ + if (blkCnt > 0) \ + { \ + mve_pred16_t p = vctp16q(blkCnt); \ + vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \ + } \ +} + +static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S, + const q15_t * __restrict pSrc, + q15_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 32 + FIR_Q15_MAIN_CORE(); + #undef NBTAPS +} + +static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S, + const q15_t * __restrict pSrc, + q15_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 24 + FIR_Q15_MAIN_CORE(); + #undef NBTAPS +} + + +static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S, + const q15_t * __restrict pSrc, + q15_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 16 + FIR_Q15_MAIN_CORE(); + #undef NBTAPS +} + +static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, + const q15_t * __restrict pSrc, + q15_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 8 + FIR_Q15_MAIN_CORE(); + #undef NBTAPS +} + + +void arm_fir_q15( + const arm_fir_instance_q15 * S, + const q15_t * pSrc, + q15_t * pDst, + uint32_t blockSize) { q15_t *pState = S->pState; /* State pointer */ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ @@ -72,46 +205,81 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t blkCnt; q15x8_t vecIn0; - /* - * load 8 coefs - */ - q15x8_t vecCoeffs = *(q15x8_t *) pCoeffs; + uint32_t tapsBlkCnt = (numTaps + 7) / 8; + q63_t acc0, acc1, acc2, acc3; + + +int32_t nbTaps = (numTaps + 7) >> 3; +switch(nbTaps) { + + case 1: + arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize); + return; + case 2: + arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize); + return; + case 3: + arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize); + return; + case 4: + arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize); + return; + } /* * pState points to state array which contains previous frame (numTaps - 1) samples * pStateCur points to the location where the new input data should be written */ - pStateCur = &(pState[(numTaps - 1u)]); - pTempSrc = pSrc; - pSamples = pState; - pOutput = pDst; - - q63_t acc0, acc1, acc2, acc3; - - blkCnt = blockSize >> 2; + pStateCur = &(pState[(numTaps - 1u)]); + pTempSrc = pSrc; + pSamples = pState; + pOutput = pDst; + blkCnt = blockSize >> 2; while (blkCnt > 0U) { + const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; + acc0 = 0LL; + acc1 = 0LL; + acc2 = 0LL; + acc3 = 0LL; + /* - * Save 4 input samples in the history buffer + * Save 8 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 8; pTempSrc += 8; - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavq(vecIn0, vecCoeffs); + int i = tapsBlkCnt; + while (i > 0) + { + /* + * load 8 coefs + */ + q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; + + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); + + vecIn0 = vld1q(&pSamplesTmp[1]); + acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmlaldavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[2]); + acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vmlaldavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[3]); + acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[3]); - acc3 = vmlaldavq(vecIn0, vecCoeffs); + pSamplesTmp += 8; + pCoeffsTmp += 8; + /* + * Decrement the taps block loop counter + */ + i--; + } *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); @@ -130,6 +298,7 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS { case 3: { + const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; @@ -137,20 +306,40 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS acc2 = 0LL; /* - * Save 4 input samples in the history buffer + * Save 8 input samples in the history buffer */ *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc; pStateCur += 8; pTempSrc += 8; - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavq(vecIn0, vecCoeffs); + int i = tapsBlkCnt; + while (i > 0) + { + /* + * load 8 coefs + */ + q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmlaldavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vmlaldavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[2]); + acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); + + vecIn0 = vld1q(&pSamplesTmp[4]); + acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); + + pSamplesTmp += 8; + pCoeffsTmp += 8; + /* + * Decrement the taps block loop counter + */ + i--; + } + + acc0 = asrl(acc0, 15); + acc1 = asrl(acc1, 15); + acc2 = asrl(acc2, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); @@ -160,23 +349,39 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS case 2: { + const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; acc1 = 0LL; - /* - * Save 4 input samples in the history buffer + * Save 8 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 8; pTempSrc += 8; - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavq(vecIn0, vecCoeffs); + int i = tapsBlkCnt; + while (i > 0) + { + /* + * load 8 coefs + */ + q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmlaldavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); + + vecIn0 = vld1q(&pSamplesTmp[2]); + acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); + + pSamplesTmp += 8; + pCoeffsTmp += 8; + /* + * Decrement the taps block loop counter + */ + i--; + } *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); @@ -185,126 +390,29 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS case 1: { + const q15_t *pCoeffsTmp = pCoeffs; const q15_t *pSamplesTmp = pSamples; acc0 = 0LL; - /* - * Save 4 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 8; - pTempSrc += 8; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavq(vecIn0, vecCoeffs); - - pSamplesTmp += 4; - - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); - } - break; - } - - /* - * Copy the samples back into the history buffer start - */ - pTempSrc = &S->pState[blockSize]; - pTempDest = S->pState; - - blkCnt = numTaps >> 3; - while (blkCnt > 0U) - { - vst1q(pTempDest, vld1q(pTempSrc)); - pTempSrc += 8; - pTempDest += 8; - blkCnt--; - } - blkCnt = numTaps & 7; - if (blkCnt > 0U) - { - mve_pred16_t p0 = vctp16q(blkCnt); - vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0); - } -} - -void arm_fir_q15( - const arm_fir_instance_q15 * S, - const q15_t * pSrc, - q15_t * pDst, - uint32_t blockSize) -{ - q15_t *pState = S->pState; /* State pointer */ - const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - q15_t *pStateCur; /* Points to the current sample of the state */ - const q15_t *pSamples; /* Temporary pointer to the sample buffer */ - q15_t *pOutput; /* Temporary pointer to the output buffer */ - const q15_t *pTempSrc; /* Temporary pointer to the source data */ - q15_t *pTempDest; /* Temporary pointer to the destination buffer */ - uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ - uint32_t blkCnt; - q15x8_t vecIn0; - uint32_t tapsBlkCnt = (numTaps + 7) / 8; - q63_t acc0, acc1, acc2, acc3; - - if (blockSize >= 12) - { - if(numTaps <= 8) { - /* [1 to 8 taps] specialized routine */ - arm_fir_q15_1_8_mve(S,pSrc, pDst, blockSize); - return; - } - } - - if (blockSize >= 12) - { - /* - * pState points to state array which contains previous frame (numTaps - 1) samples - * pStateCur points to the location where the new input data should be written - */ - pStateCur = &(pState[(numTaps - 1u)]); - pTempSrc = pSrc; - pSamples = pState; - pOutput = pDst; - blkCnt = blockSize >> 2; - - while (blkCnt > 0U) - { - const q15_t *pCoeffsTmp = pCoeffs; - const q15_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - acc1 = 0LL; - acc2 = 0LL; - acc3 = 0LL; - /* * Save 8 input samples in the history buffer */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 8; pTempSrc += 8; - - uint32_t i = tapsBlkCnt; - while (i > 0U) + + int i = tapsBlkCnt; + while (i > 0) { /* * load 8 coefs */ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; - + vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[3]); - acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs); - + acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); + pSamplesTmp += 8; pCoeffsTmp += 8; /* @@ -312,197 +420,17 @@ void arm_fir_q15( */ i--; } - + *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15); - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15); - - pSamples += 4; - /* - * Decrement the sample block loop counter - */ - blkCnt--; } - - uint32_t residual = blockSize & 3; - switch (residual) - { - case 3: - { - const q15_t *pCoeffsTmp = pCoeffs; - const q15_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - acc1 = 0LL; - acc2 = 0LL; - - /* - * Save 8 input samples in the history buffer - */ - *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc; - pStateCur += 8; - pTempSrc += 8; - - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - /* - * load 8 coefs - */ - q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs); - - pSamplesTmp += 8; - pCoeffsTmp += 8; - /* - * Decrement the taps block loop counter - */ - i--; - } - - - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15); - } - break; - - case 2: - { - const q15_t *pCoeffsTmp = pCoeffs; - const q15_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - acc1 = 0LL; - /* - * Save 8 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 8; - pTempSrc += 8; - - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - /* - * load 8 coefs - */ - q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs); - - pSamplesTmp += 8; - pCoeffsTmp += 8; - /* - * Decrement the taps block loop counter - */ - i--; - } - - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15); - } - break; - - case 1: - { - const q15_t *pCoeffsTmp = pCoeffs; - const q15_t *pSamplesTmp = pSamples; - - acc0 = 0LL; - - /* - * Save 8 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 8; - pTempSrc += 8; - - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - /* - * load 8 coefs - */ - q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs); - - pSamplesTmp += 8; - pCoeffsTmp += 8; - /* - * Decrement the taps block loop counter - */ - i--; - } - - *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15); - } - break; - } - } - else - { - q15_t *pStateCurnt; /* Points to the current sample of the state */ - q15_t *px; /* Temporary pointer for state buffer */ - const q15_t *pb; /* Temporary pointer for coefficient buffer */ - q63_t acc0; /* Accumulator */ - uint32_t blkCnt,tapCnt; /* Loop counters */ - pStateCurnt = &(S->pState[(numTaps - 1U)]); - blkCnt = blockSize; - while (blkCnt > 0U) - { - /* Copy two samples into state buffer */ - *pStateCurnt++ = *pSrc++; - - /* Set the accumulator to zero */ - acc0 = 0; - - /* Use SIMD to hold states and coefficients */ - px = pState; - pb = pCoeffs; - - tapCnt = numTaps >> 1U; - - while (tapCnt > 0U) - { - acc0 += (q15_t) *px++ * *pb++; - acc0 += (q15_t) *px++ * *pb++; - - tapCnt--; - } - - - /* The result is in 2.30 format. Convert to 1.15 with saturation. - Then store the output in the destination buffer. */ - *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); - - /* Advance state pointer by 1 for the next sample */ - pState = pState + 1U; - - /* Decrement loop counter */ - blkCnt--; - } + break; } + /* * Copy the samples back into the history buffer start */ - pTempSrc = &S->pState[blockSize]; - pTempDest = S->pState; + pTempSrc = &pState[blockSize]; + pTempDest = pState; blkCnt = numTaps >> 3; while (blkCnt > 0U) diff --git a/Source/FilteringFunctions/arm_fir_q7.c b/Source/FilteringFunctions/arm_fir_q7.c index bb2aaf5b..9222ba90 100644 --- a/Source/FilteringFunctions/arm_fir_q7.c +++ b/Source/FilteringFunctions/arm_fir_q7.c @@ -56,7 +56,115 @@ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) -void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t * pDst, uint32_t blockSize) +#define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \ + for (int j = 0; j < nbAcc; j++) { \ + const q7_t *pSmp = &pSample[j]; \ + q31_t acc[4]; \ + \ + acc[j] = 0; \ + for (int i = 0; i < nbVecTaps; i++) { \ + vecIn0 = vld1q(pSmp + 16 * i); \ + acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]); \ + } \ + *pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8); \ + } + +#define FIR_Q7_MAIN_CORE() \ +{ \ + q7_t *pState = S->pState; /* State pointer */ \ + const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \ + q7_t *pStateCur; /* Points to the current sample of the state */ \ + const q7_t *pSamples; /* Temporary pointer to the sample buffer */ \ + q7_t *pOutput; /* Temporary pointer to the output buffer */ \ + const q7_t *pTempSrc; /* Temporary pointer to the source data */ \ + q7_t *pTempDest; /* Temporary pointer to the destination buffer */\ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\ + int32_t blkCnt; \ + q7x16_t vecIn0; \ + const int32_t nbVecTaps = (NBTAPS / 16); \ + \ + /* \ + * load coefs \ + */ \ + q7x16_t vecCoeffs[nbVecTaps]; \ + \ + for (int i = 0; i < nbVecTaps; i++) \ + vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i); \ + \ + /* \ + * pState points to state array which contains previous frame (numTaps - 1) samples \ + * pStateCur points to the location where the new input data should be written \ + */ \ + pStateCur = &(pState[(numTaps - 1u)]); \ + pTempSrc = pSrc; \ + pSamples = pState; \ + pOutput = pDst; \ + \ + blkCnt = blockSize >> 2; \ + while (blkCnt > 0) { \ + /* \ + * Save 4 input samples in the history buffer \ + */ \ + vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc)); \ + pStateCur += 4; \ + pTempSrc += 4; \ + \ + FIR_Q7_CORE(pOutput, 4, nbVecTaps, pSamples, vecCoeffs); \ + pSamples += 4; \ + \ + blkCnt--; \ + } \ + \ + /* tail */ \ + int32_t residual = blockSize & 3; \ + \ + for (int i = 0; i < residual; i++) \ + *pStateCur++ = *pTempSrc++; \ + \ + FIR_Q7_CORE(pOutput, residual, nbVecTaps, pSamples, vecCoeffs); \ + \ + \ + /* \ + * Copy the samples back into the history buffer start \ + */ \ + pTempSrc = &pState[blockSize]; \ + pTempDest = pState; \ + blkCnt = numTaps - 1; \ + do { \ + mve_pred16_t p = vctp8q(blkCnt); \ + \ + vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p); \ + pTempSrc += 16; \ + pTempDest += 16; \ + blkCnt -= 16; \ + } \ + while (blkCnt > 0); \ +} + +static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S, + const q7_t * __restrict pSrc, + q7_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 32 + FIR_Q7_MAIN_CORE(); + #undef NBTAPS +} + + +void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, + const q7_t * __restrict pSrc, + q7_t * __restrict pDst, uint32_t blockSize) +{ + #define NBTAPS 16 + FIR_Q7_MAIN_CORE(); + #undef NBTAPS +} + +void arm_fir_q7( + const arm_fir_instance_q7 * S, + const q7_t * pSrc, + q7_t * pDst, + uint32_t blockSize) { q7_t *pState = S->pState; /* State pointer */ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ @@ -68,9 +176,27 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t blkCnt; q7x16_t vecIn0; + uint32_t tapsBlkCnt = (numTaps + 15) / 16; q31_t acc0, acc1, acc2, acc3; q7x16_t vecCoeffs; + if (numTaps <= 16) + { + /* + * [1 to 16 taps] specialized routine + */ + arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize); + return; + } + else if (numTaps <= 32) + { + /* + * [17 to 32 taps] specialized routine + */ + arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize); + return; + } + /* * pState points to state array which contains previous frame (numTaps - 1) samples * pStateCur points to the location where the new input data should be written @@ -82,12 +208,17 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t blkCnt = blockSize >> 2; /* - * load 16 coefs + * outer samples loop */ - vecCoeffs = *(q7x16_t *) pCoeffs; - while (blkCnt > 0U) { + const q7_t *pCoeffsTmp = pCoeffs; + const q7_t *pSamplesTmp = pSamples; + + acc0 = 0; + acc1 = 0; + acc2 = 0; + acc3 = 0; /* * Save 16 input samples in the history buffer */ @@ -95,18 +226,36 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t pStateCur += 16; pTempSrc += 16; - vecIn0 = vld1q(pSamples); - acc0 = vmladavq(vecIn0, vecCoeffs); + /* + * inner coefficients loop + */ + int i = tapsBlkCnt; + while (i > 0) + { + /* + * load 16 coefs + */ + vecCoeffs = *(q7x16_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamples[1]);; - acc1 = vmladavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[2]);; - acc2 = vmladavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[1]); + acc1 = vmladavaq(acc1, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[3]); - acc3 = vmladavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[2]); + acc2 = vmladavaq(acc2, vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[3]); + acc3 = vmladavaq(acc3, vecIn0, vecCoeffs); + + pSamplesTmp += 16; + pCoeffsTmp += 16; + /* + * Decrement the taps block loop counter + */ + i--; + } /* * Store the 1.7 format filter output in destination buffer */ @@ -127,18 +276,37 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t { case 3: { + const q7_t *pCoeffsTmp = pCoeffs; + const q7_t *pSamplesTmp = pSamples; + + acc0 = 0; + acc1 = 0; + acc2 = 0; + /* + * Save 16 input samples in the history buffer + */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 16; pTempSrc += 16; - vecIn0 = vld1q(pSamples); - acc0 = vmladavq(vecIn0, vecCoeffs); + int i = tapsBlkCnt; + while (i > 0) + { + vecCoeffs = *(q7x16_t *) pCoeffsTmp; + + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); + + vecIn0 = vld1q(&pSamplesTmp[4]); + acc1 = vmladavaq(acc1, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[1]); - acc1 = vmladavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(&pSamplesTmp[8]); + acc2 = vmladavaq(acc2, vecIn0, vecCoeffs); - vecIn0 = vld1q(&pSamples[2]); - acc2 = vmladavq(vecIn0, vecCoeffs); + pSamplesTmp += 16; + pCoeffsTmp += 16; + i--; + } *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8); @@ -148,15 +316,33 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t case 2: { + const q7_t *pCoeffsTmp = pCoeffs; + const q7_t *pSamplesTmp = pSamples; + + acc0 = 0; + acc1 = 0; + /* + * Save 16 input samples in the history buffer + */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 16; pTempSrc += 16; - vecIn0 = vld1q(pSamples); - acc0 = vmladavq(vecIn0, vecCoeffs); + int i = tapsBlkCnt; + while (i > 0) + { + vecCoeffs = *(q7x16_t *) pCoeffsTmp; - vecIn0 = vld1q(&pSamples[1]); - acc1 = vmladavq(vecIn0, vecCoeffs); + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); + + vecIn0 = vld1q(&pSamplesTmp[4]); + acc1 = vmladavaq(acc1, vecIn0, vecCoeffs); + + pSamplesTmp += 16; + pCoeffsTmp += 16; + i--; + } *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8); @@ -165,13 +351,29 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t case 1: { + const q7_t *pCoeffsTmp = pCoeffs; + const q7_t *pSamplesTmp = pSamples; + + acc0 = 0; + /* + * Save 16 input samples in the history buffer + */ vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 16; pTempSrc += 16; - vecIn0 = vld1q(pSamples); - acc0 = vmladavq(vecIn0, vecCoeffs); + int i = tapsBlkCnt; + while (i > 0) + { + vecCoeffs = *(q7x16_t *) pCoeffsTmp; + + vecIn0 = vld1q(pSamplesTmp); + acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); + pSamplesTmp += 16; + pCoeffsTmp += 16; + i--; + } *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); } break; @@ -198,288 +400,6 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0); } } - -void arm_fir_q7( - const arm_fir_instance_q7 * S, - const q7_t * pSrc, - q7_t * pDst, - uint32_t blockSize) -{ - q7_t *pState = S->pState; /* State pointer */ - const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ - q7_t *pStateCur; /* Points to the current sample of the state */ - const q7_t *pSamples; /* Temporary pointer to the sample buffer */ - q7_t *pOutput; /* Temporary pointer to the output buffer */ - const q7_t *pTempSrc; /* Temporary pointer to the source data */ - q7_t *pTempDest; /* Temporary pointer to the destination buffer */ - uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ - uint32_t blkCnt; - q7x16_t vecIn0; - uint32_t tapsBlkCnt = (numTaps + 15) / 16; - q31_t acc0, acc1, acc2, acc3; - q7x16_t vecCoeffs; - - if (blockSize >= 20) - { - if (numTaps <= 16) - { - /* - * [1 to 16 taps] specialized routine - */ - arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize); - return; - } - } - - if (blockSize >= 20) - { - /* - * pState points to state array which contains previous frame (numTaps - 1) samples - * pStateCur points to the location where the new input data should be written - */ - pStateCur = &(pState[(numTaps - 1u)]); - pSamples = pState; - pTempSrc = pSrc; - pOutput = pDst; - blkCnt = blockSize >> 2; - - /* - * outer samples loop - */ - while (blkCnt > 0U) - { - const q7_t *pCoeffsTmp = pCoeffs; - const q7_t *pSamplesTmp = pSamples; - - acc0 = 0; - acc1 = 0; - acc2 = 0; - acc3 = 0; - /* - * Save 16 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 16; - pTempSrc += 16; - - /* - * inner coefficients loop - */ - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - /* - * load 16 coefs - */ - vecCoeffs = *(q7x16_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmladavaq(acc1, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vmladavaq(acc2, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[3]); - acc3 = vmladavaq(acc3, vecIn0, vecCoeffs); - - pSamplesTmp += 16; - pCoeffsTmp += 16; - /* - * Decrement the taps block loop counter - */ - i--; - } - /* - * Store the 1.7 format filter output in destination buffer - */ - *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); - *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8); - *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8); - *pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8); - - pSamples += 4; - /* - * Decrement the sample block loop counter - */ - blkCnt--; - } - - uint32_t residual = blockSize & 3; - switch (residual) - { - case 3: - { - const q7_t *pCoeffsTmp = pCoeffs; - const q7_t *pSamplesTmp = pSamples; - - acc0 = 0; - acc1 = 0; - acc2 = 0; - /* - * Save 16 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 16; - pTempSrc += 16; - - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - vecCoeffs = *(q7x16_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmladavaq(acc1, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[2]); - acc2 = vmladavaq(acc2, vecIn0, vecCoeffs); - - pSamplesTmp += 16; - pCoeffsTmp += 16; - i--; - } - - *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); - *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8); - *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8); - } - break; - - case 2: - { - const q7_t *pCoeffsTmp = pCoeffs; - const q7_t *pSamplesTmp = pSamples; - - acc0 = 0; - acc1 = 0; - /* - * Save 16 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 16; - pTempSrc += 16; - - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - vecCoeffs = *(q7x16_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); - - vecIn0 = vld1q(&pSamplesTmp[1]); - acc1 = vmladavaq(acc1, vecIn0, vecCoeffs); - - pSamplesTmp += 16; - pCoeffsTmp += 16; - i--; - } - - *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); - *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8); - } - break; - - case 1: - { - const q7_t *pCoeffsTmp = pCoeffs; - const q7_t *pSamplesTmp = pSamples; - - acc0 = 0; - /* - * Save 16 input samples in the history buffer - */ - vst1q(pStateCur, vld1q(pTempSrc)); - pStateCur += 16; - pTempSrc += 16; - - uint32_t i = tapsBlkCnt; - while (i > 0U) - { - vecCoeffs = *(q7x16_t *) pCoeffsTmp; - - vecIn0 = vld1q(pSamplesTmp); - acc0 = vmladavaq(acc0, vecIn0, vecCoeffs); - - pSamplesTmp += 16; - pCoeffsTmp += 16; - i--; - } - *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8); - } - break; - } - } - else - { - q7_t *pStateCurnt; /* Points to the current sample of the state */ - q7_t *px; /* Temporary pointer for state buffer */ - const q7_t *pb; /* Temporary pointer for coefficient buffer */ - q31_t acc0; /* Accumulator */ - uint32_t i,blkCnt; /* Loop counters */ - pStateCurnt = &(S->pState[(numTaps - 1U)]); - blkCnt = blockSize; - - while (blkCnt > 0U) - { - /* Copy one sample at a time into state buffer */ - *pStateCurnt++ = *pSrc++; - - /* Set the accumulator to zero */ - acc0 = 0; - - /* Initialize state pointer */ - px = pState; - - /* Initialize Coefficient pointer */ - pb = pCoeffs; - - i = numTaps; - - /* Perform the multiply-accumulates */ - while (i > 0U) - { - acc0 += (q15_t) * (px++) * (*(pb++)); - i--; - } - - /* The result is in 2.14 format. Convert to 1.7 - Then store the output in the destination buffer. */ - *pDst++ = __SSAT((acc0 >> 7U), 8); - - /* Advance state pointer by 1 for the next sample */ - pState = pState + 1U; - - /* Decrement loop counter */ - blkCnt--; - } - } - /* - * Copy the samples back into the history buffer start - */ - pTempSrc = &S->pState[blockSize]; - pTempDest = S->pState; - - blkCnt = numTaps >> 4; - while (blkCnt > 0U) - { - vst1q(pTempDest, vld1q(pTempSrc)); - pTempSrc += 16; - pTempDest += 16; - blkCnt--; - } - blkCnt = numTaps & 0xF; - if (blkCnt > 0U) - { - mve_pred16_t p0 = vctp8q(blkCnt); - vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0); - } -} #else void arm_fir_q7( const arm_fir_instance_q7 * S, diff --git a/Testing/CMakeLists.txt b/Testing/CMakeLists.txt index 28cc8db4..a0a14c0f 100644 --- a/Testing/CMakeLists.txt +++ b/Testing/CMakeLists.txt @@ -168,6 +168,7 @@ set (NNSRC Source/Benchmarks/FIRF32.cpp Source/Benchmarks/FIRQ31.cpp Source/Benchmarks/FIRQ15.cpp + Source/Benchmarks/FIRQ7.cpp Source/Benchmarks/MISCF32.cpp Source/Benchmarks/MISCQ31.cpp Source/Benchmarks/MISCQ15.cpp diff --git a/Testing/Include/Benchmarks/FIRQ7.h b/Testing/Include/Benchmarks/FIRQ7.h new file mode 100755 index 00000000..c07ef798 --- /dev/null +++ b/Testing/Include/Benchmarks/FIRQ7.h @@ -0,0 +1,33 @@ +#include "Test.h" +#include "Pattern.h" + +#include "dsp/filtering_functions.h" + +class FIRQ7:public Client::Suite + { + public: + FIRQ7(Testing::testID_t id); + virtual void setUp(Testing::testID_t,std::vector& params,Client::PatternMgr *mgr); + virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr); + private: + #include "FIRQ7_decl.h" + Client::Pattern coefs; + Client::Pattern samples; + Client::Pattern refs; + + Client::LocalPattern output; + Client::LocalPattern error; + Client::LocalPattern state; + + int nbTaps; + int nbSamples; + + arm_fir_instance_q7 instFir; + + const q7_t *pSrc; + const q7_t *pCoefs; + q7_t *pDst; + const q7_t *pRef; + q7_t *pErr; + + }; diff --git a/Testing/Source/Benchmarks/FIRQ15.cpp b/Testing/Source/Benchmarks/FIRQ15.cpp index 72677ad3..27fdc144 100755 --- a/Testing/Source/Benchmarks/FIRQ15.cpp +++ b/Testing/Source/Benchmarks/FIRQ15.cpp @@ -1,6 +1,9 @@ #include "FIRQ15.h" #include "Error.h" +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +static __ALIGNED(8) q15_t coeffArray[64]; +#endif void FIRQ15::test_fir_q15() { @@ -35,10 +38,21 @@ switch(id) { case TEST_FIR_Q15_1: +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + /* Copy coefficients and pad to zero + */ + memset(coeffArray,0,32*sizeof(q15_t)); + q15_t *ptr; + + ptr=coefs.ptr(); + memcpy(coeffArray,ptr,this->nbTaps*sizeof(q15_t)); + this->pCoefs = coeffArray; +#else + this->pCoefs=coefs.ptr(); +#endif arm_fir_init_q15(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples); this->pSrc=samples.ptr(); - this->pCoefs=coefs.ptr(); this->pDst=output.ptr(); break; diff --git a/Testing/Source/Benchmarks/FIRQ31.cpp b/Testing/Source/Benchmarks/FIRQ31.cpp index 25cc206e..d590e1e2 100755 --- a/Testing/Source/Benchmarks/FIRQ31.cpp +++ b/Testing/Source/Benchmarks/FIRQ31.cpp @@ -1,7 +1,7 @@ #include "FIRQ31.h" #include "Error.h" -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) static __ALIGNED(8) q31_t coeffArray[64]; #endif @@ -39,7 +39,7 @@ static __ALIGNED(8) q31_t coeffArray[64]; switch(id) { case TEST_FIR_Q31_1: -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) /* Copy coefficients and pad to zero */ memset(coeffArray,0,32*sizeof(q31_t)); diff --git a/Testing/Source/Benchmarks/FIRQ7.cpp b/Testing/Source/Benchmarks/FIRQ7.cpp new file mode 100755 index 00000000..174cb34d --- /dev/null +++ b/Testing/Source/Benchmarks/FIRQ7.cpp @@ -0,0 +1,60 @@ +#include "FIRQ7.h" +#include "Error.h" + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +static __ALIGNED(8) q7_t coeffArray[64]; +#endif + + void FIRQ7::test_fir_q7() + { + arm_fir_q7(&instFir, this->pSrc, this->pDst, this->nbSamples); + } + + + + + void FIRQ7::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) + { + + + std::vector::iterator it = params.begin(); + this->nbTaps = *it++; + this->nbSamples = *it; + + samples.reload(FIRQ7::SAMPLES1_Q7_ID,mgr,this->nbSamples); + coefs.reload(FIRQ7::COEFS1_Q7_ID,mgr,this->nbTaps); + + state.create(this->nbSamples + this->nbTaps - 1,FIRQ7::STATE_Q7_ID,mgr); + output.create(this->nbSamples,FIRQ7::OUT_SAMPLES_Q7_ID,mgr); + + switch(id) + { + case TEST_FIR_Q7_1: +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + /* Copy coefficients and pad to zero + */ + memset(coeffArray,0,32*sizeof(q7_t)); + q7_t *ptr; + + ptr=coefs.ptr(); + memcpy(coeffArray,ptr,this->nbTaps*sizeof(q7_t)); + this->pCoefs = coeffArray; +#else + this->pCoefs=coefs.ptr(); +#endif + arm_fir_init_q7(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples); + + this->pSrc=samples.ptr(); + this->pDst=output.ptr(); + break; + + + + + } + + } + + void FIRQ7::tearDown(Testing::testID_t id,Client::PatternMgr *mgr) + { + } diff --git a/Testing/Source/Tests/FIRQ15.cpp b/Testing/Source/Tests/FIRQ15.cpp index 05dfa2cd..91ef8bb6 100644 --- a/Testing/Source/Tests/FIRQ15.cpp +++ b/Testing/Source/Tests/FIRQ15.cpp @@ -130,10 +130,10 @@ void checkInnerTail(q15_t *b) ref.reload(FIRQ15::FIRREFS_Q15_ID,mgr); output.create(ref.nbSamples(),FIRQ15::OUT_Q15_ID,mgr); - /* Max blockSize + numTaps as generated by Python script + /* > Max blockSize + numTaps as generated by Python script numTaps may be increased by 1 by Python script to force it to even values */ - state.create(41,FIRQ15::OUT_Q15_ID,mgr); + state.create(3 * 41,FIRQ15::OUT_Q15_ID,mgr); } void FIRQ15::tearDown(Testing::testID_t id,Client::PatternMgr *mgr) diff --git a/Testing/bench.txt b/Testing/bench.txt index 723cca82..3e8bfee3 100755 --- a/Testing/bench.txt +++ b/Testing/bench.txt @@ -499,6 +499,35 @@ group Root { Normalized LMS Filter:test_lms_norm_q15 } -> PARAM1_ID } + + suite FIR Q7 { + class = FIRQ7 + folder = FIRQ7 + + ParamList { + NumTaps, NB + Summary NumTaps, NB + Names "Number of taps","Number of samples" + Formula "NumTaps * NB" + } + + Pattern SAMPLES1_Q7_ID : Samples1_q7.txt + Pattern REFS1_Q7_ID : Refs1_q7.txt + Pattern COEFS1_Q7_ID : Coefs1_q7.txt + + Output OUT_SAMPLES_Q7_ID : Output + Output STATE_Q7_ID : State + Output ERR_Q7_ID : Err + + Params PARAM1_ID = { + NumTaps = [16,32,64] + NB = [64,128,256] + } + + Functions { + FIR Filter:test_fir_q7 + } -> PARAM1_ID + } } group Convolutions / Correlations {