From f51e95f5a8afe22b524dcaea9a16c33d38561ca3 Mon Sep 17 00:00:00 2001 From: GuentherMartin Date: Mon, 23 Mar 2020 09:04:23 +0100 Subject: [PATCH] CMSIS-DSP: Corrected tail management bug in MVE version of arm_fir_f32.c --- Source/FilteringFunctions/arm_fir_f32.c | 136 ++++++++++++------------ 1 file changed, 69 insertions(+), 67 deletions(-) diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c index bb2da425..4ce8dfaa 100644 --- a/Source/FilteringFunctions/arm_fir_f32.c +++ b/Source/FilteringFunctions/arm_fir_f32.c @@ -188,7 +188,9 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t blkCnt--; } + blkCnt = blockSize & 3; + if (blkCnt > 0U) { mve_pred16_t p0 = vctp32q(blkCnt); @@ -217,6 +219,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t */ pTempSrc = &S->pState[blockSize]; pTempDest = S->pState; + blkCnt = numTaps >> 2; while (blkCnt > 0U) { @@ -225,6 +228,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t pTempDest += 4; blkCnt--; } + blkCnt = numTaps & 3; if (blkCnt > 0U) { @@ -234,7 +238,6 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t } - static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) { float32_t *pState = S->pState; /* State pointer */ @@ -314,9 +317,8 @@ static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t } blkCnt = blockSize & 3; - if (blkCnt > 0) + if (blkCnt > 0U) { - mve_pred16_t p0 = vctp32q(blkCnt); vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0); @@ -364,6 +366,7 @@ static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t pTempDest += 4; blkCnt--; } + blkCnt = numTaps & 3; if (blkCnt > 0U) { @@ -372,6 +375,7 @@ static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t } } + void arm_fir_f32( const arm_fir_instance_f32 * S, const float32_t * pSrc, @@ -413,9 +417,6 @@ uint32_t blockSize) } } - - - if (blockSize >= 8) { /* @@ -432,7 +433,7 @@ uint32_t blockSize) { int32_t i; const float32_t *pCoeffsCur = pCoeffs; - + c0 = *pCoeffsCur++; c1 = *pCoeffsCur++; c2 = *pCoeffsCur++; @@ -441,37 +442,37 @@ uint32_t blockSize) c5 = *pCoeffsCur++; c6 = *pCoeffsCur++; c7 = *pCoeffsCur++; - + vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 4; pTempSrc += 4; - + vecIn0 = vld1q(pSamples); vecAcc0 = vmulq(vecIn0, c0); - + vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - + vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - + vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - + vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - + vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - + vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); - + vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); - + pSamples += 8; - + numCnt = ((int32_t)numTaps - 8) / 8; for (i = 0; i < numCnt; i++) @@ -484,31 +485,31 @@ uint32_t blockSize) c5 = *pCoeffsCur++; c6 = *pCoeffsCur++; c7 = *pCoeffsCur++; - + vecIn0 = vld1q(pSamples); vecAcc0 = vfmaq(vecAcc0, vecIn0, c0); - + vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - + vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - + vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - + vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - + vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - + vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); - + vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); - + pSamples += 8; } @@ -523,24 +524,25 @@ uint32_t blockSize) numCnt --; } - + vst1q(pOutput, vecAcc0); pOutput += 4; pSamples = pSamples - numTaps + 4; - + blkCnt--; } - + blkCnt = blockSize & 3; + if (blkCnt > 0U) { mve_pred16_t p0 = vctp32q(blkCnt); int32_t i; const float32_t *pCoeffsCur = pCoeffs; - + vst1q(pStateCur, vld1q(pTempSrc)); pStateCur += 4; pTempSrc += 4; - + c0 = *pCoeffsCur++; c1 = *pCoeffsCur++; c2 = *pCoeffsCur++; @@ -549,33 +551,33 @@ uint32_t blockSize) c5 = *pCoeffsCur++; c6 = *pCoeffsCur++; c7 = *pCoeffsCur++; - + vecIn0 = vld1q(pSamples); vecAcc0 = vmulq(vecIn0, c0); - + vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - + vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - + vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - + vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - + vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - + vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); - + vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); - + pSamples += 8; - + numCnt = ((int32_t)numTaps - 8) / 8; for (i = 0; i < numCnt; i++) @@ -588,31 +590,31 @@ uint32_t blockSize) c5 = *pCoeffsCur++; c6 = *pCoeffsCur++; c7 = *pCoeffsCur++; - + vecIn0 = vld1q(pSamples); vecAcc0 = vfmaq(vecAcc0, vecIn0, c0); - + vecIn0 = vld1q(&pSamples[1]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); - + vecIn0 = vld1q(&pSamples[2]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); - + vecIn0 = vld1q(&pSamples[3]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); - + vecIn0 = vld1q(&pSamples[4]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); - + vecIn0 = vld1q(&pSamples[5]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); - + vecIn0 = vld1q(&pSamples[6]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); - + vecIn0 = vld1q(&pSamples[7]); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); - + pSamples += 8; } @@ -627,8 +629,7 @@ uint32_t blockSize) numCnt --; } - - + vstrwq_p_f32(pOutput, vecAcc0, p0); } } @@ -641,40 +642,40 @@ uint32_t blockSize) uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t i, blkCnt; /* Loop counters */ pStateCurnt = &(S->pState[(numTaps - 1U)]); + blkCnt = blockSize; while (blkCnt > 0U) { /* Copy one sample at a time into state buffer */ *pStateCurnt++ = *pSrc++; - + /* Set the accumulator to zero */ acc0 = 0.0f; - + /* Initialize state pointer */ px = pState; - + /* Initialize Coefficient pointer */ pb = pCoeffs; - + i = numTaps; - + /* Perform the multiply-accumulates */ while (i > 0U) { /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */ acc0 += *px++ * *pb++; - + i--; } - /* Store result in destination buffer. */ *pDst++ = acc0; - + /* Advance state pointer by 1 for the next sample */ pState = pState + 1U; - + /* Decrement loop counter */ blkCnt--; } @@ -694,6 +695,7 @@ uint32_t blockSize) pTempDest += 4; blkCnt--; } + blkCnt = numTaps & 3; if (blkCnt > 0U) { @@ -773,7 +775,7 @@ uint32_t blockSize) xa = vextq_f32(x0,x1,1); xb = vextq_f32(x1,x2,1); - + accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1)); accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1)); @@ -785,7 +787,7 @@ uint32_t blockSize) xa = vextq_f32(x0,x1,3); xb = vextq_f32(x1,x2,3); - + accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3)); accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3)); @@ -821,7 +823,7 @@ uint32_t blockSize) xa = vextq_f32(x0,x1,2); xb = vextq_f32(x1,x2,2); - + accv0 = vmlaq_n_f32(accv0,xa,*pb); accv1 = vmlaq_n_f32(accv1,xb,*pb); @@ -836,7 +838,7 @@ uint32_t blockSize) xa = vextq_f32(x0,x1,1); xb = vextq_f32(x1,x2,1); - + accv0 = vmlaq_n_f32(accv0,xa,*pb); accv1 = vmlaq_n_f32(accv1,xb,*pb); @@ -844,7 +846,7 @@ uint32_t blockSize) break; case 1: { - + accv0 = vmlaq_n_f32(accv0,x0,*pb); accv1 = vmlaq_n_f32(accv1,x1,*pb);