diff --git a/Include/dsp/filtering_functions.h b/Include/dsp/filtering_functions.h
index 0cbd7cf4..1f9e0fac 100755
--- a/Include/dsp/filtering_functions.h
+++ b/Include/dsp/filtering_functions.h
@@ -39,6 +39,8 @@ extern "C"
{
#endif
+
+
#define DELTA_Q31 ((q31_t)(0x100))
#define DELTA_Q15 ((q15_t)0x5)
diff --git a/Include/dsp/filtering_functions_f16.h b/Include/dsp/filtering_functions_f16.h
index 4a99e831..0265f04e 100755
--- a/Include/dsp/filtering_functions_f16.h
+++ b/Include/dsp/filtering_functions_f16.h
@@ -40,6 +40,8 @@ extern "C"
#if defined(ARM_FLOAT16_SUPPORTED)
+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
+
/**
* @brief Instance structure for the floating-point FIR filter.
*/
diff --git a/Source/FilteringFunctions/arm_fir_f16.c b/Source/FilteringFunctions/arm_fir_f16.c
index d8713cf3..0cc55843 100755
--- a/Source/FilteringFunctions/arm_fir_f16.c
+++ b/Source/FilteringFunctions/arm_fir_f16.c
@@ -47,6 +47,8 @@
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+#define FIR_F32_MAX_COEF_BLK 8
+
#define FIR_F16_CORE(pSamples, c, NB_TAPS) \
vecAcc0 = vdupq_n_f16(0.0f16); \
for (int i = 0; i < NB_TAPS; i++) { \
@@ -54,7 +56,9 @@
vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
}
-static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,
+ const float16_t * __restrict pSrc,
+ float16_t * __restrict pDst, uint32_t blockSize)
{
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@@ -65,8 +69,8 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
float16_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
int32_t blkCnt;
- f16x8_t vecIn0;
- f16x8_t vecAcc0;
+ float16x8_t vecIn0;
+ float16x8_t vecAcc0;
const int NB_TAPS=4;
float16_t c[NB_TAPS];
@@ -107,6 +111,7 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
}
blkCnt = blockSize & 7;
+ if (blkCnt)
{
mve_pred16_t p0 = vctp16q(blkCnt);
@@ -141,7 +146,9 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
}
-static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
+__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,
+ const float16_t * __restrict pSrc,
+ float16_t * __restrict pDst, uint32_t blockSize)
{
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@@ -152,8 +159,8 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
float16_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
int32_t blkCnt;
- f16x8_t vecIn0;
- f16x8_t vecAcc0;
+ float16x8_t vecIn0;
+ float16x8_t vecAcc0;
const int NB_TAPS=8;
float16_t c[NB_TAPS];
@@ -194,6 +201,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
}
blkCnt = blockSize & 7;
+ if (blkCnt)
{
mve_pred16_t p0 = vctp16q(blkCnt);
@@ -224,6 +232,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
mve_pred16_t p0 = vctp16q(blkCnt);
vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
}
+
}
@@ -232,224 +241,299 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
float16_t * pDst,
uint32_t blockSize)
{
- float16_t *pState = S->pState; /* State pointer */
- const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- float16_t *pStateCur; /* Points to the current sample of the state */
- const float16_t *pSamples; /* Temporary pointer to the sample buffer */
- float16_t *pOutput; /* Temporary pointer to the output buffer */
- const float16_t *pTempSrc; /* Temporary pointer to the source data */
- float16_t *pTempDest; /* Temporary pointer to the destination buffer */
- int32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t blkCnt;
- f16x8_t vecIn0;
- f16x8_t vecAcc0;
- float16_t c0, c1, c2, c3;
- float16_t c4, c5, c6, c7;
+ float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8);
+ float16_t *pState = pRefStatePtr ; /* State pointer */
+ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float16_t *pSamples; /* Temporary pointer to the sample buffer */
+ float16_t *pOutput; /* Temporary pointer to the output buffer */
+ const float16_t *pTempSrc; /* Temporary pointer to the source data */
+ float16_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ float16_t c0, c1, c2, c3;
+ float16_t c4, c5, c6, c7;
/*
* [1 to 8 taps] specialized routines
*/
- if (numTaps <= 4)
- {
+ if (numTaps <= 4) {
arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
return;
- }
- else if (numTaps <= 8)
- {
+ } else if (numTaps <= 8) {
arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
return;
}
- /*
- * pState points to state array which contains previous frame (numTaps - 1) samples
- * pStateCur points to the location where the new input data should be written
- */
- pStateCur = &(pState[(numTaps - 1u)]);
- /*
- * Copy new data into state so that we obtain a continuous sample buffer
- * containing both the tail end of the old data and the new data.
- */
- pSamples = pState;
pTempSrc = pSrc;
- pOutput = pDst;
-
- blkCnt = blockSize >> 3;
- while (blkCnt > 0U)
- {
- int i;
- const float16_t *pCoeffsCur = pCoeffs;
-
- /*
- * Save 8 input samples in the history buffer
- */
- vst1q(pStateCur, vld1q(pTempSrc));
- pStateCur += 8;
+ pTempDest = &(pState[(numTaps - 1u)]);
+ int cnt = blockSize;
+ do {
+ mve_pred16_t p0 = vctp16q(cnt);
+ vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+ pTempDest += 8;
pTempSrc += 8;
+ cnt -= 8;
+ } while (cnt > 0);
+
+ float16_t *partial_accu_ptr = S->pState;
- c0 = *pCoeffsCur++;
- c1 = *pCoeffsCur++;
- c2 = *pCoeffsCur++;
- c3 = *pCoeffsCur++;
- c4 = *pCoeffsCur++;
- c5 = *pCoeffsCur++;
- c6 = *pCoeffsCur++;
- c7 = *pCoeffsCur++;
+ pSamples = pState;
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
-
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 8;
+ vst1q(partial_accu_ptr, vecAcc0);
+ cnt--;
+ partial_accu_ptr += 8;
+ }
+
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ mve_pred16_t p0 = vctp16q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+ }
- pSamples += 8;
+ int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+ int sample_offset = FIR_F32_MAX_COEF_BLK;
+ while (localTaps > FIR_F32_MAX_COEF_BLK) {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ partial_accu_ptr = S->pState;
+ pSamples = pState + sample_offset;
+ int cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
- for (i = 0; i <= ((numTaps - 9) / 8); i++)
- {
- c0 = *pCoeffsCur++;
- c1 = *pCoeffsCur++;
- c2 = *pCoeffsCur++;
- c3 = *pCoeffsCur++;
- c4 = *pCoeffsCur++;
- c5 = *pCoeffsCur++;
- c6 = *pCoeffsCur++;
- c7 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
+ vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
-
pSamples += 8;
+ vecAcc0 += vld1q_f16(partial_accu_ptr);
+ vst1q(partial_accu_ptr, vecAcc0);
+ cnt--;
+ partial_accu_ptr += 8;
}
- vst1q(pOutput, vecAcc0);
- pOutput += 8;
- pSamples = pSamples - (i + 1) * 8 + 8;
-
- blkCnt--;
- }
-
- blkCnt = blockSize & 7;
- {
- mve_pred16_t p0 = vctp16q(blkCnt);
- int i;
- const float16_t *pCoeffsCur = pCoeffs;
-
- vst1q(pStateCur, vld1q(pTempSrc));
- pStateCur += 8;
- pTempSrc += 8;
-
- c0 = *pCoeffsCur++;
- c1 = *pCoeffsCur++;
- c2 = *pCoeffsCur++;
- c3 = *pCoeffsCur++;
- c4 = *pCoeffsCur++;
- c5 = *pCoeffsCur++;
- c6 = *pCoeffsCur++;
- c7 = *pCoeffsCur++;
-
- vecIn0 = vld1q(pSamples);
- vecAcc0 = vmulq(vecIn0, c0);
-
- vecIn0 = vld1q(&pSamples[1]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
- vecIn0 = vld1q(&pSamples[2]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ mve_pred16_t p0 = vctp16q(cnt);
- vecIn0 = vld1q(&pSamples[3]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
- vecIn0 = vld1q(&pSamples[4]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vecAcc0 += vld1q_f16(partial_accu_ptr);
+ vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+ }
- vecIn0 = vld1q(&pSamples[5]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ localTaps -= FIR_F32_MAX_COEF_BLK;
+ sample_offset += FIR_F32_MAX_COEF_BLK;
+ }
- vecIn0 = vld1q(&pSamples[6]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ pSamples = pState + sample_offset;
+
+ if (localTaps > 4) {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+ pOutput = pDst;
+
+ partial_accu_ptr = S->pState;
+ cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
- vecIn0 = vld1q(&pSamples[7]);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 8;
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vst1q(pOutput, vecAcc0 + pap);
+ cnt--;
+ partial_accu_ptr += 8;
+ pOutput += 8;
+ }
- pSamples += 8;
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
- for (i = 0; i <= ((numTaps - 9) / 8); i++)
- {
- c0 = *pCoeffsCur++;
- c1 = *pCoeffsCur++;
- c2 = *pCoeffsCur++;
- c3 = *pCoeffsCur++;
- c4 = *pCoeffsCur++;
- c5 = *pCoeffsCur++;
- c6 = *pCoeffsCur++;
- c7 = *pCoeffsCur++;
+ mve_pred16_t p0 = vctp16q(cnt);
vecIn0 = vld1q(pSamples);
- vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
-
+ vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
-
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
-
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
-
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
-
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
-
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
-
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+ pOutput += cnt;
+ }
+
+ } else {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ pOutput = pDst;
+ partial_accu_ptr = S->pState;
+ cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
pSamples += 8;
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vst1q(pOutput, vecAcc0 + pap);
+ cnt--;
+ partial_accu_ptr += 8;
+ pOutput += 8;
}
- vstrhq_p_f16(pOutput, vecAcc0, p0);
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ mve_pred16_t p0 = vctp16q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+ pOutput += cnt;
+ }
}
/*
@@ -459,17 +543,15 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
pTempDest = pState;
blkCnt = numTaps >> 3;
- while (blkCnt > 0U)
- {
+ while (blkCnt > 0U) {
vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 8;
pTempDest += 8;
blkCnt--;
}
blkCnt = numTaps & 7;
- if (blkCnt > 0U)
- {
- mve_pred16_t p0 = vctp16q(blkCnt);
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
}
}
diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c
index 3731e9cc..6fa87565 100644
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@@ -107,9 +107,12 @@
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
- The first blockSize samples are temporary data.
+ The first A samples are temporary data.
The remaining samples are the state of the FIR filter.
- So the state buffer has size numTaps + 2 * blockSize - 1
+ @par
+ So the state buffer has size numTaps + A * blockSize - 1 :
+ - A is blockSize for f32
+ - A is 8*ceil(blockSize/8) for f16
@par Fixed-Point Behavior
Care must be taken when using the fixed-point versions of the FIR filter functions.
@@ -144,7 +147,7 @@
}
-static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
+__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
const float32_t * __restrict pSrc,
float32_t * __restrict pDst, uint32_t blockSize)
{
@@ -229,7 +232,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
-static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
+__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
const float32_t * __restrict pSrc,
float32_t * __restrict pDst, uint32_t blockSize)
{
diff --git a/Source/FilteringFunctions/arm_fir_init_f16.c b/Source/FilteringFunctions/arm_fir_init_f16.c
index 4e300d7c..8b63796b 100755
--- a/Source/FilteringFunctions/arm_fir_init_f16.c
+++ b/Source/FilteringFunctions/arm_fir_init_f16.c
@@ -52,7 +52,7 @@
@par
pState points to the array of state variables.
- pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f16().
+ pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_f16().
@par Initialization of Helium version
For Helium version the array of coefficients must be a multiple of 16 even if less
then 16 coefficients are used. The additional coefficients must be set to 0.
@@ -61,6 +61,13 @@
the implementation may require to read more coefficients due to the vectorization and
to avoid having to manage too many different cases in the code.
+ @par Helium state buffer
+ The state buffer must contain some additional temporary data
+ used during the computation but which is not the state of the FIR.
+ The first 8*ceil(blockSize/8) samples are temporary data.
+ The remaining samples are the state of the FIR filter.
+ So the state buffer has size numTaps + 8*ceil(blockSize/8) + blockSize - 1
+
*/
void arm_fir_init_f16(
@@ -77,7 +84,11 @@ void arm_fir_init_f16(
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+ memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t));
+#else
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t));
+#endif
/* Assign state pointer */
S->pState = pState;
diff --git a/Testing/Source/Benchmarks/FIRF16.cpp b/Testing/Source/Benchmarks/FIRF16.cpp
index d3605e1b..86b54bca 100755
--- a/Testing/Source/Benchmarks/FIRF16.cpp
+++ b/Testing/Source/Benchmarks/FIRF16.cpp
@@ -1,6 +1,9 @@
#include "FIRF16.h"
#include "Error.h"
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) float16_t coeffArray[64];
+#endif
void FIRF16::test_fir_f16()
{
@@ -21,16 +24,28 @@
samples.reload(FIRF16::SAMPLES1_F16_ID,mgr,this->nbSamples);
coefs.reload(FIRF16::COEFS1_F16_ID,mgr,this->nbTaps);
- state.create(this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
+ state.create(ROUND_UP(this->nbSamples,8) + this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
output.create(this->nbSamples,FIRF16::OUT_SAMPLES_F16_ID,mgr);
switch(id)
{
case TEST_FIR_F16_1:
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+ /* Copy coefficients and pad to zero
+ */
+ memset(coeffArray,0,32*sizeof(float16_t));
+ float16_t *ptr;
+
+ ptr=coefs.ptr();
+ memcpy(coeffArray,ptr,this->nbTaps*sizeof(float16_t));
+ this->pCoefs = coeffArray;
+#else
+ this->pCoefs=coefs.ptr();
+#endif
+
arm_fir_init_f16(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
- this->pCoefs=coefs.ptr();
this->pDst=output.ptr();
break;
diff --git a/Testing/Source/Benchmarks/FIRF32.cpp b/Testing/Source/Benchmarks/FIRF32.cpp
index 41a32018..bbe9ed55 100755
--- a/Testing/Source/Benchmarks/FIRF32.cpp
+++ b/Testing/Source/Benchmarks/FIRF32.cpp
@@ -39,7 +39,6 @@ static __ALIGNED(8) float32_t coeffArray[64];
switch(id)
{
case TEST_FIR_F32_1:
- arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
@@ -57,6 +56,8 @@ static __ALIGNED(8) float32_t coeffArray[64];
this->pSrc=samples.ptr();
this->pDst=output.ptr();
+
+ arm_fir_init_f32(&instFir,this->nbTaps,this->pCoefs,state.ptr(),this->nbSamples);
break;
case TEST_LMS_F32_2:
diff --git a/Testing/Source/Tests/FIRF16.cpp b/Testing/Source/Tests/FIRF16.cpp
index 0573caa3..0933c743 100755
--- a/Testing/Source/Tests/FIRF16.cpp
+++ b/Testing/Source/Tests/FIRF16.cpp
@@ -137,8 +137,8 @@ void checkInnerTail(float16_t *b)
ref.reload(FIRF16::FIRREFS_F16_ID,mgr);
output.create(ref.nbSamples(),FIRF16::OUT_F16_ID,mgr);
- /* Max blockSize + numTaps - 1 as generated by Python script */
- state.create(47,FIRF16::OUT_F16_ID,mgr);
+ /* > Max 8*ceil(blockSize,8) + blockSize + numTaps - 1 as generated by Python script */
+ state.create(47+47,FIRF16::OUT_F16_ID,mgr);
}
void FIRF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)