diff --git a/Include/dsp/filtering_functions_f16.h b/Include/dsp/filtering_functions_f16.h
index 0265f04e..4a99e831 100755
--- a/Include/dsp/filtering_functions_f16.h
+++ b/Include/dsp/filtering_functions_f16.h
@@ -40,8 +40,6 @@ extern "C"
#if defined(ARM_FLOAT16_SUPPORTED)
-#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
-
/**
* @brief Instance structure for the floating-point FIR filter.
*/
diff --git a/Include/dsp/utils.h b/Include/dsp/utils.h
index 794023c5..7f5acb37 100755
--- a/Include/dsp/utils.h
+++ b/Include/dsp/utils.h
@@ -42,6 +42,7 @@ extern "C"
#define SQ(x) ((x) * (x))
+#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
/**
diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c
index 6fa87565..7f3da5e0 100644
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@@ -110,9 +110,11 @@
The first A samples are temporary data.
The remaining samples are the state of the FIR filter.
@par
- So the state buffer has size numTaps + A * blockSize - 1 :
+ So the state buffer has size numTaps + A + blockSize - 1 :
- A is blockSize for f32
- A is 8*ceil(blockSize/8) for f16
+ - A is 8*ceil(blockSize/4) for q31
+
@par Fixed-Point Behavior
Care must be taken when using the fixed-point versions of the FIR filter functions.
@@ -200,6 +202,7 @@ __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
}
blkCnt = blockSize & 3;
+ if (blkCnt)
{
mve_pred16_t p0 = vctp32q(blkCnt);
diff --git a/Source/FilteringFunctions/arm_fir_init_q31.c b/Source/FilteringFunctions/arm_fir_init_q31.c
index 2a9600c1..e491437e 100644
--- a/Source/FilteringFunctions/arm_fir_init_q31.c
+++ b/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -52,7 +52,23 @@
{b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
pState points to the array of state variables.
- pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q31().
+ pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_q31().
+
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be a multiple of 16 even if less
+ then 16 coefficients are used. The additional coefficients must be set to 0.
+ It does not mean that all the coefficients will be used in the filter (numTaps
+ is still set to its right value in the init function.) It just means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+
+ @par Helium state buffer
+ The state buffer must contain some additional temporary data
+ used during the computation but which is not the state of the FIR.
+ The first 2*4*ceil(blockSize/4) samples are temporary data.
+ The remaining samples are the state of the FIR filter.
+ So the state buffer has size numTaps + 8*ceil(blockSize/4) + blockSize - 1
+
*/
void arm_fir_init_q31(
@@ -69,7 +85,11 @@ void arm_fir_init_q31(
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t));
+ #else
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+ #endif
/* Assign state pointer */
S->pState = pState;
diff --git a/Source/FilteringFunctions/arm_fir_q31.c b/Source/FilteringFunctions/arm_fir_q31.c
index bf406350..eda1d4f0 100644
--- a/Source/FilteringFunctions/arm_fir_q31.c
+++ b/Source/FilteringFunctions/arm_fir_q31.c
@@ -28,6 +28,7 @@
#include "dsp/filtering_functions.h"
+
/**
@ingroup groupFilters
*/
@@ -58,12 +59,160 @@
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
-
-static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
+#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q31_t *pSmp = &pSamples[j]; \
+ q31x4_t vecIn0; \
+ q63_t acc[4]; \
+ \
+ acc[j] = 0; \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 4 * i); \
+ acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *pOutput++ = (q31_t)asrl(acc[j], 23); \
+ }
+
+
+#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q31_t *pSmp = &pSamples[j]; \
+ q31x4_t vecIn0; \
+ \
+ acc[j] = 0; \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 4 * i); \
+ acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *arm_fir_partial_accu_ptr++ = acc[j]; \
+ }
+
+
+#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q31_t *pSmp = &pSamples[j]; \
+ q31x4_t vecIn0; \
+ \
+ acc[j] = *arm_fir_partial_accu_ptr++; \
+ \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 4 * i); \
+ acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *pOutput++ = (q31_t)asrl(acc[j], 23); \
+ }
+
+
+#define FIR_Q31_MAIN_CORE() \
+{ \
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); \
+ q31_t *pState = pRefStatePtr; /* State pointer */ \
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
+ q31_t *pStateCur; /* Points to the current sample of the state */ \
+ const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \
+ q31_t *pOutput; /* Temporary pointer to the output buffer */ \
+ const q31_t *pTempSrc; /* Temporary pointer to the source data */ \
+ q31_t *pTempDest; /* Temporary pointer to the destination buffer */\
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
+ int32_t blkCnt; \
+ const int32_t nbVecTaps = (NBTAPS / 4); \
+ \
+ /* \
+ * load coefs \
+ */ \
+ q31x4_t vecCoeffs[nbVecTaps]; \
+ \
+ for (int i = 0; i < nbVecTaps; i++) \
+ vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \
+ \
+ /* \
+ * pState points to state array which contains previous frame (numTaps - 1) samples \
+ * pStateCur points to the location where the new input data should be written \
+ */ \
+ pStateCur = &(pState[(numTaps - 1u)]); \
+ pTempSrc = pSrc; \
+ pSamples = pState; \
+ pOutput = pDst; \
+ \
+ blkCnt = blockSize >> 2; \
+ while (blkCnt > 0) { \
+ /* \
+ * Save 4 input samples in the history buffer \
+ */ \
+ vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \
+ pStateCur += 4; \
+ pTempSrc += 4; \
+ \
+ FIR_Q31_CORE(4, nbVecTaps, pSamples, vecCoeffs); \
+ \
+ pSamples += 4; \
+ /* \
+ * Decrement the sample block loop counter \
+ */ \
+ blkCnt--; \
+ } \
+ \
+ /* tail */ \
+ int32_t residual = blockSize & 3; \
+ switch (residual) { \
+ case 3: \
+ { \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q31_CORE(3, nbVecTaps, pSamples, vecCoeffs); \
+ } \
+ break; \
+ \
+ case 2: \
+ { \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q31_CORE(2, nbVecTaps, pSamples, vecCoeffs); \
+ } \
+ break; \
+ \
+ case 1: \
+ { \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q31_CORE(1, nbVecTaps, pSamples, vecCoeffs); \
+ } \
+ break; \
+ } \
+ \
+ /* \
+ * Copy the samples back into the history buffer start \
+ */ \
+ pTempSrc = &pState[blockSize]; \
+ pTempDest = pState; \
+ \
+ blkCnt =(numTaps - 1) >> 2; \
+ while (blkCnt > 0) \
+ { \
+ vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \
+ pTempSrc += 4; \
+ pTempDest += 4; \
+ blkCnt--; \
+ } \
+ blkCnt = (numTaps - 1) & 3; \
+ if (blkCnt > 0) \
+ { \
+ mve_pred16_t p0 = vctp32q(blkCnt); \
+ vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \
+ } \
+}
+
+static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+ q31_t *pState = pRefStatePtr; /* State pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t *pStateCur; /* Points to the current sample of the state */
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
@@ -74,6 +223,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
uint32_t blkCnt;
q31x4_t vecIn0;
+
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
@@ -83,7 +233,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pSamples = pState;
pOutput = pDst;
- q63_t acc0, acc1, acc2, acc3;
+ q63_t acc0=0, acc1=0, acc2=0, acc3=0;
/*
* load 4 coefs
*/
@@ -131,7 +281,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
}
uint32_t residual = blockSize & 3;
-
switch (residual)
{
case 3:
@@ -139,7 +288,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
/*
* Save 4 input samples in the history buffer
*/
-
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
pStateCur += 4;
pTempSrc += 4;
@@ -205,14 +353,13 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
break;
}
-
/*
* Copy the samples back into the history buffer start
*/
- pTempSrc = &S->pState[blockSize];
- pTempDest = S->pState;
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
- blkCnt = numTaps >> 2;
+ blkCnt = (numTaps-1) >> 2;
while (blkCnt > 0U)
{
vst1q(pTempDest, vld1q(pTempSrc));
@@ -220,7 +367,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pTempDest += 4;
blkCnt--;
}
- blkCnt = numTaps & 3;
+ blkCnt = (numTaps-1) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
@@ -228,9 +375,274 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
}
}
-static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
+
+
+static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
{
- q31_t *pState = S->pState; /* State pointer */
+ #define NBTAPS 8
+ FIR_Q31_MAIN_CORE();
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 12
+ FIR_Q31_MAIN_CORE();
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 16
+ FIR_Q31_MAIN_CORE();
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 20
+ FIR_Q31_MAIN_CORE();
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 24
+ FIR_Q31_MAIN_CORE();
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 28
+ FIR_Q31_MAIN_CORE();
+ #undef NBTAPS
+}
+
+static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst,
+ uint32_t blockSize)
+{
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+ q31_t *pState = pRefStatePtr; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ const q31_t *pSamples; /* Temporary pointer to the sample buffer */
+ q31_t *pOutput; /* Temporary pointer to the output buffer */
+ const q31_t *pTempSrc; /* Temporary pointer to the source data */
+ q31_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t blkCnt;
+ q63_t acc0, acc1, acc2, acc3;
+
+#define MAX_VECT_BATCH 7
+
+ /*
+ * pre-load 28 1st coefs
+ */
+ q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
+ q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
+ q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
+ q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
+ q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
+ q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
+ q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pTempSrc = pSrc;
+ pSamples = pState;
+
+ q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0) {
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ const q31_t *pSmp;
+ q31x4_t vecIn0;
+
+ pSmp = &pSamples[0];
+
+ vecIn0 = vld1q(pSmp);
+ acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
+
+ *arm_fir_partial_accu_ptr++ = acc0;
+
+ pSmp = &pSamples[1];
+
+ vecIn0 = vld1q(pSmp);
+ acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
+
+ *arm_fir_partial_accu_ptr++ = acc1;
+
+ pSmp = &pSamples[2];
+
+ vecIn0 = vld1q(pSmp);
+ acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
+ *arm_fir_partial_accu_ptr++ = acc2;
+
+ pSmp = &pSamples[3];
+
+ vecIn0 = vld1q(pSmp);
+ acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
+
+ *arm_fir_partial_accu_ptr++ = acc3;
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+
+ /* reminder */
+
+ /* load last 4 coef */
+ vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
+ arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+ pOutput = pDst;
+ pSamples = pState + (MAX_VECT_BATCH * 4);
+
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0) {
+ q31x4_t vecIn0;
+
+ /* reload intermediate MAC */
+ acc0 = *arm_fir_partial_accu_ptr++;
+ acc1 = *arm_fir_partial_accu_ptr++;
+ acc2 = *arm_fir_partial_accu_ptr++;
+ acc3 = *arm_fir_partial_accu_ptr++;
+
+
+ vecIn0 = vld1q(&pSamples[0]);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
+
+ vecIn0 = vld1q(&pSamples[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
+
+ vecIn0 = vld1q(&pSamples[2]);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
+
+ vecIn0 = vld1q(&pSamples[3]);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
+
+ *pOutput++ = asrl(acc0, 23);
+ *pOutput++ = asrl(acc1, 23);
+ *pOutput++ = asrl(acc2, 23);
+ *pOutput++ = asrl(acc3, 23);
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps - 1;
+ do {
+ mve_pred16_t p = vctp32q(blkCnt);
+
+ vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
+ pTempSrc += 4;
+ pTempDest += 4;
+ blkCnt -= 4;
+ }
+ while (blkCnt > 0);
+}
+
+
+
+void arm_fir_q31(
+ const arm_fir_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+ q31_t *pState = pRefStatePtr; /* State pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t *pStateCur; /* Points to the current sample of the state */
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
@@ -240,60 +652,110 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
q31x4_t vecIn0;
+ uint32_t tapsBlkCnt = (numTaps + 3) / 4;
q63_t acc0, acc1, acc2, acc3;
- q31x4_t vecCoeffs1_4, vecCoeffs5_8;
+ q31x4_t vecCoeffs;
+
/*
- * pState points to state array which contains previous frame (numTaps - 1) samples
- * pStateCur points to the location where the new input data should be written
+ * [1 to 32 taps] specialized routines
*/
- pStateCur = &(pState[(numTaps - 1u)]);
- pTempSrc = pSrc;
- pSamples = pState;
- pOutput = pDst;
-
+ if (numTaps <= 4)
+ {
+ arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 8)
+ {
+ arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 12)
+ {
+ arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 16)
+ {
+ arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 20)
+ {
+ arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 24)
+ {
+ arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 28)
+ {
+ arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if ((numTaps <= 32) && (blockSize >= 32))
+ {
+ arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
/*
- * load 8 coefs
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
*/
- vecCoeffs1_4 = *(q31x4_t *) pCoeffs;
- vecCoeffs5_8 = *(q31x4_t *) (pCoeffs + 4);
-
- blkCnt = blockSize >> 2;
- while (blkCnt > 0U)
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pSamples = pState;
+ pTempSrc = pSrc;
+ pOutput = pDst;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0)
{
+ const q31_t *pCoeffsTmp = pCoeffs;
const q31_t *pSamplesTmp = pSamples;
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
- vecIn0 = vld1q(pSamplesTmp);
- acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
- vecIn0 = vld1q(&pSamplesTmp[1]);
- acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
- vecIn0 = vld1q(&pSamplesTmp[2]);
- acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
- vecIn0 = vld1q(&pSamplesTmp[3]);
- acc3 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 4 coefs
+ */
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
- vecIn0 = vld1q(&pSamplesTmp[4]);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamplesTmp[5]);
- acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamplesTmp[6]);
- acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamplesTmp[7]);
- acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(&pSamplesTmp[3]);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+ /* .54-> .31 conversion and store accumulators */
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
@@ -305,8 +767,6 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
*pOutput++ = (q31_t) acc3;
pSamples += 4;
- pStateCur += 4;
- pTempSrc += 4;
/*
* Decrement the sample block loop counter
@@ -314,11 +774,18 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
blkCnt--;
}
- uint32_t residual = blockSize & 3;
+ int32_t residual = blockSize & 3;
switch (residual)
{
case 3:
{
+ const q31_t *pCoeffsTmp = pCoeffs;
+ const q31_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+
/*
* Save 4 input samples in the history buffer
*/
@@ -326,23 +793,24 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pStateCur += 4;
pTempSrc += 4;
- vecIn0 = vld1q(pSamples);
- acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
- vecIn0 = vld1q(&pSamples[1]);
- acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
- vecIn0 = vld1q(&pSamples[2]);
- acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamples[4]);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamples[5]);
- acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamples[6]);
- acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ i--;
+ }
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
@@ -356,6 +824,12 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
case 2:
{
+ const q31_t *pCoeffsTmp = pCoeffs;
+ const q31_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+
/*
* Save 4 input samples in the history buffer
*/
@@ -363,17 +837,21 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pStateCur += 4;
pTempSrc += 4;
- vecIn0 = vld1q(pSamples);
- acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
- vecIn0 = vld1q(&pSamples[1]);
- acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamples[4]);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
- vecIn0 = vld1q(&pSamples[5]);
- acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ i--;
+ }
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
@@ -384,431 +862,55 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
break;
case 1:
- {
- /*
- * Save 4 input samples in the history buffer
- */
- vst1q(pStateCur, vld1q(pTempSrc));
- pStateCur += 4;
- pTempSrc += 4;
-
- vecIn0 = vld1q(pSamples);
- acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
-
- vecIn0 = vld1q(&pSamples[4]);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
-
- acc0 = asrl(acc0, 23);
-
- *pOutput++ = (q31_t) acc0;
- }
- break;
- }
-
- /*
- * Copy the samples back into the history buffer start
- */
- pTempSrc = &S->pState[blockSize];
- pTempDest = S->pState;
-
- blkCnt = numTaps >> 2;
- while (blkCnt > 0U)
- {
- vst1q(pTempDest, vld1q(pTempSrc));
- pTempSrc += 4;
- pTempDest += 4;
- blkCnt--;
- }
- blkCnt = numTaps & 3;
- if (blkCnt > 0U)
- {
- mve_pred16_t p0 = vctp32q(blkCnt);
- vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
- }
-}
-
-void arm_fir_q31(
- const arm_fir_instance_q31 * S,
- const q31_t * pSrc,
- q31_t * pDst,
- uint32_t blockSize)
-{
- q31_t *pState = S->pState; /* State pointer */
- const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
- q31_t *pStateCur; /* Points to the current sample of the state */
- const q31_t *pSamples; /* Temporary pointer to the sample buffer */
- q31_t *pOutput; /* Temporary pointer to the output buffer */
- const q31_t *pTempSrc; /* Temporary pointer to the source data */
- q31_t *pTempDest; /* Temporary pointer to the destination buffer */
- uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
- uint32_t blkCnt;
- q31x4_t vecIn0;
- uint32_t tapsBlkCnt = (numTaps + 3) / 4;
- q63_t acc0, acc1, acc2, acc3;
- q31x4_t vecCoeffs;
-
- /*
- * [1 to 8 taps] specialized routines
- */
-
- if (blockSize >= 8)
- {
- if (numTaps <= 4)
- {
- arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
- return;
- }
- else if (numTaps <= 8)
- {
- arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
- return;
- }
- }
-
-
- /*
- * pState points to state array which contains previous frame (numTaps - 1) samples
- * pStateCur points to the location where the new input data should be written
- */
- if (blockSize >= 8)
- {
- pStateCur = &(pState[(numTaps - 1u)]);
- pSamples = pState;
- pTempSrc = pSrc;
- pOutput = pDst;
- blkCnt = blockSize >> 2;
- while (blkCnt > 0U)
{
const q31_t *pCoeffsTmp = pCoeffs;
const q31_t *pSamplesTmp = pSamples;
-
+
acc0 = 0LL;
- acc1 = 0LL;
- acc2 = 0LL;
- acc3 = 0LL;
-
+
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
-
- tapsBlkCnt = (numTaps ) / 4;
- uint32_t i = tapsBlkCnt ;
- while (i > 0U)
+
+ int i = tapsBlkCnt;
+ while (i > 0)
{
- /*
- * load 4 coefs
- */
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-
+
vecIn0 = vld1q(pSamplesTmp);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-
- vecIn0 = vld1q(&pSamplesTmp[1]);
- acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-
- vecIn0 = vld1q(&pSamplesTmp[2]);
- acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
-
- vecIn0 = vld1q(&pSamplesTmp[3]);
- acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
-
+
pSamplesTmp += 4;
pCoeffsTmp += 4;
- /*
- * Decrement the taps block loop counter
- */
i--;
}
- tapsBlkCnt = (numTaps ) & 3;
- i = tapsBlkCnt ;
- while (i > 0U)
- {
- /*
- * load 4 coefs
- */
-
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
- acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
- acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
- acc3 += ((q63_t) pSamplesTmp[3] * *pCoeffsTmp) >> 8;
-
-
- pSamplesTmp += 1;
- pCoeffsTmp += 1;
- /*
- * Decrement the taps block loop counter
- */
- i--;
- }
-
- /* .54-> .31 conversion and store accumulators */
acc0 = asrl(acc0, 23);
- acc1 = asrl(acc1, 23);
- acc2 = asrl(acc2, 23);
- acc3 = asrl(acc3, 23);
-
- *pOutput++ = (q31_t) acc0;
- *pOutput++ = (q31_t) acc1;
- *pOutput++ = (q31_t) acc2;
- *pOutput++ = (q31_t) acc3;
-
- pSamples += 4;
-
- /*
- * Decrement the sample block loop counter
- */
- blkCnt--;
- }
-
- uint32_t residual = blockSize & 3;
- switch (residual)
- {
- case 3:
- {
- const q31_t *pCoeffsTmp = pCoeffs;
- const q31_t *pSamplesTmp = pSamples;
-
- acc0 = 0LL;
- acc1 = 0LL;
- acc2 = 0LL;
-
- /*
- * Save 4 input samples in the history buffer
- */
-
- *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
- pStateCur += 4;
- pTempSrc += 4;
-
- tapsBlkCnt = numTaps / 4;
- uint32_t i = tapsBlkCnt;
- while (i > 0U)
- {
- vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-
- vecIn0 = vld1q(pSamplesTmp);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-
- vecIn0 = vld1q(&pSamplesTmp[1]);
- acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-
- vecIn0 = vld1q(&pSamplesTmp[2]);
- acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
-
- pSamplesTmp += 4;
- pCoeffsTmp += 4;
- i--;
- }
-
- tapsBlkCnt = (numTaps ) & 3;
-
- i = tapsBlkCnt ;
- while (i > 0U)
- {
-
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
- acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
- acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
-
- pSamplesTmp += 1;
- pCoeffsTmp += 1;
- /*
- * Decrement the taps block loop counter
- */
- i--;
- }
-
-
- acc0 = asrl(acc0, 23);
- acc1 = asrl(acc1, 23);
- acc2 = asrl(acc2, 23);
-
- *pOutput++ = (q31_t) acc0;
- *pOutput++ = (q31_t) acc1;
- *pOutput++ = (q31_t) acc2;
- }
- break;
-
- case 2:
- {
- const q31_t *pCoeffsTmp = pCoeffs;
- const q31_t *pSamplesTmp = pSamples;
-
- acc0 = 0LL;
- acc1 = 0LL;
-
- /*
- * Save 4 input samples in the history buffer
- */
- vst1q(pStateCur, vld1q(pTempSrc));
- pStateCur += 4;
- pTempSrc += 4;
-
- tapsBlkCnt = (numTaps ) / 4;
- uint32_t i = tapsBlkCnt;
- while (i > 0U)
- {
- vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-
- vecIn0 = vld1q(pSamplesTmp);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-
- vecIn0 = vld1q(&pSamplesTmp[1]);
- acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
-
- pSamplesTmp += 4;
- pCoeffsTmp += 4;
- i--;
- }
-
- tapsBlkCnt = (numTaps ) & 3;
- i = tapsBlkCnt ;
- while (i > 0U)
- {
-
-
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
- acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
-
- pSamplesTmp += 1;
- pCoeffsTmp += 1;
- /*
- * Decrement the taps block loop counter
- */
- i--;
- }
-
- acc0 = asrl(acc0, 23);
- acc1 = asrl(acc1, 23);
-
- *pOutput++ = (q31_t) acc0;
- *pOutput++ = (q31_t) acc1;
- }
- break;
-
- case 1:
- {
- const q31_t *pCoeffsTmp = pCoeffs;
- const q31_t *pSamplesTmp = pSamples;
-
- acc0 = 0LL;
-
- /*
- * Save 4 input samples in the history buffer
- */
- vst1q(pStateCur, vld1q(pTempSrc));
- pStateCur += 4;
- pTempSrc += 4;
-
- tapsBlkCnt = (numTaps ) / 4;
- uint32_t i = tapsBlkCnt;
- while (i > 0U)
- {
- vecCoeffs = *(q31x4_t *) pCoeffsTmp;
-
- vecIn0 = vld1q(pSamplesTmp);
- acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
-
- pSamplesTmp += 4;
- pCoeffsTmp += 4;
- i--;
- }
-
- tapsBlkCnt = (numTaps ) & 3;
- i = tapsBlkCnt ;
- while (i > 0U)
- {
-
-
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
-
- pSamplesTmp += 1;
- pCoeffsTmp += 1;
- /*
- * Decrement the taps block loop counter
- */
- i--;
- }
-
- acc0 = asrl(acc0, 23);
-
- *pOutput++ = (q31_t) acc0;
- }
- break;
- }
- }
- else
- {
-
- q31_t *pStateCurnt; /* Points to the current sample of the state */
- q31_t *px; /* Temporary pointer for state buffer */
- const q31_t *pb; /* Temporary pointer for coefficient buffer */
- q63_t acc0; /* Accumulator */
- uint32_t i, blkCnt; /* Loop counters */
- pStateCurnt = &(S->pState[(numTaps - 1U)]);
- blkCnt = blockSize;
-
- while (blkCnt > 0U)
- {
- /* Copy one sample at a time into state buffer */
- *pStateCurnt++ = *pSrc++;
-
- /* Set the accumulator to zero */
- acc0 = 0;
-
- /* Initialize state pointer */
- px = pState;
-
- /* Initialize Coefficient pointer */
- pb = pCoeffs;
-
- i = numTaps;
-
- /* Perform the multiply-accumulates */
- do
- {
- /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
- acc0 += (q63_t) *px++ * *pb++;
-
- i--;
- } while (i > 0U);
-
- /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
- *pDst++ = (q31_t) (acc0 >> 31U);
-
- /* Advance state pointer by 1 for the next sample */
- pState = pState + 1U;
-
- /* Decrement loop counter */
- blkCnt--;
+ *pOutput++ = (q31_t) acc0;
}
+ break;
}
/*
* Copy the samples back into the history buffer start
*/
- pTempSrc = &S->pState[blockSize];
- pTempDest = S->pState;
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
- blkCnt = numTaps >> 2;
- while (blkCnt > 0U)
+ blkCnt = (numTaps - 1U) >> 2;
+ while (blkCnt > 0)
{
vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 4;
pTempDest += 4;
blkCnt--;
}
- blkCnt = numTaps & 3;
- if (blkCnt > 0U)
+ blkCnt = (numTaps - 1U) & 3;
+ if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
diff --git a/Testing/Source/Benchmarks/FIRQ31.cpp b/Testing/Source/Benchmarks/FIRQ31.cpp
index 3626a134..25cc206e 100755
--- a/Testing/Source/Benchmarks/FIRQ31.cpp
+++ b/Testing/Source/Benchmarks/FIRQ31.cpp
@@ -1,6 +1,9 @@
#include "FIRQ31.h"
#include "Error.h"
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+static __ALIGNED(8) q31_t coeffArray[64];
+#endif
void FIRQ31::test_fir_q31()
{
@@ -30,16 +33,28 @@
samples.reload(FIRQ31::SAMPLES1_Q31_ID,mgr,this->nbSamples);
coefs.reload(FIRQ31::COEFS1_Q31_ID,mgr,this->nbTaps);
- state.create(this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
+ state.create(2*ROUND_UP(this->nbSamples,4) + this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
output.create(this->nbSamples,FIRQ31::OUT_SAMPLES_Q31_ID,mgr);
switch(id)
{
case TEST_FIR_Q31_1:
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+ /* Copy coefficients and pad to zero
+ */
+ memset(coeffArray,0,32*sizeof(q31_t));
+ q31_t *ptr;
+
+ ptr=coefs.ptr();
+ memcpy(coeffArray,ptr,this->nbTaps*sizeof(q31_t));
+ this->pCoefs = coeffArray;
+#else
+ this->pCoefs=coefs.ptr();
+#endif
+
arm_fir_init_q31(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
- this->pCoefs=coefs.ptr();
this->pDst=output.ptr();
break;
diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp
index dfee9e1e..80f8195e 100644
--- a/Testing/Source/Tests/FIRQ31.cpp
+++ b/Testing/Source/Tests/FIRQ31.cpp
@@ -37,6 +37,7 @@ void checkInnerTail(q31_t *b)
#endif
int blockSize;
int numTaps;
+ int nb=1;
/*
@@ -98,6 +99,8 @@ void checkInnerTail(q31_t *b)
configp += 2;
orgcoefsp += numTaps;
+ nb += blockSize + blockSize;
+
}
@@ -129,8 +132,8 @@ void checkInnerTail(q31_t *b)
ref.reload(FIRQ31::FIRREFS_Q31_ID,mgr);
output.create(ref.nbSamples(),FIRQ31::OUT_Q31_ID,mgr);
- /* Max blockSize + numTaps - 1 as generated by Python script */
- state.create(47,FIRQ31::OUT_Q31_ID,mgr);
+ /* > Max blockSize + numTaps - 1 as generated by Python script */
+ state.create(47 + 47+47,FIRQ31::OUT_Q31_ID,mgr);
}
void FIRQ31::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)