CMSIS-DSP: Added new MVE implementation of FIR Q31

pull/19/head
Christophe Favergeon 5 years ago
parent 68b219bb1f
commit a108d6763e

@ -40,8 +40,6 @@ extern "C"
#if defined(ARM_FLOAT16_SUPPORTED)
#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
/**
* @brief Instance structure for the floating-point FIR filter.
*/

@ -42,6 +42,7 @@ extern "C"
#define SQ(x) ((x) * (x))
#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
/**

@ -110,9 +110,11 @@
The first A samples are temporary data.
The remaining samples are the state of the FIR filter.
@par
So the state buffer has size <code> numTaps + A * blockSize - 1 </code> :
So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
- A is blockSize for f32
- A is 8*ceil(blockSize/8) for f16
- A is 8*ceil(blockSize/4) for q31
@par Fixed-Point Behavior
Care must be taken when using the fixed-point versions of the FIR filter functions.
@ -200,6 +202,7 @@ __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
}
blkCnt = blockSize & 3;
if (blkCnt)
{
mve_pred16_t p0 = vctp32q(blkCnt);

@ -52,7 +52,23 @@
{b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
</pre>
<code>pState</code> points to the array of state variables.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
@par Initialization of Helium version
For Helium version the array of coefficients must be a multiple of 16 even if less
then 16 coefficients are used. The additional coefficients must be set to 0.
It does not mean that all the coefficients will be used in the filter (numTaps
is still set to its right value in the init function.) It just means that
the implementation may require to read more coefficients due to the vectorization and
to avoid having to manage too many different cases in the code.
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
The first 2*4*ceil(blockSize/4) samples are temporary data.
The remaining samples are the state of the FIR filter.
So the state buffer has size <code> numTaps + 8*ceil(blockSize/4) + blockSize - 1 </code>
*/
void arm_fir_init_q31(
@ -69,7 +85,11 @@ void arm_fir_init_q31(
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t));
#else
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
#endif
/* Assign state pointer */
S->pState = pState;

@ -28,6 +28,7 @@
#include "dsp/filtering_functions.h"
/**
@ingroup groupFilters
*/
@ -60,10 +61,158 @@
#include "arm_helium_utils.h"
#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \
for (int j = 0; j < nbAcc; j++) { \
const q31_t *pSmp = &pSamples[j]; \
q31x4_t vecIn0; \
q63_t acc[4]; \
\
acc[j] = 0; \
for (int i = 0; i < nbVecTaps; i++) { \
vecIn0 = vld1q(pSmp + 4 * i); \
acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
} \
*pOutput++ = (q31_t)asrl(acc[j], 23); \
}
#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
for (int j = 0; j < nbAcc; j++) { \
const q31_t *pSmp = &pSamples[j]; \
q31x4_t vecIn0; \
\
acc[j] = 0; \
for (int i = 0; i < nbVecTaps; i++) { \
vecIn0 = vld1q(pSmp + 4 * i); \
acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
} \
*arm_fir_partial_accu_ptr++ = acc[j]; \
}
#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
for (int j = 0; j < nbAcc; j++) { \
const q31_t *pSmp = &pSamples[j]; \
q31x4_t vecIn0; \
\
acc[j] = *arm_fir_partial_accu_ptr++; \
\
for (int i = 0; i < nbVecTaps; i++) { \
vecIn0 = vld1q(pSmp + 4 * i); \
acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
} \
*pOutput++ = (q31_t)asrl(acc[j], 23); \
}
#define FIR_Q31_MAIN_CORE() \
{ \
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); \
q31_t *pState = pRefStatePtr; /* State pointer */ \
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
q31_t *pStateCur; /* Points to the current sample of the state */ \
const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \
q31_t *pOutput; /* Temporary pointer to the output buffer */ \
const q31_t *pTempSrc; /* Temporary pointer to the source data */ \
q31_t *pTempDest; /* Temporary pointer to the destination buffer */\
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
int32_t blkCnt; \
const int32_t nbVecTaps = (NBTAPS / 4); \
\
/* \
* load coefs \
*/ \
q31x4_t vecCoeffs[nbVecTaps]; \
\
for (int i = 0; i < nbVecTaps; i++) \
vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \
\
/* \
* pState points to state array which contains previous frame (numTaps - 1) samples \
* pStateCur points to the location where the new input data should be written \
*/ \
pStateCur = &(pState[(numTaps - 1u)]); \
pTempSrc = pSrc; \
pSamples = pState; \
pOutput = pDst; \
\
blkCnt = blockSize >> 2; \
while (blkCnt > 0) { \
/* \
* Save 4 input samples in the history buffer \
*/ \
vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \
pStateCur += 4; \
pTempSrc += 4; \
\
FIR_Q31_CORE(4, nbVecTaps, pSamples, vecCoeffs); \
\
pSamples += 4; \
/* \
* Decrement the sample block loop counter \
*/ \
blkCnt--; \
} \
\
/* tail */ \
int32_t residual = blockSize & 3; \
switch (residual) { \
case 3: \
{ \
for (int i = 0; i < residual; i++) \
*pStateCur++ = *pTempSrc++; \
\
FIR_Q31_CORE(3, nbVecTaps, pSamples, vecCoeffs); \
} \
break; \
\
case 2: \
{ \
for (int i = 0; i < residual; i++) \
*pStateCur++ = *pTempSrc++; \
\
FIR_Q31_CORE(2, nbVecTaps, pSamples, vecCoeffs); \
} \
break; \
\
case 1: \
{ \
for (int i = 0; i < residual; i++) \
*pStateCur++ = *pTempSrc++; \
\
FIR_Q31_CORE(1, nbVecTaps, pSamples, vecCoeffs); \
} \
break; \
} \
\
/* \
* Copy the samples back into the history buffer start \
*/ \
pTempSrc = &pState[blockSize]; \
pTempDest = pState; \
\
blkCnt =(numTaps - 1) >> 2; \
while (blkCnt > 0) \
{ \
vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \
pTempSrc += 4; \
pTempDest += 4; \
blkCnt--; \
} \
blkCnt = (numTaps - 1) & 3; \
if (blkCnt > 0) \
{ \
mve_pred16_t p0 = vctp32q(blkCnt); \
vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \
} \
}
static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
q31_t *pState = S->pState; /* State pointer */
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
q31_t *pState = pRefStatePtr; /* State pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t *pStateCur; /* Points to the current sample of the state */
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
@ -74,6 +223,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
uint32_t blkCnt;
q31x4_t vecIn0;
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
@ -83,7 +233,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pSamples = pState;
pOutput = pDst;
q63_t acc0, acc1, acc2, acc3;
q63_t acc0=0, acc1=0, acc2=0, acc3=0;
/*
* load 4 coefs
*/
@ -131,7 +281,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
}
uint32_t residual = blockSize & 3;
switch (residual)
{
case 3:
@ -139,7 +288,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
/*
* Save 4 input samples in the history buffer
*/
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
pStateCur += 4;
pTempSrc += 4;
@ -205,14 +353,13 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
break;
}
/*
* Copy the samples back into the history buffer start
*/
pTempSrc = &S->pState[blockSize];
pTempDest = S->pState;
pTempSrc = &pState[blockSize];
pTempDest = pState;
blkCnt = numTaps >> 2;
blkCnt = (numTaps-1) >> 2;
while (blkCnt > 0U)
{
vst1q(pTempDest, vld1q(pTempSrc));
@ -220,7 +367,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pTempDest += 4;
blkCnt--;
}
blkCnt = numTaps & 3;
blkCnt = (numTaps-1) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
@ -228,9 +375,74 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
}
}
static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 8
FIR_Q31_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
q31_t *pState = S->pState; /* State pointer */
#define NBTAPS 12
FIR_Q31_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 16
FIR_Q31_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 20
FIR_Q31_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 24
FIR_Q31_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 28
FIR_Q31_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,
const q31_t * __restrict pSrc,
q31_t * __restrict pDst,
uint32_t blockSize)
{
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
q31_t *pState = pRefStatePtr; /* State pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t *pStateCur; /* Points to the current sample of the state */
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
@ -238,10 +450,21 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
const q31_t *pTempSrc; /* Temporary pointer to the source data */
q31_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
q31x4_t vecIn0;
int32_t blkCnt;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecCoeffs1_4, vecCoeffs5_8;
#define MAX_VECT_BATCH 7
/*
* pre-load 28 1st coefs
*/
q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
@ -250,190 +473,176 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
pStateCur = &(pState[(numTaps - 1u)]);
pTempSrc = pSrc;
pSamples = pState;
pOutput = pDst;
/*
* load 8 coefs
*/
vecCoeffs1_4 = *(q31x4_t *) pCoeffs;
vecCoeffs5_8 = *(q31x4_t *) (pCoeffs + 4);
q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
const q31_t *pSamplesTmp = pSamples;
while (blkCnt > 0) {
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
vecIn0 = vld1q(pSamplesTmp);
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamplesTmp[3]);
acc3 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamplesTmp[4]);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
vecIn0 = vld1q(&pSamplesTmp[5]);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
vecIn0 = vld1q(&pSamplesTmp[6]);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
vecIn0 = vld1q(&pSamplesTmp[7]);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5_8);
vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
const q31_t *pSmp;
q31x4_t vecIn0;
*pOutput++ = (q31_t) acc0;
*pOutput++ = (q31_t) acc1;
*pOutput++ = (q31_t) acc2;
*pOutput++ = (q31_t) acc3;
pSmp = &pSamples[0];
vecIn0 = vld1q(pSmp);
acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
vecIn0 = vld1q(pSmp + 4 * 1);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
vecIn0 = vld1q(pSmp + 4 * 2);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
vecIn0 = vld1q(pSmp + 4 * 3);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
vecIn0 = vld1q(pSmp + 4 * 4);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
vecIn0 = vld1q(pSmp + 4 * 5);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
vecIn0 = vld1q(pSmp + 4 * 6);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
*arm_fir_partial_accu_ptr++ = acc0;
pSmp = &pSamples[1];
vecIn0 = vld1q(pSmp);
acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
vecIn0 = vld1q(pSmp + 4 * 1);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
vecIn0 = vld1q(pSmp + 4 * 2);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
vecIn0 = vld1q(pSmp + 4 * 3);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
vecIn0 = vld1q(pSmp + 4 * 4);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
vecIn0 = vld1q(pSmp + 4 * 5);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
vecIn0 = vld1q(pSmp + 4 * 6);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
*arm_fir_partial_accu_ptr++ = acc1;
pSmp = &pSamples[2];
vecIn0 = vld1q(pSmp);
acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
vecIn0 = vld1q(pSmp + 4 * 1);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
vecIn0 = vld1q(pSmp + 4 * 2);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
vecIn0 = vld1q(pSmp + 4 * 3);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
vecIn0 = vld1q(pSmp + 4 * 4);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
vecIn0 = vld1q(pSmp + 4 * 5);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
vecIn0 = vld1q(pSmp + 4 * 6);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
*arm_fir_partial_accu_ptr++ = acc2;
pSmp = &pSamples[3];
vecIn0 = vld1q(pSmp);
acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
vecIn0 = vld1q(pSmp + 4 * 1);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
vecIn0 = vld1q(pSmp + 4 * 2);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
vecIn0 = vld1q(pSmp + 4 * 3);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
vecIn0 = vld1q(pSmp + 4 * 4);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
vecIn0 = vld1q(pSmp + 4 * 5);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
vecIn0 = vld1q(pSmp + 4 * 6);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
*arm_fir_partial_accu_ptr++ = acc3;
pSamples += 4;
pStateCur += 4;
pTempSrc += 4;
/*
* Decrement the sample block loop counter
*/
blkCnt--;
}
uint32_t residual = blockSize & 3;
switch (residual)
{
case 3:
{
/*
* Save 4 input samples in the history buffer
*/
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
pStateCur += 4;
pTempSrc += 4;
vecIn0 = vld1q(pSamples);
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamples[1]);
acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamples[2]);
acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamples[4]);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
/* reminder */
vecIn0 = vld1q(&pSamples[5]);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
/* load last 4 coef */
vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
arm_fir_partial_accu_ptr = (q63_t*)S->pState;
pOutput = pDst;
pSamples = pState + (MAX_VECT_BATCH * 4);
vecIn0 = vld1q(&pSamples[6]);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
blkCnt = blockSize >> 2;
while (blkCnt > 0) {
q31x4_t vecIn0;
*pOutput++ = (q31_t) acc0;
*pOutput++ = (q31_t) acc1;
*pOutput++ = (q31_t) acc2;
}
break;
/* reload intermediate MAC */
acc0 = *arm_fir_partial_accu_ptr++;
acc1 = *arm_fir_partial_accu_ptr++;
acc2 = *arm_fir_partial_accu_ptr++;
acc3 = *arm_fir_partial_accu_ptr++;
case 2:
{
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
vecIn0 = vld1q(pSamples);
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamples[0]);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
vecIn0 = vld1q(&pSamples[1]);
acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
vecIn0 = vld1q(&pSamples[4]);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
vecIn0 = vld1q(&pSamples[2]);
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
vecIn0 = vld1q(&pSamples[5]);
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
vecIn0 = vld1q(&pSamples[3]);
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
*pOutput++ = asrl(acc0, 23);
*pOutput++ = asrl(acc1, 23);
*pOutput++ = asrl(acc2, 23);
*pOutput++ = asrl(acc3, 23);
*pOutput++ = (q31_t) acc0;
*pOutput++ = (q31_t) acc1;
}
break;
case 1:
{
pSamples += 4;
/*
* Save 4 input samples in the history buffer
* Decrement the sample block loop counter
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
vecIn0 = vld1q(pSamples);
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
vecIn0 = vld1q(&pSamples[4]);
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
acc0 = asrl(acc0, 23);
*pOutput++ = (q31_t) acc0;
}
break;
blkCnt--;
}
/*
* Copy the samples back into the history buffer start
*/
pTempSrc = &S->pState[blockSize];
pTempDest = S->pState;
pTempSrc = &pState[blockSize];
pTempDest = pState;
blkCnt = numTaps >> 2;
while (blkCnt > 0U)
{
vst1q(pTempDest, vld1q(pTempSrc));
blkCnt = numTaps - 1;
do {
mve_pred16_t p = vctp32q(blkCnt);
vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
pTempSrc += 4;
pTempDest += 4;
blkCnt--;
}
blkCnt = numTaps & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
blkCnt -= 4;
}
while (blkCnt > 0);
}
void arm_fir_q31(
const arm_fir_instance_q31 * S,
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q31_t *pState = S->pState; /* State pointer */
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
q31_t *pState = pRefStatePtr; /* State pointer */
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q31_t *pStateCur; /* Points to the current sample of the state */
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
@ -447,12 +656,10 @@ void arm_fir_q31(
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecCoeffs;
/*
* [1 to 8 taps] specialized routines
* [1 to 32 taps] specialized routines
*/
if (blockSize >= 8)
{
if (numTaps <= 4)
{
arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
@ -463,21 +670,47 @@ void arm_fir_q31(
arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 12)
{
arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 16)
{
arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 20)
{
arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 24)
{
arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 28)
{
arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
return;
}
else if ((numTaps <= 32) && (blockSize >= 32))
{
arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
return;
}
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
*/
if (blockSize >= 8)
{
pStateCur = &(pState[(numTaps - 1u)]);
pSamples = pState;
pTempSrc = pSrc;
pOutput = pDst;
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
while (blkCnt > 0)
{
const q31_t *pCoeffsTmp = pCoeffs;
const q31_t *pSamplesTmp = pSamples;
@ -494,9 +727,8 @@ void arm_fir_q31(
pStateCur += 4;
pTempSrc += 4;
tapsBlkCnt = (numTaps ) / 4;
uint32_t i = tapsBlkCnt ;
while (i > 0U)
int i = tapsBlkCnt;
while (i > 0)
{
/*
* load 4 coefs
@ -523,29 +755,6 @@ void arm_fir_q31(
i--;
}
tapsBlkCnt = (numTaps ) & 3;
i = tapsBlkCnt ;
while (i > 0U)
{
/*
* load 4 coefs
*/
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
acc3 += ((q63_t) pSamplesTmp[3] * *pCoeffsTmp) >> 8;
pSamplesTmp += 1;
pCoeffsTmp += 1;
/*
* Decrement the taps block loop counter
*/
i--;
}
/* .54-> .31 conversion and store accumulators */
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
@ -559,14 +768,13 @@ void arm_fir_q31(
pSamples += 4;
/*
* Decrement the sample block loop counter
*/
blkCnt--;
}
uint32_t residual = blockSize & 3;
int32_t residual = blockSize & 3;
switch (residual)
{
case 3:
@ -581,14 +789,12 @@ void arm_fir_q31(
/*
* Save 4 input samples in the history buffer
*/
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
pStateCur += 4;
pTempSrc += 4;
tapsBlkCnt = numTaps / 4;
uint32_t i = tapsBlkCnt;
while (i > 0U)
int i = tapsBlkCnt;
while (i > 0)
{
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
@ -606,26 +812,6 @@ void arm_fir_q31(
i--;
}
tapsBlkCnt = (numTaps ) & 3;
i = tapsBlkCnt ;
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
pSamplesTmp += 1;
pCoeffsTmp += 1;
/*
* Decrement the taps block loop counter
*/
i--;
}
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
@ -651,9 +837,8 @@ void arm_fir_q31(
pStateCur += 4;
pTempSrc += 4;
tapsBlkCnt = (numTaps ) / 4;
uint32_t i = tapsBlkCnt;
while (i > 0U)
int i = tapsBlkCnt;
while (i > 0)
{
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
@ -668,24 +853,6 @@ void arm_fir_q31(
i--;
}
tapsBlkCnt = (numTaps ) & 3;
i = tapsBlkCnt ;
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
pSamplesTmp += 1;
pCoeffsTmp += 1;
/*
* Decrement the taps block loop counter
*/
i--;
}
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
@ -708,9 +875,8 @@ void arm_fir_q31(
pStateCur += 4;
pTempSrc += 4;
tapsBlkCnt = (numTaps ) / 4;
uint32_t i = tapsBlkCnt;
while (i > 0U)
int i = tapsBlkCnt;
while (i > 0)
{
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
@ -722,93 +888,29 @@ void arm_fir_q31(
i--;
}
tapsBlkCnt = (numTaps ) & 3;
i = tapsBlkCnt ;
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
pSamplesTmp += 1;
pCoeffsTmp += 1;
/*
* Decrement the taps block loop counter
*/
i--;
}
acc0 = asrl(acc0, 23);
*pOutput++ = (q31_t) acc0;
}
break;
}
}
else
{
q31_t *pStateCurnt; /* Points to the current sample of the state */
q31_t *px; /* Temporary pointer for state buffer */
const q31_t *pb; /* Temporary pointer for coefficient buffer */
q63_t acc0; /* Accumulator */
uint32_t i, blkCnt; /* Loop counters */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0;
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps;
/* Perform the multiply-accumulates */
do
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += (q63_t) *px++ * *pb++;
i--;
} while (i > 0U);
/* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
*pDst++ = (q31_t) (acc0 >> 31U);
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
}
/*
* Copy the samples back into the history buffer start
*/
pTempSrc = &S->pState[blockSize];
pTempDest = S->pState;
pTempSrc = &pState[blockSize];
pTempDest = pState;
blkCnt = numTaps >> 2;
while (blkCnt > 0U)
blkCnt = (numTaps - 1U) >> 2;
while (blkCnt > 0)
{
vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 4;
pTempDest += 4;
blkCnt--;
}
blkCnt = numTaps & 3;
if (blkCnt > 0U)
blkCnt = (numTaps - 1U) & 3;
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);

@ -1,6 +1,9 @@
#include "FIRQ31.h"
#include "Error.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) q31_t coeffArray[64];
#endif
void FIRQ31::test_fir_q31()
{
@ -30,16 +33,28 @@
samples.reload(FIRQ31::SAMPLES1_Q31_ID,mgr,this->nbSamples);
coefs.reload(FIRQ31::COEFS1_Q31_ID,mgr,this->nbTaps);
state.create(this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
state.create(2*ROUND_UP(this->nbSamples,4) + this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
output.create(this->nbSamples,FIRQ31::OUT_SAMPLES_Q31_ID,mgr);
switch(id)
{
case TEST_FIR_Q31_1:
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(q31_t));
q31_t *ptr;
ptr=coefs.ptr();
memcpy(coeffArray,ptr,this->nbTaps*sizeof(q31_t));
this->pCoefs = coeffArray;
#else
this->pCoefs=coefs.ptr();
#endif
arm_fir_init_q31(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
this->pCoefs=coefs.ptr();
this->pDst=output.ptr();
break;

@ -37,6 +37,7 @@ void checkInnerTail(q31_t *b)
#endif
int blockSize;
int numTaps;
int nb=1;
/*
@ -98,6 +99,8 @@ void checkInnerTail(q31_t *b)
configp += 2;
orgcoefsp += numTaps;
nb += blockSize + blockSize;
}
@ -129,8 +132,8 @@ void checkInnerTail(q31_t *b)
ref.reload(FIRQ31::FIRREFS_Q31_ID,mgr);
output.create(ref.nbSamples(),FIRQ31::OUT_Q31_ID,mgr);
/* Max blockSize + numTaps - 1 as generated by Python script */
state.create(47,FIRQ31::OUT_Q31_ID,mgr);
/* > Max blockSize + numTaps - 1 as generated by Python script */
state.create(47 + 47+47,FIRQ31::OUT_Q31_ID,mgr);
}
void FIRQ31::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)

Loading…
Cancel
Save