|
|
|
|
@ -104,10 +104,18 @@
|
|
|
|
|
the implementation may require to read more coefficients due to the vectorization and
|
|
|
|
|
to avoid having to manage too many different cases in the code.
|
|
|
|
|
|
|
|
|
|
@par Helium state buffer
|
|
|
|
|
The state buffer must contain some additional temporary data
|
|
|
|
|
used during the computation but which is not the state of the FIR.
|
|
|
|
|
The first blockSize samples are temporary data.
|
|
|
|
|
The remaining samples are the state of the FIR filter.
|
|
|
|
|
So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
|
|
|
|
|
|
|
|
|
|
@par Fixed-Point Behavior
|
|
|
|
|
Care must be taken when using the fixed-point versions of the FIR filter functions.
|
|
|
|
|
In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
|
|
|
|
|
Refer to the function specific documentation below for usage guidelines.
|
|
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@ -126,9 +134,22 @@
|
|
|
|
|
|
|
|
|
|
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
|
|
|
|
|
|
|
|
|
|
static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
|
|
|
|
|
#define FIR_F32_MAX_COEF_BLK 8
|
|
|
|
|
|
|
|
|
|
#define FIR_F32_CORE(pSamples, c, NB_TAPS) \
|
|
|
|
|
vecAcc0 = vdupq_n_f32(0.0f); \
|
|
|
|
|
for (int i = 0; i < NB_TAPS; i++) { \
|
|
|
|
|
vecIn0 = vld1q(&pSamples[i]); \
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
|
|
|
|
|
const float32_t * __restrict pSrc,
|
|
|
|
|
float32_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
float32_t *pState = S->pState; /* State pointer */
|
|
|
|
|
float32_t *pRefStatePtr = S->pState + blockSize;
|
|
|
|
|
float32_t *pState = pRefStatePtr; /* State pointer */
|
|
|
|
|
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
|
|
|
|
float32_t *pStateCur; /* Points to the current sample of the state */
|
|
|
|
|
const float32_t *pSamples; /* Temporary pointer to the sample buffer */
|
|
|
|
|
@ -136,32 +157,28 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t
|
|
|
|
|
const float32_t *pTempSrc; /* Temporary pointer to the source data */
|
|
|
|
|
float32_t *pTempDest; /* Temporary pointer to the destination buffer */
|
|
|
|
|
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
|
|
|
|
|
uint32_t blkCnt;
|
|
|
|
|
f32x4_t vecIn0;
|
|
|
|
|
f32x4_t vecAcc0;
|
|
|
|
|
float32_t c0, c1, c2, c3;
|
|
|
|
|
int32_t blkCnt;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
const int NB_TAPS=4;
|
|
|
|
|
float32_t c[NB_TAPS];
|
|
|
|
|
const float32_t *pCoeffsCur = pCoeffs;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples
|
|
|
|
|
* pStateCur points to the location where the new input data should be written
|
|
|
|
|
*/
|
|
|
|
|
pStateCur = &(pState[(numTaps - 1u)]);
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pTempSrc = pSrc;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
if (((numTaps - 1) / 4) == 0)
|
|
|
|
|
{
|
|
|
|
|
const float32_t *pCoeffsCur = pCoeffs;
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
c1 = *pCoeffsCur++;
|
|
|
|
|
c2 = *pCoeffsCur++;
|
|
|
|
|
c3 = *pCoeffsCur++;
|
|
|
|
|
for (int i = 0; i < NB_TAPS; i++)
|
|
|
|
|
c[i] = *pCoeffsCur++;
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
while (blkCnt > 0) {
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
@ -169,17 +186,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
FIR_F32_CORE(pSamples, c, NB_TAPS);
|
|
|
|
|
|
|
|
|
|
vst1q(pOutput, vecAcc0);
|
|
|
|
|
|
|
|
|
|
@ -190,514 +197,479 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
|
|
|
|
|
vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
|
|
|
|
|
pStateCur += blkCnt;
|
|
|
|
|
pTempSrc += blkCnt;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
FIR_F32_CORE(pSamples, c, NB_TAPS);
|
|
|
|
|
|
|
|
|
|
vstrwq_p_f32(pOutput, vecAcc0, p0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy the samples back into the history buffer start
|
|
|
|
|
*/
|
|
|
|
|
pTempSrc = &S->pState[blockSize];
|
|
|
|
|
pTempDest = S->pState;
|
|
|
|
|
pTempSrc = &pState[blockSize];
|
|
|
|
|
pTempDest = pState;
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
vst1q(pTempDest, vld1q(pTempSrc));
|
|
|
|
|
blkCnt = numTaps - 1;
|
|
|
|
|
do {
|
|
|
|
|
mve_pred16_t p = vctp32q(blkCnt);
|
|
|
|
|
|
|
|
|
|
vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
blkCnt -= 4;
|
|
|
|
|
}
|
|
|
|
|
while (blkCnt > 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
|
|
|
|
|
|
|
|
|
|
static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
|
|
|
|
|
const float32_t * __restrict pSrc,
|
|
|
|
|
float32_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
float32_t *pState = S->pState; /* State pointer */
|
|
|
|
|
float32_t *pRefStatePtr = S->pState + blockSize;
|
|
|
|
|
float32_t *pState = pRefStatePtr; /* State pointer */
|
|
|
|
|
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
|
|
|
|
float32_t *pStateCur; /* Points to the current sample of the state */
|
|
|
|
|
const float32_t *pSamples; /* Temporary pointer to the sample buffer */
|
|
|
|
|
float32_t *pOutput; /* Temporary pointer to the output buffer */
|
|
|
|
|
const float32_t *pTempSrc; /* Temporary pointer to the source data */
|
|
|
|
|
float32_t *pTempDest; /* Temporary pointer to the destination buffer */
|
|
|
|
|
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
|
|
|
|
|
uint32_t blkCnt;
|
|
|
|
|
f32x4_t vecIn0;
|
|
|
|
|
f32x4_t vecAcc0;
|
|
|
|
|
int32_t blkCnt;
|
|
|
|
|
float32_t c0, c1, c2, c3;
|
|
|
|
|
float32_t c4, c5, c6, c7;
|
|
|
|
|
const float32_t *pCoeffsCur = pCoeffs;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples
|
|
|
|
|
* pStateCur points to the location where the new input data should be written
|
|
|
|
|
*/
|
|
|
|
|
pStateCur = &(pState[(numTaps - 1u)]);
|
|
|
|
|
|
|
|
|
|
pTempSrc = pSrc;
|
|
|
|
|
pTempDest = &(pState[(numTaps - 1u)]);
|
|
|
|
|
int cnt = blockSize;
|
|
|
|
|
do {
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
cnt -= 4;
|
|
|
|
|
} while(cnt > 0);
|
|
|
|
|
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
c1 = *pCoeffsCur++;
|
|
|
|
|
c2 = *pCoeffsCur++;
|
|
|
|
|
c3 = *pCoeffsCur++;
|
|
|
|
|
c4 = *pCoeffsCur++;
|
|
|
|
|
c5 = *pCoeffsCur++;
|
|
|
|
|
c6 = *pCoeffsCur++;
|
|
|
|
|
c7 = *pCoeffsCur++;
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
c0 = *pCoeffs++;
|
|
|
|
|
c1 = *pCoeffs++;
|
|
|
|
|
c2 = *pCoeffs++;
|
|
|
|
|
c3 = *pCoeffs++;
|
|
|
|
|
c4 = *pCoeffs++;
|
|
|
|
|
c5 = *pCoeffs++;
|
|
|
|
|
c6 = *pCoeffs++;
|
|
|
|
|
c7 = *pCoeffs++;
|
|
|
|
|
|
|
|
|
|
cnt = blockSize >> 2;
|
|
|
|
|
while(cnt > 0)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
|
|
|
|
|
vst1q(pOutput, vecAcc0);
|
|
|
|
|
|
|
|
|
|
pOutput += 4;
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
|
|
|
|
|
blkCnt--;
|
|
|
|
|
vst1q(pDst, vecAcc0);
|
|
|
|
|
cnt--;
|
|
|
|
|
pDst += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
cnt = blockSize & 3;
|
|
|
|
|
if (cnt > 0)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0);
|
|
|
|
|
pStateCur += blkCnt;
|
|
|
|
|
pTempSrc += blkCnt;
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
|
|
|
|
|
vstrwq_p_f32(pOutput, vecAcc0, p0);
|
|
|
|
|
vstrwq_p_f32(pDst, vecAcc0,p0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy the samples back into the history buffer start
|
|
|
|
|
*/
|
|
|
|
|
pTempSrc = &S->pState[blockSize];
|
|
|
|
|
pTempDest = S->pState;
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
pTempSrc = &pState[blockSize];
|
|
|
|
|
pTempDest = pState;
|
|
|
|
|
blkCnt = numTaps;
|
|
|
|
|
while (blkCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
vst1q(pTempDest, vld1q(pTempSrc));
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
*pTempDest++ = *pTempSrc++;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void arm_fir_f32(
|
|
|
|
|
const arm_fir_instance_f32 * S,
|
|
|
|
|
const float32_t * pSrc,
|
|
|
|
|
float32_t * pDst,
|
|
|
|
|
uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
float32_t *pState = S->pState; /* State pointer */
|
|
|
|
|
/*
|
|
|
|
|
S->pState is the arm_fir_partial_accu
|
|
|
|
|
S->pState + blockSize is the FIR state
|
|
|
|
|
*/
|
|
|
|
|
float32_t *pRefStatePtr = S->pState + blockSize;
|
|
|
|
|
float32_t *pState = pRefStatePtr ; /* State pointer */
|
|
|
|
|
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
|
|
|
|
float32_t *pStateCur; /* Points to the current sample of the state */
|
|
|
|
|
const float32_t *pSamples; /* Temporary pointer to the sample buffer */
|
|
|
|
|
float32_t *pOutput; /* Temporary pointer to the output buffer */
|
|
|
|
|
const float32_t *pTempSrc; /* Temporary pointer to the source data */
|
|
|
|
|
float32_t *pTempDest; /* Temporary pointer to the destination buffer */
|
|
|
|
|
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
|
|
|
|
|
uint32_t blkCnt;
|
|
|
|
|
int32_t numCnt;
|
|
|
|
|
f32x4_t vecIn0;
|
|
|
|
|
f32x4_t vecAcc0;
|
|
|
|
|
float32_t c0, c1, c2, c3;
|
|
|
|
|
float32_t c4, c5, c6, c7;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* [1 to 8 taps] specialized routines
|
|
|
|
|
*/
|
|
|
|
|
if (blockSize >= 8)
|
|
|
|
|
{
|
|
|
|
|
if (numTaps <= 4)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (blockSize >= 8)
|
|
|
|
|
{
|
|
|
|
|
if (numTaps <= 8)
|
|
|
|
|
else if (numTaps <= 8)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (blockSize >= 8)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples
|
|
|
|
|
* pStateCur points to the location where the new input data should be written
|
|
|
|
|
*/
|
|
|
|
|
pStateCur = &(pState[(numTaps - 1u)]);
|
|
|
|
|
pTempSrc = pSrc;
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
int32_t i;
|
|
|
|
|
const float32_t *pCoeffsCur = pCoeffs;
|
|
|
|
|
pTempDest = &(pState[(numTaps - 1u)]);
|
|
|
|
|
int cnt = blockSize;
|
|
|
|
|
do {
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
cnt -= 4;
|
|
|
|
|
} while(cnt > 0);
|
|
|
|
|
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
c1 = *pCoeffsCur++;
|
|
|
|
|
c2 = *pCoeffsCur++;
|
|
|
|
|
c3 = *pCoeffsCur++;
|
|
|
|
|
c4 = *pCoeffsCur++;
|
|
|
|
|
c5 = *pCoeffsCur++;
|
|
|
|
|
c6 = *pCoeffsCur++;
|
|
|
|
|
c7 = *pCoeffsCur++;
|
|
|
|
|
float32_t *partial_accu_ptr = S->pState;
|
|
|
|
|
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
c0 = *pCoeffs++;
|
|
|
|
|
c1 = *pCoeffs++;
|
|
|
|
|
c2 = *pCoeffs++;
|
|
|
|
|
c3 = *pCoeffs++;
|
|
|
|
|
c4 = *pCoeffs++;
|
|
|
|
|
c5 = *pCoeffs++;
|
|
|
|
|
c6 = *pCoeffs++;
|
|
|
|
|
c7 = *pCoeffs++;
|
|
|
|
|
|
|
|
|
|
cnt = blockSize >> 2;
|
|
|
|
|
while(cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
vst1q(partial_accu_ptr, vecAcc0);
|
|
|
|
|
cnt--;
|
|
|
|
|
partial_accu_ptr += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pSamples += 8;
|
|
|
|
|
|
|
|
|
|
numCnt = ((int32_t)numTaps - 8) / 8;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < numCnt; i++)
|
|
|
|
|
cnt = blockSize & 3;
|
|
|
|
|
if (cnt > 0)
|
|
|
|
|
{
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
c1 = *pCoeffsCur++;
|
|
|
|
|
c2 = *pCoeffsCur++;
|
|
|
|
|
c3 = *pCoeffsCur++;
|
|
|
|
|
c4 = *pCoeffsCur++;
|
|
|
|
|
c5 = *pCoeffsCur++;
|
|
|
|
|
c6 = *pCoeffsCur++;
|
|
|
|
|
c7 = *pCoeffsCur++;
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
|
|
|
|
|
pSamples += 8;
|
|
|
|
|
pSamples += cnt;
|
|
|
|
|
vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
|
|
|
|
|
partial_accu_ptr += cnt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
numCnt = ((int32_t)numTaps - 8) & 7;
|
|
|
|
|
int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
|
|
|
|
|
int sample_offset = FIR_F32_MAX_COEF_BLK;
|
|
|
|
|
while (localTaps > FIR_F32_MAX_COEF_BLK) {
|
|
|
|
|
c0 = *pCoeffs++;
|
|
|
|
|
c1 = *pCoeffs++;
|
|
|
|
|
c2 = *pCoeffs++;
|
|
|
|
|
c3 = *pCoeffs++;
|
|
|
|
|
c4 = *pCoeffs++;
|
|
|
|
|
c5 = *pCoeffs++;
|
|
|
|
|
c6 = *pCoeffs++;
|
|
|
|
|
c7 = *pCoeffs++;
|
|
|
|
|
|
|
|
|
|
partial_accu_ptr = S->pState;
|
|
|
|
|
pSamples = pState + sample_offset;
|
|
|
|
|
int cnt = blockSize >> 2;
|
|
|
|
|
while(cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
while (numCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
|
|
|
|
|
pSamples ++;
|
|
|
|
|
|
|
|
|
|
numCnt --;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
vst1q(pOutput, vecAcc0);
|
|
|
|
|
pOutput += 4;
|
|
|
|
|
pSamples = pSamples - numTaps + 4;
|
|
|
|
|
|
|
|
|
|
blkCnt--;
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
vecAcc0 += vld1q_f32(partial_accu_ptr);
|
|
|
|
|
vst1q(partial_accu_ptr, vecAcc0);
|
|
|
|
|
cnt--;
|
|
|
|
|
partial_accu_ptr += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
int32_t i;
|
|
|
|
|
const float32_t *pCoeffsCur = pCoeffs;
|
|
|
|
|
cnt = blockSize & 3;
|
|
|
|
|
if (cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
c1 = *pCoeffsCur++;
|
|
|
|
|
c2 = *pCoeffsCur++;
|
|
|
|
|
c3 = *pCoeffsCur++;
|
|
|
|
|
c4 = *pCoeffsCur++;
|
|
|
|
|
c5 = *pCoeffsCur++;
|
|
|
|
|
c6 = *pCoeffsCur++;
|
|
|
|
|
c7 = *pCoeffsCur++;
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
pSamples += cnt;
|
|
|
|
|
vecAcc0 += vld1q_f32(partial_accu_ptr);
|
|
|
|
|
vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
|
|
|
|
|
partial_accu_ptr += cnt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pSamples += 8;
|
|
|
|
|
localTaps -= FIR_F32_MAX_COEF_BLK;
|
|
|
|
|
sample_offset += FIR_F32_MAX_COEF_BLK;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
numCnt = ((int32_t)numTaps - 8) / 8;
|
|
|
|
|
pSamples = pState + sample_offset;
|
|
|
|
|
|
|
|
|
|
if (localTaps > 4) {
|
|
|
|
|
c0 = *pCoeffs++;
|
|
|
|
|
c1 = *pCoeffs++;
|
|
|
|
|
c2 = *pCoeffs++;
|
|
|
|
|
c3 = *pCoeffs++;
|
|
|
|
|
c4 = *pCoeffs++;
|
|
|
|
|
c5 = *pCoeffs++;
|
|
|
|
|
c6 = *pCoeffs++;
|
|
|
|
|
c7 = *pCoeffs++;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < numCnt; i++)
|
|
|
|
|
{
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
c1 = *pCoeffsCur++;
|
|
|
|
|
c2 = *pCoeffsCur++;
|
|
|
|
|
c3 = *pCoeffsCur++;
|
|
|
|
|
c4 = *pCoeffsCur++;
|
|
|
|
|
c5 = *pCoeffsCur++;
|
|
|
|
|
c6 = *pCoeffsCur++;
|
|
|
|
|
c7 = *pCoeffsCur++;
|
|
|
|
|
partial_accu_ptr = S->pState;
|
|
|
|
|
cnt = blockSize >> 2;
|
|
|
|
|
while(cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
|
|
|
|
|
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
|
|
|
|
|
pSamples += 8;
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
float32x4_t pap = vld1q_f32(partial_accu_ptr);
|
|
|
|
|
vst1q(pOutput, vecAcc0+pap);
|
|
|
|
|
cnt--;
|
|
|
|
|
partial_accu_ptr += 4;
|
|
|
|
|
pOutput += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
numCnt = ((int32_t)numTaps - 8) & 7;
|
|
|
|
|
cnt = blockSize & 3;
|
|
|
|
|
if (cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
while (numCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
c0 = *pCoeffsCur++;
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
|
|
|
|
|
pSamples ++;
|
|
|
|
|
|
|
|
|
|
numCnt --;
|
|
|
|
|
}
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
|
|
|
|
|
vstrwq_p_f32(pOutput, vecAcc0, p0);
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[7]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
|
|
|
|
|
pSamples += cnt;
|
|
|
|
|
float32x4_t pap = vld1q_f32(partial_accu_ptr);
|
|
|
|
|
vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
|
|
|
|
|
partial_accu_ptr += cnt;
|
|
|
|
|
pOutput += cnt;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
float32_t *pStateCurnt; /* Points to the current sample of the state */
|
|
|
|
|
float32_t *px; /* Temporary pointer for state buffer */
|
|
|
|
|
const float32_t *pb; /* Temporary pointer for coefficient buffer */
|
|
|
|
|
float32_t acc0; /* Accumulator */
|
|
|
|
|
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
|
|
|
|
|
uint32_t i, blkCnt; /* Loop counters */
|
|
|
|
|
pStateCurnt = &(S->pState[(numTaps - 1U)]);
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
/* Copy one sample at a time into state buffer */
|
|
|
|
|
*pStateCurnt++ = *pSrc++;
|
|
|
|
|
|
|
|
|
|
/* Set the accumulator to zero */
|
|
|
|
|
acc0 = 0.0f;
|
|
|
|
|
|
|
|
|
|
/* Initialize state pointer */
|
|
|
|
|
px = pState;
|
|
|
|
|
|
|
|
|
|
/* Initialize Coefficient pointer */
|
|
|
|
|
pb = pCoeffs;
|
|
|
|
|
|
|
|
|
|
i = numTaps;
|
|
|
|
|
else {
|
|
|
|
|
c0 = *pCoeffs++;
|
|
|
|
|
c1 = *pCoeffs++;
|
|
|
|
|
c2 = *pCoeffs++;
|
|
|
|
|
c3 = *pCoeffs++;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
{
|
|
|
|
|
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
|
|
|
|
|
acc0 += *px++ * *pb++;
|
|
|
|
|
partial_accu_ptr = S->pState;
|
|
|
|
|
cnt = blockSize >> 2;
|
|
|
|
|
while(cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
i--;
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
float32x4_t pap = vld1q_f32(partial_accu_ptr);
|
|
|
|
|
vst1q(pOutput, vecAcc0+pap);
|
|
|
|
|
cnt--;
|
|
|
|
|
partial_accu_ptr += 4;
|
|
|
|
|
pOutput += 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cnt = blockSize & 3;
|
|
|
|
|
if (cnt > 0) {
|
|
|
|
|
float32x4_t vecAcc0;
|
|
|
|
|
float32x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
/* Store result in destination buffer. */
|
|
|
|
|
*pDst++ = acc0;
|
|
|
|
|
mve_pred16_t p0 = vctp32q(cnt);
|
|
|
|
|
|
|
|
|
|
/* Advance state pointer by 1 for the next sample */
|
|
|
|
|
pState = pState + 1U;
|
|
|
|
|
|
|
|
|
|
/* Decrement loop counter */
|
|
|
|
|
blkCnt--;
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
vecAcc0 = vmulq(vecIn0, c0);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
|
|
|
|
|
pSamples += cnt;
|
|
|
|
|
float32x4_t pap = vld1q_f32(partial_accu_ptr);
|
|
|
|
|
vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
|
|
|
|
|
partial_accu_ptr += cnt;
|
|
|
|
|
pOutput += cnt;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy the samples back into the history buffer start
|
|
|
|
|
*/
|
|
|
|
|
pTempSrc = &S->pState[blockSize];
|
|
|
|
|
pTempDest = S->pState;
|
|
|
|
|
pTempSrc = &pRefStatePtr[blockSize];
|
|
|
|
|
pTempDest = pRefStatePtr;
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
while (blkCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
vst1q(pTempDest, vld1q(pTempSrc));
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
if (blkCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
|