|
|
|
|
@ -28,6 +28,7 @@
|
|
|
|
|
|
|
|
|
|
#include "dsp/filtering_functions.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@ingroup groupFilters
|
|
|
|
|
*/
|
|
|
|
|
@ -60,10 +61,158 @@
|
|
|
|
|
#include "arm_helium_utils.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \
|
|
|
|
|
for (int j = 0; j < nbAcc; j++) { \
|
|
|
|
|
const q31_t *pSmp = &pSamples[j]; \
|
|
|
|
|
q31x4_t vecIn0; \
|
|
|
|
|
q63_t acc[4]; \
|
|
|
|
|
\
|
|
|
|
|
acc[j] = 0; \
|
|
|
|
|
for (int i = 0; i < nbVecTaps; i++) { \
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * i); \
|
|
|
|
|
acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
|
|
|
|
|
} \
|
|
|
|
|
*pOutput++ = (q31_t)asrl(acc[j], 23); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
|
|
|
|
|
for (int j = 0; j < nbAcc; j++) { \
|
|
|
|
|
const q31_t *pSmp = &pSamples[j]; \
|
|
|
|
|
q31x4_t vecIn0; \
|
|
|
|
|
\
|
|
|
|
|
acc[j] = 0; \
|
|
|
|
|
for (int i = 0; i < nbVecTaps; i++) { \
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * i); \
|
|
|
|
|
acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
|
|
|
|
|
} \
|
|
|
|
|
*arm_fir_partial_accu_ptr++ = acc[j]; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
|
|
|
|
|
for (int j = 0; j < nbAcc; j++) { \
|
|
|
|
|
const q31_t *pSmp = &pSamples[j]; \
|
|
|
|
|
q31x4_t vecIn0; \
|
|
|
|
|
\
|
|
|
|
|
acc[j] = *arm_fir_partial_accu_ptr++; \
|
|
|
|
|
\
|
|
|
|
|
for (int i = 0; i < nbVecTaps; i++) { \
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * i); \
|
|
|
|
|
acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
|
|
|
|
|
} \
|
|
|
|
|
*pOutput++ = (q31_t)asrl(acc[j], 23); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define FIR_Q31_MAIN_CORE() \
|
|
|
|
|
{ \
|
|
|
|
|
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); \
|
|
|
|
|
q31_t *pState = pRefStatePtr; /* State pointer */ \
|
|
|
|
|
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
|
|
|
|
|
q31_t *pStateCur; /* Points to the current sample of the state */ \
|
|
|
|
|
const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \
|
|
|
|
|
q31_t *pOutput; /* Temporary pointer to the output buffer */ \
|
|
|
|
|
const q31_t *pTempSrc; /* Temporary pointer to the source data */ \
|
|
|
|
|
q31_t *pTempDest; /* Temporary pointer to the destination buffer */\
|
|
|
|
|
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
|
|
|
|
|
int32_t blkCnt; \
|
|
|
|
|
const int32_t nbVecTaps = (NBTAPS / 4); \
|
|
|
|
|
\
|
|
|
|
|
/* \
|
|
|
|
|
* load coefs \
|
|
|
|
|
*/ \
|
|
|
|
|
q31x4_t vecCoeffs[nbVecTaps]; \
|
|
|
|
|
\
|
|
|
|
|
for (int i = 0; i < nbVecTaps; i++) \
|
|
|
|
|
vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \
|
|
|
|
|
\
|
|
|
|
|
/* \
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples \
|
|
|
|
|
* pStateCur points to the location where the new input data should be written \
|
|
|
|
|
*/ \
|
|
|
|
|
pStateCur = &(pState[(numTaps - 1u)]); \
|
|
|
|
|
pTempSrc = pSrc; \
|
|
|
|
|
pSamples = pState; \
|
|
|
|
|
pOutput = pDst; \
|
|
|
|
|
\
|
|
|
|
|
blkCnt = blockSize >> 2; \
|
|
|
|
|
while (blkCnt > 0) { \
|
|
|
|
|
/* \
|
|
|
|
|
* Save 4 input samples in the history buffer \
|
|
|
|
|
*/ \
|
|
|
|
|
vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \
|
|
|
|
|
pStateCur += 4; \
|
|
|
|
|
pTempSrc += 4; \
|
|
|
|
|
\
|
|
|
|
|
FIR_Q31_CORE(4, nbVecTaps, pSamples, vecCoeffs); \
|
|
|
|
|
\
|
|
|
|
|
pSamples += 4; \
|
|
|
|
|
/* \
|
|
|
|
|
* Decrement the sample block loop counter \
|
|
|
|
|
*/ \
|
|
|
|
|
blkCnt--; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
/* tail */ \
|
|
|
|
|
int32_t residual = blockSize & 3; \
|
|
|
|
|
switch (residual) { \
|
|
|
|
|
case 3: \
|
|
|
|
|
{ \
|
|
|
|
|
for (int i = 0; i < residual; i++) \
|
|
|
|
|
*pStateCur++ = *pTempSrc++; \
|
|
|
|
|
\
|
|
|
|
|
FIR_Q31_CORE(3, nbVecTaps, pSamples, vecCoeffs); \
|
|
|
|
|
} \
|
|
|
|
|
break; \
|
|
|
|
|
\
|
|
|
|
|
case 2: \
|
|
|
|
|
{ \
|
|
|
|
|
for (int i = 0; i < residual; i++) \
|
|
|
|
|
*pStateCur++ = *pTempSrc++; \
|
|
|
|
|
\
|
|
|
|
|
FIR_Q31_CORE(2, nbVecTaps, pSamples, vecCoeffs); \
|
|
|
|
|
} \
|
|
|
|
|
break; \
|
|
|
|
|
\
|
|
|
|
|
case 1: \
|
|
|
|
|
{ \
|
|
|
|
|
for (int i = 0; i < residual; i++) \
|
|
|
|
|
*pStateCur++ = *pTempSrc++; \
|
|
|
|
|
\
|
|
|
|
|
FIR_Q31_CORE(1, nbVecTaps, pSamples, vecCoeffs); \
|
|
|
|
|
} \
|
|
|
|
|
break; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
/* \
|
|
|
|
|
* Copy the samples back into the history buffer start \
|
|
|
|
|
*/ \
|
|
|
|
|
pTempSrc = &pState[blockSize]; \
|
|
|
|
|
pTempDest = pState; \
|
|
|
|
|
\
|
|
|
|
|
blkCnt =(numTaps - 1) >> 2; \
|
|
|
|
|
while (blkCnt > 0) \
|
|
|
|
|
{ \
|
|
|
|
|
vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \
|
|
|
|
|
pTempSrc += 4; \
|
|
|
|
|
pTempDest += 4; \
|
|
|
|
|
blkCnt--; \
|
|
|
|
|
} \
|
|
|
|
|
blkCnt = (numTaps - 1) & 3; \
|
|
|
|
|
if (blkCnt > 0) \
|
|
|
|
|
{ \
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt); \
|
|
|
|
|
vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \
|
|
|
|
|
} \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
|
|
|
|
|
static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
q31_t *pState = S->pState; /* State pointer */
|
|
|
|
|
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
|
|
|
|
|
q31_t *pState = pRefStatePtr; /* State pointer */
|
|
|
|
|
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
|
|
|
|
q31_t *pStateCur; /* Points to the current sample of the state */
|
|
|
|
|
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
|
|
|
|
|
@ -74,6 +223,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
uint32_t blkCnt;
|
|
|
|
|
q31x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples
|
|
|
|
|
* pStateCur points to the location where the new input data should be written
|
|
|
|
|
@ -83,7 +233,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
q63_t acc0, acc1, acc2, acc3;
|
|
|
|
|
q63_t acc0=0, acc1=0, acc2=0, acc3=0;
|
|
|
|
|
/*
|
|
|
|
|
* load 4 coefs
|
|
|
|
|
*/
|
|
|
|
|
@ -131,7 +281,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t residual = blockSize & 3;
|
|
|
|
|
|
|
|
|
|
switch (residual)
|
|
|
|
|
{
|
|
|
|
|
case 3:
|
|
|
|
|
@ -139,7 +288,6 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
@ -205,14 +353,13 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy the samples back into the history buffer start
|
|
|
|
|
*/
|
|
|
|
|
pTempSrc = &S->pState[blockSize];
|
|
|
|
|
pTempDest = S->pState;
|
|
|
|
|
pTempSrc = &pState[blockSize];
|
|
|
|
|
pTempDest = pState;
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps >> 2;
|
|
|
|
|
blkCnt = (numTaps-1) >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
vst1q(pTempDest, vld1q(pTempSrc));
|
|
|
|
|
@ -220,7 +367,7 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
blkCnt = numTaps & 3;
|
|
|
|
|
blkCnt = (numTaps-1) & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
@ -228,9 +375,74 @@ static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pSrc, q31_t * pDst, uint32_t blockSize)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
#define NBTAPS 8
|
|
|
|
|
FIR_Q31_MAIN_CORE();
|
|
|
|
|
#undef NBTAPS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
q31_t *pState = S->pState; /* State pointer */
|
|
|
|
|
#define NBTAPS 12
|
|
|
|
|
FIR_Q31_MAIN_CORE();
|
|
|
|
|
#undef NBTAPS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
#define NBTAPS 16
|
|
|
|
|
FIR_Q31_MAIN_CORE();
|
|
|
|
|
#undef NBTAPS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
#define NBTAPS 20
|
|
|
|
|
FIR_Q31_MAIN_CORE();
|
|
|
|
|
#undef NBTAPS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
#define NBTAPS 24
|
|
|
|
|
FIR_Q31_MAIN_CORE();
|
|
|
|
|
#undef NBTAPS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst, uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
#define NBTAPS 28
|
|
|
|
|
FIR_Q31_MAIN_CORE();
|
|
|
|
|
#undef NBTAPS
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * __restrict pSrc,
|
|
|
|
|
q31_t * __restrict pDst,
|
|
|
|
|
uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
|
|
|
|
|
q31_t *pState = pRefStatePtr; /* State pointer */
|
|
|
|
|
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
|
|
|
|
q31_t *pStateCur; /* Points to the current sample of the state */
|
|
|
|
|
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
|
|
|
|
|
@ -238,10 +450,21 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
const q31_t *pTempSrc; /* Temporary pointer to the source data */
|
|
|
|
|
q31_t *pTempDest; /* Temporary pointer to the destination buffer */
|
|
|
|
|
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
|
|
|
|
|
uint32_t blkCnt;
|
|
|
|
|
q31x4_t vecIn0;
|
|
|
|
|
int32_t blkCnt;
|
|
|
|
|
q63_t acc0, acc1, acc2, acc3;
|
|
|
|
|
q31x4_t vecCoeffs1_4, vecCoeffs5_8;
|
|
|
|
|
|
|
|
|
|
#define MAX_VECT_BATCH 7
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* pre-load 28 1st coefs
|
|
|
|
|
*/
|
|
|
|
|
q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
|
|
|
|
|
q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
|
|
|
|
|
q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
|
|
|
|
|
q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
|
|
|
|
|
q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
|
|
|
|
|
q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
|
|
|
|
|
q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples
|
|
|
|
|
@ -250,190 +473,176 @@ static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S, const q31_t * pS
|
|
|
|
|
pStateCur = &(pState[(numTaps - 1u)]);
|
|
|
|
|
pTempSrc = pSrc;
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* load 8 coefs
|
|
|
|
|
*/
|
|
|
|
|
vecCoeffs1_4 = *(q31x4_t *) pCoeffs;
|
|
|
|
|
vecCoeffs5_8 = *(q31x4_t *) (pCoeffs + 4);
|
|
|
|
|
q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
|
|
|
|
|
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
const q31_t *pSamplesTmp = pSamples;
|
|
|
|
|
|
|
|
|
|
while (blkCnt > 0) {
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamplesTmp);
|
|
|
|
|
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[1]);
|
|
|
|
|
acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[2]);
|
|
|
|
|
acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[3]);
|
|
|
|
|
acc3 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[4]);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[5]);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[6]);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamplesTmp[7]);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5_8);
|
|
|
|
|
|
|
|
|
|
vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
acc1 = asrl(acc1, 23);
|
|
|
|
|
acc2 = asrl(acc2, 23);
|
|
|
|
|
acc3 = asrl(acc3, 23);
|
|
|
|
|
const q31_t *pSmp;
|
|
|
|
|
q31x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
*pOutput++ = (q31_t) acc0;
|
|
|
|
|
*pOutput++ = (q31_t) acc1;
|
|
|
|
|
*pOutput++ = (q31_t) acc2;
|
|
|
|
|
*pOutput++ = (q31_t) acc3;
|
|
|
|
|
pSmp = &pSamples[0];
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSmp);
|
|
|
|
|
acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 1);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 2);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 3);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 4);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 5);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 6);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
|
|
|
|
|
|
|
|
|
|
*arm_fir_partial_accu_ptr++ = acc0;
|
|
|
|
|
|
|
|
|
|
pSmp = &pSamples[1];
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSmp);
|
|
|
|
|
acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 1);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 2);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 3);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 4);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 5);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 6);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
|
|
|
|
|
|
|
|
|
|
*arm_fir_partial_accu_ptr++ = acc1;
|
|
|
|
|
|
|
|
|
|
pSmp = &pSamples[2];
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSmp);
|
|
|
|
|
acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 1);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 2);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 3);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 4);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 5);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 6);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
|
|
|
|
|
*arm_fir_partial_accu_ptr++ = acc2;
|
|
|
|
|
|
|
|
|
|
pSmp = &pSamples[3];
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSmp);
|
|
|
|
|
acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 1);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 2);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 3);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 4);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 5);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
|
|
|
|
|
vecIn0 = vld1q(pSmp + 4 * 6);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
|
|
|
|
|
|
|
|
|
|
*arm_fir_partial_accu_ptr++ = acc3;
|
|
|
|
|
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the sample block loop counter
|
|
|
|
|
*/
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t residual = blockSize & 3;
|
|
|
|
|
switch (residual)
|
|
|
|
|
{
|
|
|
|
|
case 3:
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
acc2 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
|
|
|
|
|
/* reminder */
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
|
|
|
|
|
/* load last 4 coef */
|
|
|
|
|
vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
|
|
|
|
|
arm_fir_partial_accu_ptr = (q63_t*)S->pState;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
pSamples = pState + (MAX_VECT_BATCH * 4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[6]);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5_8);
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
acc1 = asrl(acc1, 23);
|
|
|
|
|
acc2 = asrl(acc2, 23);
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0) {
|
|
|
|
|
q31x4_t vecIn0;
|
|
|
|
|
|
|
|
|
|
*pOutput++ = (q31_t) acc0;
|
|
|
|
|
*pOutput++ = (q31_t) acc1;
|
|
|
|
|
*pOutput++ = (q31_t) acc2;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
/* reload intermediate MAC */
|
|
|
|
|
acc0 = *arm_fir_partial_accu_ptr++;
|
|
|
|
|
acc1 = *arm_fir_partial_accu_ptr++;
|
|
|
|
|
acc2 = *arm_fir_partial_accu_ptr++;
|
|
|
|
|
acc3 = *arm_fir_partial_accu_ptr++;
|
|
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[0]);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[1]);
|
|
|
|
|
acc1 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[2]);
|
|
|
|
|
acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[5]);
|
|
|
|
|
acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5_8);
|
|
|
|
|
vecIn0 = vld1q(&pSamples[3]);
|
|
|
|
|
acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
acc1 = asrl(acc1, 23);
|
|
|
|
|
*pOutput++ = asrl(acc0, 23);
|
|
|
|
|
*pOutput++ = asrl(acc1, 23);
|
|
|
|
|
*pOutput++ = asrl(acc2, 23);
|
|
|
|
|
*pOutput++ = asrl(acc3, 23);
|
|
|
|
|
|
|
|
|
|
*pOutput++ = (q31_t) acc0;
|
|
|
|
|
*pOutput++ = (q31_t) acc1;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
|
{
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
* Decrement the sample block loop counter
|
|
|
|
|
*/
|
|
|
|
|
vst1q(pStateCur, vld1q(pTempSrc));
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(pSamples);
|
|
|
|
|
acc0 = vrmlaldavhq(vecIn0, vecCoeffs1_4);
|
|
|
|
|
|
|
|
|
|
vecIn0 = vld1q(&pSamples[4]);
|
|
|
|
|
acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5_8);
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
|
|
|
|
|
*pOutput++ = (q31_t) acc0;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy the samples back into the history buffer start
|
|
|
|
|
*/
|
|
|
|
|
pTempSrc = &S->pState[blockSize];
|
|
|
|
|
pTempDest = S->pState;
|
|
|
|
|
pTempSrc = &pState[blockSize];
|
|
|
|
|
pTempDest = pState;
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
vst1q(pTempDest, vld1q(pTempSrc));
|
|
|
|
|
blkCnt = numTaps - 1;
|
|
|
|
|
do {
|
|
|
|
|
mve_pred16_t p = vctp32q(blkCnt);
|
|
|
|
|
|
|
|
|
|
vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
blkCnt = numTaps & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
blkCnt -= 4;
|
|
|
|
|
}
|
|
|
|
|
while (blkCnt > 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void arm_fir_q31(
|
|
|
|
|
const arm_fir_instance_q31 * S,
|
|
|
|
|
const q31_t * pSrc,
|
|
|
|
|
q31_t * pDst,
|
|
|
|
|
uint32_t blockSize)
|
|
|
|
|
{
|
|
|
|
|
q31_t *pState = S->pState; /* State pointer */
|
|
|
|
|
q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
|
|
|
|
|
q31_t *pState = pRefStatePtr; /* State pointer */
|
|
|
|
|
const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
|
|
|
|
|
q31_t *pStateCur; /* Points to the current sample of the state */
|
|
|
|
|
const q31_t *pSamples; /* Temporary pointer to the sample buffer */
|
|
|
|
|
@ -447,12 +656,10 @@ void arm_fir_q31(
|
|
|
|
|
q63_t acc0, acc1, acc2, acc3;
|
|
|
|
|
q31x4_t vecCoeffs;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* [1 to 8 taps] specialized routines
|
|
|
|
|
* [1 to 32 taps] specialized routines
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
if (blockSize >= 8)
|
|
|
|
|
{
|
|
|
|
|
if (numTaps <= 4)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
@ -463,21 +670,47 @@ void arm_fir_q31(
|
|
|
|
|
arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else if (numTaps <= 12)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else if (numTaps <= 16)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else if (numTaps <= 20)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else if (numTaps <= 24)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else if (numTaps <= 28)
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
else if ((numTaps <= 32) && (blockSize >= 32))
|
|
|
|
|
{
|
|
|
|
|
arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* pState points to state array which contains previous frame (numTaps - 1) samples
|
|
|
|
|
* pStateCur points to the location where the new input data should be written
|
|
|
|
|
*/
|
|
|
|
|
if (blockSize >= 8)
|
|
|
|
|
{
|
|
|
|
|
pStateCur = &(pState[(numTaps - 1u)]);
|
|
|
|
|
pSamples = pState;
|
|
|
|
|
pTempSrc = pSrc;
|
|
|
|
|
pOutput = pDst;
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
while (blkCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
const q31_t *pCoeffsTmp = pCoeffs;
|
|
|
|
|
const q31_t *pSamplesTmp = pSamples;
|
|
|
|
|
@ -494,9 +727,8 @@ void arm_fir_q31(
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) / 4;
|
|
|
|
|
uint32_t i = tapsBlkCnt ;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
int i = tapsBlkCnt;
|
|
|
|
|
while (i > 0)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* load 4 coefs
|
|
|
|
|
@ -523,29 +755,6 @@ void arm_fir_q31(
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) & 3;
|
|
|
|
|
i = tapsBlkCnt ;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* load 4 coefs
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
|
|
|
|
|
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
|
|
|
|
|
acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
|
|
|
|
|
acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
|
|
|
|
|
acc3 += ((q63_t) pSamplesTmp[3] * *pCoeffsTmp) >> 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pSamplesTmp += 1;
|
|
|
|
|
pCoeffsTmp += 1;
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the taps block loop counter
|
|
|
|
|
*/
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* .54-> .31 conversion and store accumulators */
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
acc1 = asrl(acc1, 23);
|
|
|
|
|
@ -559,14 +768,13 @@ void arm_fir_q31(
|
|
|
|
|
|
|
|
|
|
pSamples += 4;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the sample block loop counter
|
|
|
|
|
*/
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t residual = blockSize & 3;
|
|
|
|
|
int32_t residual = blockSize & 3;
|
|
|
|
|
switch (residual)
|
|
|
|
|
{
|
|
|
|
|
case 3:
|
|
|
|
|
@ -581,14 +789,12 @@ void arm_fir_q31(
|
|
|
|
|
/*
|
|
|
|
|
* Save 4 input samples in the history buffer
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
*(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = numTaps / 4;
|
|
|
|
|
uint32_t i = tapsBlkCnt;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
int i = tapsBlkCnt;
|
|
|
|
|
while (i > 0)
|
|
|
|
|
{
|
|
|
|
|
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
|
|
|
|
|
|
|
|
|
|
@ -606,26 +812,6 @@ void arm_fir_q31(
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) & 3;
|
|
|
|
|
|
|
|
|
|
i = tapsBlkCnt ;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
|
|
|
|
|
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
|
|
|
|
|
acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
|
|
|
|
|
acc2 += ((q63_t) pSamplesTmp[2] * *pCoeffsTmp) >> 8;
|
|
|
|
|
|
|
|
|
|
pSamplesTmp += 1;
|
|
|
|
|
pCoeffsTmp += 1;
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the taps block loop counter
|
|
|
|
|
*/
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
acc1 = asrl(acc1, 23);
|
|
|
|
|
acc2 = asrl(acc2, 23);
|
|
|
|
|
@ -651,9 +837,8 @@ void arm_fir_q31(
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) / 4;
|
|
|
|
|
uint32_t i = tapsBlkCnt;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
int i = tapsBlkCnt;
|
|
|
|
|
while (i > 0)
|
|
|
|
|
{
|
|
|
|
|
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
|
|
|
|
|
|
|
|
|
|
@ -668,24 +853,6 @@ void arm_fir_q31(
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) & 3;
|
|
|
|
|
i = tapsBlkCnt ;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
|
|
|
|
|
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
|
|
|
|
|
acc1 += ((q63_t) pSamplesTmp[1] * *pCoeffsTmp) >> 8;
|
|
|
|
|
|
|
|
|
|
pSamplesTmp += 1;
|
|
|
|
|
pCoeffsTmp += 1;
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the taps block loop counter
|
|
|
|
|
*/
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
acc1 = asrl(acc1, 23);
|
|
|
|
|
|
|
|
|
|
@ -708,9 +875,8 @@ void arm_fir_q31(
|
|
|
|
|
pStateCur += 4;
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) / 4;
|
|
|
|
|
uint32_t i = tapsBlkCnt;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
int i = tapsBlkCnt;
|
|
|
|
|
while (i > 0)
|
|
|
|
|
{
|
|
|
|
|
vecCoeffs = *(q31x4_t *) pCoeffsTmp;
|
|
|
|
|
|
|
|
|
|
@ -722,93 +888,29 @@ void arm_fir_q31(
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tapsBlkCnt = (numTaps ) & 3;
|
|
|
|
|
i = tapsBlkCnt ;
|
|
|
|
|
while (i > 0U)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
|
|
|
|
|
acc0 += ((q63_t) *pSamplesTmp * *pCoeffsTmp) >> 8;
|
|
|
|
|
|
|
|
|
|
pSamplesTmp += 1;
|
|
|
|
|
pCoeffsTmp += 1;
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the taps block loop counter
|
|
|
|
|
*/
|
|
|
|
|
i--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
acc0 = asrl(acc0, 23);
|
|
|
|
|
|
|
|
|
|
*pOutput++ = (q31_t) acc0;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
q31_t *pStateCurnt; /* Points to the current sample of the state */
|
|
|
|
|
q31_t *px; /* Temporary pointer for state buffer */
|
|
|
|
|
const q31_t *pb; /* Temporary pointer for coefficient buffer */
|
|
|
|
|
q63_t acc0; /* Accumulator */
|
|
|
|
|
uint32_t i, blkCnt; /* Loop counters */
|
|
|
|
|
pStateCurnt = &(S->pState[(numTaps - 1U)]);
|
|
|
|
|
blkCnt = blockSize;
|
|
|
|
|
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
/* Copy one sample at a time into state buffer */
|
|
|
|
|
*pStateCurnt++ = *pSrc++;
|
|
|
|
|
|
|
|
|
|
/* Set the accumulator to zero */
|
|
|
|
|
acc0 = 0;
|
|
|
|
|
|
|
|
|
|
/* Initialize state pointer */
|
|
|
|
|
px = pState;
|
|
|
|
|
|
|
|
|
|
/* Initialize Coefficient pointer */
|
|
|
|
|
pb = pCoeffs;
|
|
|
|
|
|
|
|
|
|
i = numTaps;
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
|
do
|
|
|
|
|
{
|
|
|
|
|
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
|
|
|
|
|
acc0 += (q63_t) *px++ * *pb++;
|
|
|
|
|
|
|
|
|
|
i--;
|
|
|
|
|
} while (i > 0U);
|
|
|
|
|
|
|
|
|
|
/* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
|
|
|
|
|
*pDst++ = (q31_t) (acc0 >> 31U);
|
|
|
|
|
|
|
|
|
|
/* Advance state pointer by 1 for the next sample */
|
|
|
|
|
pState = pState + 1U;
|
|
|
|
|
|
|
|
|
|
/* Decrement loop counter */
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Copy the samples back into the history buffer start
|
|
|
|
|
*/
|
|
|
|
|
pTempSrc = &S->pState[blockSize];
|
|
|
|
|
pTempDest = S->pState;
|
|
|
|
|
pTempSrc = &pState[blockSize];
|
|
|
|
|
pTempDest = pState;
|
|
|
|
|
|
|
|
|
|
blkCnt = numTaps >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
blkCnt = (numTaps - 1U) >> 2;
|
|
|
|
|
while (blkCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
vst1q(pTempDest, vld1q(pTempSrc));
|
|
|
|
|
pTempSrc += 4;
|
|
|
|
|
pTempDest += 4;
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
blkCnt = numTaps & 3;
|
|
|
|
|
if (blkCnt > 0U)
|
|
|
|
|
blkCnt = (numTaps - 1U) & 3;
|
|
|
|
|
if (blkCnt > 0)
|
|
|
|
|
{
|
|
|
|
|
mve_pred16_t p0 = vctp32q(blkCnt);
|
|
|
|
|
vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
|
|
|
|
|
|