CMSIS-DSP: Added new MVE implementation for FIR Q15 and Q7.

pull/19/head
Christophe Favergeon 5 years ago
parent a108d6763e
commit c4283d209f

@ -114,6 +114,7 @@
- A is blockSize for f32
- A is 8*ceil(blockSize/8) for f16
- A is 8*ceil(blockSize/4) for q31
- A is 0 for other datatypes (q15 and q7)
@par Fixed-Point Behavior

@ -60,7 +60,140 @@
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pSrc, q15_t * pDst, uint32_t blockSize)
#define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
for (int j = 0; j < nbAcc; j++) { \
const q15_t *pSmp = &pSample[j]; \
q63_t acc[4]; \
\
acc[j] = 0; \
for (int i = 0; i < nbVecTaps; i++) { \
vecIn0 = vld1q(pSmp + 8 * i); \
acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \
} \
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \
}
#define FIR_Q15_MAIN_CORE() \
{ \
q15_t *pState = S->pState; /* State pointer */ \
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
q15_t *pStateCur; /* Points to the current sample of the state */ \
const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \
q15_t *pOutput; /* Temporary pointer to the output buffer */ \
const q15_t *pTempSrc; /* Temporary pointer to the source data */ \
q15_t *pTempDest; /* Temporary pointer to the destination buffer */\
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
int32_t blkCnt; \
q15x8_t vecIn0; \
const int32_t nbVecTaps = (NBTAPS / 8); \
\
/* \
* load coefs \
*/ \
q15x8_t vecCoeffs[nbVecTaps]; \
\
for (int i = 0; i < nbVecTaps; i++) \
vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \
\
/* \
* pState points to state array which contains previous frame (numTaps - 1) samples \
* pStateCur points to the location where the new input data should be written \
*/ \
pStateCur = &(pState[(numTaps - 1u)]); \
pTempSrc = pSrc; \
pSamples = pState; \
pOutput = pDst; \
\
blkCnt = blockSize >> 2; \
while (blkCnt > 0) { \
/* \
* Save 4 input samples in the history buffer \
*/ \
vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \
pStateCur += 4; \
pTempSrc += 4; \
\
FIR_Q15_CORE(pOutput, 4, nbVecTaps, pSamples, vecCoeffs); \
pSamples += 4; \
\
blkCnt--; \
} \
\
/* tail */ \
int32_t residual = blockSize & 3; \
\
for (int i = 0; i < residual; i++) \
*pStateCur++ = *pTempSrc++; \
\
FIR_Q15_CORE(pOutput, residual, nbVecTaps, pSamples, vecCoeffs); \
\
/* \
* Copy the samples back into the history buffer start \
*/ \
pTempSrc = &pState[blockSize]; \
pTempDest = pState; \
\
/* current compiler limitation */ \
blkCnt = (numTaps - 1) >> 3; \
while (blkCnt > 0) \
{ \
vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \
pTempSrc += 8; \
pTempDest += 8; \
blkCnt--; \
} \
blkCnt = (numTaps - 1) & 7; \
if (blkCnt > 0) \
{ \
mve_pred16_t p = vctp16q(blkCnt); \
vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \
} \
}
static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
const q15_t * __restrict pSrc,
q15_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 32
FIR_Q15_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
const q15_t * __restrict pSrc,
q15_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 24
FIR_Q15_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
const q15_t * __restrict pSrc,
q15_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 16
FIR_Q15_MAIN_CORE();
#undef NBTAPS
}
static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
const q15_t * __restrict pSrc,
q15_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 8
FIR_Q15_MAIN_CORE();
#undef NBTAPS
}
void arm_fir_q15(
const arm_fir_instance_q15 * S,
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
q15_t *pState = S->pState; /* State pointer */
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@ -72,46 +205,81 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
q15x8_t vecIn0;
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffs;
uint32_t tapsBlkCnt = (numTaps + 7) / 8;
q63_t acc0, acc1, acc2, acc3;
int32_t nbTaps = (numTaps + 7) >> 3;
switch(nbTaps) {
case 1:
arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
return;
case 2:
arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
return;
case 3:
arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
return;
case 4:
arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
return;
}
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
*/
pStateCur = &(pState[(numTaps - 1u)]);
pTempSrc = pSrc;
pSamples = pState;
pOutput = pDst;
q63_t acc0, acc1, acc2, acc3;
blkCnt = blockSize >> 2;
pStateCur = &(pState[(numTaps - 1u)]);
pTempSrc = pSrc;
pSamples = pState;
pOutput = pDst;
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
/*
* Save 4 input samples in the history buffer
* Save 8 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavq(vecIn0, vecCoeffs);
int i = tapsBlkCnt;
while (i > 0)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmlaldavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[3]);
acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[3]);
acc3 = vmlaldavq(vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
* Decrement the taps block loop counter
*/
i--;
}
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@ -130,6 +298,7 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
{
case 3:
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
@ -137,20 +306,40 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
acc2 = 0LL;
/*
* Save 4 input samples in the history buffer
* Save 8 input samples in the history buffer
*/
*(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
pStateCur += 8;
pTempSrc += 8;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavq(vecIn0, vecCoeffs);
int i = tapsBlkCnt;
while (i > 0)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmlaldavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[4]);
acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
* Decrement the taps block loop counter
*/
i--;
}
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@ -160,23 +349,39 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
case 2:
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
acc1 = 0LL;
/*
* Save 4 input samples in the history buffer
* Save 8 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavq(vecIn0, vecCoeffs);
int i = tapsBlkCnt;
while (i > 0)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
* Decrement the taps block loop counter
*/
i--;
}
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
@ -185,126 +390,29 @@ static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S, const q15_t * pS
case 1:
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavq(vecIn0, vecCoeffs);
pSamplesTmp += 4;
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
}
break;
}
/*
* Copy the samples back into the history buffer start
*/
pTempSrc = &S->pState[blockSize];
pTempDest = S->pState;
blkCnt = numTaps >> 3;
while (blkCnt > 0U)
{
vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 8;
pTempDest += 8;
blkCnt--;
}
blkCnt = numTaps & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
}
}
void arm_fir_q15(
const arm_fir_instance_q15 * S,
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
q15_t *pState = S->pState; /* State pointer */
const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q15_t *pStateCur; /* Points to the current sample of the state */
const q15_t *pSamples; /* Temporary pointer to the sample buffer */
q15_t *pOutput; /* Temporary pointer to the output buffer */
const q15_t *pTempSrc; /* Temporary pointer to the source data */
q15_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
q15x8_t vecIn0;
uint32_t tapsBlkCnt = (numTaps + 7) / 8;
q63_t acc0, acc1, acc2, acc3;
if (blockSize >= 12)
{
if(numTaps <= 8) {
/* [1 to 8 taps] specialized routine */
arm_fir_q15_1_8_mve(S,pSrc, pDst, blockSize);
return;
}
}
if (blockSize >= 12)
{
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
*/
pStateCur = &(pState[(numTaps - 1u)]);
pTempSrc = pSrc;
pSamples = pState;
pOutput = pDst;
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
/*
* Save 8 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
uint32_t i = tapsBlkCnt;
while (i > 0U)
int i = tapsBlkCnt;
while (i > 0)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[3]);
acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
@ -312,197 +420,17 @@ void arm_fir_q15(
*/
i--;
}
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
pSamples += 4;
/*
* Decrement the sample block loop counter
*/
blkCnt--;
}
uint32_t residual = blockSize & 3;
switch (residual)
{
case 3:
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
/*
* Save 8 input samples in the history buffer
*/
*(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
pStateCur += 8;
pTempSrc += 8;
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
* Decrement the taps block loop counter
*/
i--;
}
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
}
break;
case 2:
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
acc1 = 0LL;
/*
* Save 8 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
* Decrement the taps block loop counter
*/
i--;
}
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
}
break;
case 1:
{
const q15_t *pCoeffsTmp = pCoeffs;
const q15_t *pSamplesTmp = pSamples;
acc0 = 0LL;
/*
* Save 8 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
/*
* load 8 coefs
*/
q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
pSamplesTmp += 8;
pCoeffsTmp += 8;
/*
* Decrement the taps block loop counter
*/
i--;
}
*pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
}
break;
}
}
else
{
q15_t *pStateCurnt; /* Points to the current sample of the state */
q15_t *px; /* Temporary pointer for state buffer */
const q15_t *pb; /* Temporary pointer for coefficient buffer */
q63_t acc0; /* Accumulator */
uint32_t blkCnt,tapCnt; /* Loop counters */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy two samples into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0;
/* Use SIMD to hold states and coefficients */
px = pState;
pb = pCoeffs;
tapCnt = numTaps >> 1U;
while (tapCnt > 0U)
{
acc0 += (q15_t) *px++ * *pb++;
acc0 += (q15_t) *px++ * *pb++;
tapCnt--;
}
/* The result is in 2.30 format. Convert to 1.15 with saturation.
Then store the output in the destination buffer. */
*pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
break;
}
/*
* Copy the samples back into the history buffer start
*/
pTempSrc = &S->pState[blockSize];
pTempDest = S->pState;
pTempSrc = &pState[blockSize];
pTempDest = pState;
blkCnt = numTaps >> 3;
while (blkCnt > 0U)

@ -56,7 +56,115 @@
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t * pDst, uint32_t blockSize)
#define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
for (int j = 0; j < nbAcc; j++) { \
const q7_t *pSmp = &pSample[j]; \
q31_t acc[4]; \
\
acc[j] = 0; \
for (int i = 0; i < nbVecTaps; i++) { \
vecIn0 = vld1q(pSmp + 16 * i); \
acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]); \
} \
*pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8); \
}
#define FIR_Q7_MAIN_CORE() \
{ \
q7_t *pState = S->pState; /* State pointer */ \
const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
q7_t *pStateCur; /* Points to the current sample of the state */ \
const q7_t *pSamples; /* Temporary pointer to the sample buffer */ \
q7_t *pOutput; /* Temporary pointer to the output buffer */ \
const q7_t *pTempSrc; /* Temporary pointer to the source data */ \
q7_t *pTempDest; /* Temporary pointer to the destination buffer */\
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
int32_t blkCnt; \
q7x16_t vecIn0; \
const int32_t nbVecTaps = (NBTAPS / 16); \
\
/* \
* load coefs \
*/ \
q7x16_t vecCoeffs[nbVecTaps]; \
\
for (int i = 0; i < nbVecTaps; i++) \
vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i); \
\
/* \
* pState points to state array which contains previous frame (numTaps - 1) samples \
* pStateCur points to the location where the new input data should be written \
*/ \
pStateCur = &(pState[(numTaps - 1u)]); \
pTempSrc = pSrc; \
pSamples = pState; \
pOutput = pDst; \
\
blkCnt = blockSize >> 2; \
while (blkCnt > 0) { \
/* \
* Save 4 input samples in the history buffer \
*/ \
vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc)); \
pStateCur += 4; \
pTempSrc += 4; \
\
FIR_Q7_CORE(pOutput, 4, nbVecTaps, pSamples, vecCoeffs); \
pSamples += 4; \
\
blkCnt--; \
} \
\
/* tail */ \
int32_t residual = blockSize & 3; \
\
for (int i = 0; i < residual; i++) \
*pStateCur++ = *pTempSrc++; \
\
FIR_Q7_CORE(pOutput, residual, nbVecTaps, pSamples, vecCoeffs); \
\
\
/* \
* Copy the samples back into the history buffer start \
*/ \
pTempSrc = &pState[blockSize]; \
pTempDest = pState; \
blkCnt = numTaps - 1; \
do { \
mve_pred16_t p = vctp8q(blkCnt); \
\
vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p); \
pTempSrc += 16; \
pTempDest += 16; \
blkCnt -= 16; \
} \
while (blkCnt > 0); \
}
static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,
const q7_t * __restrict pSrc,
q7_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 32
FIR_Q7_MAIN_CORE();
#undef NBTAPS
}
void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,
const q7_t * __restrict pSrc,
q7_t * __restrict pDst, uint32_t blockSize)
{
#define NBTAPS 16
FIR_Q7_MAIN_CORE();
#undef NBTAPS
}
void arm_fir_q7(
const arm_fir_instance_q7 * S,
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
q7_t *pState = S->pState; /* State pointer */
const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@ -68,9 +176,27 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
q7x16_t vecIn0;
uint32_t tapsBlkCnt = (numTaps + 15) / 16;
q31_t acc0, acc1, acc2, acc3;
q7x16_t vecCoeffs;
if (numTaps <= 16)
{
/*
* [1 to 16 taps] specialized routine
*/
arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 32)
{
/*
* [17 to 32 taps] specialized routine
*/
arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize);
return;
}
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
@ -82,12 +208,17 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
blkCnt = blockSize >> 2;
/*
* load 16 coefs
* outer samples loop
*/
vecCoeffs = *(q7x16_t *) pCoeffs;
while (blkCnt > 0U)
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
acc1 = 0;
acc2 = 0;
acc3 = 0;
/*
* Save 16 input samples in the history buffer
*/
@ -95,18 +226,36 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
pStateCur += 16;
pTempSrc += 16;
vecIn0 = vld1q(pSamples);
acc0 = vmladavq(vecIn0, vecCoeffs);
/*
* inner coefficients loop
*/
int i = tapsBlkCnt;
while (i > 0)
{
/*
* load 16 coefs
*/
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(&pSamples[1]);;
acc1 = vmladavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamples[2]);;
acc2 = vmladavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamples[3]);
acc3 = vmladavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[3]);
acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
/*
* Decrement the taps block loop counter
*/
i--;
}
/*
* Store the 1.7 format filter output in destination buffer
*/
@ -127,18 +276,37 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
{
case 3:
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
acc1 = 0;
acc2 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
vecIn0 = vld1q(pSamples);
acc0 = vmladavq(vecIn0, vecCoeffs);
int i = tapsBlkCnt;
while (i > 0)
{
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[4]);
acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamples[1]);
acc1 = vmladavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[8]);
acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamples[2]);
acc2 = vmladavq(vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
i--;
}
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
@ -148,15 +316,33 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
case 2:
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
acc1 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
vecIn0 = vld1q(pSamples);
acc0 = vmladavq(vecIn0, vecCoeffs);
int i = tapsBlkCnt;
while (i > 0)
{
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(&pSamples[1]);
acc1 = vmladavq(vecIn0, vecCoeffs);
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[4]);
acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
i--;
}
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
@ -165,13 +351,29 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
case 1:
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
vecIn0 = vld1q(pSamples);
acc0 = vmladavq(vecIn0, vecCoeffs);
int i = tapsBlkCnt;
while (i > 0)
{
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
i--;
}
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
}
break;
@ -198,288 +400,6 @@ void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t
vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
}
}
void arm_fir_q7(
const arm_fir_instance_q7 * S,
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
q7_t *pState = S->pState; /* State pointer */
const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
q7_t *pStateCur; /* Points to the current sample of the state */
const q7_t *pSamples; /* Temporary pointer to the sample buffer */
q7_t *pOutput; /* Temporary pointer to the output buffer */
const q7_t *pTempSrc; /* Temporary pointer to the source data */
q7_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
q7x16_t vecIn0;
uint32_t tapsBlkCnt = (numTaps + 15) / 16;
q31_t acc0, acc1, acc2, acc3;
q7x16_t vecCoeffs;
if (blockSize >= 20)
{
if (numTaps <= 16)
{
/*
* [1 to 16 taps] specialized routine
*/
arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
return;
}
}
if (blockSize >= 20)
{
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
*/
pStateCur = &(pState[(numTaps - 1u)]);
pSamples = pState;
pTempSrc = pSrc;
pOutput = pDst;
blkCnt = blockSize >> 2;
/*
* outer samples loop
*/
while (blkCnt > 0U)
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
acc1 = 0;
acc2 = 0;
acc3 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
/*
* inner coefficients loop
*/
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
/*
* load 16 coefs
*/
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[3]);
acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
/*
* Decrement the taps block loop counter
*/
i--;
}
/*
* Store the 1.7 format filter output in destination buffer
*/
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8);
pSamples += 4;
/*
* Decrement the sample block loop counter
*/
blkCnt--;
}
uint32_t residual = blockSize & 3;
switch (residual)
{
case 3:
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
acc1 = 0;
acc2 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[2]);
acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
i--;
}
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
}
break;
case 2:
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
acc1 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
vecIn0 = vld1q(&pSamplesTmp[1]);
acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
i--;
}
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
*pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
}
break;
case 1:
{
const q7_t *pCoeffsTmp = pCoeffs;
const q7_t *pSamplesTmp = pSamples;
acc0 = 0;
/*
* Save 16 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 16;
pTempSrc += 16;
uint32_t i = tapsBlkCnt;
while (i > 0U)
{
vecCoeffs = *(q7x16_t *) pCoeffsTmp;
vecIn0 = vld1q(pSamplesTmp);
acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
pSamplesTmp += 16;
pCoeffsTmp += 16;
i--;
}
*pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
}
break;
}
}
else
{
q7_t *pStateCurnt; /* Points to the current sample of the state */
q7_t *px; /* Temporary pointer for state buffer */
const q7_t *pb; /* Temporary pointer for coefficient buffer */
q31_t acc0; /* Accumulator */
uint32_t i,blkCnt; /* Loop counters */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0;
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps;
/* Perform the multiply-accumulates */
while (i > 0U)
{
acc0 += (q15_t) * (px++) * (*(pb++));
i--;
}
/* The result is in 2.14 format. Convert to 1.7
Then store the output in the destination buffer. */
*pDst++ = __SSAT((acc0 >> 7U), 8);
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
}
/*
* Copy the samples back into the history buffer start
*/
pTempSrc = &S->pState[blockSize];
pTempDest = S->pState;
blkCnt = numTaps >> 4;
while (blkCnt > 0U)
{
vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 16;
pTempDest += 16;
blkCnt--;
}
blkCnt = numTaps & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
}
}
#else
void arm_fir_q7(
const arm_fir_instance_q7 * S,

@ -168,6 +168,7 @@ set (NNSRC
Source/Benchmarks/FIRF32.cpp
Source/Benchmarks/FIRQ31.cpp
Source/Benchmarks/FIRQ15.cpp
Source/Benchmarks/FIRQ7.cpp
Source/Benchmarks/MISCF32.cpp
Source/Benchmarks/MISCQ31.cpp
Source/Benchmarks/MISCQ15.cpp

@ -0,0 +1,33 @@
#include "Test.h"
#include "Pattern.h"
#include "dsp/filtering_functions.h"
class FIRQ7:public Client::Suite
{
public:
FIRQ7(Testing::testID_t id);
virtual void setUp(Testing::testID_t,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr);
virtual void tearDown(Testing::testID_t,Client::PatternMgr *mgr);
private:
#include "FIRQ7_decl.h"
Client::Pattern<q7_t> coefs;
Client::Pattern<q7_t> samples;
Client::Pattern<q7_t> refs;
Client::LocalPattern<q7_t> output;
Client::LocalPattern<q7_t> error;
Client::LocalPattern<q7_t> state;
int nbTaps;
int nbSamples;
arm_fir_instance_q7 instFir;
const q7_t *pSrc;
const q7_t *pCoefs;
q7_t *pDst;
const q7_t *pRef;
q7_t *pErr;
};

@ -1,6 +1,9 @@
#include "FIRQ15.h"
#include "Error.h"
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) q15_t coeffArray[64];
#endif
void FIRQ15::test_fir_q15()
{
@ -35,10 +38,21 @@
switch(id)
{
case TEST_FIR_Q15_1:
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(q15_t));
q15_t *ptr;
ptr=coefs.ptr();
memcpy(coeffArray,ptr,this->nbTaps*sizeof(q15_t));
this->pCoefs = coeffArray;
#else
this->pCoefs=coefs.ptr();
#endif
arm_fir_init_q15(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
this->pCoefs=coefs.ptr();
this->pDst=output.ptr();
break;

@ -1,7 +1,7 @@
#include "FIRQ31.h"
#include "Error.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) q31_t coeffArray[64];
#endif
@ -39,7 +39,7 @@ static __ALIGNED(8) q31_t coeffArray[64];
switch(id)
{
case TEST_FIR_Q31_1:
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(q31_t));

@ -0,0 +1,60 @@
#include "FIRQ7.h"
#include "Error.h"
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) q7_t coeffArray[64];
#endif
void FIRQ7::test_fir_q7()
{
arm_fir_q7(&instFir, this->pSrc, this->pDst, this->nbSamples);
}
void FIRQ7::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
{
std::vector<Testing::param_t>::iterator it = params.begin();
this->nbTaps = *it++;
this->nbSamples = *it;
samples.reload(FIRQ7::SAMPLES1_Q7_ID,mgr,this->nbSamples);
coefs.reload(FIRQ7::COEFS1_Q7_ID,mgr,this->nbTaps);
state.create(this->nbSamples + this->nbTaps - 1,FIRQ7::STATE_Q7_ID,mgr);
output.create(this->nbSamples,FIRQ7::OUT_SAMPLES_Q7_ID,mgr);
switch(id)
{
case TEST_FIR_Q7_1:
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(q7_t));
q7_t *ptr;
ptr=coefs.ptr();
memcpy(coeffArray,ptr,this->nbTaps*sizeof(q7_t));
this->pCoefs = coeffArray;
#else
this->pCoefs=coefs.ptr();
#endif
arm_fir_init_q7(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
this->pDst=output.ptr();
break;
}
}
void FIRQ7::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)
{
}

@ -130,10 +130,10 @@ void checkInnerTail(q15_t *b)
ref.reload(FIRQ15::FIRREFS_Q15_ID,mgr);
output.create(ref.nbSamples(),FIRQ15::OUT_Q15_ID,mgr);
/* Max blockSize + numTaps as generated by Python script
/* > Max blockSize + numTaps as generated by Python script
numTaps may be increased by 1 by Python script to force it to even values
*/
state.create(41,FIRQ15::OUT_Q15_ID,mgr);
state.create(3 * 41,FIRQ15::OUT_Q15_ID,mgr);
}
void FIRQ15::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)

@ -499,6 +499,35 @@ group Root {
Normalized LMS Filter:test_lms_norm_q15
} -> PARAM1_ID
}
suite FIR Q7 {
class = FIRQ7
folder = FIRQ7
ParamList {
NumTaps, NB
Summary NumTaps, NB
Names "Number of taps","Number of samples"
Formula "NumTaps * NB"
}
Pattern SAMPLES1_Q7_ID : Samples1_q7.txt
Pattern REFS1_Q7_ID : Refs1_q7.txt
Pattern COEFS1_Q7_ID : Coefs1_q7.txt
Output OUT_SAMPLES_Q7_ID : Output
Output STATE_Q7_ID : State
Output ERR_Q7_ID : Err
Params PARAM1_ID = {
NumTaps = [16,32,64]
NB = [64,128,256]
}
Functions {
FIR Filter:test_fir_q7
} -> PARAM1_ID
}
}
group Convolutions / Correlations {

Loading…
Cancel
Save