CMSIS-DSP: New MVE implementation of the FIR f32

pull/19/head
Christophe Favergeon 5 years ago
parent fa73bb9825
commit 6f229b4f65

2
.gitignore vendored

@ -2,7 +2,7 @@ DSP_Lib_TestSuite/build/
PythonWrapper/build/ PythonWrapper/build/
PythonWrapper/cmsisdsp.cp36-win_amd64.pyd PythonWrapper/cmsisdsp.cp36-win_amd64.pyd
PythonWrapper/rec_2.dat PythonWrapper/rec_2.dat
Output.pickle *.pickle
build_*/ build_*/
Examples/ARM/arm_fft_bin_example/RTE/ Examples/ARM/arm_fft_bin_example/RTE/
Examples/ARM/arm_fft_bin_example/RTE/ Examples/ARM/arm_fft_bin_example/RTE/

@ -143,6 +143,14 @@ this example is not giving better SNR ...
*/ */
#define SNR_THRESHOLD_F32 75.0f #define SNR_THRESHOLD_F32 75.0f
#define BLOCK_SIZE 32 #define BLOCK_SIZE 32
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Must be a multiple of 16 */
#define NUM_TAPS_ARRAY_SIZE 32
#else
#define NUM_TAPS_ARRAY_SIZE 29
#endif
#define NUM_TAPS 29 #define NUM_TAPS 29
/* ------------------------------------------------------------------- /* -------------------------------------------------------------------
@ -162,20 +170,31 @@ static float32_t testOutput[TEST_LENGTH_SAMPLES];
/* ------------------------------------------------------------------- /* -------------------------------------------------------------------
* Declare State buffer of size (numTaps + blockSize - 1) * Declare State buffer of size (numTaps + blockSize - 1)
* ------------------------------------------------------------------- */ * ------------------------------------------------------------------- */
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
static float32_t firStateF32[2 * BLOCK_SIZE + NUM_TAPS - 1];
#else
static float32_t firStateF32[BLOCK_SIZE + NUM_TAPS - 1]; static float32_t firStateF32[BLOCK_SIZE + NUM_TAPS - 1];
#endif
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
** FIR Coefficients buffer generated using fir1() MATLAB function. ** FIR Coefficients buffer generated using fir1() MATLAB function.
** fir1(28, 6/24) ** fir1(28, 6/24)
** ------------------------------------------------------------------- */ ** ------------------------------------------------------------------- */
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
const float32_t firCoeffs32[NUM_TAPS] = { const float32_t firCoeffs32[NUM_TAPS_ARRAY_SIZE] = {
-0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f,
-0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f,
+0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f,
+0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f, 0.0f,0.0f,0.0f
};
#else
const float32_t firCoeffs32[NUM_TAPS_ARRAY_SIZE] = {
-0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f, -0.0018225230f, -0.0015879294f, +0.0000000000f, +0.0036977508f, +0.0080754303f, +0.0085302217f, -0.0000000000f, -0.0173976984f,
-0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f, -0.0341458607f, -0.0333591565f, +0.0000000000f, +0.0676308395f, +0.1522061835f, +0.2229246956f, +0.2504960933f, +0.2229246956f,
+0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f, +0.1522061835f, +0.0676308395f, +0.0000000000f, -0.0333591565f, -0.0341458607f, -0.0173976984f, -0.0000000000f, +0.0085302217f,
+0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f +0.0080754303f, +0.0036977508f, +0.0000000000f, -0.0015879294f, -0.0018225230f
}; };
#endif
/* ------------------------------------------------------------------ /* ------------------------------------------------------------------
* Global variables for FIR LPF Example * Global variables for FIR LPF Example

@ -104,10 +104,18 @@
the implementation may require to read more coefficients due to the vectorization and the implementation may require to read more coefficients due to the vectorization and
to avoid having to manage too many different cases in the code. to avoid having to manage too many different cases in the code.
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
The first blockSize samples are temporary data.
The remaining samples are the state of the FIR filter.
So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
@par Fixed-Point Behavior @par Fixed-Point Behavior
Care must be taken when using the fixed-point versions of the FIR filter functions. Care must be taken when using the fixed-point versions of the FIR filter functions.
In particular, the overflow and saturation behavior of the accumulator used in each function must be considered. In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
Refer to the function specific documentation below for usage guidelines. Refer to the function specific documentation below for usage guidelines.
*/ */
/** /**
@ -126,578 +134,542 @@
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize) #define FIR_F32_MAX_COEF_BLK 8
#define FIR_F32_CORE(pSamples, c, NB_TAPS) \
vecAcc0 = vdupq_n_f32(0.0f); \
for (int i = 0; i < NB_TAPS; i++) { \
vecIn0 = vld1q(&pSamples[i]); \
vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
}
static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
const float32_t * __restrict pSrc,
float32_t * __restrict pDst, uint32_t blockSize)
{ {
float32_t *pState = S->pState; /* State pointer */ float32_t *pRefStatePtr = S->pState + blockSize;
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ float32_t *pState = pRefStatePtr; /* State pointer */
float32_t *pStateCur; /* Points to the current sample of the state */ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
const float32_t *pSamples; /* Temporary pointer to the sample buffer */ float32_t *pStateCur; /* Points to the current sample of the state */
float32_t *pOutput; /* Temporary pointer to the output buffer */ const float32_t *pSamples; /* Temporary pointer to the sample buffer */
const float32_t *pTempSrc; /* Temporary pointer to the source data */ float32_t *pOutput; /* Temporary pointer to the output buffer */
float32_t *pTempDest; /* Temporary pointer to the destination buffer */ const float32_t *pTempSrc; /* Temporary pointer to the source data */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ float32_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t blkCnt; uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
f32x4_t vecIn0; int32_t blkCnt;
f32x4_t vecAcc0; float32x4_t vecIn0;
float32_t c0, c1, c2, c3; float32x4_t vecAcc0;
const int NB_TAPS=4;
float32_t c[NB_TAPS];
const float32_t *pCoeffsCur = pCoeffs;
/* /*
* pState points to state array which contains previous frame (numTaps - 1) samples * pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written * pStateCur points to the location where the new input data should be written
*/ */
pStateCur = &(pState[(numTaps - 1u)]); pStateCur = &(pState[(numTaps - 1u)]);
pSamples = pState; pTempSrc = pSrc;
pTempSrc = pSrc;
pOutput = pDst;
if (((numTaps - 1) / 4) == 0)
{
const float32_t *pCoeffsCur = pCoeffs;
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]); pSamples = pState;
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); pOutput = pDst;
vst1q(pOutput, vecAcc0); for (int i = 0; i < NB_TAPS; i++)
c[i] = *pCoeffsCur++;
pOutput += 4; blkCnt = blockSize >> 2;
pSamples += 4; while (blkCnt > 0) {
/*
* Save 4 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
blkCnt--; FIR_F32_CORE(pSamples, c, NB_TAPS);
}
blkCnt = blockSize & 3; vst1q(pOutput, vecAcc0);
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0); pOutput += 4;
pStateCur += blkCnt; pSamples += 4;
pTempSrc += blkCnt;
vecIn0 = vld1q(pSamples); blkCnt--;
vecAcc0 = vmulq(vecIn0, c0); }
vecIn0 = vld1q(&pSamples[1]); blkCnt = blockSize & 3;
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); {
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn0 = vld1q(&pSamples[2]); vst1q(pStateCur, vld1q(pTempSrc));
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); pStateCur += 4;
pTempSrc += 4;
vecIn0 = vld1q(&pSamples[3]); FIR_F32_CORE(pSamples, c, NB_TAPS);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vstrwq_p_f32(pOutput, vecAcc0, p0); vstrwq_p_f32(pOutput, vecAcc0, p0);
}
} }
/* /*
* Copy the samples back into the history buffer start * Copy the samples back into the history buffer start
*/ */
pTempSrc = &S->pState[blockSize]; pTempSrc = &pState[blockSize];
pTempDest = S->pState; pTempDest = pState;
blkCnt = numTaps >> 2; blkCnt = numTaps - 1;
while (blkCnt > 0U) do {
{ mve_pred16_t p = vctp32q(blkCnt);
vst1q(pTempDest, vld1q(pTempSrc));
vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
pTempSrc += 4; pTempSrc += 4;
pTempDest += 4; pTempDest += 4;
blkCnt--; blkCnt -= 4;
}
blkCnt = numTaps & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
} }
while (blkCnt > 0);
} }
static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S, const float32_t * pSrc, float32_t * pDst, uint32_t blockSize)
static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
const float32_t * __restrict pSrc,
float32_t * __restrict pDst, uint32_t blockSize)
{ {
float32_t *pState = S->pState; /* State pointer */ float32_t *pRefStatePtr = S->pState + blockSize;
float32_t *pState = pRefStatePtr; /* State pointer */
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t *pStateCur; /* Points to the current sample of the state */
const float32_t *pSamples; /* Temporary pointer to the sample buffer */ const float32_t *pSamples; /* Temporary pointer to the sample buffer */
float32_t *pOutput; /* Temporary pointer to the output buffer */
const float32_t *pTempSrc; /* Temporary pointer to the source data */ const float32_t *pTempSrc; /* Temporary pointer to the source data */
float32_t *pTempDest; /* Temporary pointer to the destination buffer */ float32_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt; int32_t blkCnt;
f32x4_t vecIn0;
f32x4_t vecAcc0;
float32_t c0, c1, c2, c3; float32_t c0, c1, c2, c3;
float32_t c4, c5, c6, c7; float32_t c4, c5, c6, c7;
const float32_t *pCoeffsCur = pCoeffs;
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
*/
pStateCur = &(pState[(numTaps - 1u)]);
pTempSrc = pSrc; pTempSrc = pSrc;
pTempDest = &(pState[(numTaps - 1u)]);
int cnt = blockSize;
do {
mve_pred16_t p0 = vctp32q(cnt);
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
pTempDest += 4;
pTempSrc += 4;
cnt -= 4;
} while(cnt > 0);
pSamples = pState;
pOutput = pDst;
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
blkCnt = blockSize >> 2; pSamples = pState;
while (blkCnt > 0U) c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
cnt = blockSize >> 2;
while(cnt > 0)
{ {
/* float32x4_t vecAcc0;
* Save 4 input samples in the history buffer float32x4_t vecIn0;
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
vecIn0 = vld1q(pSamples); vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0); vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]); vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]); vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]); vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]); vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]); vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]); vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]); vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
vst1q(pOutput, vecAcc0);
pOutput += 4;
pSamples += 4; pSamples += 4;
vst1q(pDst, vecAcc0);
blkCnt--; cnt--;
pDst += 4;
} }
blkCnt = blockSize & 3; cnt = blockSize & 3;
if (blkCnt > 0U) if (cnt > 0)
{ {
mve_pred16_t p0 = vctp32q(blkCnt); float32x4_t vecAcc0;
float32x4_t vecIn0;
vstrwq_p_f32(pStateCur, vld1q(pTempSrc),p0); mve_pred16_t p0 = vctp32q(cnt);
pStateCur += blkCnt;
pTempSrc += blkCnt;
vecIn0 = vld1q(pSamples); vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0); vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]); vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]); vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]); vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]); vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]); vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]); vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]); vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
vstrwq_p_f32(pDst, vecAcc0,p0);
vstrwq_p_f32(pOutput, vecAcc0, p0);
} }
/* /*
* Copy the samples back into the history buffer start * Copy the samples back into the history buffer start
*/ */
pTempSrc = &S->pState[blockSize]; pTempSrc = &pState[blockSize];
pTempDest = S->pState; pTempDest = pState;
blkCnt = numTaps;
blkCnt = numTaps >> 2; while (blkCnt > 0)
while (blkCnt > 0U)
{ {
vst1q(pTempDest, vld1q(pTempSrc)); *pTempDest++ = *pTempSrc++;
pTempSrc += 4;
pTempDest += 4;
blkCnt--; blkCnt--;
} }
blkCnt = numTaps & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
}
} }
void arm_fir_f32( void arm_fir_f32(
const arm_fir_instance_f32 * S, const arm_fir_instance_f32 * S,
const float32_t * pSrc, const float32_t * pSrc,
float32_t * pDst, float32_t * pDst,
uint32_t blockSize) uint32_t blockSize)
{ {
float32_t *pState = S->pState; /* State pointer */ /*
S->pState is the arm_fir_partial_accu
S->pState + blockSize is the FIR state
*/
float32_t *pRefStatePtr = S->pState + blockSize;
float32_t *pState = pRefStatePtr ; /* State pointer */
const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float32_t *pStateCur; /* Points to the current sample of the state */
const float32_t *pSamples; /* Temporary pointer to the sample buffer */ const float32_t *pSamples; /* Temporary pointer to the sample buffer */
float32_t *pOutput; /* Temporary pointer to the output buffer */ float32_t *pOutput; /* Temporary pointer to the output buffer */
const float32_t *pTempSrc; /* Temporary pointer to the source data */ const float32_t *pTempSrc; /* Temporary pointer to the source data */
float32_t *pTempDest; /* Temporary pointer to the destination buffer */ float32_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt; uint32_t blkCnt;
int32_t numCnt;
f32x4_t vecIn0;
f32x4_t vecAcc0;
float32_t c0, c1, c2, c3; float32_t c0, c1, c2, c3;
float32_t c4, c5, c6, c7; float32_t c4, c5, c6, c7;
/* /*
* [1 to 8 taps] specialized routines * [1 to 8 taps] specialized routines
*/ */
if (blockSize >= 8) if (numTaps <= 4)
{ {
if (numTaps <= 4) arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
{ return;
arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
return;
}
} }
if (blockSize >= 8) else if (numTaps <= 8)
{ {
if (numTaps <= 8) arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
{ return;
arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
return;
}
} }
if (blockSize >= 8) pTempSrc = pSrc;
pTempDest = &(pState[(numTaps - 1u)]);
int cnt = blockSize;
do {
mve_pred16_t p0 = vctp32q(cnt);
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
pTempDest += 4;
pTempSrc += 4;
cnt -= 4;
} while(cnt > 0);
float32_t *partial_accu_ptr = S->pState;
pSamples = pState;
c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
cnt = blockSize >> 2;
while(cnt > 0) {
float32x4_t vecAcc0;
float32x4_t vecIn0;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 4;
vst1q(partial_accu_ptr, vecAcc0);
cnt--;
partial_accu_ptr += 4;
}
cnt = blockSize & 3;
if (cnt > 0)
{ {
/* float32x4_t vecAcc0;
* pState points to state array which contains previous frame (numTaps - 1) samples float32x4_t vecIn0;
* pStateCur points to the location where the new input data should be written
*/
pStateCur = &(pState[(numTaps - 1u)]);
pTempSrc = pSrc;
pSamples = pState;
pOutput = pDst;
blkCnt = blockSize >> 2; mve_pred16_t p0 = vctp32q(cnt);
while (blkCnt > 0U)
{ vecIn0 = vld1q(pSamples);
int32_t i; vecAcc0 = vmulq(vecIn0, c0);
const float32_t *pCoeffsCur = pCoeffs; vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
c0 = *pCoeffsCur++; vecIn0 = vld1q(&pSamples[2]);
c1 = *pCoeffsCur++; vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
c2 = *pCoeffsCur++; vecIn0 = vld1q(&pSamples[3]);
c3 = *pCoeffsCur++; vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
c4 = *pCoeffsCur++; vecIn0 = vld1q(&pSamples[4]);
c5 = *pCoeffsCur++; vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
c6 = *pCoeffsCur++; vecIn0 = vld1q(&pSamples[5]);
c7 = *pCoeffsCur++; vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vst1q(pStateCur, vld1q(pTempSrc)); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
pStateCur += 4; vecIn0 = vld1q(&pSamples[7]);
pTempSrc += 4; vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += cnt;
vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
partial_accu_ptr += cnt;
}
int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
int sample_offset = FIR_F32_MAX_COEF_BLK;
while (localTaps > FIR_F32_MAX_COEF_BLK) {
c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
partial_accu_ptr = S->pState;
pSamples = pState + sample_offset;
int cnt = blockSize >> 2;
while(cnt > 0) {
float32x4_t vecAcc0;
float32x4_t vecIn0;
vecIn0 = vld1q(pSamples); vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0); vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]); vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]); vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]); vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]); vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]); vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]); vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]); vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 4;
vecAcc0 += vld1q_f32(partial_accu_ptr);
vst1q(partial_accu_ptr, vecAcc0);
cnt--;
partial_accu_ptr += 4;
}
pSamples += 8; cnt = blockSize & 3;
if (cnt > 0) {
numCnt = ((int32_t)numTaps - 8) / 8; float32x4_t vecAcc0;
float32x4_t vecIn0;
for (i = 0; i < numCnt; i++)
{
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]); mve_pred16_t p0 = vctp32q(cnt);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 8; vecIn0 = vld1q(pSamples);
} vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += cnt;
vecAcc0 += vld1q_f32(partial_accu_ptr);
vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
partial_accu_ptr += cnt;
}
numCnt = ((int32_t)numTaps - 8) & 7; localTaps -= FIR_F32_MAX_COEF_BLK;
sample_offset += FIR_F32_MAX_COEF_BLK;
}
while (numCnt > 0) pSamples = pState + sample_offset;
{
c0 = *pCoeffsCur++; if (localTaps > 4) {
vecIn0 = vld1q(pSamples); c0 = *pCoeffs++;
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0); c1 = *pCoeffs++;
pSamples ++; c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
pOutput = pDst;
numCnt --; partial_accu_ptr = S->pState;
} cnt = blockSize >> 2;
while(cnt > 0) {
float32x4_t vecAcc0;
float32x4_t vecIn0;
vst1q(pOutput, vecAcc0); vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 4;
float32x4_t pap = vld1q_f32(partial_accu_ptr);
vst1q(pOutput, vecAcc0+pap);
cnt--;
partial_accu_ptr += 4;
pOutput += 4; pOutput += 4;
pSamples = pSamples - numTaps + 4;
blkCnt--;
} }
blkCnt = blockSize & 3; cnt = blockSize & 3;
if (blkCnt > 0U) if (cnt > 0) {
{ float32x4_t vecAcc0;
mve_pred16_t p0 = vctp32q(blkCnt); float32x4_t vecIn0;
int32_t i;
const float32_t *pCoeffsCur = pCoeffs; mve_pred16_t p0 = vctp32q(cnt);
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples); vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0); vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]); vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1); vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]); vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2); vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]); vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3); vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]); vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4); vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]); vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5); vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]); vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6); vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]); vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7); vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += cnt;
pSamples += 8; float32x4_t pap = vld1q_f32(partial_accu_ptr);
vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
numCnt = ((int32_t)numTaps - 8) / 8; partial_accu_ptr += cnt;
pOutput += cnt;
for (i = 0; i < numCnt; i++)
{
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 8;
}
numCnt = ((int32_t)numTaps - 8) & 7;
while (numCnt > 0)
{
c0 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
pSamples ++;
numCnt --;
}
vstrwq_p_f32(pOutput, vecAcc0, p0);
} }
} }
else else {
{ c0 = *pCoeffs++;
float32_t *pStateCurnt; /* Points to the current sample of the state */ c1 = *pCoeffs++;
float32_t *px; /* Temporary pointer for state buffer */ c2 = *pCoeffs++;
const float32_t *pb; /* Temporary pointer for coefficient buffer */ c3 = *pCoeffs++;
float32_t acc0; /* Accumulator */ pOutput = pDst;
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, blkCnt; /* Loop counters */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0.0f;
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps;
/* Perform the multiply-accumulates */
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += *px++ * *pb++;
i--; partial_accu_ptr = S->pState;
} cnt = blockSize >> 2;
while(cnt > 0) {
float32x4_t vecAcc0;
float32x4_t vecIn0;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
pSamples += 4;
float32x4_t pap = vld1q_f32(partial_accu_ptr);
vst1q(pOutput, vecAcc0+pap);
cnt--;
partial_accu_ptr += 4;
pOutput += 4;
}
/* Store result in destination buffer. */ cnt = blockSize & 3;
*pDst++ = acc0; if (cnt > 0) {
float32x4_t vecAcc0;
float32x4_t vecIn0;
/* Advance state pointer by 1 for the next sample */ mve_pred16_t p0 = vctp32q(cnt);
pState = pState + 1U;
/* Decrement loop counter */ vecIn0 = vld1q(pSamples);
blkCnt--; vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
pSamples += cnt;
float32x4_t pap = vld1q_f32(partial_accu_ptr);
vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
partial_accu_ptr += cnt;
pOutput += cnt;
} }
} }
/* /*
* Copy the samples back into the history buffer start * Copy the samples back into the history buffer start
*/ */
pTempSrc = &S->pState[blockSize]; pTempSrc = &pRefStatePtr[blockSize];
pTempDest = S->pState; pTempDest = pRefStatePtr;
blkCnt = numTaps >> 2; blkCnt = numTaps >> 2;
while (blkCnt > 0U) while (blkCnt > 0)
{ {
vst1q(pTempDest, vld1q(pTempSrc)); vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 4; pTempSrc += 4;
pTempDest += 4; pTempDest += 4;
blkCnt--; blkCnt--;
} }
blkCnt = numTaps & 3; blkCnt = numTaps & 3;
if (blkCnt > 0U) if (blkCnt > 0)
{ {
mve_pred16_t p0 = vctp32q(blkCnt); mve_pred16_t p0 = vctp32q(blkCnt);
vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0); vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);

@ -52,8 +52,8 @@
{b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]} {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
</pre> </pre>
@par @par
<code>pState</code> points to the array of state variables. <code>pState</code> points to the array of state variables and some working memory for the Helium version.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>. <code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f32()</code>.
@par Initialization of Helium version @par Initialization of Helium version
For Helium version the array of coefficients must be a multiple of 16 even if less For Helium version the array of coefficients must be a multiple of 16 even if less
then 16 coefficients are used. The additional coefficients must be set to 0. then 16 coefficients are used. The additional coefficients must be set to 0.
@ -62,6 +62,13 @@
the implementation may require to read more coefficients due to the vectorization and the implementation may require to read more coefficients due to the vectorization and
to avoid having to manage too many different cases in the code. to avoid having to manage too many different cases in the code.
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
The first blockSize samples are temporary data.
The remaining samples are the state of the FIR filter.
So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
*/ */
void arm_fir_init_f32( void arm_fir_init_f32(
@ -78,8 +85,11 @@ void arm_fir_init_f32(
S->pCoeffs = pCoeffs; S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
memset(pState, 0, (numTaps + (blockSize - 1U) + blockSize) * sizeof(float32_t));
#else
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t)); memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
#endif
/* Assign state pointer */ /* Assign state pointer */
S->pState = pState; S->pState = pState;
} }

@ -9,6 +9,8 @@ FullBenchmark/
Output/ Output/
GeneratedInclude/ GeneratedInclude/
GeneratedSource/ GeneratedSource/
GeneratedIncludeBench/
GeneratedSourceBench/
*.db *.db
TestDesc.txt TestDesc.txt
currentConfig.csv currentConfig.csv

@ -85,6 +85,8 @@ option(FLOAT16TESTS "Float16 tests" OFF)
option(MICROBENCH "Micro benchmarks" OFF) option(MICROBENCH "Micro benchmarks" OFF)
option(EXTERNAL "External benchmarks or tests" OFF) option(EXTERNAL "External benchmarks or tests" OFF)
option(DISTINCT "Different generated folder for benchmarking and tests" OFF)
project(Testing) project(Testing)
# Needed to find the config modules # Needed to find the config modules
@ -422,7 +424,12 @@ endif()
target_sources(TestingLib PRIVATE testmain.cpp) target_sources(TestingLib PRIVATE testmain.cpp)
if ((DISTINCT) AND (BENCHMARK))
target_sources(TestingLib PRIVATE GeneratedSourceBench/TestDesc.cpp)
else()
target_sources(TestingLib PRIVATE GeneratedSource/TestDesc.cpp) target_sources(TestingLib PRIVATE GeneratedSource/TestDesc.cpp)
endif()
if (EMBEDDED) if (EMBEDDED)
target_compile_definitions(TestingLib PUBLIC EMBEDDED) target_compile_definitions(TestingLib PUBLIC EMBEDDED)
@ -468,7 +475,12 @@ if(NN)
target_link_libraries(TestingLib PRIVATE CMSISNN) target_link_libraries(TestingLib PRIVATE CMSISNN)
endif() endif()
target_include_directories(TestingLib PRIVATE FrameworkInclude) target_include_directories(TestingLib PRIVATE FrameworkInclude)
if ((DISTINCT) AND (BENCHMARK))
target_include_directories(TestingLib PRIVATE GeneratedIncludeBench)
else()
target_include_directories(TestingLib PRIVATE GeneratedInclude) target_include_directories(TestingLib PRIVATE GeneratedInclude)
endif()
configLib(TestingLib ${ROOT}) configLib(TestingLib ${ROOT})
#configDsp(TestingLib ${ROOT}) #configDsp(TestingLib ${ROOT})
@ -490,7 +502,12 @@ core_includes(FrameworkLib)
add_executable(Testing main.cpp) add_executable(Testing main.cpp)
# To see the file in the scatter load, it must not because # To see the file in the scatter load, it must not because
# linked in a .a archive # linked in a .a archive
if ((DISTINCT) AND (BENCHMARK))
target_include_directories(Testing PRIVATE GeneratedIncludeBench)
else()
target_include_directories(Testing PRIVATE GeneratedInclude) target_include_directories(Testing PRIVATE GeneratedInclude)
endif()
target_sources(Testing PRIVATE patterndata.c) target_sources(Testing PRIVATE patterndata.c)
# With -O2, generated code is crashing on some cycle accurate models. # With -O2, generated code is crashing on some cycle accurate models.
@ -504,7 +521,11 @@ target_link_libraries(Testing PRIVATE FrameworkLib)
if (EXTERNAL) if (EXTERNAL)
target_include_directories(${EXTERNALPROJECT} PRIVATE FrameworkInclude) target_include_directories(${EXTERNALPROJECT} PRIVATE FrameworkInclude)
target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedInclude) if ((DISTINCT) AND (BENCHMARK))
target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedIncludeBench)
else()
target_include_directories(${EXTERNALPROJECT} PRIVATE GeneratedInclude)
endif()
target_link_libraries(TestingLib PRIVATE ${EXTERNALPROJECT}) target_link_libraries(TestingLib PRIVATE ${EXTERNALPROJECT})
endif() endif()

@ -15,8 +15,10 @@ class FIRF32:public Client::Suite
Client::Pattern<float32_t> coefs; Client::Pattern<float32_t> coefs;
Client::Pattern<float32_t> inputs; Client::Pattern<float32_t> inputs;
Client::RefPattern<int16_t> configs; Client::RefPattern<int16_t> configs;
Client::LocalPattern<float32_t> output; Client::LocalPattern<float32_t> output;
Client::LocalPattern<float32_t> state; Client::LocalPattern<float32_t> state;
Client::LocalPattern<float32_t> tmp;
// Reference patterns are not loaded when we are in dump mode // Reference patterns are not loaded when we are in dump mode
Client::RefPattern<float32_t> ref; Client::RefPattern<float32_t> ref;

@ -1,7 +1,10 @@
#include "FIRF32.h" #include "FIRF32.h"
#include "Error.h" #include "Error.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) float32_t coeffArray[64];
#endif
void FIRF32::test_fir_f32() void FIRF32::test_fir_f32()
{ {
arm_fir_f32(&instFir, this->pSrc, this->pDst, this->nbSamples); arm_fir_f32(&instFir, this->pSrc, this->pDst, this->nbSamples);
@ -30,7 +33,7 @@
samples.reload(FIRF32::SAMPLES1_F32_ID,mgr,this->nbSamples); samples.reload(FIRF32::SAMPLES1_F32_ID,mgr,this->nbSamples);
coefs.reload(FIRF32::COEFS1_F32_ID,mgr,this->nbTaps); coefs.reload(FIRF32::COEFS1_F32_ID,mgr,this->nbTaps);
state.create(this->nbSamples + this->nbTaps - 1,FIRF32::STATE_F32_ID,mgr); state.create(this->nbSamples + this->nbSamples + this->nbTaps - 1,FIRF32::STATE_F32_ID,mgr);
output.create(this->nbSamples,FIRF32::OUT_SAMPLES_F32_ID,mgr); output.create(this->nbSamples,FIRF32::OUT_SAMPLES_F32_ID,mgr);
switch(id) switch(id)
@ -38,8 +41,21 @@
case TEST_FIR_F32_1: case TEST_FIR_F32_1:
arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples); arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr(); #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(float32_t));
float32_t *ptr;
ptr=coefs.ptr();
memcpy(coeffArray,ptr,this->nbTaps*sizeof(float32_t));
this->pCoefs = coeffArray;
#else
this->pCoefs=coefs.ptr(); this->pCoefs=coefs.ptr();
#endif
this->pSrc=samples.ptr();
this->pDst=output.ptr(); this->pDst=output.ptr();
break; break;

@ -61,7 +61,7 @@ void checkInnerTail(float16_t *b)
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero /* Copy coefficients and pad to zero
*/ */
memset(coeffArray,0,32); memset(coeffArray,0,32*sizeof(float16_t));
for(j=0;j < numTaps; j++) for(j=0;j < numTaps; j++)
{ {
coeffArray[j] = orgcoefsp[j]; coeffArray[j] = orgcoefsp[j];

@ -24,6 +24,7 @@ void checkInnerTail(float32_t *b)
ASSERT_TRUE(b[3] == 0.0f); ASSERT_TRUE(b[3] == 0.0f);
} }
void FIRF32::test_fir_f32() void FIRF32::test_fir_f32()
{ {
@ -34,6 +35,7 @@ void checkInnerTail(float32_t *b)
const float32_t *coefsp; const float32_t *coefsp;
const float32_t *inputp = inputs.ptr(); const float32_t *inputp = inputs.ptr();
float32_t *outp = output.ptr(); float32_t *outp = output.ptr();
unsigned long i; unsigned long i;
@ -42,6 +44,7 @@ void checkInnerTail(float32_t *b)
#endif #endif
int blockSize; int blockSize;
int numTaps; int numTaps;
int nb=0;
@ -58,10 +61,12 @@ void checkInnerTail(float32_t *b)
blockSize = configp[0]; blockSize = configp[0];
numTaps = configp[1]; numTaps = configp[1];
nb += 2*blockSize;
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero /* Copy coefficients and pad to zero
*/ */
memset(coeffArray,0,32); memset(coeffArray,0,32*sizeof(float32_t));
for(j=0;j < numTaps; j++) for(j=0;j < numTaps; j++)
{ {
coeffArray[j] = orgcoefsp[j]; coeffArray[j] = orgcoefsp[j];
@ -86,6 +91,7 @@ void checkInnerTail(float32_t *b)
*/ */
inputp = inputs.ptr(); inputp = inputs.ptr();
/* /*
Python script is filtering a 2*blockSize number of samples. Python script is filtering a 2*blockSize number of samples.
@ -94,7 +100,9 @@ void checkInnerTail(float32_t *b)
*/ */
arm_fir_f32(&this->S,inputp,outp,blockSize); arm_fir_f32(&this->S,inputp,outp,blockSize);
outp += blockSize; outp += blockSize;
checkInnerTail(outp); checkInnerTail(outp);
@ -137,8 +145,15 @@ void checkInnerTail(float32_t *b)
ref.reload(FIRF32::FIRREFS_F32_ID,mgr); ref.reload(FIRF32::FIRREFS_F32_ID,mgr);
output.create(ref.nbSamples(),FIRF32::OUT_F32_ID,mgr); output.create(ref.nbSamples(),FIRF32::OUT_F32_ID,mgr);
/* Max blockSize + numTaps - 1 as generated by Python script */ /* Max 2*blockSize + numTaps - 1 as generated by Python script
state.create(47,FIRF32::OUT_F32_ID,mgr); A temp buffer blockSize is used by Helium implementation.
It is at beginning of state buffer and is NOT the state
of the FIR which is in the following part.
*/
state.create(47+47,FIRF32::OUT_F32_ID,mgr);
} }
void FIRF32::tearDown(Testing::testID_t id,Client::PatternMgr *mgr) void FIRF32::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)

@ -53,7 +53,7 @@ void checkInnerTail(q15_t *b)
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero /* Copy coefficients and pad to zero
*/ */
memset(coeffArray,0,32); memset(coeffArray,0,32*sizeof(q15_t));
for(j=0;j < numTaps; j++) for(j=0;j < numTaps; j++)
{ {
coeffArray[j] = orgcoefsp[j]; coeffArray[j] = orgcoefsp[j];

@ -54,7 +54,7 @@ void checkInnerTail(q31_t *b)
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero /* Copy coefficients and pad to zero
*/ */
memset(coeffArray,0,32); memset(coeffArray,0,32*sizeof(q31_t));
for(j=0;j < numTaps; j++) for(j=0;j < numTaps; j++)
{ {
coeffArray[j] = orgcoefsp[j]; coeffArray[j] = orgcoefsp[j];

@ -53,7 +53,7 @@ void checkInnerTail(q7_t *b)
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero /* Copy coefficients and pad to zero
*/ */
memset(coeffArray,0,32); memset(coeffArray,0,32*sizeof(q7_t));
for(j=0;j < numTaps; j++) for(j=0;j < numTaps; j++)
{ {
coeffArray[j] = orgcoefsp[j]; coeffArray[j] = orgcoefsp[j];

@ -832,7 +832,7 @@ class CodeGen:
self._currentPaths = oldPath.copy() self._currentPaths = oldPath.copy()
self._currentParamPaths = oldParamPath.copy() self._currentParamPaths = oldParamPath.copy()
def genCodeForTree(self,root): def genCodeForTree(self,root,benchMode):
""" Generate all files from the trees of tests """ Generate all files from the trees of tests
Args: Args:
@ -845,17 +845,21 @@ class CodeGen:
# Get a list of all suites contained in the tree # Get a list of all suites contained in the tree
suites = self.getSuites(root,[]) suites = self.getSuites(root,[])
src = "GeneratedSource"
header = "GeneratedInclude"
if benchMode:
src += "Bench"
header += "Bench"
# Generate .cpp and .h files neded to run the tests # Generate .cpp and .h files neded to run the tests
with open("GeneratedSource/TestDesc.cpp","w") as sourceFile: with open("%s/TestDesc.cpp" % src,"w") as sourceFile:
with open("GeneratedInclude/TestDesc.h","w") as headerFile: with open("%s/TestDesc.h" % header,"w") as headerFile:
headerFile.write("#include \"Test.h\"\n") headerFile.write("#include \"Test.h\"\n")
headerFile.write("#include \"Pattern.h\"\n") headerFile.write("#include \"Pattern.h\"\n")
sourceFile.write("#include \"Test.h\"\n") sourceFile.write("#include \"Test.h\"\n")
for s in suites: for s in suites:
headerFile.write("#include \"%s.h\"\n" % s) headerFile.write("#include \"%s.h\"\n" % s)
self._genCode(root,"GeneratedInclude",sourceFile,headerFile) self._genCode(root,"%s" % header,sourceFile,headerFile)
# Generate a driver file for semihosting # Generate a driver file for semihosting
# (always generated for debug purpose since it is the reference format) # (always generated for debug purpose since it is the reference format)
@ -868,17 +872,17 @@ class CodeGen:
# Driver file is similar in this case but different from semihosting # Driver file is similar in this case but different from semihosting
# one. # one.
if not self._fpga: if not self._fpga:
with open("GeneratedInclude/TestDrive.h","w") as driverFile: with open("%s/TestDrive.h" % header,"w") as driverFile:
driverFile.write("// Empty driver include in semihosting mode") driverFile.write("// Empty driver include in semihosting mode")
with open("GeneratedInclude/Patterns.h","w") as includeFile: with open("%s/Patterns.h" % header,"w") as includeFile:
includeFile.write("// Empty pattern include in semihosting mode") includeFile.write("// Empty pattern include in semihosting mode")
else: else:
with open("GeneratedInclude/TestDrive.h","w") as driverFile: with open("%s/TestDrive.h" % header,"w") as driverFile:
driverFile.write("#ifndef _DRIVER_H_\n") driverFile.write("#ifndef _DRIVER_H_\n")
driverFile.write("#define _DRIVER_H_\n") driverFile.write("#define _DRIVER_H_\n")
driverFile.write("__ALIGNED(8) const char testDesc[]={\n") driverFile.write("__ALIGNED(8) const char testDesc[]={\n")
self._offset=0 self._offset=0
with open("GeneratedInclude/Patterns.h","w") as includeFile: with open("%s/Patterns.h" % header,"w") as includeFile:
includeFile.write("#ifndef _PATTERNS_H_\n") includeFile.write("#ifndef _PATTERNS_H_\n")
includeFile.write("#define _PATTERNS_H_\n") includeFile.write("#define _PATTERNS_H_\n")
includeFile.write("__ALIGNED(8) const char patterns[]={\n") includeFile.write("__ALIGNED(8) const char patterns[]={\n")

@ -2,3 +2,5 @@ mkdir FullBenchmark
mkdir Output mkdir Output
mkdir GeneratedInclude mkdir GeneratedInclude
mkdir GeneratedSource mkdir GeneratedSource
mkdir GeneratedIncludeBench
mkdir GeneratedSourceBench

@ -5,7 +5,7 @@ import TestScripts.Deprecate as d
parser = argparse.ArgumentParser(description='Parse test description') parser = argparse.ArgumentParser(description='Parse test description')
parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="File path") parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="Pickle path")
parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path") parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path")
parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path") parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path")
@ -16,6 +16,8 @@ parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Para
# So the .h for include files need to be generated. # So the .h for include files need to be generated.
parser.add_argument('-e', action='store_true', help="Embedded test") parser.add_argument('-e', action='store_true', help="Embedded test")
parser.add_argument('-b', action='store_true', help="Benchmark mode to use different generated folders")
parser.add_argument('others', nargs=argparse.REMAINDER) parser.add_argument('others', nargs=argparse.REMAINDER)
args = parser.parse_args() args = parser.parse_args()
@ -32,6 +34,6 @@ if args.f is not None:
d.deprecate(root,args.others) d.deprecate(root,args.others)
#print(root) #print(root)
# Generate code with the tree of tests # Generate code with the tree of tests
c.genCodeForTree(root) c.genCodeForTree(root,args.b)
else: else:
parser.print_help() parser.print_help()
Loading…
Cancel
Save