CMSIS-DSP: Added new MVE implementation of FIR Q31

pull/19/head
Christophe Favergeon 5 years ago
parent 68b219bb1f
commit a108d6763e

@ -40,8 +40,6 @@ extern "C"
#if defined(ARM_FLOAT16_SUPPORTED)
#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
/**
* @brief Instance structure for the floating-point FIR filter.
*/

@ -42,6 +42,7 @@ extern "C"
#define SQ(x) ((x) * (x))
#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
/**

@ -110,9 +110,11 @@
The first A samples are temporary data.
The remaining samples are the state of the FIR filter.
@par
So the state buffer has size <code> numTaps + A * blockSize - 1 </code> :
So the state buffer has size <code> numTaps + A + blockSize - 1 </code> :
- A is blockSize for f32
- A is 8*ceil(blockSize/8) for f16
- A is 8*ceil(blockSize/4) for q31
@par Fixed-Point Behavior
Care must be taken when using the fixed-point versions of the FIR filter functions.
@ -200,6 +202,7 @@ __STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
}
blkCnt = blockSize & 3;
if (blkCnt)
{
mve_pred16_t p0 = vctp32q(blkCnt);

@ -52,7 +52,23 @@
{b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
</pre>
<code>pState</code> points to the array of state variables.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_q31()</code>.
@par Initialization of Helium version
For Helium version the array of coefficients must be a multiple of 16 even if less
then 16 coefficients are used. The additional coefficients must be set to 0.
It does not mean that all the coefficients will be used in the filter (numTaps
is still set to its right value in the init function.) It just means that
the implementation may require to read more coefficients due to the vectorization and
to avoid having to manage too many different cases in the code.
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
The first 2*4*ceil(blockSize/4) samples are temporary data.
The remaining samples are the state of the FIR filter.
So the state buffer has size <code> numTaps + 8*ceil(blockSize/4) + blockSize - 1 </code>
*/
void arm_fir_init_q31(
@ -69,7 +85,11 @@ void arm_fir_init_q31(
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t));
#else
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
#endif
/* Assign state pointer */
S->pState = pState;

File diff suppressed because it is too large Load Diff

@ -1,6 +1,9 @@
#include "FIRQ31.h"
#include "Error.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) q31_t coeffArray[64];
#endif
void FIRQ31::test_fir_q31()
{
@ -30,16 +33,28 @@
samples.reload(FIRQ31::SAMPLES1_Q31_ID,mgr,this->nbSamples);
coefs.reload(FIRQ31::COEFS1_Q31_ID,mgr,this->nbTaps);
state.create(this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
state.create(2*ROUND_UP(this->nbSamples,4) + this->nbSamples + this->nbTaps - 1,FIRQ31::STATE_Q31_ID,mgr);
output.create(this->nbSamples,FIRQ31::OUT_SAMPLES_Q31_ID,mgr);
switch(id)
{
case TEST_FIR_Q31_1:
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(q31_t));
q31_t *ptr;
ptr=coefs.ptr();
memcpy(coeffArray,ptr,this->nbTaps*sizeof(q31_t));
this->pCoefs = coeffArray;
#else
this->pCoefs=coefs.ptr();
#endif
arm_fir_init_q31(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
this->pCoefs=coefs.ptr();
this->pDst=output.ptr();
break;

@ -37,6 +37,7 @@ void checkInnerTail(q31_t *b)
#endif
int blockSize;
int numTaps;
int nb=1;
/*
@ -98,6 +99,8 @@ void checkInnerTail(q31_t *b)
configp += 2;
orgcoefsp += numTaps;
nb += blockSize + blockSize;
}
@ -129,8 +132,8 @@ void checkInnerTail(q31_t *b)
ref.reload(FIRQ31::FIRREFS_Q31_ID,mgr);
output.create(ref.nbSamples(),FIRQ31::OUT_Q31_ID,mgr);
/* Max blockSize + numTaps - 1 as generated by Python script */
state.create(47,FIRQ31::OUT_Q31_ID,mgr);
/* > Max blockSize + numTaps - 1 as generated by Python script */
state.create(47 + 47+47,FIRQ31::OUT_Q31_ID,mgr);
}
void FIRQ31::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)

Loading…
Cancel
Save