CMSIS-DSP: New MVE implementation of the FIR F16

pull/19/head
Christophe Favergeon 5 years ago
parent 364fdb9a28
commit 68b219bb1f

@ -39,6 +39,8 @@ extern "C"
{
#endif
#define DELTA_Q31 ((q31_t)(0x100))
#define DELTA_Q15 ((q15_t)0x5)

@ -40,6 +40,8 @@ extern "C"
#if defined(ARM_FLOAT16_SUPPORTED)
#define ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S))
/**
* @brief Instance structure for the floating-point FIR filter.
*/

@ -47,6 +47,8 @@
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#define FIR_F32_MAX_COEF_BLK 8
#define FIR_F16_CORE(pSamples, c, NB_TAPS) \
vecAcc0 = vdupq_n_f16(0.0f16); \
for (int i = 0; i < NB_TAPS; i++) { \
@ -54,7 +56,9 @@
vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
}
static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,
const float16_t * __restrict pSrc,
float16_t * __restrict pDst, uint32_t blockSize)
{
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@ -65,8 +69,8 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
float16_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
int32_t blkCnt;
f16x8_t vecIn0;
f16x8_t vecAcc0;
float16x8_t vecIn0;
float16x8_t vecAcc0;
const int NB_TAPS=4;
float16_t c[NB_TAPS];
@ -107,6 +111,7 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
}
blkCnt = blockSize & 7;
if (blkCnt)
{
mve_pred16_t p0 = vctp16q(blkCnt);
@ -141,7 +146,9 @@ static void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S, const float16_t
}
static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t * pSrc, float16_t * pDst, uint32_t blockSize)
__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,
const float16_t * __restrict pSrc,
float16_t * __restrict pDst, uint32_t blockSize)
{
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
@ -152,8 +159,8 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
float16_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
int32_t blkCnt;
f16x8_t vecIn0;
f16x8_t vecAcc0;
float16x8_t vecIn0;
float16x8_t vecAcc0;
const int NB_TAPS=8;
float16_t c[NB_TAPS];
@ -194,6 +201,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
}
blkCnt = blockSize & 7;
if (blkCnt)
{
mve_pred16_t p0 = vctp16q(blkCnt);
@ -224,6 +232,7 @@ static void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S, const float16_t
mve_pred16_t p0 = vctp16q(blkCnt);
vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
}
}
@ -232,224 +241,299 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
float16_t * pDst,
uint32_t blockSize)
{
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t *pStateCur; /* Points to the current sample of the state */
const float16_t *pSamples; /* Temporary pointer to the sample buffer */
float16_t *pOutput; /* Temporary pointer to the output buffer */
const float16_t *pTempSrc; /* Temporary pointer to the source data */
float16_t *pTempDest; /* Temporary pointer to the destination buffer */
int32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
f16x8_t vecIn0;
f16x8_t vecAcc0;
float16_t c0, c1, c2, c3;
float16_t c4, c5, c6, c7;
float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8);
float16_t *pState = pRefStatePtr ; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
const float16_t *pSamples; /* Temporary pointer to the sample buffer */
float16_t *pOutput; /* Temporary pointer to the output buffer */
const float16_t *pTempSrc; /* Temporary pointer to the source data */
float16_t *pTempDest; /* Temporary pointer to the destination buffer */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t blkCnt;
float16_t c0, c1, c2, c3;
float16_t c4, c5, c6, c7;
/*
* [1 to 8 taps] specialized routines
*/
if (numTaps <= 4)
{
if (numTaps <= 4) {
arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
return;
}
else if (numTaps <= 8)
{
} else if (numTaps <= 8) {
arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
return;
}
/*
* pState points to state array which contains previous frame (numTaps - 1) samples
* pStateCur points to the location where the new input data should be written
*/
pStateCur = &(pState[(numTaps - 1u)]);
/*
* Copy new data into state so that we obtain a continuous sample buffer
* containing both the tail end of the old data and the new data.
*/
pSamples = pState;
pTempSrc = pSrc;
pOutput = pDst;
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
int i;
const float16_t *pCoeffsCur = pCoeffs;
/*
* Save 8 input samples in the history buffer
*/
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempDest = &(pState[(numTaps - 1u)]);
int cnt = blockSize;
do {
mve_pred16_t p0 = vctp16q(cnt);
vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
pTempDest += 8;
pTempSrc += 8;
cnt -= 8;
} while (cnt > 0);
float16_t *partial_accu_ptr = S->pState;
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
pSamples = pState;
c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
cnt = blockSize >> 3;
while (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 8;
vst1q(partial_accu_ptr, vecAcc0);
cnt--;
partial_accu_ptr += 8;
}
cnt = blockSize & 7;
if (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
mve_pred16_t p0 = vctp16q(cnt);
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
}
pSamples += 8;
int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
int sample_offset = FIR_F32_MAX_COEF_BLK;
while (localTaps > FIR_F32_MAX_COEF_BLK) {
c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
partial_accu_ptr = S->pState;
pSamples = pState + sample_offset;
int cnt = blockSize >> 3;
while (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
for (i = 0; i <= ((numTaps - 9) / 8); i++)
{
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 8;
vecAcc0 += vld1q_f16(partial_accu_ptr);
vst1q(partial_accu_ptr, vecAcc0);
cnt--;
partial_accu_ptr += 8;
}
vst1q(pOutput, vecAcc0);
pOutput += 8;
pSamples = pSamples - (i + 1) * 8 + 8;
blkCnt--;
}
blkCnt = blockSize & 7;
{
mve_pred16_t p0 = vctp16q(blkCnt);
int i;
const float16_t *pCoeffsCur = pCoeffs;
vst1q(pStateCur, vld1q(pTempSrc));
pStateCur += 8;
pTempSrc += 8;
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
cnt = blockSize & 7;
if (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
mve_pred16_t p0 = vctp16q(cnt);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
vecAcc0 += vld1q_f16(partial_accu_ptr);
vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
}
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
localTaps -= FIR_F32_MAX_COEF_BLK;
sample_offset += FIR_F32_MAX_COEF_BLK;
}
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
pSamples = pState + sample_offset;
if (localTaps > 4) {
c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
c4 = *pCoeffs++;
c5 = *pCoeffs++;
c6 = *pCoeffs++;
c7 = *pCoeffs++;
pOutput = pDst;
partial_accu_ptr = S->pState;
cnt = blockSize >> 3;
while (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
pSamples += 8;
float16x8_t pap = vld1q_f16(partial_accu_ptr);
vst1q(pOutput, vecAcc0 + pap);
cnt--;
partial_accu_ptr += 8;
pOutput += 8;
}
pSamples += 8;
cnt = blockSize & 7;
if (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
for (i = 0; i <= ((numTaps - 9) / 8); i++)
{
c0 = *pCoeffsCur++;
c1 = *pCoeffsCur++;
c2 = *pCoeffsCur++;
c3 = *pCoeffsCur++;
c4 = *pCoeffsCur++;
c5 = *pCoeffsCur++;
c6 = *pCoeffsCur++;
c7 = *pCoeffsCur++;
mve_pred16_t p0 = vctp16q(cnt);
vecIn0 = vld1q(pSamples);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c0);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
vecIn0 = vld1q(&pSamples[4]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
vecIn0 = vld1q(&pSamples[5]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
vecIn0 = vld1q(&pSamples[6]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
vecIn0 = vld1q(&pSamples[7]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
float16x8_t pap = vld1q_f16(partial_accu_ptr);
vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
pOutput += cnt;
}
} else {
c0 = *pCoeffs++;
c1 = *pCoeffs++;
c2 = *pCoeffs++;
c3 = *pCoeffs++;
pOutput = pDst;
partial_accu_ptr = S->pState;
cnt = blockSize >> 3;
while (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
pSamples += 8;
float16x8_t pap = vld1q_f16(partial_accu_ptr);
vst1q(pOutput, vecAcc0 + pap);
cnt--;
partial_accu_ptr += 8;
pOutput += 8;
}
vstrhq_p_f16(pOutput, vecAcc0, p0);
cnt = blockSize & 7;
if (cnt > 0) {
float16x8_t vecAcc0;
float16x8_t vecIn0;
mve_pred16_t p0 = vctp16q(cnt);
vecIn0 = vld1q(pSamples);
vecAcc0 = vmulq(vecIn0, c0);
vecIn0 = vld1q(&pSamples[1]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
vecIn0 = vld1q(&pSamples[2]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
vecIn0 = vld1q(&pSamples[3]);
vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
float16x8_t pap = vld1q_f16(partial_accu_ptr);
vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
pOutput += cnt;
}
}
/*
@ -459,17 +543,15 @@ void arm_fir_f16(const arm_fir_instance_f16 * S,
pTempDest = pState;
blkCnt = numTaps >> 3;
while (blkCnt > 0U)
{
while (blkCnt > 0U) {
vst1q(pTempDest, vld1q(pTempSrc));
pTempSrc += 8;
pTempDest += 8;
blkCnt--;
}
blkCnt = numTaps & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
}
}

@ -107,9 +107,12 @@
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
The first blockSize samples are temporary data.
The first A samples are temporary data.
The remaining samples are the state of the FIR filter.
So the state buffer has size <code> numTaps + 2 * blockSize - 1 </code>
@par
So the state buffer has size <code> numTaps + A * blockSize - 1 </code> :
- A is blockSize for f32
- A is 8*ceil(blockSize/8) for f16
@par Fixed-Point Behavior
Care must be taken when using the fixed-point versions of the FIR filter functions.
@ -144,7 +147,7 @@
}
static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
const float32_t * __restrict pSrc,
float32_t * __restrict pDst, uint32_t blockSize)
{
@ -229,7 +232,7 @@ static void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
static void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
const float32_t * __restrict pSrc,
float32_t * __restrict pDst, uint32_t blockSize)
{

@ -52,7 +52,7 @@
</pre>
@par
<code>pState</code> points to the array of state variables.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples, where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
<code>pState</code> is of length <code>numTaps+blockSize-1</code> samples (except for Helium - see below), where <code>blockSize</code> is the number of input samples processed by each call to <code>arm_fir_f16()</code>.
@par Initialization of Helium version
For Helium version the array of coefficients must be a multiple of 16 even if less
then 16 coefficients are used. The additional coefficients must be set to 0.
@ -61,6 +61,13 @@
the implementation may require to read more coefficients due to the vectorization and
to avoid having to manage too many different cases in the code.
@par Helium state buffer
The state buffer must contain some additional temporary data
used during the computation but which is not the state of the FIR.
The first 8*ceil(blockSize/8) samples are temporary data.
The remaining samples are the state of the FIR filter.
So the state buffer has size <code> numTaps + 8*ceil(blockSize/8) + blockSize - 1 </code>
*/
void arm_fir_init_f16(
@ -77,7 +84,11 @@ void arm_fir_init_f16(
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t));
#else
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t));
#endif
/* Assign state pointer */
S->pState = pState;

@ -1,6 +1,9 @@
#include "FIRF16.h"
#include "Error.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
static __ALIGNED(8) float16_t coeffArray[64];
#endif
void FIRF16::test_fir_f16()
{
@ -21,16 +24,28 @@
samples.reload(FIRF16::SAMPLES1_F16_ID,mgr,this->nbSamples);
coefs.reload(FIRF16::COEFS1_F16_ID,mgr,this->nbTaps);
state.create(this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
state.create(ROUND_UP(this->nbSamples,8) + this->nbSamples + this->nbTaps - 1,FIRF16::STATE_F16_ID,mgr);
output.create(this->nbSamples,FIRF16::OUT_SAMPLES_F16_ID,mgr);
switch(id)
{
case TEST_FIR_F16_1:
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
*/
memset(coeffArray,0,32*sizeof(float16_t));
float16_t *ptr;
ptr=coefs.ptr();
memcpy(coeffArray,ptr,this->nbTaps*sizeof(float16_t));
this->pCoefs = coeffArray;
#else
this->pCoefs=coefs.ptr();
#endif
arm_fir_init_f16(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
this->pSrc=samples.ptr();
this->pCoefs=coefs.ptr();
this->pDst=output.ptr();
break;

@ -39,7 +39,6 @@ static __ALIGNED(8) float32_t coeffArray[64];
switch(id)
{
case TEST_FIR_F32_1:
arm_fir_init_f32(&instFir,this->nbTaps,coefs.ptr(),state.ptr(),this->nbSamples);
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Copy coefficients and pad to zero
@ -57,6 +56,8 @@ static __ALIGNED(8) float32_t coeffArray[64];
this->pSrc=samples.ptr();
this->pDst=output.ptr();
arm_fir_init_f32(&instFir,this->nbTaps,this->pCoefs,state.ptr(),this->nbSamples);
break;
case TEST_LMS_F32_2:

@ -137,8 +137,8 @@ void checkInnerTail(float16_t *b)
ref.reload(FIRF16::FIRREFS_F16_ID,mgr);
output.create(ref.nbSamples(),FIRF16::OUT_F16_ID,mgr);
/* Max blockSize + numTaps - 1 as generated by Python script */
state.create(47,FIRF16::OUT_F16_ID,mgr);
/* > Max 8*ceil(blockSize,8) + blockSize + numTaps - 1 as generated by Python script */
state.create(47+47,FIRF16::OUT_F16_ID,mgr);
}
void FIRF16::tearDown(Testing::testID_t id,Client::PatternMgr *mgr)

Loading…
Cancel
Save