CMSIS-DSP: Improved f16 scalar code.

pull/19/head
Christophe Favergeon 6 years ago
parent 4b4d5322c8
commit 078c0b282d

@ -72,7 +72,7 @@ __STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
in = vaddq_f16(tmpVec, in); in = vaddq_f16(tmpVec, in);
tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in); tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
in = vaddq_f16(tmpVec, in); in = vaddq_f16(tmpVec, in);
acc = vgetq_lane_f16(in, 0) + vgetq_lane_f16(in, 4); acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);
return acc; return acc;
} }

@ -67,7 +67,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
const float16_t *pIn = in; const float16_t *pIn = in;
float16_t result; float16_t result;
f16x8_t vsigma; f16x8_t vsigma;
float16_t tmp; _Float16 tmp;
f16x8_t vacc1, vacc2; f16x8_t vacc1, vacc2;
uint32_t index; uint32_t index;
float16_t logclassPriors[S->numberOfClasses]; float16_t logclassPriors[S->numberOfClasses];
@ -81,15 +81,15 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) { for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
pIn = in; pIn = in;
vacc1 = vdupq_n_f16(0); vacc1 = vdupq_n_f16(0.0f16);
vacc2 = vdupq_n_f16(0); vacc2 = vdupq_n_f16(0.0f16);
uint32_t blkCnt =S->vectorDimension >> 3; uint32_t blkCnt =S->vectorDimension >> 3;
while (blkCnt > 0U) { while (blkCnt > 0U) {
f16x8_t vinvSigma, vtmp; f16x8_t vinvSigma, vtmp;
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon); vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI))); vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));
vinvSigma = vrecip_medprec_f16(vsigma); vinvSigma = vrecip_medprec_f16(vsigma);
@ -112,7 +112,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon); vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 = vacc1 =
vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)), p0); vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);
vinvSigma = vrecip_medprec_f16(vsigma); vinvSigma = vrecip_medprec_f16(vsigma);
@ -126,8 +126,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
pSigma += blkCnt; pSigma += blkCnt;
} }
tmp = -0.5f * vecAddAcrossF16Mve(vacc1); tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
tmp -= 0.5f * vecAddAcrossF16Mve(vacc2); tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);
*buffer = tmp + *pLogPrior++; *buffer = tmp + *pLogPrior++;
buffer++; buffer++;
@ -175,13 +175,13 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
pIn = in; pIn = in;
tmp = 0.0; tmp = 0.0f16;
acc1 = 0.0f; acc1 = 0.0f16;
acc2 = 0.0f; acc2 = 0.0f16;
for(nbDim = 0; nbDim < S->vectorDimension; nbDim++) for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
{ {
sigma = *pSigma + S->epsilon; sigma = *pSigma + S->epsilon;
acc1 += logf(2.0f * PI_F * sigma); acc1 += logf(2.0f16 * (_Float16)PI_F * sigma);
acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma; acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++; pIn++;
@ -189,8 +189,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
pSigma++; pSigma++;
} }
tmp = -0.5f * acc1; tmp = -0.5f16 * acc1;
tmp -= 0.5f * acc2; tmp -= 0.5f16 * acc2;
*buffer = tmp + logf(*pPrior++); *buffer = tmp + logf(*pPrior++);

@ -172,7 +172,7 @@ void arm_cmplx_mag_f16(
uint32_t numSamples) uint32_t numSamples)
{ {
uint32_t blkCnt; /* loop counter */ uint32_t blkCnt; /* loop counter */
float16_t real, imag; /* Temporary variables to hold input values */ _Float16 real, imag; /* Temporary variables to hold input values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -107,7 +107,7 @@ void arm_cmplx_mag_squared_f16(
uint32_t numSamples) uint32_t numSamples)
{ {
uint32_t blkCnt; /* Loop counter */ uint32_t blkCnt; /* Loop counter */
float16_t real, imag; /* Temporary input variables */ _Float16 real, imag; /* Temporary input variables */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -103,7 +103,7 @@ void arm_cmplx_mult_cmplx_f16(
pDst += 8; pDst += 8;
} }
float16_t a, b, c, d; /* Temporary variables to store real and imaginary values */ _Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
/* Tail */ /* Tail */
blkCnt = (blockSize & 7) >> 1; blkCnt = (blockSize & 7) >> 1;
while (blkCnt > 0) while (blkCnt > 0)
@ -134,7 +134,7 @@ void arm_cmplx_mult_cmplx_f16(
uint32_t numSamples) uint32_t numSamples)
{ {
uint32_t blkCnt; /* Loop counter */ uint32_t blkCnt; /* Loop counter */
float16_t a, b, c, d; /* Temporary variables to store real and imaginary values */ _Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -72,7 +72,7 @@
float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t accumDiff = 0.0f, accumSum = 0.0f; _Float16 accumDiff = 0.0f, accumSum = 0.0f;
uint32_t blkCnt; uint32_t blkCnt;
f16x8_t a, b, c, accumDiffV, accumSumV; f16x8_t a, b, c, accumDiffV, accumSumV;
@ -125,14 +125,14 @@ float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, u
float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t accumDiff=0.0f, accumSum=0.0f, tmpA, tmpB; _Float16 accumDiff=0.0f16, accumSum=0.0f16, tmpA, tmpB;
while(blockSize > 0) while(blockSize > 0)
{ {
tmpA = *pA++; tmpA = *pA++;
tmpB = *pB++; tmpB = *pB++;
accumDiff += fabsf(tmpA - tmpB); accumDiff += (_Float16)fabsf(tmpA - tmpB);
accumSum += fabsf(tmpA + tmpB); accumSum += (_Float16)fabsf(tmpA + tmpB);
blockSize --; blockSize --;
} }
/* /*

@ -70,7 +70,7 @@
float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t accum = 0.0f; _Float16 accum = 0.0f16;
uint32_t blkCnt; uint32_t blkCnt;
f16x8_t a, b, c, accumV; f16x8_t a, b, c, accumV;
@ -141,7 +141,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
#else #else
float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t accum=0.0f, tmpA, tmpB,diff,sum; _Float16 accum=0.0f, tmpA, tmpB,diff,sum;
while(blockSize > 0) while(blockSize > 0)
{ {
@ -150,7 +150,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
diff = fabsf(tmpA - tmpB); diff = fabsf(tmpA - tmpB);
sum = fabsf(tmpA) + fabsf(tmpB); sum = fabsf(tmpA) + fabsf(tmpB);
if ((tmpA != 0.0f) || (tmpB != 0.0f)) if ((tmpA != 0.0f16) || (tmpB != 0.0f16))
{ {
accum += (diff / sum); accum += (diff / sum);
} }

@ -67,7 +67,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
uint32_t blkCnt; /* loop counters */ uint32_t blkCnt; /* loop counters */
f16x8_t vecA, vecB; f16x8_t vecA, vecB;
f16x8_t vecDiff = vdupq_n_f16(0.0); f16x8_t vecDiff = vdupq_n_f16(0.0);
float16_t maxValue = 0.0; float16_t maxValue = 0.0f16;
blkCnt = blockSize >> 3; blkCnt = blockSize >> 3;
@ -111,7 +111,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
#else #else
float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t diff=0.0f, maxVal,tmpA, tmpB; _Float16 diff=0.0f, maxVal,tmpA, tmpB;
tmpA = *pA++; tmpA = *pA++;
tmpB = *pB++; tmpB = *pB++;

@ -102,14 +102,14 @@ float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, ui
#else #else
float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t accum,tmpA, tmpB; _Float16 accum,tmpA, tmpB;
accum = 0.0f; accum = 0.0f16;
while(blockSize > 0) while(blockSize > 0)
{ {
tmpA = *pA++; tmpA = *pA++;
tmpB = *pB++; tmpB = *pB++;
accum += fabsf(tmpA - tmpB); accum += (_Float16)fabsf(tmpA - tmpB);
blockSize --; blockSize --;
} }

@ -105,16 +105,17 @@ float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, ui
#else #else
float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t accum=0.0f,tmp; _Float16 accum=0.0f,tmp;
float16_t result;
while(blockSize > 0) while(blockSize > 0)
{ {
tmp = *pA++ - *pB++; tmp = (_Float16)*pA++ - (_Float16)*pB++;
accum += SQ(tmp); accum += SQ(tmp);
blockSize --; blockSize --;
} }
arm_sqrt_f16(accum,&tmp); arm_sqrt_f16(accum,&result);
return(tmp); return(result);
} }
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

@ -145,16 +145,17 @@ float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB
float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize) float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{ {
float16_t left, right,sum, result, tmp; _Float16 left, right,sum, tmp;
float16_t result;
uint32_t i; uint32_t i;
left = 0.0f; left = 0.0f16;
right = 0.0f; right = 0.0f16;
for(i=0; i < blockSize; i++) for(i=0; i < blockSize; i++)
{ {
tmp = (pA[i] + pB[i]) / 2.0f; tmp = ((_Float16)pA[i] + (_Float16)pB[i]) / 2.0f16;
left += rel_entr(pA[i], tmp); left += (_Float16)rel_entr(pA[i], tmp);
right += rel_entr(pB[i], tmp); right += (_Float16)rel_entr(pB[i], tmp);
} }

@ -111,13 +111,13 @@ float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, in
float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize) float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize)
{ {
float16_t sum; _Float16 sum;
uint32_t i; uint32_t i;
sum = 0.0f; sum = 0.0f;
for(i=0; i < blockSize; i++) for(i=0; i < blockSize; i++)
{ {
sum += powf(fabsf(pA[i] - pB[i]),order); sum += (_Float16)powf(fabsf(pA[i] - pB[i]),order);
} }

@ -64,7 +64,7 @@ void arm_biquad_cascade_df1_f16(
float16_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */ float16_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
float16_t X0, X1, X2, X3; /* temporary input */ float16_t X0, X1, X2, X3; /* temporary input */
float16_t X4, X5, X6, X7; /* temporary input */ float16_t X4, X5, X6, X7; /* temporary input */
float16_t lastX, lastY; /* X,Y history for tail handling */ _Float16 lastX, lastY; /* X,Y history for tail handling */
f16x8_t coeffs; f16x8_t coeffs;
f16x8_t accVec; /* accumultor vector */ f16x8_t accVec; /* accumultor vector */
uint32_t sample, stage = S->numStages; /* loop counters */ uint32_t sample, stage = S->numStages; /* loop counters */
@ -320,10 +320,10 @@ void arm_biquad_cascade_df1_f16(
float16_t *pOut = pDst; /* Destination pointer */ float16_t *pOut = pDst; /* Destination pointer */
float16_t *pState = S->pState; /* pState pointer */ float16_t *pState = S->pState; /* pState pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t acc; /* Accumulator */ _Float16 acc; /* Accumulator */
float16_t b0, b1, b2, a1, a2; /* Filter coefficients */ _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
float16_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */ _Float16 Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
float16_t Xn; /* Temporary input */ _Float16 Xn; /* Temporary input */
uint32_t sample, stage = S->numStages; /* Loop counters */ uint32_t sample, stage = S->numStages; /* Loop counters */
do do

@ -197,10 +197,10 @@ void arm_biquad_cascade_df2T_f16(
float16_t *pOut = pDst; /* Destination pointer */ float16_t *pOut = pDst; /* Destination pointer */
float16_t *pState = S->pState; /* State pointer */ float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t acc1; /* Accumulator */ _Float16 acc1; /* Accumulator */
float16_t b0, b1, b2, a1, a2; /* Filter coefficients */ _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
float16_t Xn1; /* Temporary input */ _Float16 Xn1; /* Temporary input */
float16_t d1, d2; /* State variables */ _Float16 d1, d2; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */ uint32_t sample, stage = S->numStages; /* Loop counters */
do do

@ -198,10 +198,10 @@ void arm_biquad_cascade_stereo_df2T_f16(
float16_t *pOut = pDst; /* Destination pointer */ float16_t *pOut = pDst; /* Destination pointer */
float16_t *pState = S->pState; /* State pointer */ float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t acc1a, acc1b; /* Accumulator */ _Float16 acc1a, acc1b; /* Accumulator */
float16_t b0, b1, b2, a1, a2; /* Filter coefficients */ _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
float16_t Xn1a, Xn1b; /* Temporary input */ _Float16 Xn1a, Xn1b; /* Temporary input */
float16_t d1a, d2a, d1b, d2b; /* State variables */ _Float16 d1a, d2a, d1b, d2b; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */ uint32_t sample, stage = S->numStages; /* Loop counters */
do do

@ -370,8 +370,8 @@ void arm_correlate_f16(
for (i = 0U; i <= block1 - 2; i += 2) for (i = 0U; i <= block1 - 2; i += 2)
{ {
uint32_t count = i + 1; uint32_t count = i + 1;
float16_t acc0; _Float16 acc0;
float16_t acc1; _Float16 acc1;
/* /*
* compute 2 accumulators per loop * compute 2 accumulators per loop
* size is incrementing for second accumulator * size is incrementing for second accumulator
@ -390,7 +390,7 @@ void arm_correlate_f16(
for (; i < block1; i++) for (; i < block1; i++)
{ {
uint32_t count = i + 1; uint32_t count = i + 1;
float16_t acc; _Float16 acc;
pX = pA; pX = pA;
pY = pB; pY = pB;
@ -403,10 +403,10 @@ void arm_correlate_f16(
for (i = 0U; i <= block2 - 4; i += 4) for (i = 0U; i <= block2 - 4; i += 4)
{ {
float16_t acc0; _Float16 acc0;
float16_t acc1; _Float16 acc1;
float16_t acc2; _Float16 acc2;
float16_t acc3; _Float16 acc3;
pX = pA; pX = pA;
pY = pB; pY = pB;
@ -430,8 +430,8 @@ void arm_correlate_f16(
for (; i <= block2 - 2; i += 2) for (; i <= block2 - 2; i += 2)
{ {
float16_t acc0; _Float16 acc0;
float16_t acc1; _Float16 acc1;
pX = pA; pX = pA;
pY = pB; pY = pB;
@ -451,7 +451,7 @@ void arm_correlate_f16(
if (block2 & 1) if (block2 & 1)
{ {
float16_t acc; _Float16 acc;
pX = pA; pX = pA;
pY = pB; pY = pB;
@ -466,8 +466,8 @@ void arm_correlate_f16(
{ {
uint32_t count = (i + 1); uint32_t count = (i + 1);
float16_t acc0; _Float16 acc0;
float16_t acc1; _Float16 acc1;
pX = pA; pX = pA;
pY = pB; pY = pB;
@ -488,7 +488,7 @@ void arm_correlate_f16(
for (; i >= 0; i--) for (; i >= 0; i--)
{ {
uint32_t count = (i + 1); uint32_t count = (i + 1);
float16_t acc; _Float16 acc;
pX = pA; pX = pA;
pY = pB; pY = pB;
@ -517,15 +517,15 @@ void arm_correlate_f16(
const float16_t *px; /* Intermediate inputA pointer */ const float16_t *px; /* Intermediate inputA pointer */
const float16_t *py; /* Intermediate inputB pointer */ const float16_t *py; /* Intermediate inputB pointer */
const float16_t *pSrc1; const float16_t *pSrc1;
float16_t sum; _Float16 sum;
uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
uint32_t j, k, count, blkCnt; /* Loop counters */ uint32_t j, k, count, blkCnt; /* Loop counters */
uint32_t outBlockSize; /* Loop counter */ uint32_t outBlockSize; /* Loop counter */
int32_t inc = 1; /* Destination address modifier */ int32_t inc = 1; /* Destination address modifier */
#if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_LOOPUNROLL)
float16_t acc0, acc1, acc2, acc3,c0; /* Accumulators */ _Float16 acc0, acc1, acc2, acc3,c0; /* Accumulators */
float16_t x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */ _Float16 x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
#endif #endif
/* The algorithm implementation is based on the lengths of the inputs. */ /* The algorithm implementation is based on the lengths of the inputs. */
@ -625,7 +625,7 @@ void arm_correlate_f16(
while (blockSize1 > 0U) while (blockSize1 > 0U)
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.0f; sum = 0.0f16;
#if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_LOOPUNROLL)
@ -725,10 +725,10 @@ void arm_correlate_f16(
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* Set all accumulators to zero */ /* Set all accumulators to zero */
acc0 = 0.0f; acc0 = 0.0f16;
acc1 = 0.0f; acc1 = 0.0f16;
acc2 = 0.0f; acc2 = 0.0f16;
acc3 = 0.0f; acc3 = 0.0f16;
/* read x[0], x[1], x[2] samples */ /* read x[0], x[1], x[2] samples */
@ -873,7 +873,7 @@ void arm_correlate_f16(
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.0f; sum = 0.0f16;
#if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_LOOPUNROLL)
@ -939,7 +939,7 @@ void arm_correlate_f16(
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.0f; sum = 0.0f16;
/* Loop over srcBLen */ /* Loop over srcBLen */
k = srcBLen; k = srcBLen;
@ -1000,7 +1000,7 @@ void arm_correlate_f16(
while (blockSize3 > 0U) while (blockSize3 > 0U)
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.0f; sum = 0.0f16;
#if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_LOOPUNROLL)
@ -1069,7 +1069,7 @@ void arm_correlate_f16(
const float16_t *pIn1 = pSrcA; /* inputA pointer */ const float16_t *pIn1 = pSrcA; /* inputA pointer */
const float16_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */ const float16_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
float16_t sum; /* Accumulator */ _Float16 sum; /* Accumulator */
uint32_t i = 0U, j; /* Loop counters */ uint32_t i = 0U, j; /* Loop counters */
uint32_t inv = 0U; /* Reverse order flag */ uint32_t inv = 0U; /* Reverse order flag */
uint32_t tot = 0U; /* Length */ uint32_t tot = 0U; /* Length */
@ -1127,7 +1127,7 @@ void arm_correlate_f16(
for (i = 0U; i <= tot; i++) for (i = 0U; i <= tot; i++)
{ {
/* Initialize sum with zero to carry out MAC operations */ /* Initialize sum with zero to carry out MAC operations */
sum = 0.0f; sum = 0.0f16;
/* Loop to perform MAC operations according to convolution equation */ /* Loop to perform MAC operations according to convolution equation */
for (j = 0U; j <= i; j++) for (j = 0U; j <= i; j++)

@ -711,13 +711,13 @@ arm_status arm_mat_cmplx_mult_f16(
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
float16_t sumReal, sumImag; /* Accumulator */ _Float16 sumReal, sumImag; /* Accumulator */
float16_t a1, b1, c1, d1; _Float16 a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */ arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL) #if defined (ARM_MATH_LOOPUNROLL)
float16_t a0, b0, c0, d0; _Float16 a0, b0, c0, d0;
#endif #endif
#ifdef ARM_MATH_MATRIX_CHECK #ifdef ARM_MATH_MATRIX_CHECK
@ -755,8 +755,8 @@ arm_status arm_mat_cmplx_mult_f16(
do do
{ {
/* Set the variable sum, that acts as accumulator, to zero */ /* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f; sumReal = 0.0f16;
sumImag = 0.0f; sumImag = 0.0f16;
/* Initiate pointer pIn1 to point to starting address of column being processed */ /* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA; pIn1 = pInA;

@ -66,7 +66,7 @@ arm_status arm_mat_inverse_f16(
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t *pTmpA, *pTmpB; float16_t *pTmpA, *pTmpB;
float16_t in = 0.0f; /* Temporary input values */ _Float16 in = 0.0f16; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */ arm_status status; /* status of matrix inverse */
uint32_t blkCnt; uint32_t blkCnt;
@ -137,20 +137,20 @@ arm_status arm_mat_inverse_f16(
j = numRows - rowCnt; j = numRows - rowCnt;
while (j > 0U) while (j > 0U)
{ {
*pOutT1++ = 0.0f; *pOutT1++ = 0.0f16;
j--; j--;
} }
/* /*
* Writing all ones in the diagonal of the destination matrix * Writing all ones in the diagonal of the destination matrix
*/ */
*pOutT1++ = 1.0f; *pOutT1++ = 1.0f16;
/* /*
* Writing all zeroes in upper triangle of the destination matrix * Writing all zeroes in upper triangle of the destination matrix
*/ */
j = rowCnt - 1U; j = rowCnt - 1U;
while (j > 0U) while (j > 0U)
{ {
*pOutT1++ = 0.0f; *pOutT1++ = 0.0f16;
j--; j--;
} }
/* /*
@ -199,7 +199,7 @@ arm_status arm_mat_inverse_f16(
/* /*
* Check if the pivot element is zero * Check if the pivot element is zero
*/ */
if (*pInT1 == 0.0f) if (*pInT1 == 0.0f16)
{ {
/* /*
* Loop over the number rows present below * Loop over the number rows present below
@ -215,7 +215,7 @@ arm_status arm_mat_inverse_f16(
* Check if there is a non zero pivot element to * Check if there is a non zero pivot element to
* * replace in the rows below * * replace in the rows below
*/ */
if (*pInT2 != 0.0f) if (*pInT2 != 0.0f16)
{ {
f16x8_t vecA, vecB; f16x8_t vecA, vecB;
/* /*
@ -310,7 +310,7 @@ arm_status arm_mat_inverse_f16(
/* /*
* Update the status if the matrix is singular * Update the status if the matrix is singular
*/ */
if ((flag != 1U) && (in == 0.0f)) if ((flag != 1U) && (in == 0.0f16))
{ {
return ARM_MATH_SINGULAR; return ARM_MATH_SINGULAR;
} }
@ -334,7 +334,7 @@ arm_status arm_mat_inverse_f16(
pTmpA = pInT1; pTmpA = pInT1;
f16x8_t invIn = vdupq_n_f16(1.0f / in); f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
blkCnt = (numCols - l) >> 3; blkCnt = (numCols - l) >> 3;
f16x8_t vecA; f16x8_t vecA;
@ -537,12 +537,12 @@ arm_status arm_mat_inverse_f16(
*/ */
status = ARM_MATH_SUCCESS; status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f)) if ((flag != 1U) && (in == 0.0f16))
{ {
pIn = pSrc->pData; pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++) for (i = 0; i < numRows * numCols; i++)
{ {
if (pIn[i] != 0.0f) if (pIn[i] != 0.0f16)
break; break;
} }
@ -568,7 +568,7 @@ arm_status arm_mat_inverse_f16(
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t Xchg, in = 0.0f, in1; /* Temporary input values */ _Float16 Xchg, in = 0.0f16, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */ arm_status status; /* status of matrix inverse */
@ -636,18 +636,18 @@ arm_status arm_mat_inverse_f16(
j = numRows - rowCnt; j = numRows - rowCnt;
while (j > 0U) while (j > 0U)
{ {
*pOutT1++ = 0.0f; *pOutT1++ = 0.0f16;
j--; j--;
} }
/* Writing all ones in the diagonal of the destination matrix */ /* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f; *pOutT1++ = 1.0f16;
/* Writing all zeroes in upper triangle of the destination matrix */ /* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U; j = rowCnt - 1U;
while (j > 0U) while (j > 0U)
{ {
*pOutT1++ = 0.0f; *pOutT1++ = 0.0f16;
j--; j--;
} }
@ -685,7 +685,7 @@ arm_status arm_mat_inverse_f16(
k = 1U; k = 1U;
/* Check if the pivot element is zero */ /* Check if the pivot element is zero */
if (*pInT1 == 0.0f) if (*pInT1 == 0.0f16)
{ {
/* Loop over the number rows present below */ /* Loop over the number rows present below */
@ -697,7 +697,7 @@ arm_status arm_mat_inverse_f16(
/* Check if there is a non zero pivot element to /* Check if there is a non zero pivot element to
* replace in the rows below */ * replace in the rows below */
if (*pInT2 != 0.0f) if (*pInT2 != 0.0f16)
{ {
/* Loop over number of columns /* Loop over number of columns
* to the right of the pilot element */ * to the right of the pilot element */
@ -743,7 +743,7 @@ arm_status arm_mat_inverse_f16(
} }
/* Update the status if the matrix is singular */ /* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f)) if ((flag != 1U) && (in == 0.0f16))
{ {
return ARM_MATH_SINGULAR; return ARM_MATH_SINGULAR;
} }
@ -877,12 +877,12 @@ arm_status arm_mat_inverse_f16(
/* Set status as ARM_MATH_SUCCESS */ /* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS; status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f)) if ((flag != 1U) && (in == 0.0f16))
{ {
pIn = pSrc->pData; pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++) for (i = 0; i < numRows * numCols; i++)
{ {
if (pIn[i] != 0.0f) if (pIn[i] != 0.0f16)
break; break;
} }

@ -632,7 +632,7 @@ arm_status arm_mat_mult_f16(
float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */ float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */ float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */ float16_t *px; /* Temporary output data matrix pointer */
float16_t sum; /* Accumulator */ _Float16 sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
@ -671,7 +671,7 @@ arm_status arm_mat_mult_f16(
do do
{ {
/* Set the variable sum, that acts as accumulator, to zero */ /* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f; sum = 0.0f16;
/* Initialize pointer pIn1 to point to starting address of column being processed */ /* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA; pIn1 = pInA;

@ -151,7 +151,7 @@ void arm_svm_linear_predict_f16(
acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++); acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++); acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
sum += vecAddAcrossF16Mve(acc0); sum += (_Float16)vecAddAcrossF16Mve(acc0);
pSrcA += numCols * 4; pSrcA += numCols * 4;
/* /*
@ -221,7 +221,7 @@ void arm_svm_linear_predict_f16(
acc0 = vmulq_n_f16(acc0,*pDualCoef++); acc0 = vmulq_n_f16(acc0,*pDualCoef++);
acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++); acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
sum += vecAddAcrossF16Mve(acc0); sum += (_Float16)vecAddAcrossF16Mve(acc0);
pSrcA += numCols * 2; pSrcA += numCols * 2;
row -= 2; row -= 2;
@ -274,7 +274,7 @@ void arm_svm_linear_predict_f16(
/* /*
* Sum the partial parts * Sum the partial parts
*/ */
sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0); sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
} }

@ -294,7 +294,7 @@ void arm_svm_polynomial_predict_f16(
(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
vctp16q(1)); vctp16q(1));
} }
sum += vecAddAcrossF16Mve(vSum); sum += (_Float16)vecAddAcrossF16Mve(vSum);
*pResult = S->classes[STEP(sum)]; *pResult = S->classes[STEP(sum)];
@ -316,9 +316,9 @@ void arm_svm_polynomial_predict_f16(
dot=0; dot=0;
for(j=0; j < S->vectorDimension; j++) for(j=0; j < S->vectorDimension; j++)
{ {
dot = dot + in[j]* *pSupport++; dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
} }
sum += S->dualCoefficients[i] * arm_exponent_f16(S->gamma * dot + S->coef0, S->degree); sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
} }
*pResult=S->classes[STEP(sum)]; *pResult=S->classes[STEP(sum)];

@ -67,7 +67,7 @@ void arm_svm_rbf_predict_f16(
uint32_t row; uint32_t row;
uint32_t blkCnt; /* loop counters */ uint32_t blkCnt; /* loop counters */
const float16_t *pDualCoef = S->dualCoefficients; const float16_t *pDualCoef = S->dualCoefficients;
float16_t sum = S->intercept; _Float16 sum = S->intercept;
f16x8_t vSum = vdupq_n_f16(0); f16x8_t vSum = vdupq_n_f16(0);
row = numRows; row = numRows;
@ -323,19 +323,19 @@ void arm_svm_rbf_predict_f16(
int32_t * pResult) int32_t * pResult)
{ {
_Float16 sum=S->intercept; _Float16 sum=S->intercept;
_Float16 dot=0; _Float16 dot=00.f16;
uint32_t i,j; uint32_t i,j;
const float16_t *pSupport = S->supportVectors; const float16_t *pSupport = S->supportVectors;
for(i=0; i < S->nbOfSupportVectors; i++) for(i=0; i < S->nbOfSupportVectors; i++)
{ {
dot=0; dot=0.0f16;
for(j=0; j < S->vectorDimension; j++) for(j=0; j < S->vectorDimension; j++)
{ {
dot = dot + SQ(in[j] - *pSupport); dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
pSupport++; pSupport++;
} }
sum += S->dualCoefficients[i] * expf(-S->gamma * dot); sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf(-(_Float16)S->gamma * dot);
} }
*pResult=S->classes[STEP(sum)]; *pResult=S->classes[STEP(sum)];
} }

@ -67,7 +67,7 @@ void arm_svm_sigmoid_predict_f16(
uint32_t row; uint32_t row;
uint32_t blkCnt; /* loop counters */ uint32_t blkCnt; /* loop counters */
const float16_t *pDualCoef = S->dualCoefficients; const float16_t *pDualCoef = S->dualCoefficients;
float16_t sum = S->intercept; _Float16 sum = S->intercept;
f16x8_t vSum = vdupq_n_f16(0.0f); f16x8_t vSum = vdupq_n_f16(0.0f);
row = numRows; row = numRows;
@ -305,18 +305,18 @@ void arm_svm_sigmoid_predict_f16(
int32_t * pResult) int32_t * pResult)
{ {
_Float16 sum=S->intercept; _Float16 sum=S->intercept;
_Float16 dot=0; _Float16 dot=0.0f16;
uint32_t i,j; uint32_t i,j;
const float16_t *pSupport = S->supportVectors; const float16_t *pSupport = S->supportVectors;
for(i=0; i < S->nbOfSupportVectors; i++) for(i=0; i < S->nbOfSupportVectors; i++)
{ {
dot=0; dot=0.0f16;
for(j=0; j < S->vectorDimension; j++) for(j=0; j < S->vectorDimension; j++)
{ {
dot = dot + in[j]* *pSupport++; dot = dot + (_Float16)in[j] * (_Float16)*pSupport++;
} }
sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0); sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((_Float16)S->gamma * dot + (_Float16)S->coef0);
} }
*pResult=S->classes[STEP(sum)]; *pResult=S->classes[STEP(sum)];
} }

@ -65,7 +65,7 @@
float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize) float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
{ {
uint32_t blkCnt; uint32_t blkCnt;
float16_t accum=0.0f,p; _Float16 accum=0.0f16,p;
blkCnt = blockSize; blkCnt = blockSize;
@ -110,7 +110,7 @@ float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
{ {
const float16_t *pIn; const float16_t *pIn;
uint32_t blkCnt; uint32_t blkCnt;
float16_t accum, p; _Float16 accum, p;
pIn = pSrcA; pIn = pSrcA;
blkCnt = blockSize; blkCnt = blockSize;

@ -72,12 +72,12 @@
float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize) float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
{ {
uint32_t blkCnt; uint32_t blkCnt;
float16_t accum, pA,pB; _Float16 accum, pA,pB;
blkCnt = blockSize; blkCnt = blockSize;
accum = 0.0f; accum = 0.0f16;
f16x8_t vSum = vdupq_n_f16(0.0f); f16x8_t vSum = vdupq_n_f16(0.0f);
blkCnt = blockSize >> 3; blkCnt = blockSize >> 3;
@ -120,7 +120,7 @@ float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSr
{ {
const float16_t *pInA, *pInB; const float16_t *pInA, *pInB;
uint32_t blkCnt; uint32_t blkCnt;
float16_t accum, pA,pB; _Float16 accum, pA,pB;
pInA = pSrcA; pInA = pSrcA;
pInB = pSrcB; pInB = pSrcB;

@ -72,8 +72,8 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
float16_t maxVal; float16_t maxVal;
const float16_t *pIn; const float16_t *pIn;
int32_t blkCnt; int32_t blkCnt;
float16_t accum=0.0f16; _Float16 accum=0.0f16;
float16_t tmp; _Float16 tmp;
arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal); arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal);
@ -122,11 +122,11 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
#else #else
float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize) float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
{ {
float16_t maxVal; _Float16 maxVal;
float16_t tmp; _Float16 tmp;
const float16_t *pIn; const float16_t *pIn;
uint32_t blkCnt; uint32_t blkCnt;
float16_t accum; _Float16 accum;
pIn = in; pIn = in;
blkCnt = blockSize; blkCnt = blockSize;

@ -88,8 +88,8 @@ void arm_power_f16(
float16_t * pResult) float16_t * pResult)
{ {
uint32_t blkCnt; /* Loop counter */ uint32_t blkCnt; /* Loop counter */
float16_t sum = 0.0f; /* Temporary result storage */ _Float16 sum = 0.0f16; /* Temporary result storage */
float16_t in; /* Temporary variable to store input value */ _Float16 in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -83,8 +83,8 @@ void arm_rms_f16(
float16_t * pResult) float16_t * pResult)
{ {
uint32_t blkCnt; /* Loop counter */ uint32_t blkCnt; /* Loop counter */
float16_t sum = 0.0f; /* Temporary result storage */ _Float16 sum = 0.0f16; /* Temporary result storage */
float16_t in; /* Temporary variable to store input value */ _Float16 in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -342,7 +342,7 @@ void assert_close_error(unsigned long nb,float64_t &ref, float64_t &val, double
if (abs(val - ref) > (absthreshold + relthreshold * abs(ref))) if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
{ {
char details[200]; char details[200];
sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold); sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
throw (Error(CLOSE_ERROR,nb,details)); throw (Error(CLOSE_ERROR,nb,details));
} }
}; };
@ -385,7 +385,7 @@ void assert_close_error(unsigned long nb,float32_t &ref, float32_t &val, double
if (abs(val - ref) > (absthreshold + relthreshold * abs(ref))) if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
{ {
char details[200]; char details[200];
sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold); sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
throw (Error(CLOSE_ERROR,nb,details)); throw (Error(CLOSE_ERROR,nb,details));
} }
}; };
@ -429,7 +429,7 @@ void assert_close_error(unsigned long nb,float16_t &ref, float16_t &val, double
if (abs((float)val - (float)ref) > (absthreshold + relthreshold * abs((float)ref))) if (abs((float)val - (float)ref) > (absthreshold + relthreshold * abs((float)ref)))
{ {
char details[200]; char details[200];
sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold); sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
throw (Error(CLOSE_ERROR,nb,details)); throw (Error(CLOSE_ERROR,nb,details));
} }
}; };

@ -11,7 +11,7 @@ NBVECTORS = [4,10,16]
VECDIM = [12,14,20] VECDIM = [12,14,20]
def genWsum(config,nb): def genWsum(config,f,nb):
DIM=50 DIM=50
inputs=[] inputs=[]
weights=[] weights=[]
@ -23,15 +23,15 @@ def genWsum(config,nb):
inputs += list(va) inputs += list(va)
weights += list(vb) weights += list(vb)
nbiters = Tools.loopnb(0,Tools.TAILONLY) nbiters = Tools.loopnb(f,Tools.TAILONLY)
e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters])
output.append(e) output.append(e)
nbiters = Tools.loopnb(0,Tools.BODYONLY) nbiters = Tools.loopnb(f,Tools.BODYONLY)
e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters])
output.append(e) output.append(e)
nbiters = Tools.loopnb(0,Tools.BODYANDTAIL) nbiters = Tools.loopnb(f,Tools.BODYANDTAIL)
e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters])
output.append(e) output.append(e)
@ -92,7 +92,7 @@ def writeTestsF32(config):
# This is for benchmarking the weighted sum and we use only one test pattern # This is for benchmarking the weighted sum and we use only one test pattern
genWsum(config,6) genWsum(config,Tools.F32,6)
def writeTestsF16(config): def writeTestsF16(config):
@ -105,7 +105,7 @@ def writeTestsF16(config):
config.writeInput(11,va,"Samples") config.writeInput(11,va,"Samples")
# This is for benchmarking the weighted sum and we use only one test pattern # This is for benchmarking the weighted sum and we use only one test pattern
genWsum(config,6) genWsum(config,Tools.F16,6)
def writeTestsQ31(config): def writeTestsQ31(config):
NBSAMPLES=256 NBSAMPLES=256

@ -1,102 +1,102 @@
H H
50 50
// 0.549118 // 0.145259
0x3865 0x30a6
// 0.782574 // 0.431775
0x3a43 0x36e9
// 0.537389 // 0.925328
0x384d 0x3b67
// 0.576597 // 0.170239
0x389d 0x3173
// 0.849496 // 0.385773
0x3acc 0x362c
// 0.764850 // 0.998601
0x3a1e 0x3bfd
// 0.539151 // 0.108755
0x3850 0x2ef6
// 0.459163 // 0.479921
0x3759 0x37ae
// 0.352476 // 0.592423
0x35a4 0x38bd
// 0.333343 // 0.431234
0x3555 0x36e6
// 0.362662 // 0.413843
0x35cd 0x369f
// 0.343928 // 0.959014
0x3581 0x3bac
// 0.046084 // 0.665528
0x29e6 0x3953
// 0.583088 // 0.952834
0x38aa 0x3b9f
// 0.464789 // 0.562821
0x3770 0x3881
// 0.659567 // 0.609152
0x3947 0x38e0
// 0.177360 // 0.890135
0x31ad 0x3b1f
// 0.465121 // 0.002624
0x3771 0x1960
// 0.849010 // 0.666853
0x3acb 0x3956
// 0.748383 // 0.813988
0x39fd 0x3a83
// 0.493254 // 0.170387
0x37e4 0x3174
// 0.023579 // 0.264551
0x2609 0x343c
// 0.354457 // 0.699111
0x35ac 0x3998
// 0.880690 // 0.738154
0x3b0c 0x39e8
// 0.029813 // 0.475053
0x27a2 0x379a
// 0.338038 // 0.439282
0x3569 0x3707
// 0.724386 // 0.699736
0x39cc 0x3999
// 0.341454 // 0.488956
0x3577 0x37d3
// 0.676073 // 0.115565
0x3969 0x2f65
// 0.041176 // 0.115452
0x2945 0x2f64
// 0.214019 // 0.586420
0x32d9 0x38b1
// 0.322539 // 0.327648
0x3529 0x353e
// 0.499026 // 0.442207
0x37fc 0x3713
// 0.616038 // 0.752186
0x38ee 0x3a04
// 0.688882 // 0.476363
0x3983 0x379f
// 0.066439 // 0.491094
0x2c41 0x37dc
// 0.094981 // 0.552734
0x2e14 0x386c
// 0.873092 // 0.674894
0x3afc 0x3966
// 0.642707 // 0.705545
0x3924 0x39a5
// 0.537527 // 0.966659
0x384d 0x3bbc
// 0.370953 // 0.528146
0x35ef 0x383a
// 0.450042 // 0.633660
0x3733 0x3912
// 0.434679 // 0.049129
0x36f4 0x2a4a
// 0.676789 // 0.458540
0x396a 0x3756
// 0.303649 // 0.331162
0x34dc 0x354c
// 0.076930 // 0.399161
0x2cec 0x3663
// 0.871305 // 0.015650
0x3af8 0x2402
// 0.855053 // 0.271388
0x3ad7 0x3458
// 0.498935 // 0.912530
0x37fc 0x3b4d
// 0.854980 // 0.655578
0x3ad7 0x393f

@ -1,8 +1,8 @@
H H
3 3
// 0.544937 // 0.445338
0x385c 0x3720
// 0.581009 // 0.569713
0x38a6 0x388f
// 0.504352 // 0.564358
0x3809 0x3884

@ -1,102 +1,102 @@
H H
50 50
// 0.873606 // 0.305271
0x3afd 0x34e2
// 0.005892 // 0.032698
0x1e09 0x282f
// 0.669526 // 0.266409
0x395b 0x3443
// 0.198146 // 0.617540
0x3257 0x38f1
// 0.032904 // 0.152922
0x2836 0x30e5
// 0.829321 // 0.774995
0x3aa2 0x3a33
// 0.044536 // 0.848550
0x29b3 0x3aca
// 0.842708 // 0.523429
0x3abe 0x3830
// 0.981319 // 0.116035
0x3bda 0x2f6d
// 0.274000 // 0.649223
0x3462 0x3932
// 0.509194 // 0.246758
0x3813 0x33e5
// 0.994965 // 0.826661
0x3bf6 0x3a9d
// 0.572766 // 0.967839
0x3895 0x3bbe
// 0.557596 // 0.283239
0x3876 0x3488
// 0.306704 // 0.847203
0x34e8 0x3ac7
// 0.983822 // 0.720735
0x3bdf 0x39c4
// 0.145555 // 0.477014
0x30a8 0x37a2
// 0.539088 // 0.633504
0x3850 0x3911
// 0.527526 // 0.596646
0x3838 0x38c6
// 0.501409 // 0.832505
0x3803 0x3aa9
// 0.249480 // 0.135769
0x33fc 0x3058
// 0.273503 // 0.607454
0x3460 0x38dc
// 0.567660 // 0.914942
0x388b 0x3b52
// 0.741561 // 0.474766
0x39ef 0x3799
// 0.458026 // 0.781271
0x3754 0x3a40
// 0.239162 // 0.084347
0x33a7 0x2d66
// 0.731739 // 0.038805
0x39db 0x28f8
// 0.662458 // 0.055045
0x394d 0x2b0c
// 0.024779 // 0.334330
0x2658 0x3559
// 0.086811 // 0.331283
0x2d8e 0x354d
// 0.660991 // 0.595124
0x394a 0x38c3
// 0.834424 // 0.443636
0x3aad 0x3719
// 0.163672 // 0.612722
0x313d 0x38e7
// 0.120432 // 0.152437
0x2fb5 0x30e1
// 0.593488 // 0.311869
0x38bf 0x34fd
// 0.973784 // 0.177173
0x3bca 0x31ab
// 0.167473 // 0.540216
0x315c 0x3852
// 0.858171 // 0.941623
0x3ade 0x3b88
// 0.986637 // 0.666747
0x3be5 0x3955
// 0.223556 // 0.002608
0x3327 0x1958
// 0.382377 // 0.092911
0x361e 0x2df2
// 0.757667 // 0.129786
0x3a10 0x3027
// 0.032219 // 0.710137
0x2820 0x39ae
// 0.574024 // 0.488728
0x3898 0x37d2
// 0.125286 // 0.478723
0x3002 0x37a9
// 0.946997 // 0.627602
0x3b93 0x3905
// 0.942443 // 0.001795
0x3b8a 0x175b
// 0.152563 // 0.299980
0x30e2 0x34cd
// 0.240567 // 0.636209
0x33b3 0x3917
// 0.160261 // 0.102662
0x3121 0x2e92

@ -10,8 +10,8 @@ Reference patterns are generated with
a double precision computation. a double precision computation.
*/ */
#define REL_ERROR (3.0e-3) #define REL_ERROR (5.0e-2)
#define ABS_ERROR (3.5e-2) #define ABS_ERROR (1.0e-1)
void BIQUADF16::test_biquad_cascade_df1_ref() void BIQUADF16::test_biquad_cascade_df1_ref()
{ {

@ -3,7 +3,7 @@
#include "Error.h" #include "Error.h"
#include "Test.h" #include "Test.h"
#define REL_ERROR (2e-3) #define REL_ERROR (5e-3)
#define REL_JS_ERROR (3e-2) #define REL_JS_ERROR (3e-2)

@ -7,8 +7,8 @@
#define SNR_THRESHOLD 120 #define SNR_THRESHOLD 120
#define REL_ERROR (1.0e-5) #define REL_ERROR (1.0e-5)
#define ABS_WEIGHTEDSUM_ERROR (5.0e-2) #define ABS_WEIGHTEDSUM_ERROR (1.0e-1)
#define REL_WEIGHTEDSUM_ERROR (5.0e-2) #define REL_WEIGHTEDSUM_ERROR (5.0e-3)
#define ABS_ERROR_F32 (1.0e-3) #define ABS_ERROR_F32 (1.0e-3)
#define REL_ERROR_F32 (1.0e-3) #define REL_ERROR_F32 (1.0e-3)
@ -28,7 +28,7 @@ void SupportTestsF16::test_weighted_sum_f16()
*outp=arm_weighted_sum_f16(inp, coefsp,this->nbSamples); *outp=arm_weighted_sum_f16(inp, coefsp,this->nbSamples);
ASSERT_CLOSE_ERROR(*outp,refp[this->offset],ABS_WEIGHTEDSUM_ERROR,REL_WEIGHTEDSUM_ERROR); ASSERT_CLOSE_ERROR(*outp,refp[this->offset],ABS_WEIGHTEDSUM_ERROR,REL_WEIGHTEDSUM_ERROR);
ASSERT_EMPTY_TAIL(output); ASSERT_EMPTY_TAIL(output);

@ -754,7 +754,7 @@ group Root {
} }
Functions { Functions {
Cascaded BiQuad Filter DF2T_f64:test_biquad_cascade_df2T_f64 Cascaded BiQuad Filter DF2T:test_biquad_cascade_df2T_f64
} -> PARAM1_ID } -> PARAM1_ID
} }
} }
@ -1570,8 +1570,8 @@ group Root {
Complex FFT:test_cfft_f32 -> CFFT_PARAM_ID Complex FFT:test_cfft_f32 -> CFFT_PARAM_ID
Real FFT:test_rfft_f32 -> RFFT_PARAM_ID Real FFT:test_rfft_f32 -> RFFT_PARAM_ID
DCT4:test_dct4_f32 -> DCT_PARAM_ID DCT4:test_dct4_f32 -> DCT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID Scalar Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID Scalar Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
} }
} }
@ -1620,8 +1620,8 @@ group Root {
Complex FFT:test_cfft_q31 -> CFFT_PARAM_ID Complex FFT:test_cfft_q31 -> CFFT_PARAM_ID
Real FFT:test_rfft_q31 -> RFFT_PARAM_ID Real FFT:test_rfft_q31 -> RFFT_PARAM_ID
DCT4:test_dct4_q31 -> DCT_PARAM_ID DCT4:test_dct4_q31 -> DCT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID Scalar Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID Scalar Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
} }
} }
@ -1670,8 +1670,8 @@ group Root {
Complex FFT:test_cfft_q15 -> CFFT_PARAM_ID Complex FFT:test_cfft_q15 -> CFFT_PARAM_ID
Real FFT:test_rfft_q15 -> RFFT_PARAM_ID Real FFT:test_rfft_q15 -> RFFT_PARAM_ID
DCT4:test_dct4_q15 -> DCT_PARAM_ID DCT4:test_dct4_q15 -> DCT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID Scalar Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID Scalar Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
} }
} }
} }

@ -584,8 +584,8 @@ group Root {
Functions { Functions {
Complex FFT:test_cfft_f16 -> CFFT_PARAM_ID Complex FFT:test_cfft_f16 -> CFFT_PARAM_ID
Real FFT:test_rfft_f16 -> RFFT_PARAM_ID Real FFT:test_rfft_f16 -> RFFT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID Scalar Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID Scalar Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
} }
} }

Loading…
Cancel
Save