CMSIS-DSP: Improved f16 scalar code.

pull/19/head
Christophe Favergeon 5 years ago
parent 4b4d5322c8
commit 078c0b282d

@ -72,7 +72,7 @@ __STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
in = vaddq_f16(tmpVec, in);
tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
in = vaddq_f16(tmpVec, in);
acc = vgetq_lane_f16(in, 0) + vgetq_lane_f16(in, 4);
acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);
return acc;
}

@ -67,7 +67,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
const float16_t *pIn = in;
float16_t result;
f16x8_t vsigma;
float16_t tmp;
_Float16 tmp;
f16x8_t vacc1, vacc2;
uint32_t index;
float16_t logclassPriors[S->numberOfClasses];
@ -81,15 +81,15 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
pIn = in;
vacc1 = vdupq_n_f16(0);
vacc2 = vdupq_n_f16(0);
vacc1 = vdupq_n_f16(0.0f16);
vacc2 = vdupq_n_f16(0.0f16);
uint32_t blkCnt =S->vectorDimension >> 3;
while (blkCnt > 0U) {
f16x8_t vinvSigma, vtmp;
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)));
vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));
vinvSigma = vrecip_medprec_f16(vsigma);
@ -112,7 +112,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 =
vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)), p0);
vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);
vinvSigma = vrecip_medprec_f16(vsigma);
@ -126,8 +126,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
pSigma += blkCnt;
}
tmp = -0.5f * vecAddAcrossF16Mve(vacc1);
tmp -= 0.5f * vecAddAcrossF16Mve(vacc2);
tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);
*buffer = tmp + *pLogPrior++;
buffer++;
@ -175,13 +175,13 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
pIn = in;
tmp = 0.0;
acc1 = 0.0f;
acc2 = 0.0f;
tmp = 0.0f16;
acc1 = 0.0f16;
acc2 = 0.0f16;
for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
{
sigma = *pSigma + S->epsilon;
acc1 += logf(2.0f * PI_F * sigma);
acc1 += logf(2.0f16 * (_Float16)PI_F * sigma);
acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++;
@ -189,8 +189,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
pSigma++;
}
tmp = -0.5f * acc1;
tmp -= 0.5f * acc2;
tmp = -0.5f16 * acc1;
tmp -= 0.5f16 * acc2;
*buffer = tmp + logf(*pPrior++);

@ -172,7 +172,7 @@ void arm_cmplx_mag_f16(
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
float16_t real, imag; /* Temporary variables to hold input values */
_Float16 real, imag; /* Temporary variables to hold input values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -107,7 +107,7 @@ void arm_cmplx_mag_squared_f16(
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float16_t real, imag; /* Temporary input variables */
_Float16 real, imag; /* Temporary input variables */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -103,7 +103,7 @@ void arm_cmplx_mult_cmplx_f16(
pDst += 8;
}
float16_t a, b, c, d; /* Temporary variables to store real and imaginary values */
_Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
/* Tail */
blkCnt = (blockSize & 7) >> 1;
while (blkCnt > 0)
@ -134,7 +134,7 @@ void arm_cmplx_mult_cmplx_f16(
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float16_t a, b, c, d; /* Temporary variables to store real and imaginary values */
_Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -72,7 +72,7 @@
float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t accumDiff = 0.0f, accumSum = 0.0f;
_Float16 accumDiff = 0.0f, accumSum = 0.0f;
uint32_t blkCnt;
f16x8_t a, b, c, accumDiffV, accumSumV;
@ -125,14 +125,14 @@ float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, u
float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t accumDiff=0.0f, accumSum=0.0f, tmpA, tmpB;
_Float16 accumDiff=0.0f16, accumSum=0.0f16, tmpA, tmpB;
while(blockSize > 0)
{
tmpA = *pA++;
tmpB = *pB++;
accumDiff += fabsf(tmpA - tmpB);
accumSum += fabsf(tmpA + tmpB);
accumDiff += (_Float16)fabsf(tmpA - tmpB);
accumSum += (_Float16)fabsf(tmpA + tmpB);
blockSize --;
}
/*

@ -70,7 +70,7 @@
float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t accum = 0.0f;
_Float16 accum = 0.0f16;
uint32_t blkCnt;
f16x8_t a, b, c, accumV;
@ -141,7 +141,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
#else
float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t accum=0.0f, tmpA, tmpB,diff,sum;
_Float16 accum=0.0f, tmpA, tmpB,diff,sum;
while(blockSize > 0)
{
@ -150,7 +150,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
diff = fabsf(tmpA - tmpB);
sum = fabsf(tmpA) + fabsf(tmpB);
if ((tmpA != 0.0f) || (tmpB != 0.0f))
if ((tmpA != 0.0f16) || (tmpB != 0.0f16))
{
accum += (diff / sum);
}

@ -67,7 +67,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
uint32_t blkCnt; /* loop counters */
f16x8_t vecA, vecB;
f16x8_t vecDiff = vdupq_n_f16(0.0);
float16_t maxValue = 0.0;
float16_t maxValue = 0.0f16;
blkCnt = blockSize >> 3;
@ -111,7 +111,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
#else
float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t diff=0.0f, maxVal,tmpA, tmpB;
_Float16 diff=0.0f, maxVal,tmpA, tmpB;
tmpA = *pA++;
tmpB = *pB++;

@ -102,14 +102,14 @@ float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, ui
#else
float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t accum,tmpA, tmpB;
_Float16 accum,tmpA, tmpB;
accum = 0.0f;
accum = 0.0f16;
while(blockSize > 0)
{
tmpA = *pA++;
tmpB = *pB++;
accum += fabsf(tmpA - tmpB);
accum += (_Float16)fabsf(tmpA - tmpB);
blockSize --;
}

@ -105,16 +105,17 @@ float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, ui
#else
float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t accum=0.0f,tmp;
_Float16 accum=0.0f,tmp;
float16_t result;
while(blockSize > 0)
{
tmp = *pA++ - *pB++;
tmp = (_Float16)*pA++ - (_Float16)*pB++;
accum += SQ(tmp);
blockSize --;
}
arm_sqrt_f16(accum,&tmp);
return(tmp);
arm_sqrt_f16(accum,&result);
return(result);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

@ -145,16 +145,17 @@ float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB
float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
{
float16_t left, right,sum, result, tmp;
_Float16 left, right,sum, tmp;
float16_t result;
uint32_t i;
left = 0.0f;
right = 0.0f;
left = 0.0f16;
right = 0.0f16;
for(i=0; i < blockSize; i++)
{
tmp = (pA[i] + pB[i]) / 2.0f;
left += rel_entr(pA[i], tmp);
right += rel_entr(pB[i], tmp);
tmp = ((_Float16)pA[i] + (_Float16)pB[i]) / 2.0f16;
left += (_Float16)rel_entr(pA[i], tmp);
right += (_Float16)rel_entr(pB[i], tmp);
}

@ -111,13 +111,13 @@ float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, in
float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize)
{
float16_t sum;
_Float16 sum;
uint32_t i;
sum = 0.0f;
for(i=0; i < blockSize; i++)
{
sum += powf(fabsf(pA[i] - pB[i]),order);
sum += (_Float16)powf(fabsf(pA[i] - pB[i]),order);
}

@ -64,7 +64,7 @@ void arm_biquad_cascade_df1_f16(
float16_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
float16_t X0, X1, X2, X3; /* temporary input */
float16_t X4, X5, X6, X7; /* temporary input */
float16_t lastX, lastY; /* X,Y history for tail handling */
_Float16 lastX, lastY; /* X,Y history for tail handling */
f16x8_t coeffs;
f16x8_t accVec; /* accumultor vector */
uint32_t sample, stage = S->numStages; /* loop counters */
@ -320,10 +320,10 @@ void arm_biquad_cascade_df1_f16(
float16_t *pOut = pDst; /* Destination pointer */
float16_t *pState = S->pState; /* pState pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t acc; /* Accumulator */
float16_t b0, b1, b2, a1, a2; /* Filter coefficients */
float16_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
float16_t Xn; /* Temporary input */
_Float16 acc; /* Accumulator */
_Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
_Float16 Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
_Float16 Xn; /* Temporary input */
uint32_t sample, stage = S->numStages; /* Loop counters */
do

@ -197,10 +197,10 @@ void arm_biquad_cascade_df2T_f16(
float16_t *pOut = pDst; /* Destination pointer */
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t acc1; /* Accumulator */
float16_t b0, b1, b2, a1, a2; /* Filter coefficients */
float16_t Xn1; /* Temporary input */
float16_t d1, d2; /* State variables */
_Float16 acc1; /* Accumulator */
_Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
_Float16 Xn1; /* Temporary input */
_Float16 d1, d2; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */
do

@ -198,10 +198,10 @@ void arm_biquad_cascade_stereo_df2T_f16(
float16_t *pOut = pDst; /* Destination pointer */
float16_t *pState = S->pState; /* State pointer */
const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float16_t acc1a, acc1b; /* Accumulator */
float16_t b0, b1, b2, a1, a2; /* Filter coefficients */
float16_t Xn1a, Xn1b; /* Temporary input */
float16_t d1a, d2a, d1b, d2b; /* State variables */
_Float16 acc1a, acc1b; /* Accumulator */
_Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
_Float16 Xn1a, Xn1b; /* Temporary input */
_Float16 d1a, d2a, d1b, d2b; /* State variables */
uint32_t sample, stage = S->numStages; /* Loop counters */
do

@ -370,8 +370,8 @@ void arm_correlate_f16(
for (i = 0U; i <= block1 - 2; i += 2)
{
uint32_t count = i + 1;
float16_t acc0;
float16_t acc1;
_Float16 acc0;
_Float16 acc1;
/*
* compute 2 accumulators per loop
* size is incrementing for second accumulator
@ -390,7 +390,7 @@ void arm_correlate_f16(
for (; i < block1; i++)
{
uint32_t count = i + 1;
float16_t acc;
_Float16 acc;
pX = pA;
pY = pB;
@ -403,10 +403,10 @@ void arm_correlate_f16(
for (i = 0U; i <= block2 - 4; i += 4)
{
float16_t acc0;
float16_t acc1;
float16_t acc2;
float16_t acc3;
_Float16 acc0;
_Float16 acc1;
_Float16 acc2;
_Float16 acc3;
pX = pA;
pY = pB;
@ -430,8 +430,8 @@ void arm_correlate_f16(
for (; i <= block2 - 2; i += 2)
{
float16_t acc0;
float16_t acc1;
_Float16 acc0;
_Float16 acc1;
pX = pA;
pY = pB;
@ -451,7 +451,7 @@ void arm_correlate_f16(
if (block2 & 1)
{
float16_t acc;
_Float16 acc;
pX = pA;
pY = pB;
@ -466,8 +466,8 @@ void arm_correlate_f16(
{
uint32_t count = (i + 1);
float16_t acc0;
float16_t acc1;
_Float16 acc0;
_Float16 acc1;
pX = pA;
pY = pB;
@ -488,7 +488,7 @@ void arm_correlate_f16(
for (; i >= 0; i--)
{
uint32_t count = (i + 1);
float16_t acc;
_Float16 acc;
pX = pA;
pY = pB;
@ -517,15 +517,15 @@ void arm_correlate_f16(
const float16_t *px; /* Intermediate inputA pointer */
const float16_t *py; /* Intermediate inputB pointer */
const float16_t *pSrc1;
float16_t sum;
_Float16 sum;
uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
uint32_t j, k, count, blkCnt; /* Loop counters */
uint32_t outBlockSize; /* Loop counter */
int32_t inc = 1; /* Destination address modifier */
#if defined (ARM_MATH_LOOPUNROLL)
float16_t acc0, acc1, acc2, acc3,c0; /* Accumulators */
float16_t x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
_Float16 acc0, acc1, acc2, acc3,c0; /* Accumulators */
_Float16 x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
#endif
/* The algorithm implementation is based on the lengths of the inputs. */
@ -625,7 +625,7 @@ void arm_correlate_f16(
while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
sum = 0.0f16;
#if defined (ARM_MATH_LOOPUNROLL)
@ -725,10 +725,10 @@ void arm_correlate_f16(
while (blkCnt > 0U)
{
/* Set all accumulators to zero */
acc0 = 0.0f;
acc1 = 0.0f;
acc2 = 0.0f;
acc3 = 0.0f;
acc0 = 0.0f16;
acc1 = 0.0f16;
acc2 = 0.0f16;
acc3 = 0.0f16;
/* read x[0], x[1], x[2] samples */
@ -873,7 +873,7 @@ void arm_correlate_f16(
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
sum = 0.0f16;
#if defined (ARM_MATH_LOOPUNROLL)
@ -939,7 +939,7 @@ void arm_correlate_f16(
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
sum = 0.0f16;
/* Loop over srcBLen */
k = srcBLen;
@ -1000,7 +1000,7 @@ void arm_correlate_f16(
while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.0f;
sum = 0.0f16;
#if defined (ARM_MATH_LOOPUNROLL)
@ -1069,7 +1069,7 @@ void arm_correlate_f16(
const float16_t *pIn1 = pSrcA; /* inputA pointer */
const float16_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
float16_t sum; /* Accumulator */
_Float16 sum; /* Accumulator */
uint32_t i = 0U, j; /* Loop counters */
uint32_t inv = 0U; /* Reverse order flag */
uint32_t tot = 0U; /* Length */
@ -1127,7 +1127,7 @@ void arm_correlate_f16(
for (i = 0U; i <= tot; i++)
{
/* Initialize sum with zero to carry out MAC operations */
sum = 0.0f;
sum = 0.0f16;
/* Loop to perform MAC operations according to convolution equation */
for (j = 0U; j <= i; j++)

@ -711,13 +711,13 @@ arm_status arm_mat_cmplx_mult_f16(
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
float16_t sumReal, sumImag; /* Accumulator */
float16_t a1, b1, c1, d1;
_Float16 sumReal, sumImag; /* Accumulator */
_Float16 a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
float16_t a0, b0, c0, d0;
_Float16 a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
@ -755,8 +755,8 @@ arm_status arm_mat_cmplx_mult_f16(
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f;
sumImag = 0.0f;
sumReal = 0.0f16;
sumImag = 0.0f16;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;

@ -66,7 +66,7 @@ arm_status arm_mat_inverse_f16(
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t *pTmpA, *pTmpB;
float16_t in = 0.0f; /* Temporary input values */
_Float16 in = 0.0f16; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
uint32_t blkCnt;
@ -137,20 +137,20 @@ arm_status arm_mat_inverse_f16(
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0f16;
j--;
}
/*
* Writing all ones in the diagonal of the destination matrix
*/
*pOutT1++ = 1.0f;
*pOutT1++ = 1.0f16;
/*
* Writing all zeroes in upper triangle of the destination matrix
*/
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0f16;
j--;
}
/*
@ -199,7 +199,7 @@ arm_status arm_mat_inverse_f16(
/*
* Check if the pivot element is zero
*/
if (*pInT1 == 0.0f)
if (*pInT1 == 0.0f16)
{
/*
* Loop over the number rows present below
@ -215,7 +215,7 @@ arm_status arm_mat_inverse_f16(
* Check if there is a non zero pivot element to
* * replace in the rows below
*/
if (*pInT2 != 0.0f)
if (*pInT2 != 0.0f16)
{
f16x8_t vecA, vecB;
/*
@ -310,7 +310,7 @@ arm_status arm_mat_inverse_f16(
/*
* Update the status if the matrix is singular
*/
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0f16))
{
return ARM_MATH_SINGULAR;
}
@ -334,7 +334,7 @@ arm_status arm_mat_inverse_f16(
pTmpA = pInT1;
f16x8_t invIn = vdupq_n_f16(1.0f / in);
f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
blkCnt = (numCols - l) >> 3;
f16x8_t vecA;
@ -537,12 +537,12 @@ arm_status arm_mat_inverse_f16(
*/
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0f16))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
if (pIn[i] != 0.0f16)
break;
}
@ -568,7 +568,7 @@ arm_status arm_mat_inverse_f16(
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t Xchg, in = 0.0f, in1; /* Temporary input values */
_Float16 Xchg, in = 0.0f16, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
@ -636,18 +636,18 @@ arm_status arm_mat_inverse_f16(
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0f16;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
*pOutT1++ = 1.0f16;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0f16;
j--;
}
@ -685,7 +685,7 @@ arm_status arm_mat_inverse_f16(
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
if (*pInT1 == 0.0f16)
{
/* Loop over the number rows present below */
@ -697,7 +697,7 @@ arm_status arm_mat_inverse_f16(
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0f)
if (*pInT2 != 0.0f16)
{
/* Loop over number of columns
* to the right of the pilot element */
@ -743,7 +743,7 @@ arm_status arm_mat_inverse_f16(
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0f16))
{
return ARM_MATH_SINGULAR;
}
@ -877,12 +877,12 @@ arm_status arm_mat_inverse_f16(
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0f16))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
if (pIn[i] != 0.0f16)
break;
}

@ -632,7 +632,7 @@ arm_status arm_mat_mult_f16(
float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
float16_t sum; /* Accumulator */
_Float16 sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
@ -671,7 +671,7 @@ arm_status arm_mat_mult_f16(
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
sum = 0.0f16;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;

@ -151,7 +151,7 @@ void arm_svm_linear_predict_f16(
acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
sum += vecAddAcrossF16Mve(acc0);
sum += (_Float16)vecAddAcrossF16Mve(acc0);
pSrcA += numCols * 4;
/*
@ -221,7 +221,7 @@ void arm_svm_linear_predict_f16(
acc0 = vmulq_n_f16(acc0,*pDualCoef++);
acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
sum += vecAddAcrossF16Mve(acc0);
sum += (_Float16)vecAddAcrossF16Mve(acc0);
pSrcA += numCols * 2;
row -= 2;
@ -274,7 +274,7 @@ void arm_svm_linear_predict_f16(
/*
* Sum the partial parts
*/
sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
}

@ -294,7 +294,7 @@ void arm_svm_polynomial_predict_f16(
(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
vctp16q(1));
}
sum += vecAddAcrossF16Mve(vSum);
sum += (_Float16)vecAddAcrossF16Mve(vSum);
*pResult = S->classes[STEP(sum)];
@ -316,9 +316,9 @@ void arm_svm_polynomial_predict_f16(
dot=0;
for(j=0; j < S->vectorDimension; j++)
{
dot = dot + in[j]* *pSupport++;
dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
}
sum += S->dualCoefficients[i] * arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
}
*pResult=S->classes[STEP(sum)];

@ -67,7 +67,7 @@ void arm_svm_rbf_predict_f16(
uint32_t row;
uint32_t blkCnt; /* loop counters */
const float16_t *pDualCoef = S->dualCoefficients;
float16_t sum = S->intercept;
_Float16 sum = S->intercept;
f16x8_t vSum = vdupq_n_f16(0);
row = numRows;
@ -323,19 +323,19 @@ void arm_svm_rbf_predict_f16(
int32_t * pResult)
{
_Float16 sum=S->intercept;
_Float16 dot=0;
_Float16 dot=00.f16;
uint32_t i,j;
const float16_t *pSupport = S->supportVectors;
for(i=0; i < S->nbOfSupportVectors; i++)
{
dot=0;
dot=0.0f16;
for(j=0; j < S->vectorDimension; j++)
{
dot = dot + SQ(in[j] - *pSupport);
dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
pSupport++;
}
sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf(-(_Float16)S->gamma * dot);
}
*pResult=S->classes[STEP(sum)];
}

@ -67,7 +67,7 @@ void arm_svm_sigmoid_predict_f16(
uint32_t row;
uint32_t blkCnt; /* loop counters */
const float16_t *pDualCoef = S->dualCoefficients;
float16_t sum = S->intercept;
_Float16 sum = S->intercept;
f16x8_t vSum = vdupq_n_f16(0.0f);
row = numRows;
@ -305,18 +305,18 @@ void arm_svm_sigmoid_predict_f16(
int32_t * pResult)
{
_Float16 sum=S->intercept;
_Float16 dot=0;
_Float16 dot=0.0f16;
uint32_t i,j;
const float16_t *pSupport = S->supportVectors;
for(i=0; i < S->nbOfSupportVectors; i++)
{
dot=0;
dot=0.0f16;
for(j=0; j < S->vectorDimension; j++)
{
dot = dot + in[j]* *pSupport++;
dot = dot + (_Float16)in[j] * (_Float16)*pSupport++;
}
sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((_Float16)S->gamma * dot + (_Float16)S->coef0);
}
*pResult=S->classes[STEP(sum)];
}

@ -65,7 +65,7 @@
float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
{
uint32_t blkCnt;
float16_t accum=0.0f,p;
_Float16 accum=0.0f16,p;
blkCnt = blockSize;
@ -110,7 +110,7 @@ float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
{
const float16_t *pIn;
uint32_t blkCnt;
float16_t accum, p;
_Float16 accum, p;
pIn = pSrcA;
blkCnt = blockSize;

@ -72,12 +72,12 @@
float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
{
uint32_t blkCnt;
float16_t accum, pA,pB;
_Float16 accum, pA,pB;
blkCnt = blockSize;
accum = 0.0f;
accum = 0.0f16;
f16x8_t vSum = vdupq_n_f16(0.0f);
blkCnt = blockSize >> 3;
@ -120,7 +120,7 @@ float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSr
{
const float16_t *pInA, *pInB;
uint32_t blkCnt;
float16_t accum, pA,pB;
_Float16 accum, pA,pB;
pInA = pSrcA;
pInB = pSrcB;

@ -72,8 +72,8 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
float16_t maxVal;
const float16_t *pIn;
int32_t blkCnt;
float16_t accum=0.0f16;
float16_t tmp;
_Float16 accum=0.0f16;
_Float16 tmp;
arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal);
@ -122,11 +122,11 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
#else
float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
{
float16_t maxVal;
float16_t tmp;
_Float16 maxVal;
_Float16 tmp;
const float16_t *pIn;
uint32_t blkCnt;
float16_t accum;
_Float16 accum;
pIn = in;
blkCnt = blockSize;

@ -88,8 +88,8 @@ void arm_power_f16(
float16_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float16_t sum = 0.0f; /* Temporary result storage */
float16_t in; /* Temporary variable to store input value */
_Float16 sum = 0.0f16; /* Temporary result storage */
_Float16 in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -83,8 +83,8 @@ void arm_rms_f16(
float16_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float16_t sum = 0.0f; /* Temporary result storage */
float16_t in; /* Temporary variable to store input value */
_Float16 sum = 0.0f16; /* Temporary result storage */
_Float16 in; /* Temporary variable to store input value */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

@ -342,7 +342,7 @@ void assert_close_error(unsigned long nb,float64_t &ref, float64_t &val, double
if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
{
char details[200];
sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
throw (Error(CLOSE_ERROR,nb,details));
}
};
@ -385,7 +385,7 @@ void assert_close_error(unsigned long nb,float32_t &ref, float32_t &val, double
if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
{
char details[200];
sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
throw (Error(CLOSE_ERROR,nb,details));
}
};
@ -429,7 +429,7 @@ void assert_close_error(unsigned long nb,float16_t &ref, float16_t &val, double
if (abs((float)val - (float)ref) > (absthreshold + relthreshold * abs((float)ref)))
{
char details[200];
sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
throw (Error(CLOSE_ERROR,nb,details));
}
};

@ -11,7 +11,7 @@ NBVECTORS = [4,10,16]
VECDIM = [12,14,20]
def genWsum(config,nb):
def genWsum(config,f,nb):
DIM=50
inputs=[]
weights=[]
@ -23,15 +23,15 @@ def genWsum(config,nb):
inputs += list(va)
weights += list(vb)
nbiters = Tools.loopnb(0,Tools.TAILONLY)
nbiters = Tools.loopnb(f,Tools.TAILONLY)
e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters])
output.append(e)
nbiters = Tools.loopnb(0,Tools.BODYONLY)
nbiters = Tools.loopnb(f,Tools.BODYONLY)
e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters])
output.append(e)
nbiters = Tools.loopnb(0,Tools.BODYANDTAIL)
nbiters = Tools.loopnb(f,Tools.BODYANDTAIL)
e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters])
output.append(e)
@ -92,7 +92,7 @@ def writeTestsF32(config):
# This is for benchmarking the weighted sum and we use only one test pattern
genWsum(config,6)
genWsum(config,Tools.F32,6)
def writeTestsF16(config):
@ -105,7 +105,7 @@ def writeTestsF16(config):
config.writeInput(11,va,"Samples")
# This is for benchmarking the weighted sum and we use only one test pattern
genWsum(config,6)
genWsum(config,Tools.F16,6)
def writeTestsQ31(config):
NBSAMPLES=256

@ -1,102 +1,102 @@
H
50
// 0.549118
0x3865
// 0.782574
0x3a43
// 0.537389
0x384d
// 0.576597
0x389d
// 0.849496
0x3acc
// 0.764850
0x3a1e
// 0.539151
0x3850
// 0.459163
0x3759
// 0.352476
0x35a4
// 0.333343
0x3555
// 0.362662
0x35cd
// 0.343928
0x3581
// 0.046084
0x29e6
// 0.583088
0x38aa
// 0.464789
0x3770
// 0.659567
0x3947
// 0.177360
0x31ad
// 0.465121
0x3771
// 0.849010
0x3acb
// 0.748383
0x39fd
// 0.493254
0x37e4
// 0.023579
0x2609
// 0.354457
0x35ac
// 0.880690
0x3b0c
// 0.029813
0x27a2
// 0.338038
0x3569
// 0.724386
0x39cc
// 0.341454
0x3577
// 0.676073
0x3969
// 0.041176
0x2945
// 0.214019
0x32d9
// 0.322539
0x3529
// 0.499026
0x37fc
// 0.616038
0x38ee
// 0.688882
0x3983
// 0.066439
0x2c41
// 0.094981
0x2e14
// 0.873092
0x3afc
// 0.642707
0x3924
// 0.537527
0x384d
// 0.370953
0x35ef
// 0.450042
0x3733
// 0.434679
0x36f4
// 0.676789
0x396a
// 0.303649
0x34dc
// 0.076930
0x2cec
// 0.871305
0x3af8
// 0.855053
0x3ad7
// 0.498935
0x37fc
// 0.854980
0x3ad7
// 0.145259
0x30a6
// 0.431775
0x36e9
// 0.925328
0x3b67
// 0.170239
0x3173
// 0.385773
0x362c
// 0.998601
0x3bfd
// 0.108755
0x2ef6
// 0.479921
0x37ae
// 0.592423
0x38bd
// 0.431234
0x36e6
// 0.413843
0x369f
// 0.959014
0x3bac
// 0.665528
0x3953
// 0.952834
0x3b9f
// 0.562821
0x3881
// 0.609152
0x38e0
// 0.890135
0x3b1f
// 0.002624
0x1960
// 0.666853
0x3956
// 0.813988
0x3a83
// 0.170387
0x3174
// 0.264551
0x343c
// 0.699111
0x3998
// 0.738154
0x39e8
// 0.475053
0x379a
// 0.439282
0x3707
// 0.699736
0x3999
// 0.488956
0x37d3
// 0.115565
0x2f65
// 0.115452
0x2f64
// 0.586420
0x38b1
// 0.327648
0x353e
// 0.442207
0x3713
// 0.752186
0x3a04
// 0.476363
0x379f
// 0.491094
0x37dc
// 0.552734
0x386c
// 0.674894
0x3966
// 0.705545
0x39a5
// 0.966659
0x3bbc
// 0.528146
0x383a
// 0.633660
0x3912
// 0.049129
0x2a4a
// 0.458540
0x3756
// 0.331162
0x354c
// 0.399161
0x3663
// 0.015650
0x2402
// 0.271388
0x3458
// 0.912530
0x3b4d
// 0.655578
0x393f

@ -1,8 +1,8 @@
H
3
// 0.544937
0x385c
// 0.581009
0x38a6
// 0.504352
0x3809
// 0.445338
0x3720
// 0.569713
0x388f
// 0.564358
0x3884

@ -1,102 +1,102 @@
H
50
// 0.873606
0x3afd
// 0.005892
0x1e09
// 0.669526
0x395b
// 0.198146
0x3257
// 0.032904
0x2836
// 0.829321
0x3aa2
// 0.044536
0x29b3
// 0.842708
0x3abe
// 0.981319
0x3bda
// 0.274000
0x3462
// 0.509194
0x3813
// 0.994965
0x3bf6
// 0.572766
0x3895
// 0.557596
0x3876
// 0.306704
0x34e8
// 0.983822
0x3bdf
// 0.145555
0x30a8
// 0.539088
0x3850
// 0.527526
0x3838
// 0.501409
0x3803
// 0.249480
0x33fc
// 0.273503
0x3460
// 0.567660
0x388b
// 0.741561
0x39ef
// 0.458026
0x3754
// 0.239162
0x33a7
// 0.731739
0x39db
// 0.662458
0x394d
// 0.024779
0x2658
// 0.086811
0x2d8e
// 0.660991
0x394a
// 0.834424
0x3aad
// 0.163672
0x313d
// 0.120432
0x2fb5
// 0.593488
0x38bf
// 0.973784
0x3bca
// 0.167473
0x315c
// 0.858171
0x3ade
// 0.986637
0x3be5
// 0.223556
0x3327
// 0.382377
0x361e
// 0.757667
0x3a10
// 0.032219
0x2820
// 0.574024
0x3898
// 0.125286
0x3002
// 0.946997
0x3b93
// 0.942443
0x3b8a
// 0.152563
0x30e2
// 0.240567
0x33b3
// 0.160261
0x3121
// 0.305271
0x34e2
// 0.032698
0x282f
// 0.266409
0x3443
// 0.617540
0x38f1
// 0.152922
0x30e5
// 0.774995
0x3a33
// 0.848550
0x3aca
// 0.523429
0x3830
// 0.116035
0x2f6d
// 0.649223
0x3932
// 0.246758
0x33e5
// 0.826661
0x3a9d
// 0.967839
0x3bbe
// 0.283239
0x3488
// 0.847203
0x3ac7
// 0.720735
0x39c4
// 0.477014
0x37a2
// 0.633504
0x3911
// 0.596646
0x38c6
// 0.832505
0x3aa9
// 0.135769
0x3058
// 0.607454
0x38dc
// 0.914942
0x3b52
// 0.474766
0x3799
// 0.781271
0x3a40
// 0.084347
0x2d66
// 0.038805
0x28f8
// 0.055045
0x2b0c
// 0.334330
0x3559
// 0.331283
0x354d
// 0.595124
0x38c3
// 0.443636
0x3719
// 0.612722
0x38e7
// 0.152437
0x30e1
// 0.311869
0x34fd
// 0.177173
0x31ab
// 0.540216
0x3852
// 0.941623
0x3b88
// 0.666747
0x3955
// 0.002608
0x1958
// 0.092911
0x2df2
// 0.129786
0x3027
// 0.710137
0x39ae
// 0.488728
0x37d2
// 0.478723
0x37a9
// 0.627602
0x3905
// 0.001795
0x175b
// 0.299980
0x34cd
// 0.636209
0x3917
// 0.102662
0x2e92

@ -10,8 +10,8 @@ Reference patterns are generated with
a double precision computation.
*/
#define REL_ERROR (3.0e-3)
#define ABS_ERROR (3.5e-2)
#define REL_ERROR (5.0e-2)
#define ABS_ERROR (1.0e-1)
void BIQUADF16::test_biquad_cascade_df1_ref()
{

@ -3,7 +3,7 @@
#include "Error.h"
#include "Test.h"
#define REL_ERROR (2e-3)
#define REL_ERROR (5e-3)
#define REL_JS_ERROR (3e-2)

@ -7,8 +7,8 @@
#define SNR_THRESHOLD 120
#define REL_ERROR (1.0e-5)
#define ABS_WEIGHTEDSUM_ERROR (5.0e-2)
#define REL_WEIGHTEDSUM_ERROR (5.0e-2)
#define ABS_WEIGHTEDSUM_ERROR (1.0e-1)
#define REL_WEIGHTEDSUM_ERROR (5.0e-3)
#define ABS_ERROR_F32 (1.0e-3)
#define REL_ERROR_F32 (1.0e-3)
@ -28,7 +28,7 @@ void SupportTestsF16::test_weighted_sum_f16()
*outp=arm_weighted_sum_f16(inp, coefsp,this->nbSamples);
ASSERT_CLOSE_ERROR(*outp,refp[this->offset],ABS_WEIGHTEDSUM_ERROR,REL_WEIGHTEDSUM_ERROR);
ASSERT_EMPTY_TAIL(output);

@ -754,7 +754,7 @@ group Root {
}
Functions {
Cascaded BiQuad Filter DF2T_f64:test_biquad_cascade_df2T_f64
Cascaded BiQuad Filter DF2T:test_biquad_cascade_df2T_f64
} -> PARAM1_ID
}
}
@ -1570,8 +1570,8 @@ group Root {
Complex FFT:test_cfft_f32 -> CFFT_PARAM_ID
Real FFT:test_rfft_f32 -> RFFT_PARAM_ID
DCT4:test_dct4_f32 -> DCT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
Scalar Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
Scalar Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
}
}
@ -1620,8 +1620,8 @@ group Root {
Complex FFT:test_cfft_q31 -> CFFT_PARAM_ID
Real FFT:test_rfft_q31 -> RFFT_PARAM_ID
DCT4:test_dct4_q31 -> DCT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
Scalar Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
Scalar Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
}
}
@ -1670,8 +1670,8 @@ group Root {
Complex FFT:test_cfft_q15 -> CFFT_PARAM_ID
Real FFT:test_rfft_q15 -> RFFT_PARAM_ID
DCT4:test_dct4_q15 -> DCT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
Scalar Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
Scalar Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
}
}
}

@ -584,8 +584,8 @@ group Root {
Functions {
Complex FFT:test_cfft_f16 -> CFFT_PARAM_ID
Real FFT:test_rfft_f16 -> RFFT_PARAM_ID
Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
Scalar Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
Scalar Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
}
}

Loading…
Cancel
Save