CMSIS-DSP: Improved f16 scalar code.

5 years ago · 078c0b282d
parent 4b4d5322c8
commit 078c0b282d
38 changed files with 369 additions and 367 deletions
--- a/Include/arm_helium_utils.h
+++ b/Include/arm_helium_utils.h
@ -72,7 +72,7 @@ __STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
    in = vaddq_f16(tmpVec, in);
    tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
    in = vaddq_f16(tmpVec, in);
-    acc = vgetq_lane_f16(in, 0) + vgetq_lane_f16(in, 4);
+    acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);

    return acc;
 }
--- a/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c
+++ b/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c
@ -67,7 +67,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
    const float16_t *pIn = in;
    float16_t       result;
    f16x8_t         vsigma;
-    float16_t       tmp;
+    _Float16       tmp;
    f16x8_t         vacc1, vacc2;
    uint32_t        index;
    float16_t       logclassPriors[S->numberOfClasses];
@ -81,15 +81,15 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
    for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
        pIn = in;

-        vacc1 = vdupq_n_f16(0);
-        vacc2 = vdupq_n_f16(0);
+        vacc1 = vdupq_n_f16(0.0f16);
+        vacc2 = vdupq_n_f16(0.0f16);

        uint32_t         blkCnt =S->vectorDimension >> 3;
        while (blkCnt > 0U) {
            f16x8_t         vinvSigma, vtmp;

            vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
-            vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)));
+            vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));

            vinvSigma = vrecip_medprec_f16(vsigma);

@ -112,7 +112,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins

            vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
            vacc1 =
-                vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)), p0);
+                vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);

            vinvSigma = vrecip_medprec_f16(vsigma);

@ -126,8 +126,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
            pSigma += blkCnt;
        }

-        tmp = -0.5f * vecAddAcrossF16Mve(vacc1);
-        tmp -= 0.5f * vecAddAcrossF16Mve(vacc2);
+        tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
+        tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);

        *buffer = tmp + *pLogPrior++;
        buffer++;
@ -175,13 +175,13 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
        
        pIn = in;

-        tmp = 0.0;
-        acc1 = 0.0f;
-        acc2 = 0.0f;
+        tmp = 0.0f16;
+        acc1 = 0.0f16;
+        acc2 = 0.0f16;
        for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
        {
           sigma = *pSigma + S->epsilon;
-           acc1 += logf(2.0f * PI_F * sigma);
+           acc1 += logf(2.0f16 * (_Float16)PI_F * sigma);
           acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;

           pIn++;
@ -189,8 +189,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
           pSigma++;
        }

-        tmp = -0.5f * acc1;
-        tmp -= 0.5f * acc2;
+        tmp = -0.5f16 * acc1;
+        tmp -= 0.5f16 * acc2;


        *buffer = tmp + logf(*pPrior++);
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c
@ -172,7 +172,7 @@ void arm_cmplx_mag_f16(
        uint32_t numSamples)
 {
  uint32_t blkCnt;                               /* loop counter */
-  float16_t real, imag;                      /* Temporary variables to hold input values */
+  _Float16 real, imag;                      /* Temporary variables to hold input values */

 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

--- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c
@ -107,7 +107,7 @@ void arm_cmplx_mag_squared_f16(
        uint32_t numSamples)
 {
        uint32_t blkCnt;                               /* Loop counter */
-        float16_t real, imag;                          /* Temporary input variables */
+        _Float16 real, imag;                          /* Temporary input variables */

 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

--- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c
@ -103,7 +103,7 @@ void arm_cmplx_mult_cmplx_f16(
        pDst += 8;
    }

-    float16_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
+    _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
        /* Tail */
    blkCnt = (blockSize & 7) >> 1;
    while (blkCnt > 0)
@ -134,7 +134,7 @@ void arm_cmplx_mult_cmplx_f16(
        uint32_t numSamples)
 {
    uint32_t blkCnt;                               /* Loop counter */
-    float16_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
+    _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */

 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

--- a/Source/DistanceFunctions/arm_braycurtis_distance_f16.c
+++ b/Source/DistanceFunctions/arm_braycurtis_distance_f16.c
@ -72,7 +72,7 @@

 float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-    float16_t       accumDiff = 0.0f, accumSum = 0.0f;
+    _Float16        accumDiff = 0.0f, accumSum = 0.0f;
    uint32_t        blkCnt;
    f16x8_t         a, b, c, accumDiffV, accumSumV;

@ -125,14 +125,14 @@ float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, u

 float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accumDiff=0.0f, accumSum=0.0f, tmpA, tmpB;
+   _Float16 accumDiff=0.0f16, accumSum=0.0f16, tmpA, tmpB;

   while(blockSize > 0)
   {
      tmpA = *pA++;
      tmpB = *pB++;
-      accumDiff += fabsf(tmpA - tmpB);
-      accumSum += fabsf(tmpA + tmpB);
+      accumDiff += (_Float16)fabsf(tmpA - tmpB);
+      accumSum += (_Float16)fabsf(tmpA + tmpB);
      blockSize --;
   }
   /*
--- a/Source/DistanceFunctions/arm_canberra_distance_f16.c
+++ b/Source/DistanceFunctions/arm_canberra_distance_f16.c
@ -70,7 +70,7 @@

 float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-    float16_t       accum = 0.0f;
+    _Float16       accum = 0.0f16;
    uint32_t         blkCnt;
    f16x8_t         a, b, c, accumV;

@ -141,7 +141,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
 #else
 float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accum=0.0f, tmpA, tmpB,diff,sum;
+   _Float16 accum=0.0f, tmpA, tmpB,diff,sum;

   while(blockSize > 0)
   {
@ -150,7 +150,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin

      diff = fabsf(tmpA - tmpB);
      sum = fabsf(tmpA) + fabsf(tmpB);
-      if ((tmpA != 0.0f) || (tmpB != 0.0f))
+      if ((tmpA != 0.0f16) || (tmpB != 0.0f16))
      {
         accum += (diff / sum);
      }
--- a/Source/DistanceFunctions/arm_chebyshev_distance_f16.c
+++ b/Source/DistanceFunctions/arm_chebyshev_distance_f16.c
@ -67,7 +67,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
    uint32_t        blkCnt;     /* loop counters */
    f16x8_t         vecA, vecB;
    f16x8_t         vecDiff = vdupq_n_f16(0.0);
-    float16_t       maxValue = 0.0;
+    float16_t       maxValue = 0.0f16;


    blkCnt = blockSize >> 3;
@ -111,7 +111,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
 #else
 float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t diff=0.0f,  maxVal,tmpA, tmpB;
+   _Float16 diff=0.0f,  maxVal,tmpA, tmpB;

   tmpA = *pA++;
   tmpB = *pB++;
--- a/Source/DistanceFunctions/arm_cityblock_distance_f16.c
+++ b/Source/DistanceFunctions/arm_cityblock_distance_f16.c
@ -102,14 +102,14 @@ float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, ui
 #else
 float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accum,tmpA, tmpB;
+   _Float16 accum,tmpA, tmpB;

-   accum = 0.0f;
+   accum = 0.0f16;
   while(blockSize > 0)
   {
      tmpA = *pA++;
      tmpB = *pB++;
-      accum  += fabsf(tmpA - tmpB);
+      accum  += (_Float16)fabsf(tmpA - tmpB);
      
      blockSize --;
   }
--- a/Source/DistanceFunctions/arm_euclidean_distance_f16.c
+++ b/Source/DistanceFunctions/arm_euclidean_distance_f16.c
@ -105,16 +105,17 @@ float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, ui
 #else
 float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accum=0.0f,tmp;
+   _Float16 accum=0.0f,tmp;
+   float16_t result;

   while(blockSize > 0)
   {
-      tmp = *pA++ - *pB++;
+      tmp = (_Float16)*pA++ - (_Float16)*pB++;
      accum += SQ(tmp);
      blockSize --;
   }
-   arm_sqrt_f16(accum,&tmp);
-   return(tmp);
+   arm_sqrt_f16(accum,&result);
+   return(result);
 }

 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
--- a/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c
+++ b/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c
@ -145,16 +145,17 @@ float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB

 float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-    float16_t left, right,sum, result, tmp;
+    _Float16 left, right,sum, tmp;
+    float16_t result;
    uint32_t i;

-    left = 0.0f; 
-    right = 0.0f;
+    left = 0.0f16; 
+    right = 0.0f16;
    for(i=0; i < blockSize; i++)
    {
-      tmp = (pA[i] + pB[i]) / 2.0f;
-      left  += rel_entr(pA[i], tmp);
-      right += rel_entr(pB[i], tmp);
+      tmp = ((_Float16)pA[i] + (_Float16)pB[i]) / 2.0f16;
+      left  += (_Float16)rel_entr(pA[i], tmp);
+      right += (_Float16)rel_entr(pB[i], tmp);
    }


--- a/Source/DistanceFunctions/arm_minkowski_distance_f16.c
+++ b/Source/DistanceFunctions/arm_minkowski_distance_f16.c
@ -111,13 +111,13 @@ float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, in

 float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize)
 {
-    float16_t sum;
+    _Float16 sum;
    uint32_t i;

    sum = 0.0f; 
    for(i=0; i < blockSize; i++)
    {
-       sum += powf(fabsf(pA[i] - pB[i]),order);
+       sum += (_Float16)powf(fabsf(pA[i] - pB[i]),order);
    }


--- a/Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c
@ -64,7 +64,7 @@ void arm_biquad_cascade_df1_f16(
    float16_t Xn1, Xn2, Yn1, Yn2;   /*  Filter pState variables   */
    float16_t X0, X1, X2, X3;   /*  temporary input           */
    float16_t X4, X5, X6, X7;   /*  temporary input           */
-    float16_t lastX, lastY;             /*  X,Y history for tail handling */
+    _Float16 lastX, lastY;             /*  X,Y history for tail handling */
    f16x8_t coeffs;
    f16x8_t accVec;           /* accumultor vector */
    uint32_t  sample, stage = S->numStages; /*  loop counters             */
@ -320,10 +320,10 @@ void arm_biquad_cascade_df1_f16(
        float16_t *pOut = pDst;                        /* Destination pointer */
        float16_t *pState = S->pState;                 /* pState pointer */
  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
-        float16_t acc;                                 /* Accumulator */
-        float16_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
-        float16_t Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
-        float16_t Xn;                                  /* Temporary input */
+        _Float16 acc;                                 /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
+        _Float16 Xn;                                  /* Temporary input */
        uint32_t sample, stage = S->numStages;         /* Loop counters */

  do
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c
@ -197,10 +197,10 @@ void arm_biquad_cascade_df2T_f16(
        float16_t *pOut = pDst;                        /* Destination pointer */
        float16_t *pState = S->pState;                 /* State pointer */
  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
-        float16_t acc1;                                /* Accumulator */
-        float16_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
-        float16_t Xn1;                                 /* Temporary input */
-        float16_t d1, d2;                              /* State variables */
+        _Float16 acc1;                                /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1;                                 /* Temporary input */
+        _Float16 d1, d2;                              /* State variables */
        uint32_t sample, stage = S->numStages;         /* Loop counters */

  do
--- a/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
@ -198,10 +198,10 @@ void arm_biquad_cascade_stereo_df2T_f16(
        float16_t *pOut = pDst;                        /* Destination pointer */
        float16_t *pState = S->pState;                 /* State pointer */
  const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
-        float16_t acc1a, acc1b;                        /* Accumulator */
-        float16_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
-        float16_t Xn1a, Xn1b;                          /* Temporary input */
-        float16_t d1a, d2a, d1b, d2b;                  /* State variables */
+        _Float16 acc1a, acc1b;                        /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1a, Xn1b;                          /* Temporary input */
+        _Float16 d1a, d2a, d1b, d2b;                  /* State variables */
        uint32_t sample, stage = S->numStages;         /* Loop counters */

    do
--- a/Source/FilteringFunctions/arm_correlate_f16.c
+++ b/Source/FilteringFunctions/arm_correlate_f16.c
@ -370,8 +370,8 @@ void arm_correlate_f16(
    for (i = 0U; i <= block1 - 2; i += 2)
    {
        uint32_t  count = i + 1;
-        float16_t acc0;
-        float16_t acc1;
+        _Float16 acc0;
+        _Float16 acc1;
        /*
         * compute 2 accumulators per loop
         * size is incrementing for second accumulator
@ -390,7 +390,7 @@ void arm_correlate_f16(
    for (; i < block1; i++)
    {
        uint32_t  count = i + 1;
-        float16_t acc;
+        _Float16 acc;

        pX = pA;
        pY = pB;
@ -403,10 +403,10 @@ void arm_correlate_f16(

    for (i = 0U; i <= block2 - 4; i += 4)
    {
-        float16_t acc0;
-        float16_t acc1;
-        float16_t acc2;
-        float16_t acc3;
+        _Float16 acc0;
+        _Float16 acc1;
+        _Float16 acc2;
+        _Float16 acc3;

        pX = pA;
        pY = pB;
@ -430,8 +430,8 @@ void arm_correlate_f16(

    for (; i <= block2 - 2; i += 2)
    {
-        float16_t acc0;
-        float16_t acc1;
+        _Float16 acc0;
+        _Float16 acc1;

        pX = pA;
        pY = pB;
@ -451,7 +451,7 @@ void arm_correlate_f16(

    if (block2 & 1)
    {
-        float16_t acc;
+        _Float16 acc;

        pX = pA;
        pY = pB;
@ -466,8 +466,8 @@ void arm_correlate_f16(
    {

        uint32_t  count = (i + 1);
-        float16_t acc0;
-        float16_t acc1;
+        _Float16 acc0;
+        _Float16 acc1;

        pX = pA;
        pY = pB;
@ -488,7 +488,7 @@ void arm_correlate_f16(
    for (; i >= 0; i--)
    {
        uint32_t  count = (i + 1);
-        float16_t acc;
+        _Float16 acc;

        pX = pA;
        pY = pB;
@ -517,15 +517,15 @@ void arm_correlate_f16(
  const float16_t *px;                                 /* Intermediate inputA pointer */
  const float16_t *py;                                 /* Intermediate inputB pointer */
  const float16_t *pSrc1;
-        float16_t sum;
+        _Float16 sum;
        uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
        uint32_t j, k, count, blkCnt;                  /* Loop counters */
        uint32_t outBlockSize;                         /* Loop counter */
        int32_t inc = 1;                               /* Destination address modifier */

 #if defined (ARM_MATH_LOOPUNROLL) 
-    float16_t acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
-    float16_t x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
+    _Float16 acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
+    _Float16 x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
 #endif

  /* The algorithm implementation is based on the lengths of the inputs. */
@ -625,7 +625,7 @@ void arm_correlate_f16(
  while (blockSize1 > 0U)
  {
    /* Accumulator is made zero for every iteration */
-    sum = 0.0f;
+    sum = 0.0f16;

 #if defined (ARM_MATH_LOOPUNROLL) 

@ -725,10 +725,10 @@ void arm_correlate_f16(
    while (blkCnt > 0U)
    {
      /* Set all accumulators to zero */
-      acc0 = 0.0f;
-      acc1 = 0.0f;
-      acc2 = 0.0f;
-      acc3 = 0.0f;
+      acc0 = 0.0f16;
+      acc1 = 0.0f16;
+      acc2 = 0.0f16;
+      acc3 = 0.0f16;


      /* read x[0], x[1], x[2] samples */
@ -873,7 +873,7 @@ void arm_correlate_f16(
    while (blkCnt > 0U)
    {
      /* Accumulator is made zero for every iteration */
-      sum = 0.0f;
+      sum = 0.0f16;

 #if defined (ARM_MATH_LOOPUNROLL) 

@ -939,7 +939,7 @@ void arm_correlate_f16(
    while (blkCnt > 0U)
    {
      /* Accumulator is made zero for every iteration */
-      sum = 0.0f;
+      sum = 0.0f16;

      /* Loop over srcBLen */
      k = srcBLen;
@ -1000,7 +1000,7 @@ void arm_correlate_f16(
  while (blockSize3 > 0U)
  {
    /* Accumulator is made zero for every iteration */
-    sum = 0.0f;
+    sum = 0.0f16;

 #if defined (ARM_MATH_LOOPUNROLL) 

@ -1069,7 +1069,7 @@ void arm_correlate_f16(

  const float16_t *pIn1 = pSrcA;                       /* inputA pointer */
  const float16_t *pIn2 = pSrcB + (srcBLen - 1U);      /* inputB pointer */
-        float16_t sum;                                 /* Accumulator */
+        _Float16 sum;                                 /* Accumulator */
        uint32_t i = 0U, j;                            /* Loop counters */
        uint32_t inv = 0U;                             /* Reverse order flag */
        uint32_t tot = 0U;                             /* Length */
@ -1127,7 +1127,7 @@ void arm_correlate_f16(
  for (i = 0U; i <= tot; i++)
  {
    /* Initialize sum with zero to carry out MAC operations */
-    sum = 0.0f;
+    sum = 0.0f16;

    /* Loop to perform MAC operations according to convolution equation */
    for (j = 0U; j <= i; j++)
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
@ -711,13 +711,13 @@ arm_status arm_mat_cmplx_mult_f16(
  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
-  float16_t sumReal, sumImag;                    /* Accumulator */
-  float16_t a1, b1, c1, d1;
+  _Float16 sumReal, sumImag;                    /* Accumulator */
+  _Float16 a1, b1, c1, d1;
  uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
  arm_status status;                             /* status of matrix multiplication */

 #if defined (ARM_MATH_LOOPUNROLL)
-  float16_t a0, b0, c0, d0;
+  _Float16 a0, b0, c0, d0;
 #endif

 #ifdef ARM_MATH_MATRIX_CHECK
@ -755,8 +755,8 @@ arm_status arm_mat_cmplx_mult_f16(
      do
      {
        /* Set the variable sum, that acts as accumulator, to zero */
-        sumReal = 0.0f;
-        sumImag = 0.0f;
+        sumReal = 0.0f16;
+        sumImag = 0.0f16;

        /* Initiate pointer pIn1 to point to starting address of column being processed */
        pIn1 = pInA;
--- a/Source/MatrixFunctions/arm_mat_inverse_f16.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f16.c
@ -66,7 +66,7 @@ arm_status arm_mat_inverse_f16(
    uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
    float16_t *pTmpA, *pTmpB;

-    float16_t in = 0.0f;        /* Temporary input values  */
+    _Float16 in = 0.0f16;        /* Temporary input values  */
    uint32_t  i, rowCnt, flag = 0U, j, loopCnt, k, l;   /* loop counters */
    arm_status status;          /* status of matrix inverse */
    uint32_t  blkCnt;
@ -137,20 +137,20 @@ arm_status arm_mat_inverse_f16(
            j = numRows - rowCnt;
            while (j > 0U)
            {
-                *pOutT1++ = 0.0f;
+                *pOutT1++ = 0.0f16;
                j--;
            }
            /*
             * Writing all ones in the diagonal of the destination matrix
             */
-            *pOutT1++ = 1.0f;
+            *pOutT1++ = 1.0f16;
            /*
             * Writing all zeroes in upper triangle of the destination matrix
             */
            j = rowCnt - 1U;
            while (j > 0U)
            {
-                *pOutT1++ = 0.0f;
+                *pOutT1++ = 0.0f16;
                j--;
            }
            /*
@ -199,7 +199,7 @@ arm_status arm_mat_inverse_f16(
            /*
             * Check if the pivot element is zero
             */
-            if (*pInT1 == 0.0f)
+            if (*pInT1 == 0.0f16)
            {
                /*
                 * Loop over the number rows present below
@ -215,7 +215,7 @@ arm_status arm_mat_inverse_f16(
                     * Check if there is a non zero pivot element to
                     * * replace in the rows below
                     */
-                    if (*pInT2 != 0.0f)
+                    if (*pInT2 != 0.0f16)
                    {
                        f16x8_t vecA, vecB;
                        /*
@ -310,7 +310,7 @@ arm_status arm_mat_inverse_f16(
            /*
             * Update the status if the matrix is singular
             */
-            if ((flag != 1U) && (in == 0.0f))
+            if ((flag != 1U) && (in == 0.0f16))
            {
                return ARM_MATH_SINGULAR;
            }
@ -334,7 +334,7 @@ arm_status arm_mat_inverse_f16(

            pTmpA = pInT1;

-            f16x8_t invIn = vdupq_n_f16(1.0f / in);
+            f16x8_t invIn = vdupq_n_f16(1.0f16 / in);

            blkCnt = (numCols - l) >> 3;
            f16x8_t vecA;
@ -537,12 +537,12 @@ arm_status arm_mat_inverse_f16(
         */
        status = ARM_MATH_SUCCESS;

-        if ((flag != 1U) && (in == 0.0f))
+        if ((flag != 1U) && (in == 0.0f16))
        {
            pIn = pSrc->pData;
            for (i = 0; i < numRows * numCols; i++)
            {
-                if (pIn[i] != 0.0f)
+                if (pIn[i] != 0.0f16)
                    break;
            }

@ -568,7 +568,7 @@ arm_status arm_mat_inverse_f16(
  uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
  uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */

-  float16_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
+  _Float16 Xchg, in = 0.0f16, in1;                /* Temporary input values  */
  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
  arm_status status;                             /* status of matrix inverse */

@ -636,18 +636,18 @@ arm_status arm_mat_inverse_f16(
      j = numRows - rowCnt;
      while (j > 0U)
      {
-        *pOutT1++ = 0.0f;
+        *pOutT1++ = 0.0f16;
        j--;
      }

      /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0f;
+      *pOutT1++ = 1.0f16;

      /* Writing all zeroes in upper triangle of the destination matrix */
      j = rowCnt - 1U;
      while (j > 0U)
      {
-        *pOutT1++ = 0.0f;
+        *pOutT1++ = 0.0f16;
        j--;
      }

@ -685,7 +685,7 @@ arm_status arm_mat_inverse_f16(
      k = 1U;

      /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0f)
+      if (*pInT1 == 0.0f16)
      {
        /* Loop over the number rows present below */

@ -697,7 +697,7 @@ arm_status arm_mat_inverse_f16(

          /* Check if there is a non zero pivot element to
           * replace in the rows below */
-          if (*pInT2 != 0.0f)
+          if (*pInT2 != 0.0f16)
          {
            /* Loop over number of columns
             * to the right of the pilot element */
@ -743,7 +743,7 @@ arm_status arm_mat_inverse_f16(
      }

      /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0f))
+      if ((flag != 1U) && (in == 0.0f16))
      {
        return ARM_MATH_SINGULAR;
      }
@ -877,12 +877,12 @@ arm_status arm_mat_inverse_f16(
    /* Set status as ARM_MATH_SUCCESS */
    status = ARM_MATH_SUCCESS;

-    if ((flag != 1U) && (in == 0.0f))
+    if ((flag != 1U) && (in == 0.0f16))
    {
      pIn = pSrc->pData;
      for (i = 0; i < numRows * numCols; i++)
      {
-        if (pIn[i] != 0.0f)
+        if (pIn[i] != 0.0f16)
            break;
      }

--- a/Source/MatrixFunctions/arm_mat_mult_f16.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f16.c
@ -632,7 +632,7 @@ arm_status arm_mat_mult_f16(
  float16_t *pInB = pSrcB->pData;                /* Input data matrix pointer B */
  float16_t *pOut = pDst->pData;                 /* Output data matrix pointer */
  float16_t *px;                                 /* Temporary output data matrix pointer */
-  float16_t sum;                                 /* Accumulator */
+  _Float16 sum;                                 /* Accumulator */
  uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
  uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
  uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
@ -671,7 +671,7 @@ arm_status arm_mat_mult_f16(
      do
      {
        /* Set the variable sum, that acts as accumulator, to zero */
-        sum = 0.0f;
+        sum = 0.0f16;

        /* Initialize pointer pIn1 to point to starting address of column being processed */
        pIn1 = pInA;
--- a/Source/SVMFunctions/arm_svm_linear_predict_f16.c
+++ b/Source/SVMFunctions/arm_svm_linear_predict_f16.c
@ -151,7 +151,7 @@ void arm_svm_linear_predict_f16(
        acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
        acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);

-        sum += vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);

        pSrcA += numCols * 4;
        /*
@ -221,7 +221,7 @@ void arm_svm_linear_predict_f16(
        acc0 = vmulq_n_f16(acc0,*pDualCoef++);
        acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);

-        sum += vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);

        pSrcA += numCols * 2;
        row -= 2;
@ -274,7 +274,7 @@ void arm_svm_linear_predict_f16(
        /*
         * Sum the partial parts
         */
-        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);

    }

--- a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
+++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@ -294,7 +294,7 @@ void arm_svm_polynomial_predict_f16(
                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
                             vctp16q(1));
    }
-    sum += vecAddAcrossF16Mve(vSum);
+    sum += (_Float16)vecAddAcrossF16Mve(vSum);

    
    *pResult = S->classes[STEP(sum)];
@ -316,9 +316,9 @@ void arm_svm_polynomial_predict_f16(
        dot=0;
        for(j=0; j < S->vectorDimension; j++)
        {
-            dot = dot + in[j]* *pSupport++;
+            dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
        }
-        sum += S->dualCoefficients[i] * arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
+        sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
    }

    *pResult=S->classes[STEP(sum)];
--- a/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
+++ b/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
@ -67,7 +67,7 @@ void arm_svm_rbf_predict_f16(
    uint32_t         row;
    uint32_t         blkCnt;     /* loop counters */
    const float16_t *pDualCoef = S->dualCoefficients;
-    float16_t       sum = S->intercept;
+    _Float16       sum = S->intercept;
    f16x8_t         vSum = vdupq_n_f16(0);

    row = numRows;
@ -323,19 +323,19 @@ void arm_svm_rbf_predict_f16(
    int32_t * pResult)
 {
    _Float16 sum=S->intercept;
-    _Float16 dot=0;
+    _Float16 dot=00.f16;
    uint32_t i,j;
    const float16_t *pSupport = S->supportVectors;

    for(i=0; i < S->nbOfSupportVectors; i++)
    {
-        dot=0;
+        dot=0.0f16;
        for(j=0; j < S->vectorDimension; j++)
        {
-            dot = dot + SQ(in[j] - *pSupport);
+            dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
            pSupport++;
        }
-        sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf(-(_Float16)S->gamma * dot);
    }
    *pResult=S->classes[STEP(sum)];
 }
--- a/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
+++ b/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
@ -67,7 +67,7 @@ void arm_svm_sigmoid_predict_f16(
    uint32_t         row;
    uint32_t         blkCnt;     /* loop counters */
    const float16_t *pDualCoef = S->dualCoefficients;
-    float16_t       sum = S->intercept;
+    _Float16       sum = S->intercept;
    f16x8_t         vSum = vdupq_n_f16(0.0f);

    row = numRows;
@ -305,18 +305,18 @@ void arm_svm_sigmoid_predict_f16(
    int32_t * pResult)
 {
    _Float16 sum=S->intercept;
-    _Float16 dot=0;
+    _Float16 dot=0.0f16;
    uint32_t i,j;
    const float16_t *pSupport = S->supportVectors;

    for(i=0; i < S->nbOfSupportVectors; i++)
    {
-        dot=0;
+        dot=0.0f16;
        for(j=0; j < S->vectorDimension; j++)
        {
-            dot = dot + in[j]* *pSupport++;
+            dot = dot + (_Float16)in[j] * (_Float16)*pSupport++;
        }
-        sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((_Float16)S->gamma * dot + (_Float16)S->coef0);
    }
    *pResult=S->classes[STEP(sum)];
 }
--- a/Source/StatisticsFunctions/arm_entropy_f16.c
+++ b/Source/StatisticsFunctions/arm_entropy_f16.c
@ -65,7 +65,7 @@
 float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
 {
    uint32_t        blkCnt;
-    float16_t       accum=0.0f,p;
+    _Float16       accum=0.0f16,p;


    blkCnt = blockSize;
@ -110,7 +110,7 @@ float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
 {
    const float16_t *pIn;
    uint32_t blkCnt;
-    float16_t accum, p;
+    _Float16 accum, p;
 
    pIn = pSrcA;
    blkCnt = blockSize;
--- a/Source/StatisticsFunctions/arm_kullback_leibler_f16.c
+++ b/Source/StatisticsFunctions/arm_kullback_leibler_f16.c
@ -72,12 +72,12 @@
 float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
 {
    uint32_t blkCnt;
-    float16_t accum, pA,pB;
+    _Float16 accum, pA,pB;
 
    
    blkCnt = blockSize;

-    accum = 0.0f;
+    accum = 0.0f16;

    f16x8_t         vSum = vdupq_n_f16(0.0f);
    blkCnt = blockSize >> 3;
@ -120,7 +120,7 @@ float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSr
 {
    const float16_t *pInA, *pInB;
    uint32_t blkCnt;
-    float16_t accum, pA,pB;
+    _Float16 accum, pA,pB;
 
    pInA = pSrcA;
    pInB = pSrcB;
--- a/Source/StatisticsFunctions/arm_logsumexp_f16.c
+++ b/Source/StatisticsFunctions/arm_logsumexp_f16.c
@ -72,8 +72,8 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
    float16_t       maxVal;
    const float16_t *pIn;
    int32_t         blkCnt;
-    float16_t       accum=0.0f16;
-    float16_t       tmp;
+    _Float16       accum=0.0f16;
+    _Float16       tmp;


    arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal);
@ -122,11 +122,11 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
 #else
 float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
 {
-    float16_t maxVal;
-    float16_t tmp;
+    _Float16 maxVal;
+    _Float16 tmp;
    const float16_t *pIn;
    uint32_t blkCnt;
-    float16_t accum;
+    _Float16 accum;
 
    pIn = in;
    blkCnt = blockSize;
--- a/Source/StatisticsFunctions/arm_power_f16.c
+++ b/Source/StatisticsFunctions/arm_power_f16.c
@ -88,8 +88,8 @@ void arm_power_f16(
        float16_t * pResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
-        float16_t sum = 0.0f;                          /* Temporary result storage */
-        float16_t in;                                  /* Temporary variable to store input value */
+        _Float16 sum = 0.0f16;                          /* Temporary result storage */
+        _Float16 in;                                  /* Temporary variable to store input value */

 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

--- a/Source/StatisticsFunctions/arm_rms_f16.c
+++ b/Source/StatisticsFunctions/arm_rms_f16.c
@ -83,8 +83,8 @@ void arm_rms_f16(
        float16_t * pResult)
 {
        uint32_t blkCnt;                               /* Loop counter */
-        float16_t sum = 0.0f;                          /* Temporary result storage */
-        float16_t in;                                  /* Temporary variable to store input value */
+        _Float16 sum = 0.0f16;                          /* Temporary result storage */
+        _Float16 in;                                  /* Temporary variable to store input value */

 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)

--- a/Testing/FrameworkSource/Error.cpp
+++ b/Testing/FrameworkSource/Error.cpp
@ -342,7 +342,7 @@ void assert_close_error(unsigned long nb,float64_t &ref, float64_t &val, double
    if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
    {
        char details[200];
-        sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
+        sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
        throw (Error(CLOSE_ERROR,nb,details));
    }
 };
@ -385,7 +385,7 @@ void assert_close_error(unsigned long nb,float32_t &ref, float32_t &val, double
    if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
    {
        char details[200];
-        sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
+        sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
        throw (Error(CLOSE_ERROR,nb,details));
    }
 };
@ -429,7 +429,7 @@ void assert_close_error(unsigned long nb,float16_t &ref, float16_t &val, double
    if (abs((float)val - (float)ref) > (absthreshold + relthreshold * abs((float)ref)))
    {
        char details[200];
-        sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
+        sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
        throw (Error(CLOSE_ERROR,nb,details));
    }
 };
--- a/Testing/PatternGeneration/Support.py
+++ b/Testing/PatternGeneration/Support.py
@ -11,7 +11,7 @@ NBVECTORS = [4,10,16]

 VECDIM = [12,14,20]

-def genWsum(config,nb):
+def genWsum(config,f,nb):
    DIM=50
    inputs=[] 
    weights=[]
@ -23,15 +23,15 @@ def genWsum(config,nb):
    inputs += list(va)
    weights += list(vb)

-    nbiters = Tools.loopnb(0,Tools.TAILONLY)
+    nbiters = Tools.loopnb(f,Tools.TAILONLY)
    e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) 
    output.append(e)

-    nbiters = Tools.loopnb(0,Tools.BODYONLY)
+    nbiters = Tools.loopnb(f,Tools.BODYONLY)
    e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) 
    output.append(e)

-    nbiters = Tools.loopnb(0,Tools.BODYANDTAIL)
+    nbiters = Tools.loopnb(f,Tools.BODYANDTAIL)
    e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) 
    output.append(e)

@ -92,7 +92,7 @@ def writeTestsF32(config):


    # This is for benchmarking the weighted sum and we use only one test pattern
-    genWsum(config,6)
+    genWsum(config,Tools.F32,6)
    

 def writeTestsF16(config):
@ -105,7 +105,7 @@ def writeTestsF16(config):
    config.writeInput(11,va,"Samples")

    # This is for benchmarking the weighted sum and we use only one test pattern
-    genWsum(config,6)
+    genWsum(config,Tools.F16,6)

 def writeTestsQ31(config):
    NBSAMPLES=256
--- a/Testing/Patterns/DSP/Support/SupportF16/Inputs6_f16.txt
+++ b/Testing/Patterns/DSP/Support/SupportF16/Inputs6_f16.txt
@ -1,102 +1,102 @@
 H
 50
-// 0.549118
-0x3865
-// 0.782574
-0x3a43
-// 0.537389
-0x384d
-// 0.576597
-0x389d
-// 0.849496
-0x3acc
-// 0.764850
-0x3a1e
-// 0.539151
-0x3850
-// 0.459163
-0x3759
-// 0.352476
-0x35a4
-// 0.333343
-0x3555
-// 0.362662
-0x35cd
-// 0.343928
-0x3581
-// 0.046084
-0x29e6
-// 0.583088
-0x38aa
-// 0.464789
-0x3770
-// 0.659567
-0x3947
-// 0.177360
-0x31ad
-// 0.465121
-0x3771
-// 0.849010
-0x3acb
-// 0.748383
-0x39fd
-// 0.493254
-0x37e4
-// 0.023579
-0x2609
-// 0.354457
-0x35ac
-// 0.880690
-0x3b0c
-// 0.029813
-0x27a2
-// 0.338038
-0x3569
-// 0.724386
-0x39cc
-// 0.341454
-0x3577
-// 0.676073
-0x3969
-// 0.041176
-0x2945
-// 0.214019
-0x32d9
-// 0.322539
-0x3529
-// 0.499026
-0x37fc
-// 0.616038
-0x38ee
-// 0.688882
-0x3983
-// 0.066439
-0x2c41
-// 0.094981
-0x2e14
-// 0.873092
-0x3afc
-// 0.642707
-0x3924
-// 0.537527
-0x384d
-// 0.370953
-0x35ef
-// 0.450042
-0x3733
-// 0.434679
-0x36f4
-// 0.676789
-0x396a
-// 0.303649
-0x34dc
-// 0.076930
-0x2cec
-// 0.871305
-0x3af8
-// 0.855053
-0x3ad7
-// 0.498935
-0x37fc
-// 0.854980
-0x3ad7
+// 0.145259
+0x30a6
+// 0.431775
+0x36e9
+// 0.925328
+0x3b67
+// 0.170239
+0x3173
+// 0.385773
+0x362c
+// 0.998601
+0x3bfd
+// 0.108755
+0x2ef6
+// 0.479921
+0x37ae
+// 0.592423
+0x38bd
+// 0.431234
+0x36e6
+// 0.413843
+0x369f
+// 0.959014
+0x3bac
+// 0.665528
+0x3953
+// 0.952834
+0x3b9f
+// 0.562821
+0x3881
+// 0.609152
+0x38e0
+// 0.890135
+0x3b1f
+// 0.002624
+0x1960
+// 0.666853
+0x3956
+// 0.813988
+0x3a83
+// 0.170387
+0x3174
+// 0.264551
+0x343c
+// 0.699111
+0x3998
+// 0.738154
+0x39e8
+// 0.475053
+0x379a
+// 0.439282
+0x3707
+// 0.699736
+0x3999
+// 0.488956
+0x37d3
+// 0.115565
+0x2f65
+// 0.115452
+0x2f64
+// 0.586420
+0x38b1
+// 0.327648
+0x353e
+// 0.442207
+0x3713
+// 0.752186
+0x3a04
+// 0.476363
+0x379f
+// 0.491094
+0x37dc
+// 0.552734
+0x386c
+// 0.674894
+0x3966
+// 0.705545
+0x39a5
+// 0.966659
+0x3bbc
+// 0.528146
+0x383a
+// 0.633660
+0x3912
+// 0.049129
+0x2a4a
+// 0.458540
+0x3756
+// 0.331162
+0x354c
+// 0.399161
+0x3663
+// 0.015650
+0x2402
+// 0.271388
+0x3458
+// 0.912530
+0x3b4d
+// 0.655578
+0x393f
--- a/Testing/Patterns/DSP/Support/SupportF16/Ref6_f16.txt
+++ b/Testing/Patterns/DSP/Support/SupportF16/Ref6_f16.txt
@ -1,8 +1,8 @@
 H
 3
-// 0.544937
-0x385c
-// 0.581009
-0x38a6
-// 0.504352
-0x3809
+// 0.445338
+0x3720
+// 0.569713
+0x388f
+// 0.564358
+0x3884
--- a/Testing/Patterns/DSP/Support/SupportF16/Weights6_f16.txt
+++ b/Testing/Patterns/DSP/Support/SupportF16/Weights6_f16.txt
@ -1,102 +1,102 @@
 H
 50
-// 0.873606
-0x3afd
-// 0.005892
-0x1e09
-// 0.669526
-0x395b
-// 0.198146
-0x3257
-// 0.032904
-0x2836
-// 0.829321
-0x3aa2
-// 0.044536
-0x29b3
-// 0.842708
-0x3abe
-// 0.981319
-0x3bda
-// 0.274000
-0x3462
-// 0.509194
-0x3813
-// 0.994965
-0x3bf6
-// 0.572766
-0x3895
-// 0.557596
-0x3876
-// 0.306704
-0x34e8
-// 0.983822
-0x3bdf
-// 0.145555
-0x30a8
-// 0.539088
-0x3850
-// 0.527526
-0x3838
-// 0.501409
-0x3803
-// 0.249480
-0x33fc
-// 0.273503
-0x3460
-// 0.567660
-0x388b
-// 0.741561
-0x39ef
-// 0.458026
-0x3754
-// 0.239162
-0x33a7
-// 0.731739
-0x39db
-// 0.662458
-0x394d
-// 0.024779
-0x2658
-// 0.086811
-0x2d8e
-// 0.660991
-0x394a
-// 0.834424
-0x3aad
-// 0.163672
-0x313d
-// 0.120432
-0x2fb5
-// 0.593488
-0x38bf
-// 0.973784
-0x3bca
-// 0.167473
-0x315c
-// 0.858171
-0x3ade
-// 0.986637
-0x3be5
-// 0.223556
-0x3327
-// 0.382377
-0x361e
-// 0.757667
-0x3a10
-// 0.032219
-0x2820
-// 0.574024
-0x3898
-// 0.125286
-0x3002
-// 0.946997
-0x3b93
-// 0.942443
-0x3b8a
-// 0.152563
-0x30e2
-// 0.240567
-0x33b3
-// 0.160261
-0x3121
+// 0.305271
+0x34e2
+// 0.032698
+0x282f
+// 0.266409
+0x3443
+// 0.617540
+0x38f1
+// 0.152922
+0x30e5
+// 0.774995
+0x3a33
+// 0.848550
+0x3aca
+// 0.523429
+0x3830
+// 0.116035
+0x2f6d
+// 0.649223
+0x3932
+// 0.246758
+0x33e5
+// 0.826661
+0x3a9d
+// 0.967839
+0x3bbe
+// 0.283239
+0x3488
+// 0.847203
+0x3ac7
+// 0.720735
+0x39c4
+// 0.477014
+0x37a2
+// 0.633504
+0x3911
+// 0.596646
+0x38c6
+// 0.832505
+0x3aa9
+// 0.135769
+0x3058
+// 0.607454
+0x38dc
+// 0.914942
+0x3b52
+// 0.474766
+0x3799
+// 0.781271
+0x3a40
+// 0.084347
+0x2d66
+// 0.038805
+0x28f8
+// 0.055045
+0x2b0c
+// 0.334330
+0x3559
+// 0.331283
+0x354d
+// 0.595124
+0x38c3
+// 0.443636
+0x3719
+// 0.612722
+0x38e7
+// 0.152437
+0x30e1
+// 0.311869
+0x34fd
+// 0.177173
+0x31ab
+// 0.540216
+0x3852
+// 0.941623
+0x3b88
+// 0.666747
+0x3955
+// 0.002608
+0x1958
+// 0.092911
+0x2df2
+// 0.129786
+0x3027
+// 0.710137
+0x39ae
+// 0.488728
+0x37d2
+// 0.478723
+0x37a9
+// 0.627602
+0x3905
+// 0.001795
+0x175b
+// 0.299980
+0x34cd
+// 0.636209
+0x3917
+// 0.102662
+0x2e92
--- a/Testing/Source/Tests/BIQUADF16.cpp
+++ b/Testing/Source/Tests/BIQUADF16.cpp
@ -10,8 +10,8 @@ Reference patterns are generated with
 a double precision computation.

 */
-#define REL_ERROR (3.0e-3)
-#define ABS_ERROR (3.5e-2)
+#define REL_ERROR (5.0e-2)
+#define ABS_ERROR (1.0e-1)

    void BIQUADF16::test_biquad_cascade_df1_ref()
    {
--- a/Testing/Source/Tests/DistanceTestsF16.cpp
+++ b/Testing/Source/Tests/DistanceTestsF16.cpp
@ -3,7 +3,7 @@
 #include "Error.h"
 #include "Test.h"

-#define REL_ERROR (2e-3)
+#define REL_ERROR (5e-3)

 #define REL_JS_ERROR (3e-2)

--- a/Testing/Source/Tests/SupportTestsF16.cpp
+++ b/Testing/Source/Tests/SupportTestsF16.cpp
@ -7,8 +7,8 @@
 #define SNR_THRESHOLD 120
 #define REL_ERROR (1.0e-5)

-#define ABS_WEIGHTEDSUM_ERROR (5.0e-2)
-#define REL_WEIGHTEDSUM_ERROR (5.0e-2)
+#define ABS_WEIGHTEDSUM_ERROR (1.0e-1)
+#define REL_WEIGHTEDSUM_ERROR (5.0e-3)

 #define ABS_ERROR_F32 (1.0e-3)
 #define REL_ERROR_F32 (1.0e-3)
@ -28,7 +28,7 @@ void SupportTestsF16::test_weighted_sum_f16()
 
 
 *outp=arm_weighted_sum_f16(inp, coefsp,this->nbSamples);
- 
+
 ASSERT_CLOSE_ERROR(*outp,refp[this->offset],ABS_WEIGHTEDSUM_ERROR,REL_WEIGHTEDSUM_ERROR);
 ASSERT_EMPTY_TAIL(output);

--- a/Testing/bench.txt
+++ b/Testing/bench.txt
@ -754,7 +754,7 @@ group Root {
                }

                Functions {
-                   Cascaded BiQuad Filter DF2T_f64:test_biquad_cascade_df2T_f64
+                   Cascaded BiQuad Filter DF2T:test_biquad_cascade_df2T_f64
                } -> PARAM1_ID
              }
           }
@ -1570,8 +1570,8 @@ group Root {
                   Complex FFT:test_cfft_f32 -> CFFT_PARAM_ID
                   Real FFT:test_rfft_f32 -> RFFT_PARAM_ID
                   DCT4:test_dct4_f32 -> DCT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
                }
           }

@ -1620,8 +1620,8 @@ group Root {
                   Complex FFT:test_cfft_q31 -> CFFT_PARAM_ID
                   Real FFT:test_rfft_q31 -> RFFT_PARAM_ID
                   DCT4:test_dct4_q31 -> DCT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
                }
           }

@ -1670,8 +1670,8 @@ group Root {
                   Complex FFT:test_cfft_q15 -> CFFT_PARAM_ID
                   Real FFT:test_rfft_q15 -> RFFT_PARAM_ID
                   DCT4:test_dct4_q15 -> DCT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
                }
           }
        }
--- a/Testing/bench_f16.txt
+++ b/Testing/bench_f16.txt
@ -584,8 +584,8 @@ group Root {
                Functions {
                   Complex FFT:test_cfft_f16 -> CFFT_PARAM_ID
                   Real FFT:test_rfft_f16 -> RFFT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
                }
           }