CMSIS-DSP: Some improvement to a few f16 functions

pull/19/head
Christophe Favergeon 5 years ago
parent 3752e622b8
commit 68e5b742ba

@ -63,10 +63,10 @@ __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
return acc; return acc;
} }
__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in) __STATIC_FORCEINLINE _Float16 vecAddAcrossF16Mve(float16x8_t in)
{ {
float16x8_t tmpVec; float16x8_t tmpVec;
float16_t acc; _Float16 acc;
tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in); tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in);
in = vaddq_f16(tmpVec, in); in = vaddq_f16(tmpVec, in);

@ -126,7 +126,7 @@ void arm_dot_prod_f16(
float16_t * result) float16_t * result)
{ {
uint32_t blkCnt; /* Loop counter */ uint32_t blkCnt; /* Loop counter */
float16_t sum = 0.0f; /* Temporary return variable */ _Float16 sum = 0.0f; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
@ -141,13 +141,13 @@ void arm_dot_prod_f16(
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */ /* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++); sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++); sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++); sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++); sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */ /* Decrement loop counter */
blkCnt--; blkCnt--;
@ -168,7 +168,7 @@ void arm_dot_prod_f16(
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */ /* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++); sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */ /* Decrement loop counter */
blkCnt--; blkCnt--;

@ -50,6 +50,7 @@
#include "arm_helium_utils.h" #include "arm_helium_utils.h"
#include <stdio.h>
void arm_svm_linear_predict_f16( void arm_svm_linear_predict_f16(
const arm_svm_linear_instance_f16 *S, const arm_svm_linear_instance_f16 *S,
const float16_t * in, const float16_t * in,
@ -65,7 +66,7 @@ void arm_svm_linear_predict_f16(
uint32_t row; uint32_t row;
uint32_t blkCnt; /* loop counters */ uint32_t blkCnt; /* loop counters */
const float16_t *pDualCoef = S->dualCoefficients; const float16_t *pDualCoef = S->dualCoefficients;
float16_t sum = S->intercept; _Float16 sum = S->intercept;
row = numRows; row = numRows;
/* /*
@ -145,10 +146,10 @@ void arm_svm_linear_predict_f16(
/* /*
* Sum the partial parts * Sum the partial parts
*/ */
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc0); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc1); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc1);
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc2); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc2);
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc3); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc3);
pSrcA += numCols * 4; pSrcA += numCols * 4;
/* /*
@ -215,8 +216,8 @@ void arm_svm_linear_predict_f16(
/* /*
* Sum the partial parts * Sum the partial parts
*/ */
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc0); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc1); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc1);
pSrcA += numCols * 2; pSrcA += numCols * 2;
row -= 2; row -= 2;
@ -269,7 +270,7 @@ void arm_svm_linear_predict_f16(
/* /*
* Sum the partial parts * Sum the partial parts
*/ */
sum += *pDualCoef++ * vecAddAcrossF16Mve(acc0); sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
} }

@ -67,7 +67,7 @@ void arm_svm_polynomial_predict_f16(
uint32_t row; uint32_t row;
uint32_t blkCnt; /* loop counters */ uint32_t blkCnt; /* loop counters */
const float16_t *pDualCoef = S->dualCoefficients; const float16_t *pDualCoef = S->dualCoefficients;
float16_t sum = S->intercept; _Float16 sum = S->intercept;
f16x8_t vSum = vdupq_n_f16(0.0f); f16x8_t vSum = vdupq_n_f16(0.0f);
row = numRows; row = numRows;

@ -11,7 +11,7 @@ def joinit(iterable, delimiter):
yield x yield x
# To format, in HTML, the cores in the right order. # To format, in HTML, the cores in the right order.
# First we order tje categories # First we order the categories
# Then we order the cores in each category # Then we order the cores in each category
# The final ORDEREDCORES is what is used # The final ORDEREDCORES is what is used
# to order tjhe values # to order tjhe values
@ -33,6 +33,8 @@ for cat in CORTEXCATEGORIES:
quit() quit()
ORDEREDCORES += cores ORDEREDCORES += cores
ORDEREDTYPES=["q7","q15","q31","u32","f16","f32","f64"]
class Markdown: class Markdown:
def __init__(self,output): def __init__(self,output):
self._id=0 self._id=0
@ -485,13 +487,14 @@ def reorder(p,v):
return(result) return(result)
class HTML: class HTML:
def __init__(self,output,regMode): def __init__(self,output,regMode,reorder):
self._id=0 self._id=0
self._sectionID = 0 self._sectionID = 0
self._barID = 0 self._barID = 0
self._histID = 0 self._histID = 0
self._output = output self._output = output
self._regMode = regMode self._regMode = regMode
self._reorder = reorder
def visitBarChart(self,bar): def visitBarChart(self,bar):
data=bar.data data=bar.data
@ -545,7 +548,10 @@ myhist(thehdata%d,"#hi%d");
self._output.write(str(col)) self._output.write(str(col))
self._output.write("</th>\n") self._output.write("</th>\n")
if self._reorder:
perm,restricted=permutation(ORDEREDCORES,table.cores) perm,restricted=permutation(ORDEREDCORES,table.cores)
else:
restricted = table.cores
for col in restricted: for col in restricted:
if firstCore: if firstCore:
@ -570,7 +576,10 @@ myhist(thehdata%d,"#hi%d");
params=row[0:nbParams] params=row[0:nbParams]
values=row[nbParams:] values=row[nbParams:]
if self._reorder:
row = params + reorder(perm,values) row = params + reorder(perm,values)
else:
row = params + values
for elem in row: for elem in row:
if i < nbParams: if i < nbParams:

@ -55,6 +55,7 @@ parser.add_argument('-g', action='store_true', help="Include graphs in regressio
parser.add_argument('-details', action='store_true', help="Details about runids") parser.add_argument('-details', action='store_true', help="Details about runids")
parser.add_argument('-lastid', action='store_true', help="Get last ID") parser.add_argument('-lastid', action='store_true', help="Get last ID")
parser.add_argument('-comments', nargs='?',type = str, default="comments.txt", help="Comment section") parser.add_argument('-comments', nargs='?',type = str, default="comments.txt", help="Comment section")
parser.add_argument('-byd', action='store_true', help="Result oganized by datatype")
# For runid or runid range # For runid or runid range
parser.add_argument('others', nargs=argparse.REMAINDER,help="Run ID") parser.add_argument('others', nargs=argparse.REMAINDER,help="Run ID")
@ -685,7 +686,10 @@ try:
if args.t=="md": if args.t=="md":
document.accept(Markdown(output)) document.accept(Markdown(output))
if args.t=="html": if args.t=="html":
document.accept(HTML(output,args.r)) reorder=True
if args.byc:
reorder=False
document.accept(HTML(output,args.r,reorder))
finally: finally:
c.close() c.close()

Loading…
Cancel
Save