vmlaq in arm_dot_prod_f32 instead of vmulq and vadd

pull/19/head
ClaudioMartino 7 years ago committed by Christophe Favergeon
parent f6c65db1fe
commit ad726aa823

@ -72,7 +72,7 @@ void arm_dot_prod_f32(
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
float32x2_t accum = vdup_n_f32(0);
float32x4_t accum = vdupq_n_f32(0);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
@ -85,8 +85,7 @@ void arm_dot_prod_f32(
/* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
res = vmulq_f32(vec1, vec2);
accum = vadd_f32(accum, vpadd_f32(vget_low_f32(res), vget_high_f32(res)));
accum = vmlaq_f32(accum, vec1, vec2);
/* Increment pointers */
pSrcA += 4;
@ -98,7 +97,12 @@ void arm_dot_prod_f32(
/* Decrement the loop counter */
blkCnt--;
}
sum += accum[0] + accum[1];
#if __aarch64__
sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
#else
sum = (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[0] + (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[1];
#endif
/* Tail */
blkCnt = blockSize & 0x3;

Loading…
Cancel
Save