diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c index ad3219b5..5be48bc0 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c @@ -86,42 +86,92 @@ void arm_cmplx_dot_prod_f16( float16_t * realResult, float16_t * imagResult) { - uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ - uint32_t blkCnt; - float16_t real_sum, imag_sum; - f16x8_t vecSrcA, vecSrcB; - f16x8_t vec_acc = vdupq_n_f16(0.0f); - - /* Compute 2 complex samples at a time */ - blkCnt = blockSize >> 3U; - - while (blkCnt > 0U) - { - vecSrcA = vld1q(pSrcA); - vecSrcB = vld1q(pSrcB); + int32_t blkCnt; + float16_t real_sum, imag_sum; + f16x8_t vecSrcA, vecSrcB; + f16x8_t vec_acc = vdupq_n_f16(0.0f16); + f16x8_t vecSrcC, vecSrcD; + + blkCnt = (numSamples >> 3); + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q( pSrcA); + vecSrcB = vld1q( pSrcB); + pSrcA += 8; + pSrcB += 8; + while (blkCnt > 0) { + vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 8; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 8; + + vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 8; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + + vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); /* - * Decrement the blkCnt loop counter - * Advance vector source and destination pointers + * tail */ - pSrcA += 8; - pSrcB += 8; - blkCnt--; - } - - /* Tail */ - blkCnt = (blockSize & 7); - - if (blkCnt > 0U) - { - mve_pred16_t p0 = vctp16q(blkCnt); - vecSrcA = vld1q(pSrcA); - vecSrcB = vld1q(pSrcB); - vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p0); - vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p0); + blkCnt = CMPLX_DIM * (numSamples & 7); + while (blkCnt > 0) { + mve_pred16_t p = vctp16q(blkCnt); + pSrcA += 8; + pSrcB += 8; + + vecSrcA = vldrhq_z_f16(pSrcA, p); + vecSrcB = vldrhq_z_f16(pSrcB, p); + vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + + blkCnt -= 8; + } + } else { + /* small vector */ + blkCnt = numSamples * CMPLX_DIM; + vec_acc = vdupq_n_f16(0.0f16); + + do { + mve_pred16_t p = vctp16q(blkCnt); + + vecSrcA = vldrhq_z_f16(pSrcA, p); + vecSrcB = vldrhq_z_f16(pSrcB, p); + + vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + + /* + * Decrement the blkCnt loop counter + * Advance vector source and destination pointers + */ + pSrcA += 8; + pSrcB += 8; + blkCnt -= 8; + } + while (blkCnt > 0); } /* Sum the partial parts */ diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c index 19185fab..692433d1 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c @@ -83,56 +83,94 @@ void arm_cmplx_dot_prod_f32( float32_t * realResult, float32_t * imagResult) { - uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ - uint32_t blkCnt; - float32_t real_sum, imag_sum; - f32x4_t vecSrcA, vecSrcB; - f32x4_t vec_acc = vdupq_n_f32(0.0f); - float32_t a0,b0,c0,d0; - - /* Compute 2 complex samples at a time */ - blkCnt = blockSize >> 2U; - - while (blkCnt > 0U) - { + int32_t blkCnt; + float32_t real_sum, imag_sum; + f32x4_t vecSrcA, vecSrcB; + f32x4_t vec_acc = vdupq_n_f32(0.0f); + f32x4_t vecSrcC, vecSrcD; + + blkCnt = numSamples >> 2; + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ vecSrcA = vld1q(pSrcA); vecSrcB = vld1q(pSrcB); + pSrcA += 4; + pSrcB += 4; + while (blkCnt > 0) { + vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 4; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 4; + + vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 4; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + + vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); /* - * Decrement the blkCnt loop counter - * Advance vector source and destination pointers + * tail */ - pSrcA += 4; - pSrcB += 4; - blkCnt--; + blkCnt = CMPLX_DIM * (numSamples & 3); + while (blkCnt > 0) { + mve_pred16_t p = vctp32q(blkCnt); + pSrcA += 4; + pSrcB += 4; + vecSrcA = vldrwq_z_f32(pSrcA, p); + vecSrcB = vldrwq_z_f32(pSrcB, p); + vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + blkCnt -= 4; + } + } else { + /* small vector */ + blkCnt = numSamples * CMPLX_DIM; + vec_acc = vdupq_n_f32(0.0f); + + do { + mve_pred16_t p = vctp32q(blkCnt); + + vecSrcA = vldrwq_z_f32(pSrcA, p); + vecSrcB = vldrwq_z_f32(pSrcB, p); + + vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + + /* + * Decrement the blkCnt loop counter + * Advance vector source and destination pointers + */ + pSrcA += 4; + pSrcB += 4; + blkCnt -= 4; + } + while (blkCnt > 0); } - real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2); imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3); - - /* Tail */ - blkCnt = (blockSize & 3) >> 1; - - while (blkCnt > 0U) - { - a0 = *pSrcA++; - b0 = *pSrcA++; - c0 = *pSrcB++; - d0 = *pSrcB++; - - real_sum += a0 * c0; - imag_sum += a0 * d0; - real_sum -= b0 * d0; - imag_sum += b0 * c0; - - /* Decrement loop counter */ - blkCnt--; - } - /* * Store the real and imaginary results in the destination buffers diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c index 84324412..e908fec8 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c @@ -62,76 +62,98 @@ void arm_cmplx_dot_prod_q15( q31_t * realResult, q31_t * imagResult) { - - uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ - uint32_t blkCnt; - q15_t a0,b0,c0,d0; - - q63_t accReal = 0LL; q63_t accImag = 0LL; - q15x8_t vecSrcA, vecSrcB; - - - - /* should give more freedom to generate stall free code */ - vecSrcA = vld1q(pSrcA); - vecSrcB = vld1q(pSrcB); - pSrcA += 8; - pSrcB += 8; - - /* Compute 4 complex samples at a time */ - blkCnt = blockSize >> 3; - while (blkCnt > 0U) - { - q15x8_t vecSrcC, vecSrcD; - - accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB); - vecSrcC = vld1q(pSrcA); - pSrcA += 8; - - accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB); - vecSrcD = vld1q(pSrcB); - pSrcB += 8; - - accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD); - vecSrcA = vld1q(pSrcA); - pSrcA += 8; - - accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD); - vecSrcB = vld1q(pSrcB); - pSrcB += 8; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - /* Tail */ - pSrcA -= 8; - pSrcB -= 8; - - blkCnt = (blockSize & 7) >> 1; - - while (blkCnt > 0U) - { - a0 = *pSrcA++; - b0 = *pSrcA++; - c0 = *pSrcB++; - d0 = *pSrcB++; - - accReal += (q31_t)a0 * c0; - accImag += (q31_t)a0 * d0; - accReal -= (q31_t)b0 * d0; - accImag += (q31_t)b0 * c0; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store real and imaginary result in 8.24 format */ - /* Convert real data in 34.30 to 8.24 by 6 right shifts */ - *realResult = (q31_t) (accReal >> 6); - /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */ - *imagResult = (q31_t) (accImag >> 6); + int32_t blkCnt; + q63_t accReal = 0LL; + q63_t accImag = 0LL; + q15x8_t vecSrcA, vecSrcB; + q15x8_t vecSrcC, vecSrcD; + + blkCnt = (numSamples >> 3); + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q(pSrcA); + vecSrcB = vld1q(pSrcB); + pSrcA += 8; + pSrcB += 8; + + while (blkCnt > 0) { + + accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 8; + + accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 8; + + accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 8; + + accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ + accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + + accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + + accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + + accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + + /* + * tail + */ + blkCnt = CMPLX_DIM * (numSamples & 7); + do { + mve_pred16_t p = vctp16q(blkCnt); + + pSrcA += 8; + pSrcB += 8; + + vecSrcA = vldrhq_z_s16(pSrcA, p); + vecSrcB = vldrhq_z_s16(pSrcB, p); + + accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p); + accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p); + + blkCnt -= 8; + } + while ((int32_t) blkCnt > 0); + } else { + blkCnt = numSamples * CMPLX_DIM; + while (blkCnt > 0) { + mve_pred16_t p = vctp16q(blkCnt); + + vecSrcA = vldrhq_z_s16(pSrcA, p); + vecSrcB = vldrhq_z_s16(pSrcB, p); + + accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p); + accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p); + + /* + * Decrement the blkCnt loop counter + * Advance vector source and destination pointers + */ + pSrcA += 8; + pSrcB += 8; + blkCnt -= 8; + } + } + *realResult = asrl(accReal, (14 - 8)); + *imagResult = asrl(accImag, (14 - 8)); } #else void arm_cmplx_dot_prod_q15( diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c index 61618ca9..4620503b 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c @@ -64,60 +64,99 @@ void arm_cmplx_dot_prod_q31( q63_t * realResult, q63_t * imagResult) { - uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ - uint32_t blkCnt; - q31x4_t vecSrcA, vecSrcB; - q63_t accReal = 0LL; - q63_t accImag = 0LL; + int32_t blkCnt; + q63_t accReal = 0LL; + q63_t accImag = 0LL; + q31x4_t vecSrcA, vecSrcB; + q31x4_t vecSrcC, vecSrcD; + + blkCnt = numSamples >> 2; + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q(pSrcA); + vecSrcB = vld1q(pSrcB); + pSrcA += 4; + pSrcB += 4; - q31_t a0,b0,c0,d0; + while (blkCnt > 0) { - /* Compute 2 complex samples at a time */ - blkCnt = blockSize >> 2U; + accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 4; - while (blkCnt > 0U) - { + accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 4; - vecSrcA = vld1q(pSrcA); - vecSrcB = vld1q(pSrcB); + accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 4; + accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + + accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + + accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); /* - * Decrement the blkCnt loop counter - * Advance vector source and destination pointers + * tail */ - pSrcA += 4; - pSrcB += 4; - blkCnt --; + blkCnt = CMPLX_DIM * (numSamples & 3); + do { + mve_pred16_t p = vctp32q(blkCnt); + + pSrcA += 4; + pSrcB += 4; + + vecSrcA = vldrwq_z_s32(pSrcA, p); + vecSrcB = vldrwq_z_s32(pSrcB, p); + + accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p); + accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p); + + blkCnt -= 4; + } + while ((int32_t) blkCnt > 0); + } else { + blkCnt = numSamples * CMPLX_DIM; + while (blkCnt > 0) { + mve_pred16_t p = vctp32q(blkCnt); + + vecSrcA = vldrwq_z_s32(pSrcA, p); + vecSrcB = vldrwq_z_s32(pSrcB, p); + + accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p); + accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p); + + /* + * Decrement the blkCnt loop counter + * Advance vector source and destination pointers + */ + pSrcA += 4; + pSrcB += 4; + blkCnt -= 4; + } } + *realResult = asrl(accReal, (14 - 8)); + *imagResult = asrl(accImag, (14 - 8)); - accReal = asrl(accReal, (14 - 8)); - accImag = asrl(accImag, (14 - 8)); - - /* Tail */ - blkCnt = (blockSize & 3) >> 1; - - while (blkCnt > 0U) - { - a0 = *pSrcA++; - b0 = *pSrcA++; - c0 = *pSrcB++; - d0 = *pSrcB++; - - accReal += ((q63_t)a0 * c0) >> 14; - accImag += ((q63_t)a0 * d0) >> 14; - accReal -= ((q63_t)b0 * d0) >> 14; - accImag += ((q63_t)b0 * c0) >> 14; - - /* Decrement loop counter */ - blkCnt--; - } - - /* Store real and imaginary result in destination buffer. */ - *realResult = accReal; - *imagResult = accImag; } #else diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c index 14fde480..9ce7d8a6 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c @@ -78,51 +78,105 @@ void arm_cmplx_mult_cmplx_f16( float16_t * pDst, uint32_t numSamples) { - int32_t blkCnt; /* loop counters */ - int32_t blockSize = numSamples; - f16x8_t vecA; - f16x8_t vecB; - f16x8_t vecDst; - - blkCnt = blockSize * CMPLX_DIM; - blkCnt = blkCnt >> 3; - - while (blkCnt > 0) - { - vecA = vldrhq_f16(pSrcA); - vecB = vldrhq_f16(pSrcB); - /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ - vecDst = vcmulq(vecA, vecB); - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ - vecDst = vcmlaq_rot90(vecDst, vecA, vecB); - vstrhq_f16(pDst, vecDst); - - blkCnt--; + int32_t blkCnt; + f16x8_t vecSrcA, vecSrcB; + f16x8_t vecSrcC, vecSrcD; + f16x8_t vec_acc; + + blkCnt = (numSamples >> 3); + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q(pSrcA); + vecSrcB = vld1q(pSrcB); pSrcA += 8; pSrcB += 8; + + while (blkCnt > 0) { + vec_acc = vcmulq(vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 8; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 8; + vst1q(pDst, vec_acc); + pDst += 8; + + vec_acc = vcmulq(vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 8; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 8; + vst1q(pDst, vec_acc); + pDst += 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ + vec_acc = vcmulq(vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + vst1q(pDst, vec_acc); pDst += 8; - } - _Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */ - /* Tail */ - blkCnt = (blockSize & 7) >> 1; - while (blkCnt > 0) - { - /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ - - a = *pSrcA++; - b = *pSrcA++; - c = *pSrcB++; - d = *pSrcB++; - - /* store result in destination buffer. */ - *pDst++ = (a * c) - (b * d); - *pDst++ = (a * d) + (b * c); - - /* Decrement loop counter */ - blkCnt--; + vec_acc = vcmulq(vecSrcC, vecSrcD); + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); + vst1q(pDst, vec_acc); + pDst += 8; + + /* + * tail + */ + blkCnt = CMPLX_DIM * (numSamples & 7); + while (blkCnt > 0) { + mve_pred16_t p = vctp16q(blkCnt); + pSrcA += 8; + pSrcB += 8; + + vecSrcA = vldrhq_z_f16(pSrcA, p); + vecSrcB = vldrhq_z_f16(pSrcB, p); + vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + + vstrhq_p_f16(pDst, vec_acc, p); + pDst += 8; + + blkCnt -= 8; + } + } else { + /* small vector */ + blkCnt = numSamples * CMPLX_DIM; + + do { + mve_pred16_t p = vctp16q(blkCnt); + + vecSrcA = vldrhq_z_f16(pSrcA, p); + vecSrcB = vldrhq_z_f16(pSrcB, p); + + vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + vstrhq_p_f16(pDst, vec_acc, p); + pDst += 8; + + /* + * Decrement the blkCnt loop counter + * Advance vector source and destination pointers + */ + pSrcA += 8; + pSrcB += 8; + blkCnt -= 8; + } + while (blkCnt > 0); } + } diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c index 6f2e2955..e53f82e2 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c @@ -76,54 +76,104 @@ void arm_cmplx_mult_cmplx_f32( float32_t * pDst, uint32_t numSamples) { - uint32_t blkCnt; /* loop counters */ - uint32_t blockSize = numSamples; /* loop counters */ - float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */ - - f32x4x2_t vecA; - f32x4x2_t vecB; - f32x4x2_t vecDst; - - /* Compute 4 complex outputs at a time */ - blkCnt = blockSize >> 2; - while (blkCnt > 0U) - { - vecA = vld2q(pSrcA); // load & separate real/imag pSrcA (de-interleave 2) - vecB = vld2q(pSrcB); // load & separate real/imag pSrcB - pSrcA += 8; - pSrcB += 8; - - /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ - vecDst.val[0] = vmulq(vecA.val[0], vecB.val[0]); - vecDst.val[0] = vfmsq(vecDst.val[0],vecA.val[1], vecB.val[1]); - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ - vecDst.val[1] = vmulq(vecA.val[0], vecB.val[1]); - vecDst.val[1] = vfmaq(vecDst.val[1], vecA.val[1], vecB.val[0]); - - vst2q(pDst, vecDst); - pDst += 8; - + int32_t blkCnt; + f32x4_t vecSrcA, vecSrcB; + f32x4_t vecSrcC, vecSrcD; + f32x4_t vec_acc; + + blkCnt = numSamples >> 2; + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q(pSrcA); + vecSrcB = vld1q(pSrcB); + pSrcA += 4; + pSrcB += 4; + + while (blkCnt > 0) { + vec_acc = vcmulq(vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 4; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 4; + vst1q(pDst, vec_acc); + pDst += 4; + + vec_acc = vcmulq(vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 4; + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 4; + vst1q(pDst, vec_acc); + pDst += 4; + /* + * Decrement the blockSize loop counter + */ blkCnt--; - } - - /* Tail */ - blkCnt = blockSize & 3; - while (blkCnt > 0U) - { - /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ - - a = *pSrcA++; - b = *pSrcA++; - c = *pSrcB++; - d = *pSrcB++; - - /* store result in destination buffer. */ - *pDst++ = (a * c) - (b * d); - *pDst++ = (a * d) + (b * c); - - /* Decrement loop counter */ - blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ + vec_acc = vcmulq(vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + + vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + vst1q(pDst, vec_acc); + pDst += 4; + + vec_acc = vcmulq(vecSrcC, vecSrcD); + vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); + vst1q(pDst, vec_acc); + pDst += 4; + + /* + * tail + */ + blkCnt = CMPLX_DIM * (numSamples & 3); + while (blkCnt > 0) { + mve_pred16_t p = vctp32q(blkCnt); + pSrcA += 4; + pSrcB += 4; + + vecSrcA = vldrwq_z_f32(pSrcA, p); + vecSrcB = vldrwq_z_f32(pSrcB, p); + vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + + vstrwq_p_f32(pDst, vec_acc, p); + pDst += 4; + + blkCnt -= 4; + } + } else { + /* small vector */ + blkCnt = numSamples * CMPLX_DIM; + vec_acc = vdupq_n_f32(0.0f); + + do { + mve_pred16_t p = vctp32q(blkCnt); + + vecSrcA = vldrwq_z_f32(pSrcA, p); + vecSrcB = vldrwq_z_f32(pSrcB, p); + + vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p); + vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); + vstrwq_p_f32(pDst, vec_acc, p); + pDst += 4; + + /* + * Decrement the blkCnt loop counter + * Advance vector source and destination pointers + */ + pSrcA += 4; + pSrcB += 4; + blkCnt -= 4; + } + while (blkCnt > 0); } } diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c index 69c1a457..521180f8 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c @@ -57,54 +57,116 @@ void arm_cmplx_mult_cmplx_q15( q15_t * pDst, uint32_t numSamples) { - uint32_t blkCnt; /* loop counters */ - uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ - q15_t a, b, c, d; - - q15x8_t vecA; - q15x8_t vecB; - q15x8_t vecDst; - - blkCnt = blockSize >> 3; - while (blkCnt > 0U) - { - vecA = vld1q(pSrcA); - vecB = vld1q(pSrcB); - /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ - vecDst = vqdmlsdhq_s16(vuninitializedq_s16(), vecA, vecB); - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ - vecDst = vqdmladhxq_s16(vecDst, vecA, vecB); - - vecDst = vshrq(vecDst, 2); - - vst1q(pDst, vecDst); - - blkCnt --; - pSrcA += 8; - pSrcB += 8; - pDst += 8; - }; - - /* - * tail - */ - blkCnt = (blockSize & 7) >> 1; - while (blkCnt > 0U) - { - /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ - - a = *pSrcA++; - b = *pSrcA++; - c = *pSrcB++; - d = *pSrcB++; - - /* store result in 3.13 format in destination buffer. */ - *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) ); - *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) ); - - /* Decrement loop counter */ - blkCnt--; + int32_t blkCnt; + q15x8_t vecSrcA, vecSrcB; + q15x8_t vecSrcC, vecSrcD; + q15x8_t vecDst; + + blkCnt = (numSamples >> 3); + blkCnt -= 1; + if (blkCnt > 0) + { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q(pSrcA); + vecSrcB = vld1q(pSrcB); + pSrcA += 8; + pSrcB += 8; + + while (blkCnt > 0) + { + + /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ + vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 8; + + /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ + vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 8; + + vstrhq_s16(pDst, vshrq(vecDst, 2)); + pDst += 8; + + vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 8; + + vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 8; + + vstrhq_s16(pDst, vshrq(vecDst, 2)); + pDst += 8; + + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ + vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + + vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + + vstrhq_s16(pDst, vshrq(vecDst, 2)); + pDst += 8; + + vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD); + vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD); + + vstrhq_s16(pDst, vshrq(vecDst, 2)); + pDst += 8; + + /* + * tail + */ + blkCnt = CMPLX_DIM * (numSamples & 7); + do + { + mve_pred16_t p = vctp16q(blkCnt); + + pSrcA += 8; + pSrcB += 8; + + vecSrcA = vldrhq_z_s16(pSrcA, p); + vecSrcB = vldrhq_z_s16(pSrcB, p); + + vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p); + vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p); + + vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p); + vstrhq_p_s16(pDst, vecDst, p); + pDst += 8; + + blkCnt -= 8; + } + while ((int32_t) blkCnt > 0); + } + else + { + blkCnt = numSamples * CMPLX_DIM; + while (blkCnt > 0) { + mve_pred16_t p = vctp16q(blkCnt); + + vecSrcA = vldrhq_z_s16(pSrcA, p); + vecSrcB = vldrhq_z_s16(pSrcB, p); + + vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p); + vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p); + + vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p); + vstrhq_p_s16(pDst, vecDst, p); + + pDst += 8; + pSrcA += 8; + pSrcB += 8; + + blkCnt -= 8; + } } } #else diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c index b2910786..05f57fad 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c @@ -57,52 +57,111 @@ void arm_cmplx_mult_cmplx_q31( q31_t * pDst, uint32_t numSamples) { - - uint32_t blkCnt; /* loop counters */ - uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ - q31x4_t vecA; - q31x4_t vecB; - q31x4_t vecDst; - q31_t a, b, c, d; /* Temporary variables */ - - /* Compute 2 complex outputs at a time */ - blkCnt = blockSize >> 2; - while (blkCnt > 0U) - { - - vecA = vld1q(pSrcA); - vecB = vld1q(pSrcB); - /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ - vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB); - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ - vecDst = vqdmladhxq(vecDst, vecA, vecB); - - vecDst = vshrq(vecDst, 2); - vst1q(pDst, vecDst); - - blkCnt --; + int32_t blkCnt; + q31x4_t vecSrcA, vecSrcB; + q31x4_t vecSrcC, vecSrcD; + q31x4_t vecDst; + + blkCnt = numSamples >> 2; + blkCnt -= 1; + if (blkCnt > 0) { + /* should give more freedom to generate stall free code */ + vecSrcA = vld1q(pSrcA); + vecSrcB = vld1q(pSrcB); pSrcA += 4; pSrcB += 4; + + while (blkCnt > 0) { + + /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ + vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + pSrcA += 4; + + /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ + vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + pSrcB += 4; + + vst1q(pDst, vshrq(vecDst, 2)); + pDst += 4; + + vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD); + vecSrcA = vld1q(pSrcA); + pSrcA += 4; + + vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD); + vecSrcB = vld1q(pSrcB); + pSrcB += 4; + + vst1q(pDst, vshrq(vecDst, 2)); + pDst += 4; + + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + + /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ + vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB); + vecSrcC = vld1q(pSrcA); + + vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB); + vecSrcD = vld1q(pSrcB); + + vst1q(pDst, vshrq(vecDst, 2)); + pDst += 4; + + vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD); + vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD); + + vst1q(pDst, vshrq(vecDst, 2)); pDst += 4; - }; - - blkCnt = (blockSize & 3) >> 1; - while (blkCnt > 0U) - { - /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ - /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ - - a = *pSrcA++; - b = *pSrcA++; - c = *pSrcB++; - d = *pSrcB++; - - /* store result in 3.29 format in destination buffer. */ - *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) ); - *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) ); - - /* Decrement loop counter */ - blkCnt--; + + /* + * tail + */ + blkCnt = CMPLX_DIM * (numSamples & 3); + do { + mve_pred16_t p = vctp32q(blkCnt); + + pSrcA += 4; + pSrcB += 4; + + vecSrcA = vldrwq_z_s32(pSrcA, p); + vecSrcB = vldrwq_z_s32(pSrcB, p); + + vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p); + vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p); + + vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p); + vstrwq_p_s32(pDst, vecDst, p); + pDst += 4; + + blkCnt -= 4; + } + while ((int32_t) blkCnt > 0); + } else { + blkCnt = numSamples * CMPLX_DIM; + while (blkCnt > 0) { + mve_pred16_t p = vctp32q(blkCnt); + + vecSrcA = vldrwq_z_s32(pSrcA, p); + vecSrcB = vldrwq_z_s32(pSrcB, p); + + vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p); + vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p); + + vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p); + vstrwq_p_s32(pDst, vecDst, p); + + pDst += 4; + pSrcA += 4; + pSrcB += 4; + + blkCnt -= 4; + } } } #else diff --git a/Testing/Source/Tests/BIQUADQ31.cpp b/Testing/Source/Tests/BIQUADQ31.cpp index 6249c931..96524c34 100755 --- a/Testing/Source/Tests/BIQUADQ31.cpp +++ b/Testing/Source/Tests/BIQUADQ31.cpp @@ -11,7 +11,7 @@ #define ABS_32x64_ERROR_Q31 ((q31_t)25) -void checkInnerTail(q31_t *b) +static void checkInnerTail(q31_t *b) { ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[1] == 0); diff --git a/Testing/Source/Tests/FIRF16.cpp b/Testing/Source/Tests/FIRF16.cpp index 0933c743..a3a847fd 100755 --- a/Testing/Source/Tests/FIRF16.cpp +++ b/Testing/Source/Tests/FIRF16.cpp @@ -16,7 +16,7 @@ a double precision computation. static __ALIGNED(8) float16_t coeffArray[32]; #endif -void checkInnerTail(float16_t *b) +static void checkInnerTail(float16_t *b) { ASSERT_TRUE(b[0] == 0.0f); ASSERT_TRUE(b[1] == 0.0f); diff --git a/Testing/Source/Tests/FIRF32.cpp b/Testing/Source/Tests/FIRF32.cpp index 42960203..8db410e3 100644 --- a/Testing/Source/Tests/FIRF32.cpp +++ b/Testing/Source/Tests/FIRF32.cpp @@ -16,7 +16,7 @@ a double precision computation. static __ALIGNED(8) float32_t coeffArray[32]; #endif -void checkInnerTail(float32_t *b) +static void checkInnerTail(float32_t *b) { ASSERT_TRUE(b[0] == 0.0f); ASSERT_TRUE(b[1] == 0.0f); diff --git a/Testing/Source/Tests/FIRQ15.cpp b/Testing/Source/Tests/FIRQ15.cpp index 91ef8bb6..26123069 100644 --- a/Testing/Source/Tests/FIRQ15.cpp +++ b/Testing/Source/Tests/FIRQ15.cpp @@ -10,7 +10,7 @@ static __ALIGNED(8) q15_t coeffArray[32]; #endif -void checkInnerTail(q15_t *b) +static void checkInnerTail(q15_t *b) { ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[1] == 0); diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp index 80f8195e..0070c671 100644 --- a/Testing/Source/Tests/FIRQ31.cpp +++ b/Testing/Source/Tests/FIRQ31.cpp @@ -10,7 +10,7 @@ static __ALIGNED(8) q31_t coeffArray[32]; #endif -void checkInnerTail(q31_t *b) +static void checkInnerTail(q31_t *b) { ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[1] == 0); diff --git a/Testing/Source/Tests/FIRQ7.cpp b/Testing/Source/Tests/FIRQ7.cpp index 4b76b630..11d6ec7e 100644 --- a/Testing/Source/Tests/FIRQ7.cpp +++ b/Testing/Source/Tests/FIRQ7.cpp @@ -10,7 +10,7 @@ static __ALIGNED(8) q7_t coeffArray[32]; #endif -void checkInnerTail(q7_t *b) +static void checkInnerTail(q7_t *b) { ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[1] == 0);