CMSIS-DSP: New MVE implementations of the complex dot product and vector product.

pull/19/head
Christophe Favergeon 5 years ago
parent c4283d209f
commit 8fff9ebe29

@ -86,42 +86,92 @@ void arm_cmplx_dot_prod_f16(
float16_t * realResult, float16_t * realResult,
float16_t * imagResult) float16_t * imagResult)
{ {
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ int32_t blkCnt;
uint32_t blkCnt; float16_t real_sum, imag_sum;
float16_t real_sum, imag_sum; f16x8_t vecSrcA, vecSrcB;
f16x8_t vecSrcA, vecSrcB; f16x8_t vec_acc = vdupq_n_f16(0.0f16);
f16x8_t vec_acc = vdupq_n_f16(0.0f); f16x8_t vecSrcC, vecSrcD;
/* Compute 2 complex samples at a time */ blkCnt = (numSamples >> 3);
blkCnt = blockSize >> 3U; blkCnt -= 1;
if (blkCnt > 0) {
while (blkCnt > 0U) /* should give more freedom to generate stall free code */
{ vecSrcA = vld1q( pSrcA);
vecSrcA = vld1q(pSrcA); vecSrcB = vld1q( pSrcB);
vecSrcB = vld1q(pSrcB); pSrcA += 8;
pSrcB += 8;
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/* /*
* Decrement the blkCnt loop counter * tail
* Advance vector source and destination pointers
*/ */
pSrcA += 8; blkCnt = CMPLX_DIM * (numSamples & 7);
pSrcB += 8; while (blkCnt > 0) {
blkCnt--; mve_pred16_t p = vctp16q(blkCnt);
} pSrcA += 8;
pSrcB += 8;
/* Tail */
blkCnt = (blockSize & 7); vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
if (blkCnt > 0U) vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
{ vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA); blkCnt -= 8;
vecSrcB = vld1q(pSrcB); }
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p0); } else {
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p0); /* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f16(0.0f16);
do {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
} }
/* Sum the partial parts */ /* Sum the partial parts */

@ -83,56 +83,94 @@ void arm_cmplx_dot_prod_f32(
float32_t * realResult, float32_t * realResult,
float32_t * imagResult) float32_t * imagResult)
{ {
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ int32_t blkCnt;
uint32_t blkCnt; float32_t real_sum, imag_sum;
float32_t real_sum, imag_sum; f32x4_t vecSrcA, vecSrcB;
f32x4_t vecSrcA, vecSrcB; f32x4_t vec_acc = vdupq_n_f32(0.0f);
f32x4_t vec_acc = vdupq_n_f32(0.0f); f32x4_t vecSrcC, vecSrcD;
float32_t a0,b0,c0,d0;
blkCnt = numSamples >> 2;
/* Compute 2 complex samples at a time */ blkCnt -= 1;
blkCnt = blockSize >> 2U; if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA); vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB); vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/* /*
* Decrement the blkCnt loop counter * tail
* Advance vector source and destination pointers
*/ */
pSrcA += 4; blkCnt = CMPLX_DIM * (numSamples & 3);
pSrcB += 4; while (blkCnt > 0) {
blkCnt--; mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt -= 4;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f32(0.0f);
do {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
} }
real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2); real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3); imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
/* Tail */
blkCnt = (blockSize & 3) >> 1;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* /*
* Store the real and imaginary results in the destination buffers * Store the real and imaginary results in the destination buffers

@ -62,76 +62,98 @@ void arm_cmplx_dot_prod_q15(
q31_t * realResult, q31_t * realResult,
q31_t * imagResult) q31_t * imagResult)
{ {
int32_t blkCnt;
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ q63_t accReal = 0LL;
uint32_t blkCnt; q63_t accImag = 0LL;
q15_t a0,b0,c0,d0; q15x8_t vecSrcA, vecSrcB;
q15x8_t vecSrcC, vecSrcD;
q63_t accReal = 0LL; q63_t accImag = 0LL;
q15x8_t vecSrcA, vecSrcB; blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
/* should give more freedom to generate stall free code */ vecSrcA = vld1q(pSrcA);
vecSrcA = vld1q(pSrcA); vecSrcB = vld1q(pSrcB);
vecSrcB = vld1q(pSrcB); pSrcA += 8;
pSrcA += 8; pSrcB += 8;
pSrcB += 8;
while (blkCnt > 0) {
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 3; accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
while (blkCnt > 0U) vecSrcC = vld1q(pSrcA);
{ pSrcA += 8;
q15x8_t vecSrcC, vecSrcD;
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB); vecSrcD = vld1q(pSrcB);
vecSrcC = vld1q(pSrcA); pSrcB += 8;
pSrcA += 8;
accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB); vecSrcA = vld1q(pSrcA);
vecSrcD = vld1q(pSrcB); pSrcA += 8;
pSrcB += 8;
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD); vecSrcB = vld1q(pSrcB);
vecSrcA = vld1q(pSrcA); pSrcB += 8;
pSrcA += 8; /*
* Decrement the blockSize loop counter
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD); */
vecSrcB = vld1q(pSrcB); blkCnt--;
pSrcB += 8; }
/*
* Decrement the blockSize loop counter /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
*/ accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
blkCnt--; vecSrcC = vld1q(pSrcA);
}
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
/* Tail */ vecSrcD = vld1q(pSrcB);
pSrcA -= 8;
pSrcB -= 8; accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
blkCnt = (blockSize & 7) >> 1;
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
while (blkCnt > 0U) vecSrcB = vld1q(pSrcB);
{
a0 = *pSrcA++; /*
b0 = *pSrcA++; * tail
c0 = *pSrcB++; */
d0 = *pSrcB++; blkCnt = CMPLX_DIM * (numSamples & 7);
do {
accReal += (q31_t)a0 * c0; mve_pred16_t p = vctp16q(blkCnt);
accImag += (q31_t)a0 * d0;
accReal -= (q31_t)b0 * d0; pSrcA += 8;
accImag += (q31_t)b0 * c0; pSrcB += 8;
/* Decrement loop counter */ vecSrcA = vldrhq_z_s16(pSrcA, p);
blkCnt--; vecSrcB = vldrhq_z_s16(pSrcB, p);
}
accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
/* Store real and imaginary result in 8.24 format */ accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
/* Convert real data in 34.30 to 8.24 by 6 right shifts */
*realResult = (q31_t) (accReal >> 6); blkCnt -= 8;
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */ }
*imagResult = (q31_t) (accImag >> 6); while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
}
*realResult = asrl(accReal, (14 - 8));
*imagResult = asrl(accImag, (14 - 8));
} }
#else #else
void arm_cmplx_dot_prod_q15( void arm_cmplx_dot_prod_q15(

@ -64,60 +64,99 @@ void arm_cmplx_dot_prod_q31(
q63_t * realResult, q63_t * realResult,
q63_t * imagResult) q63_t * imagResult)
{ {
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ int32_t blkCnt;
uint32_t blkCnt; q63_t accReal = 0LL;
q31x4_t vecSrcA, vecSrcB; q63_t accImag = 0LL;
q63_t accReal = 0LL; q31x4_t vecSrcA, vecSrcB;
q63_t accImag = 0LL; q31x4_t vecSrcC, vecSrcD;
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
q31_t a0,b0,c0,d0; while (blkCnt > 0) {
/* Compute 2 complex samples at a time */ accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
blkCnt = blockSize >> 2U; vecSrcC = vld1q(pSrcA);
pSrcA += 4;
while (blkCnt > 0U) accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
{ vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vecSrcA = vld1q(pSrcA); accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB); vecSrcA = vld1q(pSrcA);
pSrcA += 4;
accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB); accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB); accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
/* /*
* Decrement the blkCnt loop counter * tail
* Advance vector source and destination pointers
*/ */
pSrcA += 4; blkCnt = CMPLX_DIM * (numSamples & 3);
pSrcB += 4; do {
blkCnt --; mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
blkCnt -= 4;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
} }
*realResult = asrl(accReal, (14 - 8));
*imagResult = asrl(accImag, (14 - 8));
accReal = asrl(accReal, (14 - 8));
accImag = asrl(accImag, (14 - 8));
/* Tail */
blkCnt = (blockSize & 3) >> 1;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
accReal += ((q63_t)a0 * c0) >> 14;
accImag += ((q63_t)a0 * d0) >> 14;
accReal -= ((q63_t)b0 * d0) >> 14;
accImag += ((q63_t)b0 * c0) >> 14;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in destination buffer. */
*realResult = accReal;
*imagResult = accImag;
} }
#else #else

@ -78,51 +78,105 @@ void arm_cmplx_mult_cmplx_f16(
float16_t * pDst, float16_t * pDst,
uint32_t numSamples) uint32_t numSamples)
{ {
int32_t blkCnt; /* loop counters */ int32_t blkCnt;
int32_t blockSize = numSamples; f16x8_t vecSrcA, vecSrcB;
f16x8_t vecA; f16x8_t vecSrcC, vecSrcD;
f16x8_t vecB; f16x8_t vec_acc;
f16x8_t vecDst;
blkCnt = (numSamples >> 3);
blkCnt = blockSize * CMPLX_DIM; blkCnt -= 1;
blkCnt = blkCnt >> 3; if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
while (blkCnt > 0) vecSrcA = vld1q(pSrcA);
{ vecSrcB = vld1q(pSrcB);
vecA = vldrhq_f16(pSrcA);
vecB = vldrhq_f16(pSrcB);
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
vecDst = vcmulq(vecA, vecB);
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
vecDst = vcmlaq_rot90(vecDst, vecA, vecB);
vstrhq_f16(pDst, vecDst);
blkCnt--;
pSrcA += 8; pSrcA += 8;
pSrcB += 8; pSrcB += 8;
while (blkCnt > 0) {
vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vst1q(pDst, vec_acc);
pDst += 8;
vec_acc = vcmulq(vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
vst1q(pDst, vec_acc);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vst1q(pDst, vec_acc);
pDst += 8; pDst += 8;
}
_Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */ vec_acc = vcmulq(vecSrcC, vecSrcD);
/* Tail */ vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
blkCnt = (blockSize & 7) >> 1; vst1q(pDst, vec_acc);
while (blkCnt > 0) pDst += 8;
{
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ /*
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ * tail
*/
a = *pSrcA++; blkCnt = CMPLX_DIM * (numSamples & 7);
b = *pSrcA++; while (blkCnt > 0) {
c = *pSrcB++; mve_pred16_t p = vctp16q(blkCnt);
d = *pSrcB++; pSrcA += 8;
pSrcB += 8;
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d); vecSrcA = vldrhq_z_f16(pSrcA, p);
*pDst++ = (a * d) + (b * c); vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
/* Decrement loop counter */ vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt--;
vstrhq_p_f16(pDst, vec_acc, p);
pDst += 8;
blkCnt -= 8;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
do {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrhq_p_f16(pDst, vec_acc, p);
pDst += 8;
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
} }
} }

@ -76,54 +76,104 @@ void arm_cmplx_mult_cmplx_f32(
float32_t * pDst, float32_t * pDst,
uint32_t numSamples) uint32_t numSamples)
{ {
uint32_t blkCnt; /* loop counters */ int32_t blkCnt;
uint32_t blockSize = numSamples; /* loop counters */ f32x4_t vecSrcA, vecSrcB;
float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */ f32x4_t vecSrcC, vecSrcD;
f32x4_t vec_acc;
f32x4x2_t vecA;
f32x4x2_t vecB; blkCnt = numSamples >> 2;
f32x4x2_t vecDst; blkCnt -= 1;
if (blkCnt > 0) {
/* Compute 4 complex outputs at a time */ /* should give more freedom to generate stall free code */
blkCnt = blockSize >> 2; vecSrcA = vld1q(pSrcA);
while (blkCnt > 0U) vecSrcB = vld1q(pSrcB);
{ pSrcA += 4;
vecA = vld2q(pSrcA); // load & separate real/imag pSrcA (de-interleave 2) pSrcB += 4;
vecB = vld2q(pSrcB); // load & separate real/imag pSrcB
pSrcA += 8; while (blkCnt > 0) {
pSrcB += 8; vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ pSrcA += 4;
vecDst.val[0] = vmulq(vecA.val[0], vecB.val[0]);
vecDst.val[0] = vfmsq(vecDst.val[0],vecA.val[1], vecB.val[1]); vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ vecSrcD = vld1q(pSrcB);
vecDst.val[1] = vmulq(vecA.val[0], vecB.val[1]); pSrcB += 4;
vecDst.val[1] = vfmaq(vecDst.val[1], vecA.val[1], vecB.val[0]); vst1q(pDst, vec_acc);
pDst += 4;
vst2q(pDst, vecDst);
pDst += 8; vec_acc = vcmulq(vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vec_acc);
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--; blkCnt--;
} }
/* Tail */ /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
blkCnt = blockSize & 3; vec_acc = vcmulq(vecSrcA, vecSrcB);
while (blkCnt > 0U) vecSrcC = vld1q(pSrcA);
{
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ vecSrcD = vld1q(pSrcB);
vst1q(pDst, vec_acc);
a = *pSrcA++; pDst += 4;
b = *pSrcA++;
c = *pSrcB++; vec_acc = vcmulq(vecSrcC, vecSrcD);
d = *pSrcB++; vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vst1q(pDst, vec_acc);
/* store result in destination buffer. */ pDst += 4;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c); /*
* tail
/* Decrement loop counter */ */
blkCnt--; blkCnt = CMPLX_DIM * (numSamples & 3);
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrwq_p_f32(pDst, vec_acc, p);
pDst += 4;
blkCnt -= 4;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f32(0.0f);
do {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrwq_p_f32(pDst, vec_acc, p);
pDst += 4;
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
} }
} }

@ -57,54 +57,116 @@ void arm_cmplx_mult_cmplx_q15(
q15_t * pDst, q15_t * pDst,
uint32_t numSamples) uint32_t numSamples)
{ {
uint32_t blkCnt; /* loop counters */ int32_t blkCnt;
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ q15x8_t vecSrcA, vecSrcB;
q15_t a, b, c, d; q15x8_t vecSrcC, vecSrcD;
q15x8_t vecDst;
q15x8_t vecA;
q15x8_t vecB; blkCnt = (numSamples >> 3);
q15x8_t vecDst; blkCnt -= 1;
if (blkCnt > 0)
blkCnt = blockSize >> 3; {
while (blkCnt > 0U) /* should give more freedom to generate stall free code */
{ vecSrcA = vld1q(pSrcA);
vecA = vld1q(pSrcA); vecSrcB = vld1q(pSrcB);
vecB = vld1q(pSrcB); pSrcA += 8;
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ pSrcB += 8;
vecDst = vqdmlsdhq_s16(vuninitializedq_s16(), vecA, vecB);
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ while (blkCnt > 0)
vecDst = vqdmladhxq_s16(vecDst, vecA, vecB); {
vecDst = vshrq(vecDst, 2); /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
vst1q(pDst, vecDst); vecSrcC = vld1q(pSrcA);
pSrcA += 8;
blkCnt --;
pSrcA += 8; /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
pSrcB += 8; vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
pDst += 8; vecSrcD = vld1q(pSrcB);
}; pSrcB += 8;
/* vstrhq_s16(pDst, vshrq(vecDst, 2));
* tail pDst += 8;
*/
blkCnt = (blockSize & 7) >> 1; vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
while (blkCnt > 0U) vecSrcA = vld1q(pSrcA);
{ pSrcA += 8;
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
a = *pSrcA++; pSrcB += 8;
b = *pSrcA++;
c = *pSrcB++; vstrhq_s16(pDst, vshrq(vecDst, 2));
d = *pSrcB++; pDst += 8;
/* store result in 3.13 format in destination buffer. */ /*
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) ); * Decrement the blockSize loop counter
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) ); */
blkCnt--;
/* Decrement loop counter */ }
blkCnt--;
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vstrhq_s16(pDst, vshrq(vecDst, 2));
pDst += 8;
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vstrhq_s16(pDst, vshrq(vecDst, 2));
pDst += 8;
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
do
{
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
vstrhq_p_s16(pDst, vecDst, p);
pDst += 8;
blkCnt -= 8;
}
while ((int32_t) blkCnt > 0);
}
else
{
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
vstrhq_p_s16(pDst, vecDst, p);
pDst += 8;
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
} }
} }
#else #else

@ -57,52 +57,111 @@ void arm_cmplx_mult_cmplx_q31(
q31_t * pDst, q31_t * pDst,
uint32_t numSamples) uint32_t numSamples)
{ {
int32_t blkCnt;
uint32_t blkCnt; /* loop counters */ q31x4_t vecSrcA, vecSrcB;
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */ q31x4_t vecSrcC, vecSrcD;
q31x4_t vecA; q31x4_t vecDst;
q31x4_t vecB;
q31x4_t vecDst; blkCnt = numSamples >> 2;
q31_t a, b, c, d; /* Temporary variables */ blkCnt -= 1;
if (blkCnt > 0) {
/* Compute 2 complex outputs at a time */ /* should give more freedom to generate stall free code */
blkCnt = blockSize >> 2; vecSrcA = vld1q(pSrcA);
while (blkCnt > 0U) vecSrcB = vld1q(pSrcB);
{
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB);
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
vecDst = vqdmladhxq(vecDst, vecA, vecB);
vecDst = vshrq(vecDst, 2);
vst1q(pDst, vecDst);
blkCnt --;
pSrcA += 4; pSrcA += 4;
pSrcB += 4; pSrcB += 4;
while (blkCnt > 0) {
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4; pDst += 4;
};
/*
blkCnt = (blockSize & 3) >> 1; * tail
while (blkCnt > 0U) */
{ blkCnt = CMPLX_DIM * (numSamples & 3);
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ do {
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ mve_pred16_t p = vctp32q(blkCnt);
a = *pSrcA++; pSrcA += 4;
b = *pSrcA++; pSrcB += 4;
c = *pSrcB++;
d = *pSrcB++; vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
/* store result in 3.29 format in destination buffer. */
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) ); vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) ); vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
/* Decrement loop counter */ vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
blkCnt--; vstrwq_p_s32(pDst, vecDst, p);
pDst += 4;
blkCnt -= 4;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
vstrwq_p_s32(pDst, vecDst, p);
pDst += 4;
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
} }
} }
#else #else

@ -11,7 +11,7 @@
#define ABS_32x64_ERROR_Q31 ((q31_t)25) #define ABS_32x64_ERROR_Q31 ((q31_t)25)
void checkInnerTail(q31_t *b) static void checkInnerTail(q31_t *b)
{ {
ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0); ASSERT_TRUE(b[1] == 0);

@ -16,7 +16,7 @@ a double precision computation.
static __ALIGNED(8) float16_t coeffArray[32]; static __ALIGNED(8) float16_t coeffArray[32];
#endif #endif
void checkInnerTail(float16_t *b) static void checkInnerTail(float16_t *b)
{ {
ASSERT_TRUE(b[0] == 0.0f); ASSERT_TRUE(b[0] == 0.0f);
ASSERT_TRUE(b[1] == 0.0f); ASSERT_TRUE(b[1] == 0.0f);

@ -16,7 +16,7 @@ a double precision computation.
static __ALIGNED(8) float32_t coeffArray[32]; static __ALIGNED(8) float32_t coeffArray[32];
#endif #endif
void checkInnerTail(float32_t *b) static void checkInnerTail(float32_t *b)
{ {
ASSERT_TRUE(b[0] == 0.0f); ASSERT_TRUE(b[0] == 0.0f);
ASSERT_TRUE(b[1] == 0.0f); ASSERT_TRUE(b[1] == 0.0f);

@ -10,7 +10,7 @@
static __ALIGNED(8) q15_t coeffArray[32]; static __ALIGNED(8) q15_t coeffArray[32];
#endif #endif
void checkInnerTail(q15_t *b) static void checkInnerTail(q15_t *b)
{ {
ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0); ASSERT_TRUE(b[1] == 0);

@ -10,7 +10,7 @@
static __ALIGNED(8) q31_t coeffArray[32]; static __ALIGNED(8) q31_t coeffArray[32];
#endif #endif
void checkInnerTail(q31_t *b) static void checkInnerTail(q31_t *b)
{ {
ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0); ASSERT_TRUE(b[1] == 0);

@ -10,7 +10,7 @@
static __ALIGNED(8) q7_t coeffArray[32]; static __ALIGNED(8) q7_t coeffArray[32];
#endif #endif
void checkInnerTail(q7_t *b) static void checkInnerTail(q7_t *b)
{ {
ASSERT_TRUE(b[0] == 0); ASSERT_TRUE(b[0] == 0);
ASSERT_TRUE(b[1] == 0); ASSERT_TRUE(b[1] == 0);

Loading…
Cancel
Save