|
|
|
|
@ -57,52 +57,111 @@ void arm_cmplx_mult_cmplx_q31(
|
|
|
|
|
q31_t * pDst,
|
|
|
|
|
uint32_t numSamples)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
uint32_t blkCnt; /* loop counters */
|
|
|
|
|
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
|
|
|
|
|
q31x4_t vecA;
|
|
|
|
|
q31x4_t vecB;
|
|
|
|
|
q31x4_t vecDst;
|
|
|
|
|
q31_t a, b, c, d; /* Temporary variables */
|
|
|
|
|
|
|
|
|
|
/* Compute 2 complex outputs at a time */
|
|
|
|
|
blkCnt = blockSize >> 2;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
vecA = vld1q(pSrcA);
|
|
|
|
|
vecB = vld1q(pSrcB);
|
|
|
|
|
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
|
|
|
|
|
vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB);
|
|
|
|
|
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
|
|
|
|
|
vecDst = vqdmladhxq(vecDst, vecA, vecB);
|
|
|
|
|
|
|
|
|
|
vecDst = vshrq(vecDst, 2);
|
|
|
|
|
vst1q(pDst, vecDst);
|
|
|
|
|
|
|
|
|
|
blkCnt --;
|
|
|
|
|
int32_t blkCnt;
|
|
|
|
|
q31x4_t vecSrcA, vecSrcB;
|
|
|
|
|
q31x4_t vecSrcC, vecSrcD;
|
|
|
|
|
q31x4_t vecDst;
|
|
|
|
|
|
|
|
|
|
blkCnt = numSamples >> 2;
|
|
|
|
|
blkCnt -= 1;
|
|
|
|
|
if (blkCnt > 0) {
|
|
|
|
|
/* should give more freedom to generate stall free code */
|
|
|
|
|
vecSrcA = vld1q(pSrcA);
|
|
|
|
|
vecSrcB = vld1q(pSrcB);
|
|
|
|
|
pSrcA += 4;
|
|
|
|
|
pSrcB += 4;
|
|
|
|
|
|
|
|
|
|
while (blkCnt > 0) {
|
|
|
|
|
|
|
|
|
|
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
|
|
|
|
|
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
|
|
|
|
|
vecSrcC = vld1q(pSrcA);
|
|
|
|
|
pSrcA += 4;
|
|
|
|
|
|
|
|
|
|
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
|
|
|
|
|
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
|
|
|
|
|
vecSrcD = vld1q(pSrcB);
|
|
|
|
|
pSrcB += 4;
|
|
|
|
|
|
|
|
|
|
vst1q(pDst, vshrq(vecDst, 2));
|
|
|
|
|
pDst += 4;
|
|
|
|
|
|
|
|
|
|
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
|
|
|
|
|
vecSrcA = vld1q(pSrcA);
|
|
|
|
|
pSrcA += 4;
|
|
|
|
|
|
|
|
|
|
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
|
|
|
|
|
vecSrcB = vld1q(pSrcB);
|
|
|
|
|
pSrcB += 4;
|
|
|
|
|
|
|
|
|
|
vst1q(pDst, vshrq(vecDst, 2));
|
|
|
|
|
pDst += 4;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Decrement the blockSize loop counter
|
|
|
|
|
*/
|
|
|
|
|
blkCnt--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
|
|
|
|
|
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
|
|
|
|
|
vecSrcC = vld1q(pSrcA);
|
|
|
|
|
|
|
|
|
|
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
|
|
|
|
|
vecSrcD = vld1q(pSrcB);
|
|
|
|
|
|
|
|
|
|
vst1q(pDst, vshrq(vecDst, 2));
|
|
|
|
|
pDst += 4;
|
|
|
|
|
|
|
|
|
|
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
|
|
|
|
|
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
|
|
|
|
|
|
|
|
|
|
vst1q(pDst, vshrq(vecDst, 2));
|
|
|
|
|
pDst += 4;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
blkCnt = (blockSize & 3) >> 1;
|
|
|
|
|
while (blkCnt > 0U)
|
|
|
|
|
{
|
|
|
|
|
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
|
|
|
|
|
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
|
|
|
|
|
|
|
|
|
|
a = *pSrcA++;
|
|
|
|
|
b = *pSrcA++;
|
|
|
|
|
c = *pSrcB++;
|
|
|
|
|
d = *pSrcB++;
|
|
|
|
|
|
|
|
|
|
/* store result in 3.29 format in destination buffer. */
|
|
|
|
|
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
|
|
|
|
|
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
|
|
|
|
|
|
|
|
|
|
/* Decrement loop counter */
|
|
|
|
|
blkCnt--;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* tail
|
|
|
|
|
*/
|
|
|
|
|
blkCnt = CMPLX_DIM * (numSamples & 3);
|
|
|
|
|
do {
|
|
|
|
|
mve_pred16_t p = vctp32q(blkCnt);
|
|
|
|
|
|
|
|
|
|
pSrcA += 4;
|
|
|
|
|
pSrcB += 4;
|
|
|
|
|
|
|
|
|
|
vecSrcA = vldrwq_z_s32(pSrcA, p);
|
|
|
|
|
vecSrcB = vldrwq_z_s32(pSrcB, p);
|
|
|
|
|
|
|
|
|
|
vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
|
|
|
|
|
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
|
|
|
|
|
|
|
|
|
|
vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
|
|
|
|
|
vstrwq_p_s32(pDst, vecDst, p);
|
|
|
|
|
pDst += 4;
|
|
|
|
|
|
|
|
|
|
blkCnt -= 4;
|
|
|
|
|
}
|
|
|
|
|
while ((int32_t) blkCnt > 0);
|
|
|
|
|
} else {
|
|
|
|
|
blkCnt = numSamples * CMPLX_DIM;
|
|
|
|
|
while (blkCnt > 0) {
|
|
|
|
|
mve_pred16_t p = vctp32q(blkCnt);
|
|
|
|
|
|
|
|
|
|
vecSrcA = vldrwq_z_s32(pSrcA, p);
|
|
|
|
|
vecSrcB = vldrwq_z_s32(pSrcB, p);
|
|
|
|
|
|
|
|
|
|
vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
|
|
|
|
|
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
|
|
|
|
|
|
|
|
|
|
vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
|
|
|
|
|
vstrwq_p_s32(pDst, vecDst, p);
|
|
|
|
|
|
|
|
|
|
pDst += 4;
|
|
|
|
|
pSrcA += 4;
|
|
|
|
|
pSrcB += 4;
|
|
|
|
|
|
|
|
|
|
blkCnt -= 4;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
|