CMSIS-DSP: Use __PKHBT intrinsics for FFTs

This results in about a few percent faster FFT when using GCC.

Change-Id: I9620370aa5098bb10e04d84a7fed2167e7734aeb
pull/19/head
Richard Allen 6 years ago committed by Christophe Favergeon
parent fe79fc5a00
commit 9e4b2a9358

@ -794,7 +794,7 @@ void arm_cfft_radix4by2_q15(
out2 = __SMUAD(coeff, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
}
#else /* #if defined (ARM_MATH_DSP) */
@ -893,7 +893,7 @@ void arm_cfft_radix4by2_inverse_q15(
out2 = __SMUSD(__QSUB(0, coeff), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
write_q15x2_ia (&pSl, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
}
#else /* #if defined (ARM_MATH_DSP) */

@ -260,7 +260,7 @@ void arm_radix4_butterfly_q15(
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
/* Butterfly calculations */
/* U = packed(yd, xd) */
@ -300,7 +300,7 @@ void arm_radix4_butterfly_q15(
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xb', yb') in little endian format */
write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
/* co3 & si3 are read from SIMD Coefficient pointer */
C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
@ -319,7 +319,7 @@ void arm_radix4_butterfly_q15(
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xd', yd') in little endian format */
write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
@ -417,7 +417,7 @@ void arm_radix4_butterfly_q15(
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
pSi1 += 2 * n1;
/* Butterfly calculations */
@ -454,7 +454,7 @@ void arm_radix4_butterfly_q15(
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
pSi2 += 2 * n1;
/* Butterfly process for the i0+3fftLen/4 sample */
@ -469,7 +469,7 @@ void arm_radix4_butterfly_q15(
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
pSi3 += 2 * n1;
}
}
@ -1126,7 +1126,7 @@ void arm_radix4_butterfly_inverse_q15(
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
/* Butterfly calculations */
/* U = packed(yd, xd) */
@ -1166,7 +1166,7 @@ void arm_radix4_butterfly_inverse_q15(
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xb', yb') in little endian format */
write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
/* co3 & si3 are read from SIMD Coefficient pointer */
C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
@ -1185,7 +1185,7 @@ void arm_radix4_butterfly_inverse_q15(
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xd', yd') in little endian format */
write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
@ -1283,7 +1283,7 @@ void arm_radix4_butterfly_inverse_q15(
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
pSi1 += 2 * n1;
/* Butterfly calculations */
@ -1318,7 +1318,7 @@ void arm_radix4_butterfly_inverse_q15(
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
pSi2 += 2 * n1;
/* Butterfly process for the i0+3fftLen/4 sample */
@ -1333,7 +1333,7 @@ void arm_radix4_butterfly_inverse_q15(
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
pSi3 += 2 * n1;
}
}

Loading…
Cancel
Save