CMSIS-DSP: Improved Helium CFFT Radix4 butterflies

pull/19/head
FabKlein 4 years ago committed by Christophe Favergeon
parent 0f4ac797f1
commit 168d055297

@ -108,28 +108,26 @@ static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float1
n2 >>= 2u;
for (int k = fftLen / 4u; k > 1; k >>= 2)
{
float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float16_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float16_t const *pW1, *pW2, *pW3;
float16_t *inA = pSrc + CMPLX_DIM * i * n1;
float16_t *inB = inA + n2 * CMPLX_DIM;
float16_t *inC = inB + n2 * CMPLX_DIM;
float16_t *inD = inC + n2 * CMPLX_DIM;
f16x8_t vecW;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
float16_t *inA = pBase;
float16_t *inB = inA + n2 * CMPLX_DIM;
float16_t *inC = inB + n2 * CMPLX_DIM;
float16_t *inD = inC + n2 * CMPLX_DIM;
float16_t const *pW1 = p_rearranged_twiddle_tab_stride1;
float16_t const *pW2 = p_rearranged_twiddle_tab_stride2;
float16_t const *pW3 = p_rearranged_twiddle_tab_stride3;
f16x8_t vecW;
blkCnt = n2 / 4;
/*
@ -198,6 +196,7 @@ static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float1
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;
@ -300,7 +299,6 @@ static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 *
f16x8_t vecTmp0, vecTmp1;
f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f16x8_t vecA, vecB, vecC, vecD;
f16x8_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
@ -317,26 +315,27 @@ static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 *
n2 >>= 2u;
for (int k = fftLen / 4; k > 1; k >>= 2)
{
float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float16_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float16_t const *pW1, *pW2, *pW3;
float16_t *inA = pSrc + CMPLX_DIM * i * n1;
float16_t *inB = inA + n2 * CMPLX_DIM;
float16_t *inC = inB + n2 * CMPLX_DIM;
float16_t *inD = inC + n2 * CMPLX_DIM;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
float16_t *inA = pBase;
float16_t *inB = inA + n2 * CMPLX_DIM;
float16_t *inC = inB + n2 * CMPLX_DIM;
float16_t *inD = inC + n2 * CMPLX_DIM;
float16_t const *pW1 = p_rearranged_twiddle_tab_stride1;
float16_t const *pW2 = p_rearranged_twiddle_tab_stride2;
float16_t const *pW3 = p_rearranged_twiddle_tab_stride3;
f16x8_t vecW;
blkCnt = n2 / 4;
/*
@ -404,6 +403,7 @@ static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 *
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;

@ -91,13 +91,13 @@ static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
{
f32x4_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
int32_t iter = 1;
f32x4_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
int32_t iter = 1;
static const int32_t strides[4] = {
(0 - 16) * (int32_t)sizeof(q31_t *),
(1 - 16) * (int32_t)sizeof(q31_t *),
@ -110,29 +110,28 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
n2 >>= 2u;
for (int k = fftLen / 4u; k > 1; k >>= 2)
{
float32_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
float32_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t const *pW1, *pW2, *pW3;
float32_t *inA = pSrc + CMPLX_DIM * i * n1;
float32_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM;
float32_t *inA = pBase;
float32_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM;
float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
f32x4_t vecW;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 2;
/*
* load 2 f32 complex pair
@ -200,6 +199,7 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;
@ -302,7 +302,6 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
f32x4_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD;
f32x4_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
@ -319,26 +318,27 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
n2 >>= 2u;
for (int k = fftLen / 4; k > 1; k >>= 2)
{
float32_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
float32_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t const *pW1, *pW2, *pW3;
float32_t *inA = pSrc + CMPLX_DIM * i * n1;
float32_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
float32_t *inA = pBase;
float32_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM;
float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
f32x4_t vecW;
blkCnt = n2 / 2;
/*
@ -406,6 +406,7 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;

@ -41,7 +41,6 @@ static void _arm_radix4_butterfly_q15_mve(
q15x8_t vecTmp0, vecTmp1;
q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
q15x8_t vecA, vecB, vecC, vecD;
q15x8_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
@ -61,25 +60,26 @@ static void _arm_radix4_butterfly_q15_mve(
for (int k = fftLen / 4u; k > 1; k >>= 2u)
{
q15_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q15_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
q15_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q15_t const *pW1, *pW2, *pW3;
q15_t *inA = pSrc + CMPLX_DIM * i * n1;
q15_t *inA = pBase;
q15_t *inB = inA + n2 * CMPLX_DIM;
q15_t *inC = inB + n2 * CMPLX_DIM;
q15_t *inD = inC + n2 * CMPLX_DIM;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
q15_t const *pW1 = p_rearranged_twiddle_tab_stride1;
q15_t const *pW2 = p_rearranged_twiddle_tab_stride2;
q15_t const *pW3 = p_rearranged_twiddle_tab_stride3;
q15x8_t vecW;
blkCnt = n2 / 4;
/*
@ -147,6 +147,7 @@ static void _arm_radix4_butterfly_q15_mve(
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;
@ -276,7 +277,6 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
q15x8_t vecTmp0, vecTmp1;
q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
q15x8_t vecA, vecB, vecC, vecD;
q15x8_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
@ -297,25 +297,27 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
for (int k = fftLen / 4u; k > 1; k >>= 2u)
{
q15_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q15_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
q15_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q15_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q15_t const *pW1, *pW2, *pW3;
q15_t *inA = pSrc + CMPLX_DIM * i * n1;
q15_t *inA = pBase;
q15_t *inB = inA + n2 * CMPLX_DIM;
q15_t *inC = inB + n2 * CMPLX_DIM;
q15_t *inD = inC + n2 * CMPLX_DIM;
q15_t const *pW1 = p_rearranged_twiddle_tab_stride1;
q15_t const *pW2 = p_rearranged_twiddle_tab_stride2;
q15_t const *pW3 = p_rearranged_twiddle_tab_stride3;
q15x8_t vecW;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 4;
/*
@ -382,6 +384,7 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;

@ -43,7 +43,6 @@ static void _arm_radix4_butterfly_q31_mve(
q31x4_t vecTmp0, vecTmp1;
q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
q31x4_t vecA, vecB, vecC, vecD;
q31x4_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
@ -64,25 +63,27 @@ static void _arm_radix4_butterfly_q31_mve(
for (int k = fftLen / 4u; k > 1; k >>= 2u)
{
q31_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q31_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
q31_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q31_t const *pW1, *pW2, *pW3;
q31_t *inA = pSrc + CMPLX_DIM * i * n1;
q31_t *inA = pBase;
q31_t *inB = inA + n2 * CMPLX_DIM;
q31_t *inC = inB + n2 * CMPLX_DIM;
q31_t *inD = inC + n2 * CMPLX_DIM;
q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
q31x4_t vecW;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 2;
/*
@ -149,6 +150,7 @@ static void _arm_radix4_butterfly_q31_mve(
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;
@ -293,7 +295,6 @@ static void _arm_radix4_butterfly_inverse_q31_mve(
q31x4_t vecTmp0, vecTmp1;
q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
q31x4_t vecA, vecB, vecC, vecD;
q31x4_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
@ -313,26 +314,26 @@ static void _arm_radix4_butterfly_inverse_q31_mve(
for (int k = fftLen / 4u; k > 1; k >>= 2u)
{
q31_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q31_t * pBase = pSrc;
for (int i = 0; i < iter; i++)
{
q31_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
q31_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
q31_t const *pW1, *pW2, *pW3;
q31_t *inA = pSrc + CMPLX_DIM * i * n1;
q31_t *inA = pBase;
q31_t *inB = inA + n2 * CMPLX_DIM;
q31_t *inC = inB + n2 * CMPLX_DIM;
q31_t *inD = inC + n2 * CMPLX_DIM;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
q31x4_t vecW;
blkCnt = n2 / 2;
/*
@ -399,6 +400,7 @@ static void _arm_radix4_butterfly_inverse_q31_mve(
blkCnt--;
}
pBase += CMPLX_DIM * n1;
}
n1 = n2;
n2 >>= 2u;

Loading…
Cancel
Save