CMSIS-DSP: Add preliminary cfft 16 MVE code.

pull/19/head
Christophe Favergeon 6 years ago
parent b543f5c61c
commit 86a272902a

@ -441,9 +441,12 @@ extern "C"
/** /**
* @brief 16-bit floating-point type definition. * @brief 16-bit floating-point type definition.
* This is already defined in arm_mve.h
*/ */
#if !defined (ARM_MATH_HELIUM) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_MVEI)
typedef __fp16 float16_t; typedef __fp16 float16_t;
#endif
/** /**
* @brief 32-bit floating-point type definition. * @brief 32-bit floating-point type definition.

@ -4,8 +4,7 @@
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc * Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
* used for MVE implementation only * used for MVE implementation only
* *
* $Date: 08. January 2020 * $Date: 14. April 2020
* $Revision: V1.7.0
* *
* Target Processor: Cortex-M cores * Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */ * -------------------------------------------------------------------- */
@ -98,6 +97,67 @@ extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f16[2];
extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f16[2];
extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f16[2];
extern float16_t rearranged_twiddle_stride1_16_f16[8];
extern float16_t rearranged_twiddle_stride2_16_f16[8];
extern float16_t rearranged_twiddle_stride3_16_f16[8];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f16[3];
extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f16[3];
extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f16[3];
extern float16_t rearranged_twiddle_stride1_64_f16[40];
extern float16_t rearranged_twiddle_stride2_64_f16[40];
extern float16_t rearranged_twiddle_stride3_64_f16[40];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f16[4];
extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f16[4];
extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f16[4];
extern float16_t rearranged_twiddle_stride1_256_f16[168];
extern float16_t rearranged_twiddle_stride2_256_f16[168];
extern float16_t rearranged_twiddle_stride3_256_f16[168];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f16[5];
extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f16[5];
extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f16[5];
extern float16_t rearranged_twiddle_stride1_1024_f16[680];
extern float16_t rearranged_twiddle_stride2_1024_f16[680];
extern float16_t rearranged_twiddle_stride3_1024_f16[680];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) || defined(ARM_TABLE_TWIDDLECOEF_F16_8192)
extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f16[6];
extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f16[6];
extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f16[6];
extern float16_t rearranged_twiddle_stride1_4096_f16[2728];
extern float16_t rearranged_twiddle_stride2_4096_f16[2728];
extern float16_t rearranged_twiddle_stride3_4096_f16[2728];
#endif
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
#if defined(ARM_MATH_MVEI) #if defined(ARM_MATH_MVEI)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)

@ -19,9 +19,10 @@ condition="""#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES)
""" """
F32 = 1 F32 = 1
Q31 = 2 F16 = 2
Q15 = 3 Q31 = 3
Q7 = 4 Q15 = 4
Q7 = 5
def printCUInt32Array(f,name,arr): def printCUInt32Array(f,name,arr):
nb = 0 nb = 0
@ -51,6 +52,20 @@ def printCFloat32Array(f,name,arr):
print("};\n",file=f) print("};\n",file=f)
def printCFloat16Array(f,name,arr):
nb = 0
print("float16_t %s[%d]={" % (name,len(arr)),file=f)
for d in arr:
val = "(float16_t)%.20ff," % d
nb = nb + len(val)
if nb > COLLIM:
print("",file=f)
nb = len(val)
print(val,end="",file=f)
print("};\n",file=f)
def printCQ31Array(f,name,arr): def printCQ31Array(f,name,arr):
nb = 0 nb = 0
print("q31_t %s[%d]={" % (name,len(arr)),file=f) print("q31_t %s[%d]={" % (name,len(arr)),file=f)
@ -99,6 +114,9 @@ def printHUInt32Array(f,name,arr):
def printHFloat32Array(f,name,arr): def printHFloat32Array(f,name,arr):
print("extern float32_t %s[%d];" % (name,len(arr)),file=f) print("extern float32_t %s[%d];" % (name,len(arr)),file=f)
def printHFloat16Array(f,name,arr):
print("extern float16_t %s[%d];" % (name,len(arr)),file=f)
def printHQ31Array(f,name,arr): def printHQ31Array(f,name,arr):
print("extern q31_t %s[%d];" % (name,len(arr)),file=f) print("extern q31_t %s[%d];" % (name,len(arr)),file=f)
@ -225,6 +243,30 @@ def reorderTwiddle(theType,conjugate,f,h,n):
print("#endif\n",file=f) print("#endif\n",file=f)
print("#endif\n",file=h) print("#endif\n",file=h)
# F16 SECTION FOR THIS FFT LENGTH
if theType == F16:
print(condition % ("F16",n, "F16",n << 1),file=f)
print(condition % ("F16",n, "F16",n << 1),file=h)
printCUInt32Array(f,"rearranged_twiddle_tab_stride1_arr_%d_f16" % n,list(tab1Offset))
printHUInt32Array(h,"rearranged_twiddle_tab_stride1_arr_%d_f16" % n,list(tab1Offset))
printCUInt32Array(f,"rearranged_twiddle_tab_stride2_arr_%d_f16" % n,list(tab2Offset))
printHUInt32Array(h,"rearranged_twiddle_tab_stride2_arr_%d_f16" % n,list(tab2Offset))
printCUInt32Array(f,"rearranged_twiddle_tab_stride3_arr_%d_f16" % n,list(tab3Offset))
printHUInt32Array(h,"rearranged_twiddle_tab_stride3_arr_%d_f16" % n,list(tab3Offset))
printCFloat16Array(f,"rearranged_twiddle_stride1_%d_f16" % n,list(tab1))
printHFloat16Array(h,"rearranged_twiddle_stride1_%d_f16" % n,list(tab1))
printCFloat16Array(f,"rearranged_twiddle_stride2_%d_f16" % n,list(tab2))
printHFloat16Array(h,"rearranged_twiddle_stride2_%d_f16" % n,list(tab2))
printCFloat16Array(f,"rearranged_twiddle_stride3_%d_f16" % n,list(tab3))
printHFloat16Array(h,"rearranged_twiddle_stride3_%d_f16" % n,list(tab3))
print("#endif\n",file=f)
print("#endif\n",file=h)
# Q31 SECTION FOR THIS FFT LENGTH # Q31 SECTION FOR THIS FFT LENGTH
if theType == Q31: if theType == Q31:
print(condition % ("Q31",n, "Q31",n << 1),file=f) print(condition % ("Q31",n, "Q31",n << 1),file=f)
@ -285,8 +327,7 @@ cheader="""/* ------------------------------------------------------------------
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc * Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
* used for MVE implementation only * used for MVE implementation only
* *
* $Date: 08. January 2020 * $Date: 14. April 2020
* $Revision: V1.7.0
* *
* Target Processor: Cortex-M cores * Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */ * -------------------------------------------------------------------- */
@ -342,8 +383,7 @@ hheader="""/* ------------------------------------------------------------------
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc * Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
* used for MVE implementation only * used for MVE implementation only
* *
* $Date: 08. January 2020 * $Date: 14. April 2020
* $Revision: V1.7.0
* *
* Target Processor: Cortex-M cores * Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */ * -------------------------------------------------------------------- */
@ -423,6 +463,16 @@ with open(args.f,'w') as f:
print(cfooterMVEF,file=f) print(cfooterMVEF,file=f)
print(hfooterMVEF,file=h) print(hfooterMVEF,file=h)
print(cifdeMVEF,file=f)
print(hifdefMVEF,file=h)
reorderTwiddle(F16,False,f,h,16)
reorderTwiddle(F16,False,f,h,64)
reorderTwiddle(F16,False,f,h,256)
reorderTwiddle(F16,False,f,h,1024)
reorderTwiddle(F16,False,f,h,4096)
print(cfooterMVEF,file=f)
print(hfooterMVEF,file=h)
print(cifdeMVEI,file=f) print(cifdeMVEI,file=f)
print(hifdefMVEI,file=h) print(hifdefMVEI,file=h)
reorderTwiddle(Q31,True,f,h,16) reorderTwiddle(Q31,True,f,h,16)

File diff suppressed because it is too large Load Diff

@ -29,54 +29,54 @@
#include "arm_math.h" #include "arm_math.h"
#include "arm_common_tables.h" #include "arm_common_tables.h"
//#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if 0
#include "arm_helium_utils.h" #include "arm_helium_utils.h"
#include "arm_vec_fft.h" #include "arm_vec_fft.h"
#include "arm_mve_tables.h" #include "arm_mve_tables.h"
static float32_t arm_inverse_fft_length_f32(uint16_t fftLen) static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
{ {
float32_t retValue=1.0; float16_t retValue=1.0;
switch (fftLen) switch (fftLen)
{ {
case 4096U: case 4096U:
retValue = 0.000244140625; retValue = (float16_t)0.000244140625;
break; break;
case 2048U: case 2048U:
retValue = 0.00048828125; retValue = (float16_t)0.00048828125;
break; break;
case 1024U: case 1024U:
retValue = 0.0009765625f; retValue = (float16_t)0.0009765625f;
break; break;
case 512U: case 512U:
retValue = 0.001953125; retValue = (float16_t)0.001953125;
break; break;
case 256U: case 256U:
retValue = 0.00390625f; retValue = (float16_t)0.00390625f;
break; break;
case 128U: case 128U:
retValue = 0.0078125; retValue = (float16_t)0.0078125;
break; break;
case 64U: case 64U:
retValue = 0.015625f; retValue = (float16_t)0.015625f;
break; break;
case 32U: case 32U:
retValue = 0.03125; retValue = (float16_t)0.03125;
break; break;
case 16U: case 16U:
retValue = 0.0625f; retValue = (float16_t)0.0625f;
break; break;
@ -87,54 +87,80 @@ static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
} }
static void arm_bitreversal_32_inpl_mve( static void arm_bitreversal_f16_inpl_mve(
uint32_t *pSrc, uint16_t *pSrc,
const uint16_t bitRevLen, const uint16_t bitRevLen,
const uint16_t *pBitRevTab) const uint16_t *pBitRevTab)
{ {
uint64_t *src = (uint64_t *) pSrc; uint32_t *src = (uint32_t *)pSrc;
uint32_t blkCnt; /* loop counters */ uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff; uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1); uint16x8_t one = vdupq_n_u16(1);
blkCnt = (bitRevLen / 2) / 2; blkCnt = (bitRevLen / 2) / 4;
while (blkCnt > 0U) { while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u32(pBitRevTab); bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 4; pBitRevTab += 8;
uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one); uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one); uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1); bitRevOff1 = bitRevOff1 >> 3;
uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2); bitRevOff2 = bitRevOff2 >> 3;
vstrdq_scatter_offset_u64(src, bitRevOff1, in2); uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
vstrdq_scatter_offset_u64(src, bitRevOff2, in1); uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
/* /*
* Decrement the blockSize loop counter * Decrement the blockSize loop counter
*/ */
blkCnt--; blkCnt--;
} }
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = bitRevLen & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
}
} }
static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen) static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
{ {
f32x4_t vecTmp0, vecTmp1; f16x8_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1; f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD; f16x8_t vecA, vecB, vecC, vecD;
uint32_t blkCnt; uint32_t blkCnt;
uint32_t n1, n2; uint32_t n1, n2;
uint32_t stage = 0; uint32_t stage = 0;
int32_t iter = 1; int32_t iter = 1;
static const uint32_t strides[4] = { static const uint32_t strides[4] =
(0 - 16) * sizeof(q31_t *), {(0 - 16) * sizeof(float16_t *)
(1 - 16) * sizeof(q31_t *), , (4 - 16) * sizeof(float16_t *)
(8 - 16) * sizeof(q31_t *), , (8 - 16) * sizeof(float16_t *)
(9 - 16) * sizeof(q31_t *) , (12 - 16) * sizeof(float16_t *)};
};
n2 = fftLen; n2 = fftLen;
n1 = n2; n1 = n2;
@ -143,37 +169,37 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
{ {
for (int i = 0; i < iter; i++) for (int i = 0; i < iter; i++)
{ {
float32_t const *p_rearranged_twiddle_tab_stride1 = float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[ &S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]]; S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 = float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[ &S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]]; S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 = float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[ &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]]; S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t const *pW1, *pW2, *pW3; float16_t const *pW1, *pW2, *pW3;
float32_t *inA = pSrc + CMPLX_DIM * i * n1; float16_t *inA = pSrc + CMPLX_DIM * i * n1;
float32_t *inB = inA + n2 * CMPLX_DIM; float16_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM; float16_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM; float16_t *inD = inC + n2 * CMPLX_DIM;
f32x4_t vecW; f16x8_t vecW;
pW1 = p_rearranged_twiddle_tab_stride1; pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2; pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3; pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 2; blkCnt = n2 / 4;
/* /*
* load 2 f32 complex pair * load 2 f16 complex pair
*/ */
vecA = vldrwq_f32(inA); vecA = vldrhq_f16(inA);
vecC = vldrwq_f32(inC); vecC = vldrhq_f16(inC);
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
vecB = vldrwq_f32(inB); vecB = vldrhq_f16(inB);
vecD = vldrwq_f32(inD); vecD = vldrhq_f16(inD);
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
@ -185,7 +211,7 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
*/ */
vecTmp0 = vecSum0 + vecSum1; vecTmp0 = vecSum0 + vecSum1;
vst1q(inA, vecTmp0); vst1q(inA, vecTmp0);
inA += 4; inA += 8;
/* /*
* [ 1 -1 1 -1 ] * [ A B C D ]' * [ 1 -1 1 -1 ] * [ A B C D ]'
@ -195,10 +221,10 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
* [ 1 -1 1 -1 ] * [ A B C D ]'.* W2 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
*/ */
vecW = vld1q(pW2); vecW = vld1q(pW2);
pW2 += 4; pW2 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
vst1q(inB, vecTmp1); vst1q(inB, vecTmp1);
inB += 4; inB += 8;
/* /*
* [ 1 -i -1 +i ] * [ A B C D ]' * [ 1 -i -1 +i ] * [ A B C D ]'
@ -208,10 +234,10 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
* [ 1 -i -1 +i ] * [ A B C D ]'.* W1 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
*/ */
vecW = vld1q(pW1); vecW = vld1q(pW1);
pW1 +=4; pW1 +=8;
vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
vst1q(inC, vecTmp1); vst1q(inC, vecTmp1);
inC += 4; inC += 8;
/* /*
* [ 1 +i -1 -i ] * [ A B C D ]' * [ 1 +i -1 -i ] * [ A B C D ]'
@ -221,13 +247,13 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
* [ 1 +i -1 -i ] * [ A B C D ]'.* W3 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
*/ */
vecW = vld1q(pW3); vecW = vld1q(pW3);
pW3 += 4; pW3 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0); vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
vst1q(inD, vecTmp1); vst1q(inD, vecTmp1);
inD += 4; inD += 8;
vecA = vldrwq_f32(inA); vecA = vldrhq_f16(inA);
vecC = vldrwq_f32(inC); vecC = vldrhq_f16(inC);
blkCnt--; blkCnt--;
} }
@ -246,35 +272,35 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
/* load scheduling */ /* load scheduling */
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
blkCnt = (fftLen >> 3); blkCnt = (fftLen >> 4);
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
vecB = vldrwq_gather_base_f32(vecScGathAddr, 8); vecB = vldrwq_gather_base_f32(vecScGathAddr, 4);
vecD = vldrwq_gather_base_f32(vecScGathAddr, 24); vecD = vldrwq_gather_base_f32(vecScGathAddr, 12);
vecSum1 = vecB + vecD; vecSum1 = vecB + vecD;
vecDiff1 = vecB - vecD; vecDiff1 = vecB - vecD;
/* pre-load for next iteration */ /* pre-load for next iteration */
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
vecTmp0 = vecSum0 + vecSum1; vecTmp0 = vecSum0 + vecSum1;
vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
vecTmp0 = vecSum0 - vecSum1; vecTmp0 = vecSum0 - vecSum1;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, vecTmp0);
vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, vecTmp0);
blkCnt--; blkCnt--;
} }
@ -284,15 +310,15 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
*/ */
} }
static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen) static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
{ {
float32_t const *pCoefVec; float16_t const *pCoefVec;
float32_t const *pCoef = S->pTwiddle; float16_t const *pCoef = S->pTwiddle;
float32_t *pIn0, *pIn1; float16_t *pIn0, *pIn1;
uint32_t n2; uint32_t n2;
uint32_t blkCnt; uint32_t blkCnt;
f32x4_t vecIn0, vecIn1, vecSum, vecDiff; f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
f32x4_t vecCmplxTmp, vecTw; f16x8_t vecCmplxTmp, vecTw;
n2 = fftLen >> 1; n2 = fftLen >> 1;
@ -300,49 +326,49 @@ static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_
pIn1 = pSrc + fftLen; pIn1 = pSrc + fftLen;
pCoefVec = pCoef; pCoefVec = pCoef;
blkCnt = n2 / 2; blkCnt = n2 / 4;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
vecIn0 = *(f32x4_t *) pIn0; vecIn0 = *(f16x8_t *) pIn0;
vecIn1 = *(f32x4_t *) pIn1; vecIn1 = *(f16x8_t *) pIn1;
vecTw = vld1q(pCoefVec); vecTw = vld1q(pCoefVec);
pCoefVec += 4; pCoefVec += 8;
vecSum = vecIn0 + vecIn1; vecSum = vaddq(vecIn0, vecIn1);
vecDiff = vecIn0 - vecIn1; vecDiff = vsubq(vecIn0, vecIn1);
vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff); vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
vst1q(pIn0, vecSum); vst1q(pIn0, vecSum);
pIn0 += 4; pIn0 += 8;
vst1q(pIn1, vecCmplxTmp); vst1q(pIn1, vecCmplxTmp);
pIn1 += 4; pIn1 += 8;
blkCnt--; blkCnt--;
} }
_arm_radix4_butterfly_f32_mve(S, pSrc, n2); _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
_arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2); _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
pIn0 = pSrc; pIn0 = pSrc;
} }
static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen) static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
{ {
f32x4_t vecTmp0, vecTmp1; f16x8_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1; f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD; f16x8_t vecA, vecB, vecC, vecD;
f32x4_t vecW; f16x8_t vecW;
uint32_t blkCnt; uint32_t blkCnt;
uint32_t n1, n2; uint32_t n1, n2;
uint32_t stage = 0; uint32_t stage = 0;
int32_t iter = 1; int32_t iter = 1;
static const uint32_t strides[4] = { static const uint32_t strides[4] = {
(0 - 16) * sizeof(q31_t *), (0 - 16) * sizeof(q31_t *),
(1 - 16) * sizeof(q31_t *), (4 - 16) * sizeof(q31_t *),
(8 - 16) * sizeof(q31_t *), (8 - 16) * sizeof(q31_t *),
(9 - 16) * sizeof(q31_t *) (12 - 16) * sizeof(q31_t *)
}; };
n2 = fftLen; n2 = fftLen;
@ -352,35 +378,35 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
{ {
for (int i = 0; i < iter; i++) for (int i = 0; i < iter; i++)
{ {
float32_t const *p_rearranged_twiddle_tab_stride1 = float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[ &S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]]; S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 = float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[ &S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]]; S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 = float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[ &S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]]; S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t const *pW1, *pW2, *pW3; float16_t const *pW1, *pW2, *pW3;
float32_t *inA = pSrc + CMPLX_DIM * i * n1; float16_t *inA = pSrc + CMPLX_DIM * i * n1;
float32_t *inB = inA + n2 * CMPLX_DIM; float16_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM; float16_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM; float16_t *inD = inC + n2 * CMPLX_DIM;
pW1 = p_rearranged_twiddle_tab_stride1; pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2; pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3; pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 2; blkCnt = n2 / 4;
/* /*
* load 2 f32 complex pair * load 2 f32 complex pair
*/ */
vecA = vldrwq_f32(inA); vecA = vldrhq_f16(inA);
vecC = vldrwq_f32(inC); vecC = vldrhq_f16(inC);
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
vecB = vldrwq_f32(inB); vecB = vldrhq_f16(inB);
vecD = vldrwq_f32(inD); vecD = vldrhq_f16(inD);
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
@ -392,7 +418,7 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
*/ */
vecTmp0 = vecSum0 + vecSum1; vecTmp0 = vecSum0 + vecSum1;
vst1q(inA, vecTmp0); vst1q(inA, vecTmp0);
inA += 4; inA += 8;
/* /*
* [ 1 -1 1 -1 ] * [ A B C D ]' * [ 1 -1 1 -1 ] * [ A B C D ]'
*/ */
@ -401,10 +427,10 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* [ 1 -1 1 -1 ] * [ A B C D ]'.* W1 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
*/ */
vecW = vld1q(pW2); vecW = vld1q(pW2);
pW2 += 4; pW2 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
vst1q(inB, vecTmp1); vst1q(inB, vecTmp1);
inB += 4; inB += 8;
/* /*
* [ 1 -i -1 +i ] * [ A B C D ]' * [ 1 -i -1 +i ] * [ A B C D ]'
@ -414,10 +440,10 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* [ 1 -i -1 +i ] * [ A B C D ]'.* W2 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
*/ */
vecW = vld1q(pW1); vecW = vld1q(pW1);
pW1 += 4; pW1 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
vst1q(inC, vecTmp1); vst1q(inC, vecTmp1);
inC += 4; inC += 8;
/* /*
* [ 1 +i -1 -i ] * [ A B C D ]' * [ 1 +i -1 -i ] * [ A B C D ]'
@ -427,13 +453,13 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* [ 1 +i -1 -i ] * [ A B C D ]'.* W3 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
*/ */
vecW = vld1q(pW3); vecW = vld1q(pW3);
pW3 += 4; pW3 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0); vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
vst1q(inD, vecTmp1); vst1q(inD, vecTmp1);
inD += 4; inD += 8;
vecA = vldrwq_f32(inA); vecA = vldrhq_f16(inA);
vecC = vldrwq_f32(inC); vecC = vldrhq_f16(inC);
blkCnt--; blkCnt--;
} }
@ -454,7 +480,7 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* load scheduling * load scheduling
*/ */
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
blkCnt = (fftLen >> 3); blkCnt = (fftLen >> 3);
while (blkCnt > 0U) while (blkCnt > 0U)
@ -462,14 +488,14 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
vecB = vldrwq_gather_base_f32(vecScGathAddr, 8); vecB = vldrwq_gather_base_f32(vecScGathAddr, 4);
vecD = vldrwq_gather_base_f32(vecScGathAddr, 24); vecD = vldrwq_gather_base_f32(vecScGathAddr, 12);
vecSum1 = vecB + vecD; vecSum1 = vecB + vecD;
vecDiff1 = vecB - vecD; vecDiff1 = vecB - vecD;
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16); vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
vecTmp0 = vecSum0 + vecSum1; vecTmp0 = vecSum0 + vecSum1;
vecTmp0 = vecTmp0 * onebyfftLen; vecTmp0 = vecTmp0 * onebyfftLen;
@ -477,15 +503,15 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
vecTmp0 = vecSum0 - vecSum1; vecTmp0 = vecSum0 - vecSum1;
vecTmp0 = vecTmp0 * onebyfftLen; vecTmp0 = vecTmp0 * onebyfftLen;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, vecTmp0);
vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
vecTmp0 = vecTmp0 * onebyfftLen; vecTmp0 = vecTmp0 * onebyfftLen;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
vecTmp0 = vecTmp0 * onebyfftLen; vecTmp0 = vecTmp0 * onebyfftLen;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0); vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, vecTmp0);
blkCnt--; blkCnt--;
} }
@ -495,16 +521,16 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
*/ */
} }
static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen) static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
{ {
float32_t const *pCoefVec; float16_t const *pCoefVec;
float32_t const *pCoef = S->pTwiddle; float16_t const *pCoef = S->pTwiddle;
float32_t *pIn0, *pIn1; float16_t *pIn0, *pIn1;
uint32_t n2; uint32_t n2;
float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen); float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen);
uint32_t blkCnt; uint32_t blkCnt;
f32x4_t vecIn0, vecIn1, vecSum, vecDiff; f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
f32x4_t vecCmplxTmp, vecTw; f16x8_t vecCmplxTmp, vecTw;
n2 = fftLen >> 1; n2 = fftLen >> 1;
@ -512,13 +538,13 @@ static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,f
pIn1 = pSrc + fftLen; pIn1 = pSrc + fftLen;
pCoefVec = pCoef; pCoefVec = pCoef;
blkCnt = n2 / 2; blkCnt = n2 / 4;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
vecIn0 = *(f32x4_t *) pIn0; vecIn0 = *(f16x8_t *) pIn0;
vecIn1 = *(f32x4_t *) pIn1; vecIn1 = *(f16x8_t *) pIn1;
vecTw = vld1q(pCoefVec); vecTw = vld1q(pCoefVec);
pCoefVec += 4; pCoefVec += 8;
vecSum = vecIn0 + vecIn1; vecSum = vecIn0 + vecIn1;
vecDiff = vecIn0 - vecIn1; vecDiff = vecIn0 - vecIn1;
@ -526,16 +552,16 @@ static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,f
vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff); vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
vst1q(pIn0, vecSum); vst1q(pIn0, vecSum);
pIn0 += 4; pIn0 += 8;
vst1q(pIn1, vecCmplxTmp); vst1q(pIn1, vecCmplxTmp);
pIn1 += 4; pIn1 += 8;
blkCnt--; blkCnt--;
} }
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen); _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen); _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
} }
@ -558,9 +584,9 @@ static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,f
*/ */
void arm_cfft_f32( void arm_cfft_f16(
const arm_cfft_instance_f32 * S, const arm_cfft_instance_f16 * S,
float32_t * pSrc, float16_t * pSrc,
uint8_t ifftFlag, uint8_t ifftFlag,
uint8_t bitReverseFlag) uint8_t bitReverseFlag)
{ {
@ -574,14 +600,14 @@ void arm_cfft_f32(
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen)); _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen); arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
break; break;
} }
} else { } else {
@ -591,14 +617,14 @@ void arm_cfft_f32(
case 256: case 256:
case 1024: case 1024:
case 4096: case 4096:
_arm_radix4_butterfly_f32_mve(S, pSrc, fftLen); _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
break; break;
case 32: case 32:
case 128: case 128:
case 512: case 512:
case 2048: case 2048:
arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen); arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
break; break;
} }
} }
@ -607,12 +633,11 @@ void arm_cfft_f32(
if (bitReverseFlag) if (bitReverseFlag)
{ {
arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable); arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
} }
} }
#else #else
extern void arm_bitreversal_16( extern void arm_bitreversal_16(

@ -54,8 +54,8 @@
#include "arm_const_structs.h" #include "arm_const_structs.h"
//#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if 0
#include "arm_vec_fft.h" #include "arm_vec_fft.h"
#include "arm_mve_tables.h" #include "arm_mve_tables.h"
@ -163,7 +163,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */ /* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;
S->pTwiddle = (float32_t *)twiddleCoef_4096; S->pTwiddle = (float16_t *)twiddleCoef_4096;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break; break;
#endif #endif
@ -174,7 +174,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */ /* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
S->pTwiddle = (float32_t *)twiddleCoef_2048; S->pTwiddle = (float16_t *)twiddleCoef_2048;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break; break;
#endif #endif
@ -185,7 +185,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */ /* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024;
S->pTwiddle = (float32_t *)twiddleCoef_1024; S->pTwiddle = (float16_t *)twiddleCoef_1024;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break; break;
#endif #endif
@ -196,7 +196,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */ /* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;
S->pTwiddle = (float32_t *)twiddleCoef_512; S->pTwiddle = (float16_t *)twiddleCoef_512;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break; break;
#endif #endif
@ -205,7 +205,7 @@ arm_status arm_cfft_init_f16(
case 256U: case 256U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256;
S->pTwiddle = (float32_t *)twiddleCoef_256; S->pTwiddle = (float16_t *)twiddleCoef_256;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break; break;
#endif #endif
@ -214,7 +214,7 @@ arm_status arm_cfft_init_f16(
case 128U: case 128U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128;
S->pTwiddle = (float32_t *)twiddleCoef_128; S->pTwiddle = (float16_t *)twiddleCoef_128;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break; break;
#endif #endif
@ -223,7 +223,7 @@ arm_status arm_cfft_init_f16(
case 64U: case 64U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;
S->pTwiddle = (float32_t *)twiddleCoef_64; S->pTwiddle = (float16_t *)twiddleCoef_64;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break; break;
#endif #endif
@ -232,7 +232,7 @@ arm_status arm_cfft_init_f16(
case 32U: case 32U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;
S->pTwiddle = (float32_t *)twiddleCoef_32; S->pTwiddle = (float16_t *)twiddleCoef_32;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break; break;
#endif #endif
@ -242,7 +242,7 @@ arm_status arm_cfft_init_f16(
/* Initializations of structure parameters for 16 point FFT */ /* Initializations of structure parameters for 16 point FFT */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH; S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16; S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16;
S->pTwiddle = (float32_t *)twiddleCoef_16; S->pTwiddle = (float16_t *)twiddleCoef_16;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1); status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break; break;
#endif #endif

@ -3,7 +3,7 @@
#include "Error.h" #include "Error.h"
#define SNR_THRESHOLD 62 #define SNR_THRESHOLD 62
#define SNR_DOTPROD_THRESHOLD 50 #define SNR_DOTPROD_THRESHOLD 40
/* /*

@ -52,8 +52,8 @@ int testmain()
// An IO runner is driven by some IO // An IO runner is driven by some IO
// In future one may have a client/server runner driven // In future one may have a client/server runner driven
// by a server running on a host. // by a server running on a host.
//Client::IORunner runner(&io,&mgr,Testing::kTestAndDump); Client::IORunner runner(&io,&mgr,Testing::kTestAndDump);
Client::IORunner runner(&io,&mgr,Testing::kTestOnly); //Client::IORunner runner(&io,&mgr,Testing::kTestOnly);
// Root object containing all the tests // Root object containing all the tests

@ -24,7 +24,7 @@ include(configCore)
function(configLib project cmsisRoot) function(configLib project cmsisRoot)
configcore(${project} ${cmsisRoot} TRUE) configcore(${project} ${cmsisRoot})
#configplatformForLib(${project} ${cmsisRoot}) #configplatformForLib(${project} ${cmsisRoot})
SET(COREID ${COREID} PARENT_SCOPE) SET(COREID ${COREID} PARENT_SCOPE)
endfunction() endfunction()

Loading…
Cancel
Save