CMSIS-DSP: Add preliminary cfft 16 MVE code.

pull/19/head
Christophe Favergeon 6 years ago
parent b543f5c61c
commit 86a272902a

@ -441,9 +441,12 @@ extern "C"
/**
* @brief 16-bit floating-point type definition.
* This is already defined in arm_mve.h
*/
#if !defined (ARM_MATH_HELIUM) && !defined(ARM_MATH_MVEF) && !defined(ARM_MATH_MVEI)
typedef __fp16 float16_t;
#endif
/**
* @brief 32-bit floating-point type definition.

@ -4,8 +4,7 @@
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
* used for MVE implementation only
*
* $Date: 08. January 2020
* $Revision: V1.7.0
* $Date: 14. April 2020
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@ -98,6 +97,67 @@ extern float32_t rearranged_twiddle_stride3_4096_f32[2728];
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
extern uint32_t rearranged_twiddle_tab_stride1_arr_16_f16[2];
extern uint32_t rearranged_twiddle_tab_stride2_arr_16_f16[2];
extern uint32_t rearranged_twiddle_tab_stride3_arr_16_f16[2];
extern float16_t rearranged_twiddle_stride1_16_f16[8];
extern float16_t rearranged_twiddle_stride2_16_f16[8];
extern float16_t rearranged_twiddle_stride3_16_f16[8];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
extern uint32_t rearranged_twiddle_tab_stride1_arr_64_f16[3];
extern uint32_t rearranged_twiddle_tab_stride2_arr_64_f16[3];
extern uint32_t rearranged_twiddle_tab_stride3_arr_64_f16[3];
extern float16_t rearranged_twiddle_stride1_64_f16[40];
extern float16_t rearranged_twiddle_stride2_64_f16[40];
extern float16_t rearranged_twiddle_stride3_64_f16[40];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
extern uint32_t rearranged_twiddle_tab_stride1_arr_256_f16[4];
extern uint32_t rearranged_twiddle_tab_stride2_arr_256_f16[4];
extern uint32_t rearranged_twiddle_tab_stride3_arr_256_f16[4];
extern float16_t rearranged_twiddle_stride1_256_f16[168];
extern float16_t rearranged_twiddle_stride2_256_f16[168];
extern float16_t rearranged_twiddle_stride3_256_f16[168];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
extern uint32_t rearranged_twiddle_tab_stride1_arr_1024_f16[5];
extern uint32_t rearranged_twiddle_tab_stride2_arr_1024_f16[5];
extern uint32_t rearranged_twiddle_tab_stride3_arr_1024_f16[5];
extern float16_t rearranged_twiddle_stride1_1024_f16[680];
extern float16_t rearranged_twiddle_stride2_1024_f16[680];
extern float16_t rearranged_twiddle_stride3_1024_f16[680];
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096) || defined(ARM_TABLE_TWIDDLECOEF_F16_8192)
extern uint32_t rearranged_twiddle_tab_stride1_arr_4096_f16[6];
extern uint32_t rearranged_twiddle_tab_stride2_arr_4096_f16[6];
extern uint32_t rearranged_twiddle_tab_stride3_arr_4096_f16[6];
extern float16_t rearranged_twiddle_stride1_4096_f16[2728];
extern float16_t rearranged_twiddle_stride2_4096_f16[2728];
extern float16_t rearranged_twiddle_stride3_4096_f16[2728];
#endif
#endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
#if defined(ARM_MATH_MVEI)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)

@ -19,9 +19,10 @@ condition="""#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES)
"""
F32 = 1
Q31 = 2
Q15 = 3
Q7 = 4
F16 = 2
Q31 = 3
Q15 = 4
Q7 = 5
def printCUInt32Array(f,name,arr):
nb = 0
@ -51,6 +52,20 @@ def printCFloat32Array(f,name,arr):
print("};\n",file=f)
def printCFloat16Array(f,name,arr):
nb = 0
print("float16_t %s[%d]={" % (name,len(arr)),file=f)
for d in arr:
val = "(float16_t)%.20ff," % d
nb = nb + len(val)
if nb > COLLIM:
print("",file=f)
nb = len(val)
print(val,end="",file=f)
print("};\n",file=f)
def printCQ31Array(f,name,arr):
nb = 0
print("q31_t %s[%d]={" % (name,len(arr)),file=f)
@ -99,6 +114,9 @@ def printHUInt32Array(f,name,arr):
def printHFloat32Array(f,name,arr):
print("extern float32_t %s[%d];" % (name,len(arr)),file=f)
def printHFloat16Array(f,name,arr):
print("extern float16_t %s[%d];" % (name,len(arr)),file=f)
def printHQ31Array(f,name,arr):
print("extern q31_t %s[%d];" % (name,len(arr)),file=f)
@ -225,6 +243,30 @@ def reorderTwiddle(theType,conjugate,f,h,n):
print("#endif\n",file=f)
print("#endif\n",file=h)
# F16 SECTION FOR THIS FFT LENGTH
if theType == F16:
print(condition % ("F16",n, "F16",n << 1),file=f)
print(condition % ("F16",n, "F16",n << 1),file=h)
printCUInt32Array(f,"rearranged_twiddle_tab_stride1_arr_%d_f16" % n,list(tab1Offset))
printHUInt32Array(h,"rearranged_twiddle_tab_stride1_arr_%d_f16" % n,list(tab1Offset))
printCUInt32Array(f,"rearranged_twiddle_tab_stride2_arr_%d_f16" % n,list(tab2Offset))
printHUInt32Array(h,"rearranged_twiddle_tab_stride2_arr_%d_f16" % n,list(tab2Offset))
printCUInt32Array(f,"rearranged_twiddle_tab_stride3_arr_%d_f16" % n,list(tab3Offset))
printHUInt32Array(h,"rearranged_twiddle_tab_stride3_arr_%d_f16" % n,list(tab3Offset))
printCFloat16Array(f,"rearranged_twiddle_stride1_%d_f16" % n,list(tab1))
printHFloat16Array(h,"rearranged_twiddle_stride1_%d_f16" % n,list(tab1))
printCFloat16Array(f,"rearranged_twiddle_stride2_%d_f16" % n,list(tab2))
printHFloat16Array(h,"rearranged_twiddle_stride2_%d_f16" % n,list(tab2))
printCFloat16Array(f,"rearranged_twiddle_stride3_%d_f16" % n,list(tab3))
printHFloat16Array(h,"rearranged_twiddle_stride3_%d_f16" % n,list(tab3))
print("#endif\n",file=f)
print("#endif\n",file=h)
# Q31 SECTION FOR THIS FFT LENGTH
if theType == Q31:
print(condition % ("Q31",n, "Q31",n << 1),file=f)
@ -285,8 +327,7 @@ cheader="""/* ------------------------------------------------------------------
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
* used for MVE implementation only
*
* $Date: 08. January 2020
* $Revision: V1.7.0
* $Date: 14. April 2020
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@ -342,8 +383,7 @@ hheader="""/* ------------------------------------------------------------------
* Description: common tables like fft twiddle factors, Bitreverse, reciprocal etc
* used for MVE implementation only
*
* $Date: 08. January 2020
* $Revision: V1.7.0
* $Date: 14. April 2020
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
@ -423,6 +463,16 @@ with open(args.f,'w') as f:
print(cfooterMVEF,file=f)
print(hfooterMVEF,file=h)
print(cifdeMVEF,file=f)
print(hifdefMVEF,file=h)
reorderTwiddle(F16,False,f,h,16)
reorderTwiddle(F16,False,f,h,64)
reorderTwiddle(F16,False,f,h,256)
reorderTwiddle(F16,False,f,h,1024)
reorderTwiddle(F16,False,f,h,4096)
print(cfooterMVEF,file=f)
print(hfooterMVEF,file=h)
print(cifdeMVEI,file=f)
print(hifdefMVEI,file=h)
reorderTwiddle(Q31,True,f,h,16)

File diff suppressed because it is too large Load Diff

@ -29,54 +29,54 @@
#include "arm_math.h"
#include "arm_common_tables.h"
//#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if 0
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_fft.h"
#include "arm_mve_tables.h"
static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
{
float32_t retValue=1.0;
float16_t retValue=1.0;
switch (fftLen)
{
case 4096U:
retValue = 0.000244140625;
retValue = (float16_t)0.000244140625;
break;
case 2048U:
retValue = 0.00048828125;
retValue = (float16_t)0.00048828125;
break;
case 1024U:
retValue = 0.0009765625f;
retValue = (float16_t)0.0009765625f;
break;
case 512U:
retValue = 0.001953125;
retValue = (float16_t)0.001953125;
break;
case 256U:
retValue = 0.00390625f;
retValue = (float16_t)0.00390625f;
break;
case 128U:
retValue = 0.0078125;
retValue = (float16_t)0.0078125;
break;
case 64U:
retValue = 0.015625f;
retValue = (float16_t)0.015625f;
break;
case 32U:
retValue = 0.03125;
retValue = (float16_t)0.03125;
break;
case 16U:
retValue = 0.0625f;
retValue = (float16_t)0.0625f;
break;
@ -87,54 +87,80 @@ static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
}
static void arm_bitreversal_32_inpl_mve(
uint32_t *pSrc,
const uint16_t bitRevLen,
static void arm_bitreversal_f16_inpl_mve(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint64_t *src = (uint64_t *) pSrc;
uint32_t *src = (uint32_t *)pSrc;
uint32_t blkCnt; /* loop counters */
uint32x4_t bitRevTabOff;
uint32x4_t one = vdupq_n_u32(1);
uint16x8_t one = vdupq_n_u16(1);
blkCnt = (bitRevLen / 2) / 2;
blkCnt = (bitRevLen / 2) / 4;
while (blkCnt > 0U) {
bitRevTabOff = vldrhq_u32(pBitRevTab);
pBitRevTab += 4;
bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8;
uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = bitRevLen & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
bitRevOff1 = bitRevOff1 >> 3;
bitRevOff2 = bitRevOff2 >> 3;
uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
}
}
static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
{
f32x4_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD;
f16x8_t vecTmp0, vecTmp1;
f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f16x8_t vecA, vecB, vecC, vecD;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
int32_t iter = 1;
static const uint32_t strides[4] = {
(0 - 16) * sizeof(q31_t *),
(1 - 16) * sizeof(q31_t *),
(8 - 16) * sizeof(q31_t *),
(9 - 16) * sizeof(q31_t *)
};
static const uint32_t strides[4] =
{(0 - 16) * sizeof(float16_t *)
, (4 - 16) * sizeof(float16_t *)
, (8 - 16) * sizeof(float16_t *)
, (12 - 16) * sizeof(float16_t *)};
n2 = fftLen;
n1 = n2;
@ -143,37 +169,37 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
{
for (int i = 0; i < iter; i++)
{
float32_t const *p_rearranged_twiddle_tab_stride1 =
float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 =
float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 =
float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t const *pW1, *pW2, *pW3;
float32_t *inA = pSrc + CMPLX_DIM * i * n1;
float32_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM;
f32x4_t vecW;
float16_t const *pW1, *pW2, *pW3;
float16_t *inA = pSrc + CMPLX_DIM * i * n1;
float16_t *inB = inA + n2 * CMPLX_DIM;
float16_t *inC = inB + n2 * CMPLX_DIM;
float16_t *inD = inC + n2 * CMPLX_DIM;
f16x8_t vecW;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 2;
blkCnt = n2 / 4;
/*
* load 2 f32 complex pair
* load 2 f16 complex pair
*/
vecA = vldrwq_f32(inA);
vecC = vldrwq_f32(inC);
vecA = vldrhq_f16(inA);
vecC = vldrhq_f16(inC);
while (blkCnt > 0U)
{
vecB = vldrwq_f32(inB);
vecD = vldrwq_f32(inD);
vecB = vldrhq_f16(inB);
vecD = vldrhq_f16(inD);
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
@ -185,7 +211,7 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
*/
vecTmp0 = vecSum0 + vecSum1;
vst1q(inA, vecTmp0);
inA += 4;
inA += 8;
/*
* [ 1 -1 1 -1 ] * [ A B C D ]'
@ -195,10 +221,10 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
* [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
*/
vecW = vld1q(pW2);
pW2 += 4;
pW2 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
vst1q(inB, vecTmp1);
inB += 4;
inB += 8;
/*
* [ 1 -i -1 +i ] * [ A B C D ]'
@ -208,10 +234,10 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
* [ 1 -i -1 +i ] * [ A B C D ]'.* W1
*/
vecW = vld1q(pW1);
pW1 +=4;
pW1 +=8;
vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
vst1q(inC, vecTmp1);
inC += 4;
inC += 8;
/*
* [ 1 +i -1 -i ] * [ A B C D ]'
@ -221,13 +247,13 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
* [ 1 +i -1 -i ] * [ A B C D ]'.* W3
*/
vecW = vld1q(pW3);
pW3 += 4;
pW3 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
vst1q(inD, vecTmp1);
inD += 4;
inD += 8;
vecA = vldrwq_f32(inA);
vecC = vldrwq_f32(inC);
vecA = vldrhq_f16(inA);
vecC = vldrhq_f16(inC);
blkCnt--;
}
@ -246,35 +272,35 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
/* load scheduling */
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
blkCnt = (fftLen >> 3);
blkCnt = (fftLen >> 4);
while (blkCnt > 0U)
{
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
vecB = vldrwq_gather_base_f32(vecScGathAddr, 4);
vecD = vldrwq_gather_base_f32(vecScGathAddr, 12);
vecSum1 = vecB + vecD;
vecDiff1 = vecB - vecD;
/* pre-load for next iteration */
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
vecTmp0 = vecSum0 + vecSum1;
vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
vecTmp0 = vecSum0 - vecSum1;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, vecTmp0);
vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, vecTmp0);
blkCnt--;
}
@ -284,15 +310,15 @@ static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float3
*/
}
static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
{
float32_t const *pCoefVec;
float32_t const *pCoef = S->pTwiddle;
float32_t *pIn0, *pIn1;
float16_t const *pCoefVec;
float16_t const *pCoef = S->pTwiddle;
float16_t *pIn0, *pIn1;
uint32_t n2;
uint32_t blkCnt;
f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
f32x4_t vecCmplxTmp, vecTw;
f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
f16x8_t vecCmplxTmp, vecTw;
n2 = fftLen >> 1;
@ -300,49 +326,49 @@ static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_
pIn1 = pSrc + fftLen;
pCoefVec = pCoef;
blkCnt = n2 / 2;
blkCnt = n2 / 4;
while (blkCnt > 0U)
{
vecIn0 = *(f32x4_t *) pIn0;
vecIn1 = *(f32x4_t *) pIn1;
vecIn0 = *(f16x8_t *) pIn0;
vecIn1 = *(f16x8_t *) pIn1;
vecTw = vld1q(pCoefVec);
pCoefVec += 4;
pCoefVec += 8;
vecSum = vecIn0 + vecIn1;
vecDiff = vecIn0 - vecIn1;
vecSum = vaddq(vecIn0, vecIn1);
vecDiff = vsubq(vecIn0, vecIn1);
vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
vst1q(pIn0, vecSum);
pIn0 += 4;
pIn0 += 8;
vst1q(pIn1, vecCmplxTmp);
pIn1 += 4;
pIn1 += 8;
blkCnt--;
}
_arm_radix4_butterfly_f32_mve(S, pSrc, n2);
_arm_radix4_butterfly_f16_mve(S, pSrc, n2);
_arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
_arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
pIn0 = pSrc;
}
static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
{
f32x4_t vecTmp0, vecTmp1;
f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f32x4_t vecA, vecB, vecC, vecD;
f32x4_t vecW;
f16x8_t vecTmp0, vecTmp1;
f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
f16x8_t vecA, vecB, vecC, vecD;
f16x8_t vecW;
uint32_t blkCnt;
uint32_t n1, n2;
uint32_t stage = 0;
int32_t iter = 1;
static const uint32_t strides[4] = {
(0 - 16) * sizeof(q31_t *),
(1 - 16) * sizeof(q31_t *),
(4 - 16) * sizeof(q31_t *),
(8 - 16) * sizeof(q31_t *),
(9 - 16) * sizeof(q31_t *)
(12 - 16) * sizeof(q31_t *)
};
n2 = fftLen;
@ -352,35 +378,35 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
{
for (int i = 0; i < iter; i++)
{
float32_t const *p_rearranged_twiddle_tab_stride1 =
float16_t const *p_rearranged_twiddle_tab_stride1 =
&S->rearranged_twiddle_stride1[
S->rearranged_twiddle_tab_stride1_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride2 =
float16_t const *p_rearranged_twiddle_tab_stride2 =
&S->rearranged_twiddle_stride2[
S->rearranged_twiddle_tab_stride2_arr[stage]];
float32_t const *p_rearranged_twiddle_tab_stride3 =
float16_t const *p_rearranged_twiddle_tab_stride3 =
&S->rearranged_twiddle_stride3[
S->rearranged_twiddle_tab_stride3_arr[stage]];
float32_t const *pW1, *pW2, *pW3;
float32_t *inA = pSrc + CMPLX_DIM * i * n1;
float32_t *inB = inA + n2 * CMPLX_DIM;
float32_t *inC = inB + n2 * CMPLX_DIM;
float32_t *inD = inC + n2 * CMPLX_DIM;
float16_t const *pW1, *pW2, *pW3;
float16_t *inA = pSrc + CMPLX_DIM * i * n1;
float16_t *inB = inA + n2 * CMPLX_DIM;
float16_t *inC = inB + n2 * CMPLX_DIM;
float16_t *inD = inC + n2 * CMPLX_DIM;
pW1 = p_rearranged_twiddle_tab_stride1;
pW2 = p_rearranged_twiddle_tab_stride2;
pW3 = p_rearranged_twiddle_tab_stride3;
blkCnt = n2 / 2;
blkCnt = n2 / 4;
/*
* load 2 f32 complex pair
*/
vecA = vldrwq_f32(inA);
vecC = vldrwq_f32(inC);
vecA = vldrhq_f16(inA);
vecC = vldrhq_f16(inC);
while (blkCnt > 0U)
{
vecB = vldrwq_f32(inB);
vecD = vldrwq_f32(inD);
vecB = vldrhq_f16(inB);
vecD = vldrhq_f16(inD);
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
@ -392,7 +418,7 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
*/
vecTmp0 = vecSum0 + vecSum1;
vst1q(inA, vecTmp0);
inA += 4;
inA += 8;
/*
* [ 1 -1 1 -1 ] * [ A B C D ]'
*/
@ -401,10 +427,10 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
*/
vecW = vld1q(pW2);
pW2 += 4;
pW2 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
vst1q(inB, vecTmp1);
inB += 4;
inB += 8;
/*
* [ 1 -i -1 +i ] * [ A B C D ]'
@ -414,10 +440,10 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* [ 1 -i -1 +i ] * [ A B C D ]'.* W2
*/
vecW = vld1q(pW1);
pW1 += 4;
pW1 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
vst1q(inC, vecTmp1);
inC += 4;
inC += 8;
/*
* [ 1 +i -1 -i ] * [ A B C D ]'
@ -427,13 +453,13 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* [ 1 +i -1 -i ] * [ A B C D ]'.* W3
*/
vecW = vld1q(pW3);
pW3 += 4;
pW3 += 8;
vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
vst1q(inD, vecTmp1);
inD += 4;
inD += 8;
vecA = vldrwq_f32(inA);
vecC = vldrwq_f32(inC);
vecA = vldrhq_f16(inA);
vecC = vldrhq_f16(inC);
blkCnt--;
}
@ -454,7 +480,7 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
* load scheduling
*/
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
blkCnt = (fftLen >> 3);
while (blkCnt > 0U)
@ -462,14 +488,14 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
vecB = vldrwq_gather_base_f32(vecScGathAddr, 4);
vecD = vldrwq_gather_base_f32(vecScGathAddr, 12);
vecSum1 = vecB + vecD;
vecDiff1 = vecB - vecD;
vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
vecC = vldrwq_gather_base_f32(vecScGathAddr, 8);
vecTmp0 = vecSum0 + vecSum1;
vecTmp0 = vecTmp0 * onebyfftLen;
@ -477,15 +503,15 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
vecTmp0 = vecSum0 - vecSum1;
vecTmp0 = vecTmp0 * onebyfftLen;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, vecTmp0);
vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
vecTmp0 = vecTmp0 * onebyfftLen;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
vecTmp0 = vecTmp0 * onebyfftLen;
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, vecTmp0);
blkCnt--;
}
@ -495,16 +521,16 @@ static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 *
*/
}
static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
{
float32_t const *pCoefVec;
float32_t const *pCoef = S->pTwiddle;
float32_t *pIn0, *pIn1;
float16_t const *pCoefVec;
float16_t const *pCoef = S->pTwiddle;
float16_t *pIn0, *pIn1;
uint32_t n2;
float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen);
float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen);
uint32_t blkCnt;
f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
f32x4_t vecCmplxTmp, vecTw;
f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
f16x8_t vecCmplxTmp, vecTw;
n2 = fftLen >> 1;
@ -512,13 +538,13 @@ static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,f
pIn1 = pSrc + fftLen;
pCoefVec = pCoef;
blkCnt = n2 / 2;
blkCnt = n2 / 4;
while (blkCnt > 0U)
{
vecIn0 = *(f32x4_t *) pIn0;
vecIn1 = *(f32x4_t *) pIn1;
vecIn0 = *(f16x8_t *) pIn0;
vecIn1 = *(f16x8_t *) pIn1;
vecTw = vld1q(pCoefVec);
pCoefVec += 4;
pCoefVec += 8;
vecSum = vecIn0 + vecIn1;
vecDiff = vecIn0 - vecIn1;
@ -526,16 +552,16 @@ static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,f
vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
vst1q(pIn0, vecSum);
pIn0 += 4;
pIn0 += 8;
vst1q(pIn1, vecCmplxTmp);
pIn1 += 4;
pIn1 += 8;
blkCnt--;
}
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
_arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
_arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
}
@ -558,9 +584,9 @@ static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,f
*/
void arm_cfft_f32(
const arm_cfft_instance_f32 * S,
float32_t * pSrc,
void arm_cfft_f16(
const arm_cfft_instance_f16 * S,
float16_t * pSrc,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
{
@ -574,14 +600,14 @@ void arm_cfft_f32(
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
_arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
break;
}
} else {
@ -591,14 +617,14 @@ void arm_cfft_f32(
case 256:
case 1024:
case 4096:
_arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
_arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
break;
}
}
@ -607,12 +633,11 @@ void arm_cfft_f32(
if (bitReverseFlag)
{
arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
}
}
#else
extern void arm_bitreversal_16(

@ -54,8 +54,8 @@
#include "arm_const_structs.h"
//#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#if 0
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_vec_fft.h"
#include "arm_mve_tables.h"
@ -163,7 +163,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;
S->pTwiddle = (float32_t *)twiddleCoef_4096;
S->pTwiddle = (float16_t *)twiddleCoef_4096;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break;
#endif
@ -174,7 +174,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
S->pTwiddle = (float32_t *)twiddleCoef_2048;
S->pTwiddle = (float16_t *)twiddleCoef_2048;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break;
#endif
@ -185,7 +185,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024;
S->pTwiddle = (float32_t *)twiddleCoef_1024;
S->pTwiddle = (float16_t *)twiddleCoef_1024;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break;
#endif
@ -196,7 +196,7 @@ arm_status arm_cfft_init_f16(
/* Initialise the bit reversal table modifier */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;
S->pTwiddle = (float32_t *)twiddleCoef_512;
S->pTwiddle = (float16_t *)twiddleCoef_512;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break;
#endif
@ -205,7 +205,7 @@ arm_status arm_cfft_init_f16(
case 256U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256;
S->pTwiddle = (float32_t *)twiddleCoef_256;
S->pTwiddle = (float16_t *)twiddleCoef_256;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break;
#endif
@ -214,7 +214,7 @@ arm_status arm_cfft_init_f16(
case 128U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128;
S->pTwiddle = (float32_t *)twiddleCoef_128;
S->pTwiddle = (float16_t *)twiddleCoef_128;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break;
#endif
@ -223,7 +223,7 @@ arm_status arm_cfft_init_f16(
case 64U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;
S->pTwiddle = (float32_t *)twiddleCoef_64;
S->pTwiddle = (float16_t *)twiddleCoef_64;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break;
#endif
@ -232,7 +232,7 @@ arm_status arm_cfft_init_f16(
case 32U:
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;
S->pTwiddle = (float32_t *)twiddleCoef_32;
S->pTwiddle = (float16_t *)twiddleCoef_32;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
break;
#endif
@ -242,7 +242,7 @@ arm_status arm_cfft_init_f16(
/* Initializations of structure parameters for 16 point FFT */
S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;
S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16;
S->pTwiddle = (float32_t *)twiddleCoef_16;
S->pTwiddle = (float16_t *)twiddleCoef_16;
status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
break;
#endif

@ -3,7 +3,7 @@
#include "Error.h"
#define SNR_THRESHOLD 62
#define SNR_DOTPROD_THRESHOLD 50
#define SNR_DOTPROD_THRESHOLD 40
/*

@ -52,8 +52,8 @@ int testmain()
// An IO runner is driven by some IO
// In future one may have a client/server runner driven
// by a server running on a host.
//Client::IORunner runner(&io,&mgr,Testing::kTestAndDump);
Client::IORunner runner(&io,&mgr,Testing::kTestOnly);
Client::IORunner runner(&io,&mgr,Testing::kTestAndDump);
//Client::IORunner runner(&io,&mgr,Testing::kTestOnly);
// Root object containing all the tests

@ -24,7 +24,7 @@ include(configCore)
function(configLib project cmsisRoot)
configcore(${project} ${cmsisRoot} TRUE)
configcore(${project} ${cmsisRoot})
#configplatformForLib(${project} ${cmsisRoot})
SET(COREID ${COREID} PARENT_SCOPE)
endfunction()

Loading…
Cancel
Save