CMSIS-DSP: Improvements for building with gcc on M55.

Solve most of f16 issues. But there are still some remaining
build issues with gcc10q4.

2 functions are reverting to scalar version when build with gcc on M55.
(Since Helium versions of those functions are not building).
pull/19/head
Christophe Favergeon 5 years ago
parent 502fb88f3d
commit 9a254bc926

@ -122,7 +122,7 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
* re0+re1 | im0+im1 | re0+re1 | im0+im1
* re2+re3 | im2+im3 | re2+re3 | im2+im3
*/
vecTmp = vaddq(vecTmp, vecIn);
vecTmp = vaddq_f16(vecTmp, vecIn);
vecOut = vecTmp;
/*
* shift left, random tmp insertion in bottom
@ -133,7 +133,7 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
* DONTCARE | DONTCARE | re0+re1+re0+re1 |im0+im1+im0+im1
* re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3
*/
vecOut = vaddq(vecOut, vecTmp);
vecOut = vaddq_f16(vecOut, vecTmp);
/*
* Cmplx sum is in 4rd & 5th f16 elt
* return full vector

@ -49,6 +49,14 @@
* The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
* 32-bit integer and 32-bit floating-point values.
*
* The library is providing vectorized versions of most algorthms for Helium
* and of most f32 algorithms for Neon.
*
* When using a vectorized version, provide a little bit of padding after the end of
* a buffer (3 words) because the vectorized code may read a little bit after the end
* of a buffer. You don't have to modify your buffers but just ensure that the
* end of buffer + padding is not outside of a memory region.
*
* \section using Using the Library
*
* The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.

@ -110,10 +110,7 @@ extern "C"
#define ARM_MATH_MVEF
#endif
#if !defined(ARM_MATH_MVE_FLOAT16)
/* HW Float16 not yet well supported on gcc for M55 */
#if !defined(__CMSIS_GCC_H)
#define ARM_MATH_MVE_FLOAT16
#endif
#endif
#endif
@ -130,10 +127,7 @@ extern "C"
#endif
#if !defined(ARM_MATH_MVE_FLOAT16)
/* HW Float16 not yet well supported on gcc for M55 */
#if !defined(__CMSIS_GCC_H)
#define ARM_MATH_MVE_FLOAT16
#endif
#endif
#endif

@ -11,11 +11,8 @@ includes = [os.path.join(ROOT,"Include"),os.path.join(ROOT,"PrivateInclude"),os.
if sys.platform == 'win32':
cflags = ["-DWIN",config.cflags,"-DUNALIGNED_SUPPORT_DISABLE"]
# Custom because a customized arm_math.h is required to build on windows
# since the visual compiler and the win platform are
# not supported by default in arm_math.h
else:
cflags = ["-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
cflags = ["-Wno-attributes","-Wno-unused-function","-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
transform = glob.glob(os.path.join(ROOT,"Source","TransformFunctions","*.c"))
#transform.remove(os.path.join(ROOT,"Source","TransformFunctions","arm_dct4_init_q15.c"))
@ -69,18 +66,18 @@ allsrcs = support + fastmath + filtering + matrix + statistics + complexf + basi
allsrcs = allsrcs + controller + transform + modulesrc + common+ interpolation
def notf16(number):
if re.match(r'^.*_f16.c$',number):
if re.search(r'f16',number):
return(False)
else:
return(True)
if re.search(r'F16',number):
return(False)
return(True)
# If there are too many files, the linker command in failing on Windows.
# If there are too many files, the linker command is failing on Windows.
# So f16 functions are removed since they are not currently available in the wrapper.
# A next version will have to structure this wrapper more cleanly so that the
# build can work even with more functions
srcs = list(filter(notf16, allsrcs))
module1 = Extension(config.extensionName,
sources = (srcs
)

@ -56,7 +56,8 @@
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"

@ -46,7 +46,12 @@
@param[in] blockSize number of samples to process
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
#pragma GCC warning "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
#endif
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
void arm_biquad_cascade_stereo_df2T_f16(
const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
const float16_t * pSrc,

@ -50,7 +50,12 @@
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
#endif
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
#include "arm_helium_utils.h"
@ -382,7 +387,7 @@ arm_status arm_mat_cmplx_mult_f16(
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint16x8_t vecOffs, vecColBOffs;
uint32_t blkCnt,rowCnt; /* loop counters */
@ -466,7 +471,6 @@ if ((pSrcA->numCols != pSrcB->numRows) ||
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
float16_t const *pInA0 = pInA;
@ -612,7 +616,6 @@ if ((pSrcA->numCols != pSrcB->numRows) ||
/*
* Matrix A columns number of MAC operations are to be performed
*/
colCnt = numColsA;
float16_t const *pSrcA0Vec;
float16_t const *pInA0 = pInA;

@ -87,7 +87,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a11 a11 x x x x}
*/
@ -95,7 +95,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 2);
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
/*
* load {b10, b11, b10, b11, x x x x }
*/
@ -157,7 +157,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a01 a11 a11 a11 a21 a21}
*/
@ -165,7 +165,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
/*
* load {b10, b11, b12, b10, b11, b12, b10, b11}
*/
@ -179,7 +179,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
/*
* move to 3rd column of matrix A
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a02 a02 a02 a12 a12 a12 a22 a22}
*/
@ -187,7 +187,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
/*
* load {b20, b21, b22, b20, b21, b22, b20, b21}
*/
@ -253,7 +253,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a20 a20 a20 a20 a30 a30 a30 a30}
*/
@ -274,7 +274,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b10, b11, b12, b13, b10, b11, b12, b13}
*/
@ -287,7 +287,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a21 a21 a21 a21 a31 a31 a31 a31}
*/
@ -309,7 +309,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b20, b21, b22, b23, b20, b21, b22, b23}
*/
@ -322,7 +322,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* jump 2 x A rows
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a22 a22 a22 a22 a32 a32 a32 a32}
@ -345,7 +345,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* move to next B row
*/
vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b30, b31, b32, b33, b30, b31, b32, b33}
*/
@ -358,7 +358,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
/*
* jump 2 x A rows
*/
vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a23 a23 a23 a23 a33 a33 a33 a33}
*/

@ -275,7 +275,7 @@ arm_status arm_cfft_init_q15(
/* Initializations of Instance structure depending on the FFT length */
switch (S->fftLen) {
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
/* Initializations of structure parameters for 4096 point FFT */
case 4096U:
/* Initialise the bit reversal table modifier */
@ -283,7 +283,7 @@ arm_status arm_cfft_init_q15(
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
/* Initializations of structure parameters for 2048 point FFT */
case 2048U:
/* Initialise the bit reversal table modifier */
@ -292,7 +292,7 @@ arm_status arm_cfft_init_q15(
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
/* Initializations of structure parameters for 1024 point FFT */
case 1024U:
/* Initialise the bit reversal table modifier */
@ -301,7 +301,7 @@ arm_status arm_cfft_init_q15(
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
/* Initializations of structure parameters for 512 point FFT */
case 512U:
/* Initialise the bit reversal table modifier */
@ -309,31 +309,31 @@ arm_status arm_cfft_init_q15(
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
case 256U:
FFTINIT(q15,256);
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
case 128U:
FFTINIT(q15,128);
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
case 64U:
FFTINIT(q15,64);
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
case 32U:
FFTINIT(q15,32);
break;
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
case 16U:
/* Initializations of structure parameters for 16 point FFT */
FFTINIT(q15,16);

@ -3,7 +3,7 @@ import struct
import numpy as np
def normalize(a):
return(a/max(np.abs(a)))
return(a/np.max(np.abs(a)))
TAILONLY = 1
BODYONLY = 2

@ -43,6 +43,7 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT)
# Need to add other gcc config for other cortex-m cores
if (ARM_CPU STREQUAL "cortex-m55" )
target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8.1-m.main+mve.fp+fp.dp")
target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
endif()

@ -6,26 +6,25 @@ SET(CMAKE_SYSTEM_PROCESSOR arm)
#SET(tools "C:/PROGRA~2/GNUTOO~1/82018-~1")
#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++.exe")
#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc")
#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++")
#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc")
find_program(CMAKE_C_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
find_program(CMAKE_CXX_COMPILER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
find_program(CMAKE_ASM_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
#SET(CMAKE_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
SET(CMAKE_AR "${tools}/bin/ar")
SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/ar")
SET(CMAKE_C_COMPILER_AR "${tools}/bin/ar")
#find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
#find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
#find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
#SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
#SET(CMAKE_C_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++.exe")
#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++")
find_program(CMAKE_LINKER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
SET(CMAKE_C_LINK_EXECUTABLE "<CMAKE_LINKER> <LINK_FLAGS> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
@ -48,10 +47,17 @@ if(NOT ARM_CPU)
)
endif(NOT ARM_CPU)
if (ARM_CPU STREQUAL "cortex-m55")
SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
SET(CMAKE_ASM_FLAGS "-march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "ASM compiler common flags")
SET(CMAKE_EXE_LINKER_FLAGS "-fno-use-linker-plugin -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "linker flags")
else()
SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
SET(CMAKE_ASM_FLAGS "-mcpu=${ARM_CPU}" CACHE INTERNAL "ASM compiler common flags")
SET(CMAKE_EXE_LINKER_FLAGS "-mcpu=${ARM_CPU}" CACHE INTERNAL "linker flags")
endif()
get_property(IS_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE)
if(IS_IN_TRY_COMPILE)

Loading…
Cancel
Save