CMSIS-DSP: Improvements for building with gcc on M55.

Solve most of f16 issues. But there are still some remaining build issues with gcc10q4. 2 functions are reverting to scalar version when build with gcc on M55. (Since Helium versions of those functions are not building).
5 years ago · 9a254bc926
parent 502fb88f3d
commit 9a254bc926
12 changed files with 72 additions and 57 deletions
--- a/Include/arm_helium_utils.h
+++ b/Include/arm_helium_utils.h
@ -122,7 +122,7 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
     *  re0+re1 | im0+im1 | re0+re1 | im0+im1
     *  re2+re3 | im2+im3 | re2+re3 | im2+im3
     */
-    vecTmp = vaddq(vecTmp, vecIn);
+    vecTmp = vaddq_f16(vecTmp, vecIn);
    vecOut = vecTmp;
    /*
     * shift left, random tmp insertion in bottom
@ -133,7 +133,7 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
     *    DONTCARE     |    DONTCARE     | re0+re1+re0+re1 |im0+im1+im0+im1
     * re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3
     */
-    vecOut = vaddq(vecOut, vecTmp);
+    vecOut = vaddq_f16(vecOut, vecTmp);
    /*
     * Cmplx sum is in 4rd & 5th f16 elt
     * return full vector
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@ -49,6 +49,14 @@
   * The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
   * 32-bit integer and 32-bit floating-point values.
   *
+   * The library is providing vectorized versions of most algorthms for Helium
+   * and of most f32 algorithms for Neon.
+   *
+   * When using a vectorized version, provide a little bit of padding after the end of
+   * a buffer (3 words) because the vectorized code may read a little bit after the end
+   * of a buffer. You don't have to modify your buffers but just ensure that the
+   * end of buffer + padding is not outside of a memory region.
+   *
   * \section using Using the Library
   *
   * The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
--- a/Include/arm_math_types.h
+++ b/Include/arm_math_types.h
@ -110,10 +110,7 @@ extern "C"
    #define ARM_MATH_MVEF
  #endif
  #if !defined(ARM_MATH_MVE_FLOAT16)
-  /* HW Float16 not yet well supported on gcc for M55 */
-    #if !defined(__CMSIS_GCC_H)
       #define ARM_MATH_MVE_FLOAT16
-    #endif
  #endif
 #endif

@ -130,10 +127,7 @@ extern "C"
  #endif

  #if !defined(ARM_MATH_MVE_FLOAT16)
-    /* HW Float16 not yet well supported on gcc for M55 */
-    #if !defined(__CMSIS_GCC_H)
       #define ARM_MATH_MVE_FLOAT16
-    #endif
  #endif
 #endif

--- a/PythonWrapper/setup.py
+++ b/PythonWrapper/setup.py
@ -11,11 +11,8 @@ includes = [os.path.join(ROOT,"Include"),os.path.join(ROOT,"PrivateInclude"),os.

 if sys.platform == 'win32':
  cflags = ["-DWIN",config.cflags,"-DUNALIGNED_SUPPORT_DISABLE"] 
-  # Custom because a customized arm_math.h is required to build on windows
-  # since the visual compiler and the win platform are
-  # not supported by default in arm_math.h
 else:
-  cflags = ["-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
+  cflags = ["-Wno-attributes","-Wno-unused-function","-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]

 transform = glob.glob(os.path.join(ROOT,"Source","TransformFunctions","*.c"))
 #transform.remove(os.path.join(ROOT,"Source","TransformFunctions","arm_dct4_init_q15.c"))
@ -69,18 +66,18 @@ allsrcs = support + fastmath + filtering + matrix + statistics + complexf + basi
 allsrcs = allsrcs + controller + transform + modulesrc + common+ interpolation

 def notf16(number):
-  if re.match(r'^.*_f16.c$',number):
+  if re.search(r'f16',number):
     return(False)
-  else:
-     return(True)
+  if re.search(r'F16',number):
+     return(False)
+  return(True)

-# If there are too many files, the linker command in failing on Windows.
+# If there are too many files, the linker command is failing on Windows.
 # So f16 functions are removed since they are not currently available in the wrapper.
 # A next version will have to structure this wrapper more cleanly so that the
 # build can work even with more functions
 srcs = list(filter(notf16, allsrcs))

-
 module1 = Extension(config.extensionName,
                    sources = (srcs
                              )
--- a/Source/BasicMathFunctions/arm_mult_f16.c
+++ b/Source/BasicMathFunctions/arm_mult_f16.c
@ -56,7 +56,8 @@
  @return        none
 */

-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 

 #include "arm_helium_utils.h"

--- a/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
@ -46,7 +46,12 @@
  @param[in]     blockSize number of samples to process
  @return        none
 */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
 void arm_biquad_cascade_stereo_df2T_f16(
  const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
  const float16_t * pSrc,
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
@ -50,7 +50,12 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)

 #include "arm_helium_utils.h"

@ -382,7 +387,7 @@ arm_status arm_mat_cmplx_mult_f16(
    uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
    uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
    uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t  col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
    arm_status status;          /* status of matrix multiplication */
    uint16x8_t vecOffs, vecColBOffs;
    uint32_t  blkCnt,rowCnt;           /* loop counters */
@ -466,7 +471,6 @@ if ((pSrcA->numCols != pSrcB->numRows) ||
            /*
             * Matrix A columns number of MAC operations are to be performed
             */
-            colCnt = numColsA;

            float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
            float16_t const *pInA0 = pInA;
@ -612,7 +616,6 @@ if ((pSrcA->numCols != pSrcB->numRows) ||
            /*
             * Matrix A columns number of MAC operations are to be performed
             */
-            colCnt = numColsA;

            float16_t const *pSrcA0Vec;
            float16_t const *pInA0 = pInA;
--- a/Source/MatrixFunctions/arm_mat_mult_f16.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f16.c
@ -87,7 +87,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
    /*
     * move to 2nd column of matrix A
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
    /*
     * load {a01 a01 a11 a11 x x x x}
     */
@ -95,7 +95,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
    /*
     * move to next B row
     */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 2);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
    /*
     * load {b10, b11, b10, b11, x x x x }
     */
@ -157,7 +157,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
    /*
     * move to 2nd column of matrix A
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
    /*
     * load {a01 a01 a01 a11 a11 a11 a21 a21}
     */
@ -165,7 +165,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
    /*
     * move to next B row
     */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
    /*
     * load {b10, b11, b12, b10, b11, b12, b10, b11}
     */
@ -179,7 +179,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
    /*
     * move to 3rd column of matrix A
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
    /*
     * load {a02 a02 a02 a12 a12 a12 a22 a22}
     */
@ -187,7 +187,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
    /*
     * move to next B row
     */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
    /*
     * load {b20, b21, b22, b20, b21, b22, b20, b21}
     */
@ -253,7 +253,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * jump 2 x A rows (2nd half of matrix)
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
    /*
     * load {a20 a20 a20 a20 a30 a30 a30 a30}
     */
@ -274,7 +274,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * move to next B row
     */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
    /*
     * load {b10, b11, b12, b13, b10, b11, b12, b13}
     */
@ -287,7 +287,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * jump 2 x A rows (2nd half of matrix)
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
    /*
     * load {a21 a21 a21 a21 a31 a31 a31 a31}
     */
@ -309,7 +309,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * move to next B row
     */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
    /*
     * load {b20, b21, b22, b23, b20, b21, b22, b23}
     */
@ -322,7 +322,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * jump 2 x A rows
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);

    /*
     * load {a22 a22 a22 a22 a32 a32 a32 a32}
@ -345,7 +345,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * move to next B row
     */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
    /*
     * load {b30, b31, b32, b33, b30, b31, b32, b33}
     */
@ -358,7 +358,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
    /*
     * jump 2 x A rows
     */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
    /*
     * load {a23 a23 a23 a23 a33 a33 a33 a33}
     */
--- a/Source/TransformFunctions/arm_cfft_init_q15.c
+++ b/Source/TransformFunctions/arm_cfft_init_q15.c
@ -275,7 +275,7 @@ arm_status arm_cfft_init_q15(

        /*  Initializations of Instance structure depending on the FFT length */
        switch (S->fftLen) {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
            /*  Initializations of structure parameters for 4096 point FFT */
        case 4096U:
            /*  Initialise the bit reversal table modifier */
@ -283,7 +283,7 @@ arm_status arm_cfft_init_q15(
            break;
 #endif

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
            /*  Initializations of structure parameters for 2048 point FFT */
        case 2048U:
            /*  Initialise the bit reversal table modifier */
@ -292,7 +292,7 @@ arm_status arm_cfft_init_q15(
            break;
 #endif

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
            /*  Initializations of structure parameters for 1024 point FFT */
        case 1024U:
            /*  Initialise the bit reversal table modifier */
@ -301,7 +301,7 @@ arm_status arm_cfft_init_q15(
            break;
 #endif

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
            /*  Initializations of structure parameters for 512 point FFT */
        case 512U:
            /*  Initialise the bit reversal table modifier */
@ -309,31 +309,31 @@ arm_status arm_cfft_init_q15(
            break;
 #endif

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
        case 256U:
            FFTINIT(q15,256);
            break;
 #endif

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
        case 128U:
            FFTINIT(q15,128);
            break;
 #endif 

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
        case 64U:
            FFTINIT(q15,64);
            break;
 #endif 

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
        case 32U:
            FFTINIT(q15,32);
            break;
 #endif 

-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
        case 16U:
            /*  Initializations of structure parameters for 16 point FFT */
            FFTINIT(q15,16);
--- a/Testing/PatternGeneration/Tools.py
+++ b/Testing/PatternGeneration/Tools.py
@ -3,7 +3,7 @@ import struct
 import numpy as np

 def normalize(a):
-  return(a/max(np.abs(a)))
+  return(a/np.max(np.abs(a)))

 TAILONLY = 1
 BODYONLY = 2
--- a/Toolchain/GCC.cmake
+++ b/Toolchain/GCC.cmake
@ -43,6 +43,7 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT)

  # Need to add other gcc config for other cortex-m cores
  if (ARM_CPU STREQUAL "cortex-m55" )
+     target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8.1-m.main+mve.fp+fp.dp")
     target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
     target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
  endif()
--- a/gcc.cmake
+++ b/gcc.cmake
@ -6,26 +6,25 @@ SET(CMAKE_SYSTEM_PROCESSOR arm)



-#SET(tools "C:/PROGRA~2/GNUTOO~1/82018-~1")

-#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
-#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++.exe")
-#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
+#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc")
+#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++")
+#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc")

 find_program(CMAKE_C_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
 find_program(CMAKE_CXX_COMPILER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
 find_program(CMAKE_ASM_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)

-#SET(CMAKE_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
-find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
-find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
+SET(CMAKE_AR "${tools}/bin/ar")
+SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/ar")
+SET(CMAKE_C_COMPILER_AR "${tools}/bin/ar")

+#find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
+#find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
+#find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)

-#SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-#SET(CMAKE_C_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")

-#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++.exe")
+#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++")
 find_program(CMAKE_LINKER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)

 SET(CMAKE_C_LINK_EXECUTABLE "<CMAKE_LINKER> <LINK_FLAGS> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
@ -48,10 +47,17 @@ if(NOT ARM_CPU)
    )
 endif(NOT ARM_CPU)

+if (ARM_CPU STREQUAL "cortex-m55")
+SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_ASM_FLAGS "-march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "ASM compiler common flags")
+SET(CMAKE_EXE_LINKER_FLAGS "-fno-use-linker-plugin -march=armv8.1-m.main+mve.fp+fp.dp"  CACHE INTERNAL "linker flags")
+else()
 SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
 SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
 SET(CMAKE_ASM_FLAGS "-mcpu=${ARM_CPU}" CACHE INTERNAL "ASM compiler common flags")
 SET(CMAKE_EXE_LINKER_FLAGS "-mcpu=${ARM_CPU}"  CACHE INTERNAL "linker flags")
+endif()

 get_property(IS_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE)
 if(IS_IN_TRY_COMPILE)