From 9a254bc9269bd2b2cee594c691d94d66b5e3656e Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Wed, 20 Jan 2021 13:26:22 +0100
Subject: [PATCH] CMSIS-DSP: Improvements for building with gcc on M55.

Solve most of f16 issues. But there are still some remaining
build issues with gcc10q4.

2 functions are reverting to scalar version when build with gcc on M55.
(Since Helium versions of those functions are not building).
---
 Include/arm_helium_utils.h                    |  4 +--
 Include/arm_math.h                            |  8 ++++++
 Include/arm_math_types.h                      |  6 ----
 PythonWrapper/setup.py                        | 15 ++++------
 Source/BasicMathFunctions/arm_mult_f16.c      |  3 +-
 .../arm_biquad_cascade_stereo_df2T_f16.c      |  7 ++++-
 .../MatrixFunctions/arm_mat_cmplx_mult_f16.c  | 11 +++++---
 Source/MatrixFunctions/arm_mat_mult_f16.c     | 26 ++++++++---------
 Source/TransformFunctions/arm_cfft_init_q15.c | 18 ++++++------
 Testing/PatternGeneration/Tools.py            |  2 +-
 Toolchain/GCC.cmake                           |  1 +
 gcc.cmake                                     | 28 +++++++++++--------
 12 files changed, 72 insertions(+), 57 deletions(-)
diff --git a/Include/arm_helium_utils.h b/Include/arm_helium_utils.h
index df497345..93645a3c 100755
--- a/Include/arm_helium_utils.h
+++ b/Include/arm_helium_utils.h
@@ -122,7 +122,7 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
      *  re0+re1 | im0+im1 | re0+re1 | im0+im1
      *  re2+re3 | im2+im3 | re2+re3 | im2+im3
      */
-    vecTmp = vaddq(vecTmp, vecIn);
+    vecTmp = vaddq_f16(vecTmp, vecIn);
     vecOut = vecTmp;
     /*
      * shift left, random tmp insertion in bottom
@@ -133,7 +133,7 @@ __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
      *    DONTCARE     |    DONTCARE     | re0+re1+re0+re1 |im0+im1+im0+im1
      * re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3
      */
-    vecOut = vaddq(vecOut, vecTmp);
+    vecOut = vaddq_f16(vecOut, vecTmp);
     /*
      * Cmplx sum is in 4rd & 5th f16 elt
      * return full vector
diff --git a/Include/arm_math.h b/Include/arm_math.h
index 404ee91c..98bc9f91 100644
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@@ -49,6 +49,14 @@
    * The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
    * 32-bit integer and 32-bit floating-point values.
    *
+   * The library is providing vectorized versions of most algorthms for Helium
+   * and of most f32 algorithms for Neon.
+   *
+   * When using a vectorized version, provide a little bit of padding after the end of
+   * a buffer (3 words) because the vectorized code may read a little bit after the end
+   * of a buffer. You don't have to modify your buffers but just ensure that the
+   * end of buffer + padding is not outside of a memory region.
+   *
    * \section using Using the Library
    *
    * The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
diff --git a/Include/arm_math_types.h b/Include/arm_math_types.h
index 7c87b36f..01e18a75 100755
--- a/Include/arm_math_types.h
+++ b/Include/arm_math_types.h
@@ -110,10 +110,7 @@ extern "C"
     #define ARM_MATH_MVEF
   #endif
   #if !defined(ARM_MATH_MVE_FLOAT16)
-  /* HW Float16 not yet well supported on gcc for M55 */
-    #if !defined(__CMSIS_GCC_H)
        #define ARM_MATH_MVE_FLOAT16
-    #endif
   #endif
 #endif
 
@@ -130,10 +127,7 @@ extern "C"
   #endif
 
   #if !defined(ARM_MATH_MVE_FLOAT16)
-    /* HW Float16 not yet well supported on gcc for M55 */
-    #if !defined(__CMSIS_GCC_H)
        #define ARM_MATH_MVE_FLOAT16
-    #endif
   #endif
 #endif
 
diff --git a/PythonWrapper/setup.py b/PythonWrapper/setup.py
index defb900c..2d6b8ccf 100644
--- a/PythonWrapper/setup.py
+++ b/PythonWrapper/setup.py
@@ -11,11 +11,8 @@ includes = [os.path.join(ROOT,"Include"),os.path.join(ROOT,"PrivateInclude"),os.
 
 if sys.platform == 'win32':
   cflags = ["-DWIN",config.cflags,"-DUNALIGNED_SUPPORT_DISABLE"] 
-  # Custom because a customized arm_math.h is required to build on windows
-  # since the visual compiler and the win platform are
-  # not supported by default in arm_math.h
 else:
-  cflags = ["-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
+  cflags = ["-Wno-attributes","-Wno-unused-function","-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
 
 transform = glob.glob(os.path.join(ROOT,"Source","TransformFunctions","*.c"))
 #transform.remove(os.path.join(ROOT,"Source","TransformFunctions","arm_dct4_init_q15.c"))
@@ -69,18 +66,18 @@ allsrcs = support + fastmath + filtering + matrix + statistics + complexf + basi
 allsrcs = allsrcs + controller + transform + modulesrc + common+ interpolation
 
 def notf16(number):
-  if re.match(r'^.*_f16.c$',number):
+  if re.search(r'f16',number):
      return(False)
-  else:
-     return(True)
+  if re.search(r'F16',number):
+     return(False)
+  return(True)
 
-# If there are too many files, the linker command in failing on Windows.
+# If there are too many files, the linker command is failing on Windows.
 # So f16 functions are removed since they are not currently available in the wrapper.
 # A next version will have to structure this wrapper more cleanly so that the
 # build can work even with more functions
 srcs = list(filter(notf16, allsrcs))
 
-
 module1 = Extension(config.extensionName,
                     sources = (srcs
                               )
diff --git a/Source/BasicMathFunctions/arm_mult_f16.c b/Source/BasicMathFunctions/arm_mult_f16.c
index e4df864d..cb521804 100755
--- a/Source/BasicMathFunctions/arm_mult_f16.c
+++ b/Source/BasicMathFunctions/arm_mult_f16.c
@@ -56,7 +56,8 @@
   @return        none
  */
 
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 
 
 #include "arm_helium_utils.h"
 
diff --git a/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c b/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
index 11f4e6ef..0c3da123 100755
--- a/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
@@ -46,7 +46,12 @@
   @param[in]     blockSize number of samples to process
   @return        none
  */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
 void arm_biquad_cascade_stereo_df2T_f16(
   const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
   const float16_t * pSrc,
diff --git a/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c b/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
index 977cba7a..358dde36 100755
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
@@ -50,7 +50,12 @@
                    - \ref ARM_MATH_SUCCESS       : Operation successful
                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
 
 #include "arm_helium_utils.h"
 
@@ -382,7 +387,7 @@ arm_status arm_mat_cmplx_mult_f16(
     uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
     uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
     uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t  col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
     arm_status status;          /* status of matrix multiplication */
     uint16x8_t vecOffs, vecColBOffs;
     uint32_t  blkCnt,rowCnt;           /* loop counters */
@@ -466,7 +471,6 @@ if ((pSrcA->numCols != pSrcB->numRows) ||
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
             float16_t const *pInA0 = pInA;
@@ -612,7 +616,6 @@ if ((pSrcA->numCols != pSrcB->numRows) ||
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             float16_t const *pSrcA0Vec;
             float16_t const *pInA0 = pInA;
diff --git a/Source/MatrixFunctions/arm_mat_mult_f16.c b/Source/MatrixFunctions/arm_mat_mult_f16.c
index e15a6b32..1530e791 100755
--- a/Source/MatrixFunctions/arm_mat_mult_f16.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f16.c
@@ -87,7 +87,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
     /*
      * move to 2nd column of matrix A
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
     /*
      * load {a01 a01 a11 a11 x x x x}
      */
@@ -95,7 +95,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 2);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
     /*
      * load {b10, b11, b10, b11, x x x x }
      */
@@ -157,7 +157,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
     /*
      * move to 2nd column of matrix A
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
     /*
      * load {a01 a01 a01 a11 a11 a11 a21 a21}
      */
@@ -165,7 +165,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
     /*
      * load {b10, b11, b12, b10, b11, b12, b10, b11}
      */
@@ -179,7 +179,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
     /*
      * move to 3rd column of matrix A
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
     /*
      * load {a02 a02 a02 a12 a12 a12 a22 a22}
      */
@@ -187,7 +187,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
     /*
      * load {b20, b21, b22, b20, b21, b22, b20, b21}
      */
@@ -253,7 +253,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * jump 2 x A rows (2nd half of matrix)
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
     /*
      * load {a20 a20 a20 a20 a30 a30 a30 a30}
      */
@@ -274,7 +274,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
     /*
      * load {b10, b11, b12, b13, b10, b11, b12, b13}
      */
@@ -287,7 +287,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * jump 2 x A rows (2nd half of matrix)
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
     /*
      * load {a21 a21 a21 a21 a31 a31 a31 a31}
      */
@@ -309,7 +309,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
     /*
      * load {b20, b21, b22, b23, b20, b21, b22, b23}
      */
@@ -322,7 +322,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * jump 2 x A rows
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
 
     /*
      * load {a22 a22 a22 a22 a32 a32 a32 a32}
@@ -345,7 +345,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
     /*
      * load {b30, b31, b32, b33, b30, b31, b32, b33}
      */
@@ -358,7 +358,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
     /*
      * jump 2 x A rows
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
     /*
      * load {a23 a23 a23 a23 a33 a33 a33 a33}
      */
diff --git a/Source/TransformFunctions/arm_cfft_init_q15.c b/Source/TransformFunctions/arm_cfft_init_q15.c
index 7dcce7fa..738cd430 100755
--- a/Source/TransformFunctions/arm_cfft_init_q15.c
+++ b/Source/TransformFunctions/arm_cfft_init_q15.c
@@ -275,7 +275,7 @@ arm_status arm_cfft_init_q15(
 
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
             /*  Initializations of structure parameters for 4096 point FFT */
         case 4096U:
             /*  Initialise the bit reversal table modifier */
@@ -283,7 +283,7 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
             /*  Initializations of structure parameters for 2048 point FFT */
         case 2048U:
             /*  Initialise the bit reversal table modifier */
@@ -292,7 +292,7 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
             /*  Initializations of structure parameters for 1024 point FFT */
         case 1024U:
             /*  Initialise the bit reversal table modifier */
@@ -301,7 +301,7 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
             /*  Initializations of structure parameters for 512 point FFT */
         case 512U:
             /*  Initialise the bit reversal table modifier */
@@ -309,31 +309,31 @@ arm_status arm_cfft_init_q15(
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
         case 256U:
             FFTINIT(q15,256);
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
         case 128U:
             FFTINIT(q15,128);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
         case 64U:
             FFTINIT(q15,64);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
         case 32U:
             FFTINIT(q15,32);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
         case 16U:
             /*  Initializations of structure parameters for 16 point FFT */
             FFTINIT(q15,16);
diff --git a/Testing/PatternGeneration/Tools.py b/Testing/PatternGeneration/Tools.py
index abda8d41..70e6b2a6 100755
--- a/Testing/PatternGeneration/Tools.py
+++ b/Testing/PatternGeneration/Tools.py
@@ -3,7 +3,7 @@ import struct
 import numpy as np
 
 def normalize(a):
-  return(a/max(np.abs(a)))
+  return(a/np.max(np.abs(a)))
 
 TAILONLY = 1
 BODYONLY = 2
diff --git a/Toolchain/GCC.cmake b/Toolchain/GCC.cmake
index 4e0dcf15..c0e051d4 100644
--- a/Toolchain/GCC.cmake
+++ b/Toolchain/GCC.cmake
@@ -43,6 +43,7 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT)
 
   # Need to add other gcc config for other cortex-m cores
   if (ARM_CPU STREQUAL "cortex-m55" )
+     target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8.1-m.main+mve.fp+fp.dp")
      target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
      target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
   endif()
diff --git a/gcc.cmake b/gcc.cmake
index 3f7b6e66..9cb2f42b 100644
--- a/gcc.cmake
+++ b/gcc.cmake
@@ -6,26 +6,25 @@ SET(CMAKE_SYSTEM_PROCESSOR arm)
 
 
 
-#SET(tools "C:/PROGRA~2/GNUTOO~1/82018-~1")
 
-#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
-#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++.exe")
-#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
+#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc")
+#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++")
+#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc")
 
 find_program(CMAKE_C_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
 find_program(CMAKE_CXX_COMPILER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
 find_program(CMAKE_ASM_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
 
-#SET(CMAKE_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
-find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
-find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
+SET(CMAKE_AR "${tools}/bin/ar")
+SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/ar")
+SET(CMAKE_C_COMPILER_AR "${tools}/bin/ar")
 
+#find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
+#find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
+#find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
 
-#SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-#SET(CMAKE_C_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
 
-#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++.exe")
+#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++")
 find_program(CMAKE_LINKER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
 
 SET(CMAKE_C_LINK_EXECUTABLE "<CMAKE_LINKER> <LINK_FLAGS> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
@@ -48,10 +47,17 @@ if(NOT ARM_CPU)
     )
 endif(NOT ARM_CPU)
 
+if (ARM_CPU STREQUAL "cortex-m55")
+SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_ASM_FLAGS "-march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "ASM compiler common flags")
+SET(CMAKE_EXE_LINKER_FLAGS "-fno-use-linker-plugin -march=armv8.1-m.main+mve.fp+fp.dp"  CACHE INTERNAL "linker flags")
+else()
 SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
 SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
 SET(CMAKE_ASM_FLAGS "-mcpu=${ARM_CPU}" CACHE INTERNAL "ASM compiler common flags")
 SET(CMAKE_EXE_LINKER_FLAGS "-mcpu=${ARM_CPU}"  CACHE INTERNAL "linker flags")
+endif()
 
 get_property(IS_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE)
 if(IS_IN_TRY_COMPILE)