CMSIS-DSP: More consistent use of the flag ARM_MATH_AUTOVECTORIZE

5 years ago · 0bcb1384f2
parent 22a3e4a048
commit 0bcb1384f2
134 changed files with 198 additions and 177 deletions
--- a/Include/arm_common_tables.h
+++ b/Include/arm_common_tables.h
@ -498,13 +498,13 @@ extern "C"
    extern const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1];
  #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */

-  #if defined(ARM_MATH_MVEI)
+  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
       extern const q31_t sqrtTable_Q31[256];
     #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
  #endif

-  #if defined(ARM_MATH_MVEI)
+  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
       extern const q15_t sqrtTable_Q15[256];
     #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
@ -517,7 +517,7 @@ extern "C"
       extern const float32_t __logf_lut_f32[8];
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */

-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
 extern const unsigned char hwLUT[256];
 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */

--- a/Include/arm_helium_utils.h
+++ b/Include/arm_helium_utils.h
@ -39,7 +39,7 @@ extern "C"
 Definitions available for MVEF and MVEI

 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)

 #define INACTIVELANE            0 /* inactive lane content */

@ -51,7 +51,7 @@ Definitions available for MVEF and MVEI
 Definitions available for MVEF only

 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF))  && !defined(ARM_MATH_AUTOVECTORIZE)

 __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
 {
@ -103,7 +103,7 @@ __STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
 Definitions available for f16 datatype with HW acceleration only

 ***************************************/
-#if defined (ARM_MATH_MVE_FLOAT16)
+#if defined (ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
 __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16(
    float16x8_t   vecIn)
 {
@ -181,7 +181,7 @@ __STATIC_FORCEINLINE void mve_cmplx_sum_intra_vec_f16(
 Definitions available for MVEI and MVEF only

 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)
 /* Following functions are used to transpose matrix in f32 and q31 cases */
 __STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
    uint32_t * pDataSrc,
@ -596,7 +596,7 @@ __STATIC_INLINE arm_status arm_mat_cmplx_trans_16bit(
 Definitions available for MVEI only

 ***************************************/
-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_common_tables.h"

--- a/Include/arm_math_types.h
+++ b/Include/arm_math_types.h
@ -294,7 +294,7 @@ extern "C"
  /**
   * @brief vector types
   */
-#if defined(ARM_MATH_NEON) || defined (ARM_MATH_MVEI)
+#if defined(ARM_MATH_NEON) || (defined (ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE))
  /**
   * @brief 64-bit fractional 128-bit vector data type in 1.63 format
   */
@ -378,7 +378,7 @@ extern "C"

 #endif

-#if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF) /* floating point vector*/
+#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF)  && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/
  /**
   * @brief 32-bit floating-point 128-bit vector type
   */
--- a/Include/arm_math_types_f16.h
+++ b/Include/arm_math_types_f16.h
@ -60,7 +60,7 @@ won't be built.
  #endif
 #endif

-#if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF) /* floating point vector*/
+#if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF)  && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/
  
 #if defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_NEON_FLOAT16)
  /**
--- a/Include/arm_mve_tables.h
+++ b/Include/arm_mve_tables.h
@ -100,7 +100,7 @@ extern float32_t rearranged_twiddle_stride3_4096_f32[2728];



-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)

 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)

@ -161,7 +161,7 @@ extern q31_t rearranged_twiddle_stride3_4096_q31[2728];



-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)

 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)

--- a/Include/arm_vec_math.h
+++ b/Include/arm_vec_math.h
@ -295,7 +295,7 @@ __STATIC_INLINE f32x4_t vpowq_f32(

 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/

-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */

 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
--- a/Include/dsp/transform_functions.h
+++ b/Include/dsp/transform_functions.h
@ -217,7 +217,7 @@ extern "C"
    const q15_t *pTwiddle;             /**< points to the Twiddle factor table. */
    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
          uint16_t bitRevLength;             /**< bit reversal table length. */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
@ -246,7 +246,7 @@ void arm_cfft_q15(
    const q31_t *pTwiddle;             /**< points to the Twiddle factor table. */
    const uint16_t *pBitRevTable;      /**< points to the bit reversal table. */
          uint16_t bitRevLength;             /**< bit reversal table length. */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
   const uint32_t *rearranged_twiddle_tab_stride1_arr;        /**< Per stage reordered twiddle pointer (offset 1) */                                                       \
   const uint32_t *rearranged_twiddle_tab_stride2_arr;        /**< Per stage reordered twiddle pointer (offset 2) */                                                       \
   const uint32_t *rearranged_twiddle_tab_stride3_arr;        /**< Per stage reordered twiddle pointer (offset 3) */                                                       \
@ -330,7 +330,7 @@ void arm_cfft_q31(
          uint32_t twidCoefRModifier;               /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
    const q15_t *pTwiddleAReal;                     /**< points to the real twiddle factor table. */
    const q15_t *pTwiddleBReal;                     /**< points to the imag twiddle factor table. */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    arm_cfft_instance_q15 cfftInst;
 #else
    const arm_cfft_instance_q15 *pCfft;       /**< points to the complex FFT instance. */
@ -359,7 +359,7 @@ void arm_cfft_q31(
          uint32_t twidCoefRModifier;                 /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
    const q31_t *pTwiddleAReal;                       /**< points to the real twiddle factor table. */
    const q31_t *pTwiddleBReal;                       /**< points to the imag twiddle factor table. */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
    arm_cfft_instance_q31 cfftInst;
 #else
    const arm_cfft_instance_q31 *pCfft;         /**< points to the complex FFT instance. */
--- a/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/Source/BasicMathFunctions/arm_abs_q15.c
@ -49,7 +49,7 @@
                   The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/Source/BasicMathFunctions/arm_abs_q31.c
@ -49,7 +49,7 @@
                   The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/Source/BasicMathFunctions/arm_abs_q7.c
@ -51,7 +51,7 @@
                   The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_add_q15.c
+++ b/Source/BasicMathFunctions/arm_add_q15.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_add_q31.c
+++ b/Source/BasicMathFunctions/arm_add_q31.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_add_q7.c
+++ b/Source/BasicMathFunctions/arm_add_q7.c
@ -51,7 +51,7 @@
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_q15.c
@ -52,7 +52,7 @@
                   there is no risk of overflow.
                   The return result is in 34.30 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_q31.c
@ -54,7 +54,7 @@
                   The return result is in 16.48 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_q7.c
@ -53,7 +53,7 @@
                   The return result is in 18.14 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/Source/BasicMathFunctions/arm_mult_q15.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/Source/BasicMathFunctions/arm_mult_q31.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/Source/BasicMathFunctions/arm_mult_q7.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/Source/BasicMathFunctions/arm_negate_q15.c
@ -50,7 +50,7 @@
                   The function uses saturating arithmetic.
                   The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/Source/BasicMathFunctions/arm_negate_q31.c
@ -49,7 +49,7 @@
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/Source/BasicMathFunctions/arm_negate_q7.c
@ -48,7 +48,7 @@
                   The function uses saturating arithmetic.
                   The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/Source/BasicMathFunctions/arm_offset_q15.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/Source/BasicMathFunctions/arm_offset_q31.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/Source/BasicMathFunctions/arm_offset_q7.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/Source/BasicMathFunctions/arm_scale_q15.c
@ -51,7 +51,7 @@
                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/Source/BasicMathFunctions/arm_scale_q31.c
@ -51,7 +51,7 @@
                   These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/Source/BasicMathFunctions/arm_scale_q7.c
@ -51,7 +51,7 @@
                   These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/Source/BasicMathFunctions/arm_shift_q15.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/Source/BasicMathFunctions/arm_shift_q31.c
@ -67,7 +67,7 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/Source/BasicMathFunctions/arm_shift_q7.c
@ -52,7 +52,7 @@
                   Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/Source/BasicMathFunctions/arm_sub_q15.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/Source/BasicMathFunctions/arm_sub_q31.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/Source/BasicMathFunctions/arm_sub_q7.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/CommonTables/arm_common_tables.c
+++ b/Source/CommonTables/arm_common_tables.c
@ -70383,7 +70383,7 @@ const q15_t sinTable_q15[FAST_MATH_TABLE_SIZE + 1] = {
 };
 #endif /* defined(ARM_ALL_FAST_TABLES) */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
 const q31_t sqrtTable_Q31[256] = {
    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
@ -70535,7 +70535,7 @@ const float32_t __logf_lut_f32[8] = {

 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) */

-#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) 
+#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))  && !defined(ARM_MATH_AUTOVECTORIZE)

 /* haming weight LUT for bytes */
 #define B2(n) n, n + 1, n + 1, n + 2
--- a/Source/CommonTables/arm_const_structs.c
+++ b/Source/CommonTables/arm_const_structs.c
@ -154,7 +154,7 @@ const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {

 /* Fixed-point structs */

-#if !defined(ARM_MATH_MVEI)
+#if !defined(ARM_MATH_MVEI) || defined(ARM_MATH_AUTOVECTORIZE)

 /* 

@ -417,7 +417,7 @@ const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
 /* Fixed-point structs */
 /* q31_t */

-#if !defined(ARM_MATH_MVEI)
+#if !defined(ARM_MATH_MVEI) || defined(ARM_MATH_AUTOVECTORIZE)

 /* 

--- a/Source/CommonTables/arm_mve_tables.c
+++ b/Source/CommonTables/arm_mve_tables.c
@ -3764,7 +3764,7 @@ float32_t rearranged_twiddle_stride3_4096_f32[2728]={
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */


-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)

 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)

@ -5429,7 +5429,7 @@ q31_t rearranged_twiddle_stride3_4096_q31[2728]={
 #endif /* defined(ARM_MATH_MVEI)  */


-#if defined(ARM_MATH_MVEI) 
+#if defined(ARM_MATH_MVEI)  && !defined(ARM_MATH_AUTOVECTORIZE)

 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)

--- a/Source/ComplexMathFunctions/ComplexMathFunctions.c
+++ b/Source/ComplexMathFunctions/ComplexMathFunctions.c
@ -34,7 +34,7 @@
 #include "arm_cmplx_dot_prod_q31.c"
 #include "arm_cmplx_mag_f32.c"

-#if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
+#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI))  && !defined(ARM_MATH_AUTOVECTORIZE)
  #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
  #include "arm_cmplx_mag_q15.c"
  #endif 
--- a/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@ -50,7 +50,7 @@
 */


-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_conj_q15(
  const q15_t * pSrc,
        q15_t * pDst,
--- a/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@ -49,7 +49,7 @@
                   The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_cmplx_conj_q31(
  const q31_t * pSrc,
--- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@ -54,7 +54,7 @@
                   The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_dot_prod_q15(
  const q15_t * pSrcA,
  const q15_t * pSrcB,
--- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@ -55,7 +55,7 @@
                   Input down scaling is not required.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_cmplx_dot_prod_q31(
  const q31_t * pSrcA,
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@ -47,7 +47,7 @@
  @par           Scaling and Overflow Behavior
                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@ -49,7 +49,7 @@
                   Input down scaling is not required.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@ -48,7 +48,7 @@
                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_cmplx_mag_squared_q15(
  const q15_t * pSrc,
--- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@ -49,7 +49,7 @@
                   Input down scaling is not required.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_cmplx_mag_squared_q31(
  const q31_t * pSrc,
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@ -49,7 +49,7 @@
                   The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_cmplx_mult_cmplx_q15(
  const q15_t * pSrcA,
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@ -50,7 +50,7 @@
                   Input down scaling is not required.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mult_cmplx_q31(
  const q31_t * pSrcA,
  const q31_t * pSrcB,
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@ -49,7 +49,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_cmplx_mult_real_q15(
  const q15_t * pSrcCmplx,
--- a/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@ -50,7 +50,7 @@
                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_cmplx_mult_real_q31(
  const q31_t * pSrcCmplx,
  const q31_t * pSrcReal,
--- a/Source/DistanceFunctions/arm_boolean_distance_template.h
+++ b/Source/DistanceFunctions/arm_boolean_distance_template.h
@ -62,7 +62,7 @@
 *
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_common_tables.h"

--- a/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
@ -172,7 +172,7 @@
                   - \ref arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"
 void arm_biquad_cas_df1_32x64_q31(
--- a/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@ -56,7 +56,7 @@
                   Refer to \ref arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_biquad_cascade_df1_q15(
  const arm_biquad_casd_df1_inst_q15 * S,
--- a/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@ -55,7 +55,7 @@
  @remark
                   Refer to \ref arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_biquad_cascade_df1_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
--- a/Source/FilteringFunctions/arm_conv_q15.c
+++ b/Source/FilteringFunctions/arm_conv_q15.c
@ -58,7 +58,7 @@
  @remark
                   Refer to \ref arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"

--- a/Source/FilteringFunctions/arm_conv_q31.c
+++ b/Source/FilteringFunctions/arm_conv_q31.c
@ -59,7 +59,7 @@
  @remark
                   Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"

--- a/Source/FilteringFunctions/arm_conv_q7.c
+++ b/Source/FilteringFunctions/arm_conv_q7.c
@ -55,7 +55,7 @@
  @remark
                   Refer to \ref arm_conv_opt_q7() for a faster implementation of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"

 #include "arm_vec_filtering.h"
--- a/Source/FilteringFunctions/arm_correlate_q15.c
+++ b/Source/FilteringFunctions/arm_correlate_q15.c
@ -58,7 +58,7 @@
  @remark
                   Refer to \ref arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"

--- a/Source/FilteringFunctions/arm_correlate_q31.c
+++ b/Source/FilteringFunctions/arm_correlate_q31.c
@ -59,7 +59,7 @@
  @remark
                   Refer to \ref arm_correlate_fast_q31() for a faster but less precise implementation of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"
 #include "arm_vec_filtering.h"
 void arm_correlate_q31(
--- a/Source/FilteringFunctions/arm_correlate_q7.c
+++ b/Source/FilteringFunctions/arm_correlate_q7.c
@ -56,7 +56,7 @@
 @remark
                   Refer to \ref arm_correlate_opt_q7() for a faster implementation of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 #include "arm_helium_utils.h"

 #include "arm_vec_filtering.h"
@ -884,7 +884,10 @@ void arm_correlate_q7(
    k = count;

 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
+/* Temporary fix for bug in clang */
+#if defined(ARM_MATH_MVEF) && defined(ARM_MATH_AUTOVECTORIZE)
+        #pragma clang loop vectorize(disable)
+#endif
    while (k > 0U)
    {
      /* Perform the multiply-accumulate */
--- a/Source/FilteringFunctions/arm_fir_decimate_q15.c
+++ b/Source/FilteringFunctions/arm_fir_decimate_q15.c
@ -57,7 +57,7 @@
                   Refer to \ref arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/FilteringFunctions/arm_fir_decimate_q31.c
+++ b/Source/FilteringFunctions/arm_fir_decimate_q31.c
@ -56,7 +56,7 @@
                   Refer to \ref arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/FilteringFunctions/arm_fir_interpolate_q15.c
+++ b/Source/FilteringFunctions/arm_fir_interpolate_q15.c
@ -54,7 +54,7 @@
                   Lastly, the accumulator is saturated to yield a result in 1.15 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"
 void arm_fir_interpolate_q15(
--- a/Source/FilteringFunctions/arm_fir_interpolate_q31.c
+++ b/Source/FilteringFunctions/arm_fir_interpolate_q31.c
@ -54,7 +54,7 @@
                   After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"
 void arm_fir_interpolate_q31(
--- a/Source/FilteringFunctions/arm_fir_q15.c
+++ b/Source/FilteringFunctions/arm_fir_q15.c
@ -56,7 +56,7 @@
  @remark
                   Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)

--- a/Source/FilteringFunctions/arm_fir_q31.c
+++ b/Source/FilteringFunctions/arm_fir_q31.c
@ -55,7 +55,7 @@
 @remark
                   Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"
                                        
--- a/Source/FilteringFunctions/arm_fir_q7.c
+++ b/Source/FilteringFunctions/arm_fir_q7.c
@ -54,7 +54,7 @@
                   Finally, the result is truncated to 1.7 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S, const q7_t * pSrc, q7_t * pDst, uint32_t blockSize)
 {
--- a/Source/MatrixFunctions/arm_mat_add_q15.c
+++ b/Source/MatrixFunctions/arm_mat_add_q15.c
@ -50,7 +50,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 arm_status arm_mat_add_q15(
  const arm_matrix_instance_q15 * pSrcA,
--- a/Source/MatrixFunctions/arm_mat_add_q31.c
+++ b/Source/MatrixFunctions/arm_mat_add_q31.c
@ -50,7 +50,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_add_q31(
  const arm_matrix_instance_q31 * pSrcA,
  const arm_matrix_instance_q31 * pSrcB,
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
@ -57,7 +57,7 @@
                   This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
                   truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)

--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
@ -56,7 +56,7 @@
                   to avoid overflows, as a total of numColsA additions are performed internally.
                   The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c
@ -45,7 +45,7 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c
@ -47,7 +47,7 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/MatrixFunctions/arm_mat_mult_q15.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q15.c
@ -57,7 +57,7 @@
  @par
                   Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)

--- a/Source/MatrixFunctions/arm_mat_mult_q31.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q31.c
@ -58,7 +58,7 @@
  @remark
                   Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #define MATRIX_DIM2 2
 #define MATRIX_DIM3 3
--- a/Source/MatrixFunctions/arm_mat_mult_q7.c
+++ b/Source/MatrixFunctions/arm_mat_mult_q7.c
@ -53,7 +53,7 @@
 *
 *
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
    const arm_matrix_instance_q7 * pSrcA,
    const arm_matrix_instance_q7 * pSrcB,
--- a/Source/MatrixFunctions/arm_mat_scale_q15.c
+++ b/Source/MatrixFunctions/arm_mat_scale_q15.c
@ -51,7 +51,7 @@
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
                   These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_scale_q15(
  const arm_matrix_instance_q15 * pSrc,
        q15_t                     scaleFract,
--- a/Source/MatrixFunctions/arm_mat_scale_q31.c
+++ b/Source/MatrixFunctions/arm_mat_scale_q31.c
@ -51,7 +51,7 @@
                   The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
                   These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_scale_q31(
  const arm_matrix_instance_q31 * pSrc,
        q31_t                     scaleFract,
--- a/Source/MatrixFunctions/arm_mat_sub_q15.c
+++ b/Source/MatrixFunctions/arm_mat_sub_q15.c
@ -50,7 +50,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 arm_status arm_mat_sub_q15(
  const arm_matrix_instance_q15 * pSrcA,
--- a/Source/MatrixFunctions/arm_mat_sub_q31.c
+++ b/Source/MatrixFunctions/arm_mat_sub_q31.c
@ -50,7 +50,7 @@
                   The function uses saturating arithmetic.
                   Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_sub_q31(
  const arm_matrix_instance_q31 * pSrcA,
  const arm_matrix_instance_q31 * pSrcB,
--- a/Source/MatrixFunctions/arm_mat_trans_q15.c
+++ b/Source/MatrixFunctions/arm_mat_trans_q15.c
@ -46,7 +46,7 @@
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
 
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/MatrixFunctions/arm_mat_trans_q31.c
+++ b/Source/MatrixFunctions/arm_mat_trans_q31.c
@ -45,7 +45,7 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/MatrixFunctions/arm_mat_trans_q7.c
+++ b/Source/MatrixFunctions/arm_mat_trans_q7.c
@ -45,7 +45,7 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
 {

--- a/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
@ -27,6 +27,7 @@

 #include "dsp/matrix_functions.h"

+
 /**
 * @ingroup groupMatrix
 */
@ -283,6 +284,7 @@ void arm_mat_vec_mult_f32(
    }
 }
 #else
+
 void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_t *pVec, float32_t *pDst)
 {
    uint32_t numRows = pSrcMat->numRows;
@ -365,7 +367,6 @@ void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_
        pInA1 = pSrcA + i;

        colCnt = numCols >> 1;
-
        while (colCnt > 0) {
            vecData = *(pInVec)++;
            vecData2 = *(pInVec)++;
@ -377,6 +378,11 @@ void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_
        }
        // process remainder of row
        colCnt = numCols & 1u;
+
+/* Temporary fix for bug in clang */
+#if defined(ARM_MATH_MVEF) && defined(ARM_MATH_AUTOVECTORIZE)
+        #pragma clang loop vectorize(disable)
+#endif
        while (colCnt > 0) {
            sum += *pInA1++ * *pInVec++;
            colCnt--;
--- a/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
@ -44,7 +44,7 @@
 * @param[in]       *pVec points to input vector
 * @param[out]      *pDst points to output vector
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
@ -44,7 +44,7 @@
 * @param[in]       *pVec points to the input vector
 * @param[out]      *pDst points to the output vector
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mat_vec_mult_q31(
    const arm_matrix_instance_q31 * pSrcMat,
    const q31_t     *pSrcVec,
--- a/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
+++ b/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
@ -44,7 +44,7 @@
 * @param[in]       *pVec points to the input vector
 * @param[out]      *pDst points to the output vector
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

@ -295,6 +295,8 @@ void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec
    i = 0u;
    px = pDst;

+
+
    /* The following loop performs the dot-product of each row in pSrcA with the vector */
    while (row > 0) {
        /* For every row wise process, the pInVec pointer is set
@ -318,6 +320,7 @@ void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec


        // Inner loop: matrix-vector multiplication
+
        while (colCnt > 0u) {
            // Read 4 values from vector
            vecData = read_q7x4_ia ((q7_t **) &pInVec);
@ -350,7 +353,11 @@ void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec
        }

        /* process any remaining columns */
+
        colCnt = numCols & 3u;
+#if defined(ARM_MATH_MVEI) && defined(ARM_MATH_AUTOVECTORIZE)
+        #pragma clang loop vectorize(disable)
+#endif
        while (colCnt > 0) {
            vecData = *pInVec++;
            sum1 += *pInA1++ * vecData;
--- a/Source/StatisticsFunctions/arm_max_q15.c
+++ b/Source/StatisticsFunctions/arm_max_q15.c
@ -45,7 +45,7 @@
  @param[out]    pIndex     index of maximum value returned here
  @return        none
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/StatisticsFunctions/arm_max_q31.c
+++ b/Source/StatisticsFunctions/arm_max_q31.c
@ -45,7 +45,7 @@
  @param[out]    pIndex     index of maximum value returned here
  @return        none
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/StatisticsFunctions/arm_max_q7.c
+++ b/Source/StatisticsFunctions/arm_max_q7.c
@ -45,7 +45,7 @@
  @param[out]    pIndex     index of maximum value returned here
  @return        none
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/StatisticsFunctions/arm_mean_q15.c
+++ b/Source/StatisticsFunctions/arm_mean_q15.c
@ -53,7 +53,7 @@
                   Finally, the accumulator is truncated to yield a result of 1.15 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mean_q15(
  const q15_t * pSrc,
        uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_mean_q31.c
+++ b/Source/StatisticsFunctions/arm_mean_q31.c
@ -52,7 +52,7 @@
                   full precision of intermediate result is preserved.
                   Finally, the accumulator is truncated to yield a result of 1.31 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_mean_q31(
  const q31_t * pSrc,
        uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_mean_q7.c
+++ b/Source/StatisticsFunctions/arm_mean_q7.c
@ -53,7 +53,7 @@
                   Finally, the accumulator is truncated to yield a result of 1.7 format.
 */

-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_mean_q7(
  const q7_t * pSrc,
--- a/Source/StatisticsFunctions/arm_min_q15.c
+++ b/Source/StatisticsFunctions/arm_min_q15.c
@ -46,7 +46,7 @@
  @param[out]    pIndex     index of minimum value returned here
  @return        none
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/StatisticsFunctions/arm_min_q31.c
+++ b/Source/StatisticsFunctions/arm_min_q31.c
@ -46,7 +46,7 @@
  @param[out]    pIndex     index of minimum value returned here
  @return        none
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/StatisticsFunctions/arm_min_q7.c
+++ b/Source/StatisticsFunctions/arm_min_q7.c
@ -46,7 +46,7 @@
  @param[out]    pIndex     index of minimum value returned here
  @return        none
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 #include "arm_helium_utils.h"

--- a/Source/StatisticsFunctions/arm_power_q15.c
+++ b/Source/StatisticsFunctions/arm_power_q15.c
@ -53,7 +53,7 @@
                   full precision of the intermediate multiplication is preserved.
                   Finally, the return result is in 34.30 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

 void arm_power_q15(
  const q15_t * pSrc,
--- a/Source/StatisticsFunctions/arm_power_q31.c
+++ b/Source/StatisticsFunctions/arm_power_q31.c
@ -54,7 +54,7 @@
                   full precision of the intermediate multiplication is preserved.
                   Finally, the return result is in 16.48 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_power_q31(
  const q31_t * pSrc,
        uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_power_q7.c
+++ b/Source/StatisticsFunctions/arm_power_q7.c
@ -53,7 +53,7 @@
                   full precision of the intermediate multiplication is preserved.
                   Finally, the return result is in 18.14 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_power_q7(
  const q7_t * pSrc,
        uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_rms_q15.c
+++ b/Source/StatisticsFunctions/arm_rms_q15.c
@ -54,7 +54,7 @@
                   Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
                   15 bits, and then saturated to yield a result in 1.15 format.
 */
-#if defined(ARM_MATH_MVEI)
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_rms_q15(
  const q15_t * pSrc,
        uint32_t blockSize,
--- a/Show More
+++ b/Show More