diff --git a/Include/arm_helium_utils.h b/Include/arm_helium_utils.h index ae207fdc..df497345 100755 --- a/Include/arm_helium_utils.h +++ b/Include/arm_helium_utils.h @@ -63,19 +63,7 @@ __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in) return acc; } -__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in) -{ - float16x8_t tmpVec; - _Float16 acc; - - tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in); - in = vaddq_f16(tmpVec, in); - tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in); - in = vaddq_f16(tmpVec, in); - acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4); - return acc; -} /* newton initial guess */ @@ -103,7 +91,23 @@ __STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in) Definitions available for f16 datatype with HW acceleration only ***************************************/ +#if defined(ARM_FLOAT16_SUPPORTED) #if defined (ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in) +{ + float16x8_t tmpVec; + _Float16 acc; + + tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in); + in = vaddq_f16(tmpVec, in); + tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in); + in = vaddq_f16(tmpVec, in); + acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4); + + return acc; +} + __STATIC_FORCEINLINE float16x8_t __mve_cmplx_sum_intra_vec_f16( float16x8_t vecIn) { @@ -175,6 +179,7 @@ __STATIC_FORCEINLINE void mve_cmplx_sum_intra_vec_f16( } #endif +#endif /*************************************** diff --git a/Include/arm_math_types_f16.h b/Include/arm_math_types_f16.h index f43271dd..648645a8 100755 --- a/Include/arm_math_types_f16.h +++ b/Include/arm_math_types_f16.h @@ -63,6 +63,7 @@ won't be built. #if defined(ARM_MATH_NEON) || (defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)) /* floating point vector*/ #if defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_NEON_FLOAT16) + /** * @brief 16-bit floating-point 128-bit vector data type */ diff --git a/Include/arm_vec_math_f16.h b/Include/arm_vec_math_f16.h index 9abab4e9..de2086c6 100755 --- a/Include/arm_vec_math_f16.h +++ b/Include/arm_vec_math_f16.h @@ -35,7 +35,7 @@ extern "C" #if defined(ARM_FLOAT16_SUPPORTED) -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) static const float16_t __logf_rng_f16=0.693147180f16; @@ -296,7 +296,7 @@ __STATIC_INLINE f16x8_t vtanhq_f16( return tanh; } -#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ +#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/ diff --git a/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c b/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c index c15bd83d..b9ee9af3 100755 --- a/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c +++ b/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c @@ -51,7 +51,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/CommonTables/arm_mve_tables_f16.c b/Source/CommonTables/arm_mve_tables_f16.c index ad831c5c..35756de7 100755 --- a/Source/CommonTables/arm_mve_tables_f16.c +++ b/Source/CommonTables/arm_mve_tables_f16.c @@ -30,7 +30,7 @@ #if defined(ARM_FLOAT16_SUPPORTED) -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) @@ -5568,6 +5568,6 @@ float16_t rearranged_twiddle_stride3_4096_f16[2728]={ #endif /* !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES) */ -#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ +#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */ #endif /* if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c b/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c index 86da920b..0dd6690b 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_conj_f16.c @@ -67,7 +67,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_cmplx_conj_f16( const float16_t * pSrc, diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c index 5be48bc0..b740a73d 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f16.c @@ -75,7 +75,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c b/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c index f13ebcdc..e151a1a4 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mag_f16.c @@ -68,7 +68,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c index 03e3696f..357ac608 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c @@ -69,7 +69,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_cmplx_mag_squared_f16( const float16_t * pSrc, diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c index 9ce7d8a6..99dab98d 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c @@ -70,7 +70,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_cmplx_mult_cmplx_f16( const float16_t * pSrcA, diff --git a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c index bff331a0..053d3efd 100755 --- a/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c +++ b/Source/ComplexMathFunctions/arm_cmplx_mult_real_f16.c @@ -69,7 +69,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_cmplx_mult_real_f16( const float16_t * pSrcCmplx, diff --git a/Source/DistanceFunctions/arm_braycurtis_distance_f16.c b/Source/DistanceFunctions/arm_braycurtis_distance_f16.c index 964c55af..fee393af 100755 --- a/Source/DistanceFunctions/arm_braycurtis_distance_f16.c +++ b/Source/DistanceFunctions/arm_braycurtis_distance_f16.c @@ -66,7 +66,7 @@ * @return distance * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/DistanceFunctions/arm_canberra_distance_f16.c b/Source/DistanceFunctions/arm_canberra_distance_f16.c index 228ede4c..27ac1a0b 100755 --- a/Source/DistanceFunctions/arm_canberra_distance_f16.c +++ b/Source/DistanceFunctions/arm_canberra_distance_f16.c @@ -63,7 +63,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/DistanceFunctions/arm_chebyshev_distance_f16.c b/Source/DistanceFunctions/arm_chebyshev_distance_f16.c index d97e255e..5e813cb4 100755 --- a/Source/DistanceFunctions/arm_chebyshev_distance_f16.c +++ b/Source/DistanceFunctions/arm_chebyshev_distance_f16.c @@ -57,7 +57,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math.h" diff --git a/Source/DistanceFunctions/arm_cityblock_distance_f16.c b/Source/DistanceFunctions/arm_cityblock_distance_f16.c index 6f961388..dcd8b544 100755 --- a/Source/DistanceFunctions/arm_cityblock_distance_f16.c +++ b/Source/DistanceFunctions/arm_cityblock_distance_f16.c @@ -56,7 +56,7 @@ * @return distance * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math.h" diff --git a/Source/DistanceFunctions/arm_euclidean_distance_f16.c b/Source/DistanceFunctions/arm_euclidean_distance_f16.c index 7a71fd51..00c7bbc8 100755 --- a/Source/DistanceFunctions/arm_euclidean_distance_f16.c +++ b/Source/DistanceFunctions/arm_euclidean_distance_f16.c @@ -58,7 +58,7 @@ * @return distance * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math.h" diff --git a/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c b/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c index 4e4ce434..0c812d49 100755 --- a/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c +++ b/Source/DistanceFunctions/arm_jensenshannon_distance_f16.c @@ -48,7 +48,7 @@ @{ */ -#if !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) +#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE) /// @private __STATIC_INLINE float16_t rel_entr(float16_t x, float16_t y) { @@ -57,7 +57,7 @@ __STATIC_INLINE float16_t rel_entr(float16_t x, float16_t y) #endif -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/DistanceFunctions/arm_minkowski_distance_f16.c b/Source/DistanceFunctions/arm_minkowski_distance_f16.c index def1569a..bd773289 100755 --- a/Source/DistanceFunctions/arm_minkowski_distance_f16.c +++ b/Source/DistanceFunctions/arm_minkowski_distance_f16.c @@ -59,7 +59,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/FastMathFunctions/arm_vexp_f16.c b/Source/FastMathFunctions/arm_vexp_f16.c index abfaf741..e2ca884d 100755 --- a/Source/FastMathFunctions/arm_vexp_f16.c +++ b/Source/FastMathFunctions/arm_vexp_f16.c @@ -32,9 +32,7 @@ #include "arm_common_tables.h" -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM) || defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_vec_math_f16.h" -#endif void arm_vexp_f16( @@ -44,7 +42,7 @@ void arm_vexp_f16( { uint32_t blkCnt; -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) f16x8_t src; f16x8_t dst; diff --git a/Source/FastMathFunctions/arm_vinverse_f16.c b/Source/FastMathFunctions/arm_vinverse_f16.c index 8eb6488d..5e11dbf2 100755 --- a/Source/FastMathFunctions/arm_vinverse_f16.c +++ b/Source/FastMathFunctions/arm_vinverse_f16.c @@ -32,9 +32,7 @@ #include "arm_common_tables.h" -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM) || defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_vec_math_f16.h" -#endif void arm_vinverse_f16( const float16_t * pSrc, @@ -43,7 +41,7 @@ void arm_vinverse_f16( { uint32_t blkCnt; -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) f16x8_t src; f16x8_t dst; diff --git a/Source/FastMathFunctions/arm_vlog_f16.c b/Source/FastMathFunctions/arm_vlog_f16.c index 88aa4c17..3a2c490f 100755 --- a/Source/FastMathFunctions/arm_vlog_f16.c +++ b/Source/FastMathFunctions/arm_vlog_f16.c @@ -31,10 +31,7 @@ #if defined(ARM_FLOAT16_SUPPORTED) #include "arm_common_tables.h" - -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM) || defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_vec_math_f16.h" -#endif void arm_vlog_f16( const float16_t * pSrc, @@ -43,7 +40,7 @@ void arm_vlog_f16( { uint32_t blkCnt; -#if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) f16x8_t src; f16x8_t dst; diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c b/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c index be829f33..bc5550b8 100755 --- a/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f16.c @@ -94,7 +94,7 @@ void arm_biquad_cascade_df1_init_f16( S->pState = pState; } -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) static void generateCoefsFastBiquadF16(float16_t b0, float16_t b1, float16_t b2, float16_t a1, float16_t a2, arm_biquad_mod_coef_f16 * newCoef) diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c index 856463ae..4c6e5bde 100755 --- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c @@ -47,7 +47,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_biquad_cascade_df2T_f16( const arm_biquad_cascade_df2T_instance_f16 * S, const float16_t * pSrc, diff --git a/Source/FilteringFunctions/arm_correlate_f16.c b/Source/FilteringFunctions/arm_correlate_f16.c index d9cb61cb..960577de 100755 --- a/Source/FilteringFunctions/arm_correlate_f16.c +++ b/Source/FilteringFunctions/arm_correlate_f16.c @@ -94,7 +94,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_filtering.h" diff --git a/Source/FilteringFunctions/arm_fir_f16.c b/Source/FilteringFunctions/arm_fir_f16.c index 0cc55843..f3d31853 100755 --- a/Source/FilteringFunctions/arm_fir_f16.c +++ b/Source/FilteringFunctions/arm_fir_f16.c @@ -45,7 +45,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #define FIR_F32_MAX_COEF_BLK 8 diff --git a/Source/FilteringFunctions/arm_fir_init_f16.c b/Source/FilteringFunctions/arm_fir_init_f16.c index 8b63796b..35df2738 100755 --- a/Source/FilteringFunctions/arm_fir_init_f16.c +++ b/Source/FilteringFunctions/arm_fir_init_f16.c @@ -84,7 +84,7 @@ void arm_fir_init_f16( S->pCoeffs = pCoeffs; /* Clear state buffer. The size is always (blockSize + numTaps - 1) */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t)); #else memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t)); diff --git a/Source/MatrixFunctions/arm_mat_add_f16.c b/Source/MatrixFunctions/arm_mat_add_f16.c index 5f63adba..6f9c16bb 100755 --- a/Source/MatrixFunctions/arm_mat_add_f16.c +++ b/Source/MatrixFunctions/arm_mat_add_f16.c @@ -52,7 +52,7 @@ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) arm_status arm_mat_add_f16( const arm_matrix_instance_f16 * pSrcA, diff --git a/Source/MatrixFunctions/arm_mat_cholesky_f16.c b/Source/MatrixFunctions/arm_mat_cholesky_f16.c index e1526de8..61bc8ef3 100755 --- a/Source/MatrixFunctions/arm_mat_cholesky_f16.c +++ b/Source/MatrixFunctions/arm_mat_cholesky_f16.c @@ -51,7 +51,7 @@ * The decomposition of A is returning a lower triangular matrix U such that A = U U^t */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c b/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c index 8f749207..977cba7a 100755 --- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c +++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c @@ -50,7 +50,7 @@ - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c b/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c index 449e4bd0..e57b11f7 100644 --- a/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c +++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c @@ -171,16 +171,16 @@ arm_status arm_mat_cmplx_mult_q15( pSrcAVec += 8; vecB = vldrhq_gather_shifted_offset(pInB, vecOffs); - acc0 = vmlsldavaq(acc0, vecA, vecB); - acc1 = vmlaldavaxq(acc1, vecA, vecB); + acc0 = vmlsldavaq_s16(acc0, vecA, vecB); + acc1 = vmlaldavaxq_s16(acc1, vecA, vecB); vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs); /* * move Matrix B read offsets, 4 rows down */ - vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM)); + vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM)); - acc2 = vmlsldavaq(acc2, vecA, vecB2); - acc3 = vmlaldavaxq(acc3, vecA, vecB2); + acc2 = vmlsldavaq_s16(acc2, vecA, vecB2); + acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2); blkCnt--; } @@ -196,17 +196,17 @@ arm_status arm_mat_cmplx_mult_q15( vecA = vldrhq_z_s16(pSrcAVec, p0); - acc0 = vmlsldavaq(acc0, vecA, vecB); - acc1 = vmlaldavaxq(acc1, vecA, vecB); + acc0 = vmlsldavaq_s16(acc0, vecA, vecB); + acc1 = vmlaldavaxq_s16(acc1, vecA, vecB); vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs); /* * move Matrix B read offsets, 4 rows down */ - vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM)); + vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM)); - acc2 = vmlsldavaq(acc2, vecA, vecB2); - acc3 = vmlaldavaxq(acc3, vecA, vecB2); + acc2 = vmlsldavaq_s16(acc2, vecA, vecB2); + acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2); } /* @@ -264,12 +264,12 @@ arm_status arm_mat_cmplx_mult_q15( pSrcAVec += 8; vecB = vldrhq_gather_shifted_offset(pInB, vecOffs); - acc0 = vmlsldavaq(acc0, vecA, vecB); - acc1 = vmlaldavaxq(acc1, vecA, vecB); + acc0 = vmlsldavaq_s16(acc0, vecA, vecB); + acc1 = vmlaldavaxq_s16(acc1, vecA, vecB); /* * move Matrix B read offsets, 4 rows down */ - vecOffs = vaddq(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM)); + vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM)); blkCnt--; } @@ -284,8 +284,8 @@ arm_status arm_mat_cmplx_mult_q15( vecB = vldrhq_gather_shifted_offset(pInB, vecOffs); vecA = vldrhq_z_s16(pSrcAVec, p0); - acc0 = vmlsldavaq(acc0, vecA, vecB); - acc1 = vmlaldavaxq(acc1, vecA, vecB); + acc0 = vmlsldavaq_s16(acc0, vecA, vecB); + acc1 = vmlaldavaxq_s16(acc1, vecA, vecB); } /* diff --git a/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c b/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c index c078598e..4ca83015 100755 --- a/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c +++ b/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c @@ -49,7 +49,7 @@ - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/MatrixFunctions/arm_mat_inverse_f16.c b/Source/MatrixFunctions/arm_mat_inverse_f16.c index 85adf6f4..3c8aba1b 100755 --- a/Source/MatrixFunctions/arm_mat_inverse_f16.c +++ b/Source/MatrixFunctions/arm_mat_inverse_f16.c @@ -50,7 +50,7 @@ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed - \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible) */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) arm_status arm_mat_inverse_f16( const arm_matrix_instance_f16 * pSrc, diff --git a/Source/MatrixFunctions/arm_mat_mult_f16.c b/Source/MatrixFunctions/arm_mat_mult_f16.c index 657d6188..e15a6b32 100755 --- a/Source/MatrixFunctions/arm_mat_mult_f16.c +++ b/Source/MatrixFunctions/arm_mat_mult_f16.c @@ -50,7 +50,7 @@ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) __STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve( const arm_matrix_instance_f16 *pSrcA, diff --git a/Source/MatrixFunctions/arm_mat_mult_q7.c b/Source/MatrixFunctions/arm_mat_mult_q7.c index 58708896..ba2267ec 100755 --- a/Source/MatrixFunctions/arm_mat_mult_q7.c +++ b/Source/MatrixFunctions/arm_mat_mult_q7.c @@ -79,8 +79,8 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve( vecA0 = vldrbq_s8(pInA0); vecA1 = vldrbq_s8(pInA1); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -91,8 +91,8 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve( vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -108,7 +108,7 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve( const arm_matrix_instance_q7 * pSrcB, arm_matrix_instance_q7 * pDst) { - const uint32_t MATRIX_DIM = 3; + const uint8_t MATRIX_DIM = 3; q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */ q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */ q7_t *pOut = pDst->pData; /* output data matrix pointer */ @@ -131,9 +131,9 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve( vecA1 = vldrbq_s8(pInA1); vecA2 = vldrbq_s8(pInA2); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -145,9 +145,9 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve( vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -159,9 +159,9 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve( vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -202,10 +202,10 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve( vecA2 = vldrbq_s8(pInA2); vecA3 = vldrbq_s8(pInA3); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); - acc3 = vmladavq(vecA3, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); + acc3 = vmladavq_s8(vecA3, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -218,10 +218,10 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve( vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); - acc3 = vmladavq(vecA3, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); + acc3 = vmladavq_s8(vecA3, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -234,10 +234,10 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve( vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); - acc3 = vmladavq(vecA3, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); + acc3 = vmladavq_s8(vecA3, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -250,10 +250,10 @@ __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve( vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0); - acc0 = vmladavq(vecA0, vecB); - acc1 = vmladavq(vecA1, vecB); - acc2 = vmladavq(vecA2, vecB); - acc3 = vmladavq(vecA3, vecB); + acc0 = vmladavq_s8(vecA0, vecB); + acc1 = vmladavq_s8(vecA1, vecB); + acc2 = vmladavq_s8(vecA2, vecB); + acc3 = vmladavq_s8(vecA3, vecB); pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8); pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8); @@ -385,16 +385,16 @@ arm_status arm_mat_mult_q7( { vecB = vld1q(pSrcBVec); pSrcBVec += 16; - acc0 = vmladavaq(acc0, vecA, vecB); + acc0 = vmladavaq_s8(acc0, vecA, vecB); vecA2 = vld1q(pSrcA2Vec); pSrcA2Vec += 16; - acc1 = vmladavaq(acc1, vecA2, vecB); + acc1 = vmladavaq_s8(acc1, vecA2, vecB); vecB2 = vld1q(pSrcB2Vec); pSrcB2Vec += 16; - acc2 = vmladavaq(acc2, vecA, vecB2); + acc2 = vmladavaq_s8(acc2, vecA, vecB2); vecA = vld1q(pSrcAVec); pSrcAVec += 16; - acc3 = vmladavaq(acc3, vecA2, vecB2); + acc3 = vmladavaq_s8(acc3, vecA2, vecB2); blkCnt--; } @@ -407,13 +407,13 @@ arm_status arm_mat_mult_q7( { mve_pred16_t p0 = vctp8q(blkCnt); vecB = vld1q(pSrcBVec); - acc0 = vmladavaq_p(acc0, vecA, vecB, p0); + acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0); vecA2 = vld1q(pSrcA2Vec); - acc1 = vmladavaq_p(acc1, vecA2, vecB, p0); + acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0); vecB2 = vld1q(pSrcB2Vec); - acc2 = vmladavaq_p(acc2, vecA, vecB2, p0); + acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0); vecA = vld1q(pSrcAVec); - acc3 = vmladavaq_p(acc3, vecA2, vecB2, p0); + acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0); } *px++ = (q7_t) __SSAT(acc0 >> 7, 8); @@ -473,7 +473,7 @@ arm_status arm_mat_mult_q7( pSrcAVec += 16; vecB = vld1q(pSrcBVec); pSrcBVec += 16; - acc0 = vmladavaq(acc0, vecA, vecB); + acc0 = vmladavaq_s8(acc0, vecA, vecB); blkCnt--; } @@ -487,7 +487,7 @@ arm_status arm_mat_mult_q7( mve_pred16_t p0 = vctp8q(blkCnt); vecA = vld1q(pSrcAVec); vecB = vld1q(pSrcBVec); - acc0 = vmladavaq_p(acc0, vecA, vecB, p0); + acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0); } *px = (q7_t) __SSAT(acc0 >> 7, 8); @@ -539,7 +539,7 @@ arm_status arm_mat_mult_q7( pSrcAVec += 16; vecB = vld1q(pSrcBVec); pSrcBVec += 16; - acc0 = vmladavaq(acc0, vecA, vecB); + acc0 = vmladavaq_s8(acc0, vecA, vecB); blkCnt--; } @@ -553,7 +553,7 @@ arm_status arm_mat_mult_q7( mve_pred16_t p0 = vctp8q(blkCnt); vecA = vld1q(pSrcAVec); vecB = vld1q(pSrcBVec); - acc0 = vmladavaq_p(acc0, vecA, vecB, p0); + acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0); } *px++ = (q7_t) __SSAT(acc0 >> 7, 8); diff --git a/Source/MatrixFunctions/arm_mat_scale_f16.c b/Source/MatrixFunctions/arm_mat_scale_f16.c index 73b94be1..8ca8dc72 100755 --- a/Source/MatrixFunctions/arm_mat_scale_f16.c +++ b/Source/MatrixFunctions/arm_mat_scale_f16.c @@ -50,7 +50,7 @@ - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) arm_status arm_mat_scale_f16( const arm_matrix_instance_f16 * pSrc, diff --git a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c index 9c02c477..7696be6b 100755 --- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c +++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c @@ -46,7 +46,7 @@ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c index f63d605c..4decb90e 100755 --- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c +++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c @@ -48,7 +48,7 @@ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/MatrixFunctions/arm_mat_sub_f16.c b/Source/MatrixFunctions/arm_mat_sub_f16.c index 2a4ad1bb..927a1388 100755 --- a/Source/MatrixFunctions/arm_mat_sub_f16.c +++ b/Source/MatrixFunctions/arm_mat_sub_f16.c @@ -50,7 +50,7 @@ - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) arm_status arm_mat_sub_f16( const arm_matrix_instance_f16 * pSrcA, diff --git a/Source/MatrixFunctions/arm_mat_trans_f16.c b/Source/MatrixFunctions/arm_mat_trans_f16.c index 1c17cd27..a3658495 100755 --- a/Source/MatrixFunctions/arm_mat_trans_f16.c +++ b/Source/MatrixFunctions/arm_mat_trans_f16.c @@ -48,7 +48,7 @@ - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/MatrixFunctions/arm_mat_vec_mult_f16.c b/Source/MatrixFunctions/arm_mat_vec_mult_f16.c index 8560c629..0c5e8b70 100755 --- a/Source/MatrixFunctions/arm_mat_vec_mult_f16.c +++ b/Source/MatrixFunctions/arm_mat_vec_mult_f16.c @@ -46,7 +46,7 @@ * @param[in] *pVec points to input vector * @param[out] *pDst points to output vector */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/SVMFunctions/arm_svm_linear_predict_f16.c b/Source/SVMFunctions/arm_svm_linear_predict_f16.c index ff590c09..0b4bae3d 100755 --- a/Source/SVMFunctions/arm_svm_linear_predict_f16.c +++ b/Source/SVMFunctions/arm_svm_linear_predict_f16.c @@ -46,7 +46,7 @@ * @return none. * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c index 52715095..cfa3ba1b 100755 --- a/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c +++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c @@ -47,7 +47,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/SVMFunctions/arm_svm_rbf_predict_f16.c b/Source/SVMFunctions/arm_svm_rbf_predict_f16.c index 831c972b..c4e2ef0f 100755 --- a/Source/SVMFunctions/arm_svm_rbf_predict_f16.c +++ b/Source/SVMFunctions/arm_svm_rbf_predict_f16.c @@ -47,7 +47,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c b/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c index 4840ca34..df80cc4a 100755 --- a/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c +++ b/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c @@ -47,7 +47,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/StatisticsFunctions/arm_entropy_f16.c b/Source/StatisticsFunctions/arm_entropy_f16.c index 2b7cd141..3cfdfe65 100755 --- a/Source/StatisticsFunctions/arm_entropy_f16.c +++ b/Source/StatisticsFunctions/arm_entropy_f16.c @@ -57,7 +57,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/StatisticsFunctions/arm_kullback_leibler_f16.c b/Source/StatisticsFunctions/arm_kullback_leibler_f16.c index 2252a821..a0505c0b 100755 --- a/Source/StatisticsFunctions/arm_kullback_leibler_f16.c +++ b/Source/StatisticsFunctions/arm_kullback_leibler_f16.c @@ -64,7 +64,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/StatisticsFunctions/arm_logsumexp_f16.c b/Source/StatisticsFunctions/arm_logsumexp_f16.c index dba2671f..bad77062 100755 --- a/Source/StatisticsFunctions/arm_logsumexp_f16.c +++ b/Source/StatisticsFunctions/arm_logsumexp_f16.c @@ -62,7 +62,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" #include "arm_vec_math_f16.h" diff --git a/Source/StatisticsFunctions/arm_max_f16.c b/Source/StatisticsFunctions/arm_max_f16.c index 099f6960..c753ddc5 100755 --- a/Source/StatisticsFunctions/arm_max_f16.c +++ b/Source/StatisticsFunctions/arm_max_f16.c @@ -53,7 +53,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_max_f16( const float16_t * pSrc, diff --git a/Source/StatisticsFunctions/arm_max_no_idx_f16.c b/Source/StatisticsFunctions/arm_max_no_idx_f16.c index 88a6beec..583838ed 100755 --- a/Source/StatisticsFunctions/arm_max_no_idx_f16.c +++ b/Source/StatisticsFunctions/arm_max_no_idx_f16.c @@ -52,7 +52,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_max_no_idx_f16( const float16_t *pSrc, diff --git a/Source/StatisticsFunctions/arm_mean_f16.c b/Source/StatisticsFunctions/arm_mean_f16.c index a32f8712..2f8d5fd8 100755 --- a/Source/StatisticsFunctions/arm_mean_f16.c +++ b/Source/StatisticsFunctions/arm_mean_f16.c @@ -60,7 +60,7 @@ @param[out] pResult mean value returned here. @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/StatisticsFunctions/arm_min_f16.c b/Source/StatisticsFunctions/arm_min_f16.c index cb53cabb..70e808e9 100755 --- a/Source/StatisticsFunctions/arm_min_f16.c +++ b/Source/StatisticsFunctions/arm_min_f16.c @@ -54,7 +54,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_min_f16( const float16_t * pSrc, diff --git a/Source/StatisticsFunctions/arm_power_f16.c b/Source/StatisticsFunctions/arm_power_f16.c index 1d1a9959..39f3a025 100755 --- a/Source/StatisticsFunctions/arm_power_f16.c +++ b/Source/StatisticsFunctions/arm_power_f16.c @@ -49,7 +49,7 @@ @param[out] pResult sum of the squares value returned here @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/StatisticsFunctions/arm_rms_f16.c b/Source/StatisticsFunctions/arm_rms_f16.c index 63d5dcf5..d025f49e 100755 --- a/Source/StatisticsFunctions/arm_rms_f16.c +++ b/Source/StatisticsFunctions/arm_rms_f16.c @@ -61,7 +61,7 @@ @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_rms_f16( const float16_t * pSrc, diff --git a/Source/StatisticsFunctions/arm_var_f16.c b/Source/StatisticsFunctions/arm_var_f16.c index e3da09c3..f4c20820 100755 --- a/Source/StatisticsFunctions/arm_var_f16.c +++ b/Source/StatisticsFunctions/arm_var_f16.c @@ -49,7 +49,7 @@ @param[out] pResult variance value returned here @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/StatisticsFunctions/arm_var_q15.c b/Source/StatisticsFunctions/arm_var_q15.c index 903fe93b..742becca 100644 --- a/Source/StatisticsFunctions/arm_var_q15.c +++ b/Source/StatisticsFunctions/arm_var_q15.c @@ -81,8 +81,8 @@ void arm_var_q15( /* Compute Sum of squares of the input samples * and then store the result in a temporary variable, sumOfSquares. */ - sumOfSquares = vmlaldavaq(sumOfSquares, vecSrc, vecSrc); - sum = vaddvaq(sum, vecSrc); + sumOfSquares = vmlaldavaq_s16(sumOfSquares, vecSrc, vecSrc); + sum = vaddvaq_s16(sum, vecSrc); blkCnt --; pSrc += 8; diff --git a/Source/SupportFunctions/arm_barycenter_f16.c b/Source/SupportFunctions/arm_barycenter_f16.c index 14bcc864..9aac1261 100755 --- a/Source/SupportFunctions/arm_barycenter_f16.c +++ b/Source/SupportFunctions/arm_barycenter_f16.c @@ -60,7 +60,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_barycenter_f16(const float16_t *in, const float16_t *weights, diff --git a/Source/SupportFunctions/arm_copy_f16.c b/Source/SupportFunctions/arm_copy_f16.c index d5a02990..b36aec38 100755 --- a/Source/SupportFunctions/arm_copy_f16.c +++ b/Source/SupportFunctions/arm_copy_f16.c @@ -48,7 +48,7 @@ @param[in] blockSize number of samples in each vector @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_copy_f16( const float16_t * pSrc, diff --git a/Source/SupportFunctions/arm_f16_to_float.c b/Source/SupportFunctions/arm_f16_to_float.c index 8fec5ff6..29d32351 100755 --- a/Source/SupportFunctions/arm_f16_to_float.c +++ b/Source/SupportFunctions/arm_f16_to_float.c @@ -53,7 +53,7 @@ */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_f16_to_float( const float16_t * pSrc, diff --git a/Source/SupportFunctions/arm_f16_to_q15.c b/Source/SupportFunctions/arm_f16_to_q15.c index 2c04b65a..39b7f3e3 100755 --- a/Source/SupportFunctions/arm_f16_to_q15.c +++ b/Source/SupportFunctions/arm_f16_to_q15.c @@ -62,7 +62,7 @@ defined in the preprocessor section of project options. */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_f16_to_q15( const float16_t * pSrc, diff --git a/Source/SupportFunctions/arm_fill_f16.c b/Source/SupportFunctions/arm_fill_f16.c index 4f14785e..32750354 100755 --- a/Source/SupportFunctions/arm_fill_f16.c +++ b/Source/SupportFunctions/arm_fill_f16.c @@ -48,7 +48,7 @@ @param[in] blockSize number of samples in each vector @return none */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_fill_f16( float16_t value, diff --git a/Source/SupportFunctions/arm_float_to_f16.c b/Source/SupportFunctions/arm_float_to_f16.c index 170e8430..601cfff7 100755 --- a/Source/SupportFunctions/arm_float_to_f16.c +++ b/Source/SupportFunctions/arm_float_to_f16.c @@ -49,7 +49,7 @@ */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_float_to_f16( const float32_t * pSrc, diff --git a/Source/SupportFunctions/arm_q15_to_f16.c b/Source/SupportFunctions/arm_q15_to_f16.c index 8a8760a3..1fc46bbf 100755 --- a/Source/SupportFunctions/arm_q15_to_f16.c +++ b/Source/SupportFunctions/arm_q15_to_f16.c @@ -58,7 +58,7 @@ */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void arm_q15_to_f16( const q15_t * pSrc, diff --git a/Source/SupportFunctions/arm_weighted_sum_f16.c b/Source/SupportFunctions/arm_weighted_sum_f16.c index 2a9924b0..96025087 100755 --- a/Source/SupportFunctions/arm_weighted_sum_f16.c +++ b/Source/SupportFunctions/arm_weighted_sum_f16.c @@ -59,7 +59,7 @@ * */ -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #include "arm_helium_utils.h" diff --git a/Source/TransformFunctions/arm_cfft_f16.c b/Source/TransformFunctions/arm_cfft_f16.c index 4250f749..cc3fefc2 100755 --- a/Source/TransformFunctions/arm_cfft_f16.c +++ b/Source/TransformFunctions/arm_cfft_f16.c @@ -272,8 +272,8 @@ static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float1 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc; /* load scheduling */ - vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); - vecC = vldrwq_gather_base_f32(vecScGathAddr, 8); + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); blkCnt = (fftLen >> 4); while (blkCnt > 0U) @@ -281,27 +281,27 @@ static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float1 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ - vecB = vldrwq_gather_base_f32(vecScGathAddr, 4); - vecD = vldrwq_gather_base_f32(vecScGathAddr, 12); + vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4); + vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12); vecSum1 = vecB + vecD; vecDiff1 = vecB - vecD; /* pre-load for next iteration */ - vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); - vecC = vldrwq_gather_base_f32(vecScGathAddr, 8); + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); vecTmp0 = vecSum0 + vecSum1; - vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0); vecTmp0 = vecSum0 - vecSum1; - vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0); vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); - vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0); vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); - vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0); blkCnt--; } @@ -480,8 +480,8 @@ static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * /* * load scheduling */ - vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); - vecC = vldrwq_gather_base_f32(vecScGathAddr, 8); + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); blkCnt = (fftLen >> 4); while (blkCnt > 0U) @@ -489,30 +489,30 @@ static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */ - vecB = vldrwq_gather_base_f32(vecScGathAddr, 4); - vecD = vldrwq_gather_base_f32(vecScGathAddr, 12); + vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4); + vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12); vecSum1 = vecB + vecD; vecDiff1 = vecB - vecD; - vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); - vecC = vldrwq_gather_base_f32(vecScGathAddr, 8); + vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64); + vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8); vecTmp0 = vecSum0 + vecSum1; vecTmp0 = vecTmp0 * onebyfftLen; - vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0); vecTmp0 = vecSum0 - vecSum1; vecTmp0 = vecTmp0 * onebyfftLen; - vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0); vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1); vecTmp0 = vecTmp0 * onebyfftLen; - vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0); vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1); vecTmp0 = vecTmp0 * onebyfftLen; - vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, vecTmp0); + vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0); blkCnt--; } diff --git a/Source/TransformFunctions/arm_rfft_fast_f16.c b/Source/TransformFunctions/arm_rfft_fast_f16.c index 155b0298..e66e9836 100755 --- a/Source/TransformFunctions/arm_rfft_fast_f16.c +++ b/Source/TransformFunctions/arm_rfft_fast_f16.c @@ -30,7 +30,7 @@ #if defined(ARM_FLOAT16_SUPPORTED) -#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) void stage_rfft_f16( const arm_rfft_fast_instance_f16 * S, diff --git a/Source/TransformFunctions/arm_rfft_q15.c b/Source/TransformFunctions/arm_rfft_q15.c index d2d34c16..3a610be6 100644 --- a/Source/TransformFunctions/arm_rfft_q15.c +++ b/Source/TransformFunctions/arm_rfft_q15.c @@ -138,6 +138,12 @@ void arm_rfft_q15( #include "arm_helium_utils.h" #include "arm_vec_fft.h" +#if defined(__CMSIS_GCC_H) +#define MVE_CMPLX_MULT_FX_AxB_S16(A,B) vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B) +#define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B) vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B) + +#endif + void arm_split_rfft_q15( q15_t * pSrc, uint32_t fftLen, @@ -157,7 +163,7 @@ void arm_split_rfft_q15( 0, 1, 0, 1, 0, 1, 0, 1 }; - offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8); + offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8); offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8)); /* Init coefficient pointers */ @@ -173,15 +179,19 @@ void arm_split_rfft_q15( i = fftLen - 1U; i = i / 4 + 1; while (i > 0U) { - q15x8_t in1 = vld1q(pIn1); + q15x8_t in1 = vld1q_s16(pIn1); q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn); q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef); q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef); - q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA), +#if defined(__CMSIS_GCC_H) + q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA), + MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2)); +#else + q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA), MVE_CMPLX_MULT_FX_AxConjB(coefB, in2)); - - vst1q(pOut1, out); +#endif + vst1q_s16(pOut1, out); pOut1 += 8; offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8); @@ -379,7 +389,7 @@ void arm_split_rifft_q15( 0, 1, 0, 1, 0, 1, 0, 1 }; - offsetCoef = vmulq(vld1q(offsetCoefArr), modifier) + vld1q(offsetCoefArr + 8); + offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8); offset = vaddq_n_u16(offset, (2 * fftLen - 6)); @@ -397,16 +407,16 @@ void arm_split_rifft_q15( i = i / 4; while (i > 0U) { - q15x8_t in1 = vld1q(pIn1); + q15x8_t in1 = vld1q_s16(pIn1); q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset); q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef); q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef); /* can we avoid the conjugate here ? */ - q15x8_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA), + q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA), vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB))); - vst1q(pDst, out); + vst1q_s16(pDst, out); pDst += 8; offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8); diff --git a/Source/TransformFunctions/arm_rfft_q31.c b/Source/TransformFunctions/arm_rfft_q31.c index a79d7e0a..c421d799 100644 --- a/Source/TransformFunctions/arm_rfft_q31.c +++ b/Source/TransformFunctions/arm_rfft_q31.c @@ -136,6 +136,13 @@ void arm_rfft_q31( #include "arm_helium_utils.h" #include "arm_vec_fft.h" +#if defined(__CMSIS_GCC_H) + +#define MVE_CMPLX_MULT_FX_AxB_S32(A,B) vqdmladhxq_s32(vqdmlsdhq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B) +#define MVE_CMPLX_MULT_FX_AxConjB_S32(A,B) vqdmladhq_s32(vqdmlsdhxq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B) + +#endif + void arm_split_rfft_q31( q31_t *pSrc, uint32_t fftLen, @@ -167,13 +174,15 @@ void arm_split_rfft_q31( i = fftLen - 1U; i = i / 2 + 1; while (i > 0U) { - q31x4_t in1 = vld1q(pIn1); + q31x4_t in1 = vld1q_s32(pIn1); q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset); q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef); q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef); - - q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2)); - +#if defined(__CMSIS_GCC_H) + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB_S32(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB_S32(coefB, in2)); +#else + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2)); +#endif vst1q(pOut1, out); pOut1 += 4; @@ -321,16 +330,20 @@ void arm_split_rifft_q31( i = fftLen; i = i >> 1; while (i > 0U) { - q31x4_t in1 = vld1q(pIn1); + q31x4_t in1 = vld1q_s32(pIn1); q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset); q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef); q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef); /* can we avoid the conjugate here ? */ - q31x4_t out = vhaddq(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA), - vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB))); - - vst1q(pDst, out); +#if defined(__CMSIS_GCC_H) + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB_S32(in1, coefA), + vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB_S32(in2, coefB))); +#else + q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA), + vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB))); +#endif + vst1q_s32(pDst, out); pDst += 4; offsetCoef += modifier * 4; diff --git a/Testing/Include/Tests/MISCF16.h b/Testing/Include/Tests/MISCF16.h index 4567c521..54ca86a5 100755 --- a/Testing/Include/Tests/MISCF16.h +++ b/Testing/Include/Tests/MISCF16.h @@ -1,8 +1,8 @@ +#include "dsp/filtering_functions_f16.h" + #include "Test.h" #include "Pattern.h" -#include "dsp/filtering_functions_f16.h" - class MISCF16:public Client::Suite { public: diff --git a/Testing/Source/Tests/FastMathF16.cpp b/Testing/Source/Tests/FastMathF16.cpp index ba91fc0f..b1993d27 100755 --- a/Testing/Source/Tests/FastMathF16.cpp +++ b/Testing/Source/Tests/FastMathF16.cpp @@ -1,7 +1,6 @@ #include "FastMathF16.h" #include #include "Error.h" -#include "arm_vec_math_f16.h" #include "Test.h" diff --git a/Testing/Source/Tests/FastMathF32.cpp b/Testing/Source/Tests/FastMathF32.cpp index b91b7463..e7dc4b7b 100755 --- a/Testing/Source/Tests/FastMathF32.cpp +++ b/Testing/Source/Tests/FastMathF32.cpp @@ -1,7 +1,9 @@ +#include "arm_vec_math.h" + #include "FastMathF32.h" #include + #include "Error.h" -#include "arm_vec_math.h" #include "Test.h" diff --git a/Testing/Source/Tests/MISCF16.cpp b/Testing/Source/Tests/MISCF16.cpp index 1c0773cc..65ead971 100755 --- a/Testing/Source/Tests/MISCF16.cpp +++ b/Testing/Source/Tests/MISCF16.cpp @@ -1,7 +1,6 @@ #include "MISCF16.h" #include #include "Error.h" -#include "arm_vec_math.h" #include "Test.h" #define SNR_THRESHOLD 60 diff --git a/Testing/Source/Tests/MISCF32.cpp b/Testing/Source/Tests/MISCF32.cpp index 2bf5b060..4039e093 100755 --- a/Testing/Source/Tests/MISCF32.cpp +++ b/Testing/Source/Tests/MISCF32.cpp @@ -1,7 +1,8 @@ +#include "arm_vec_math.h" + #include "MISCF32.h" #include #include "Error.h" -#include "arm_vec_math.h" #include "Test.h" #define SNR_THRESHOLD 120 diff --git a/Testing/Source/Tests/MISCQ15.cpp b/Testing/Source/Tests/MISCQ15.cpp index 19124ef0..d67e2ae4 100755 --- a/Testing/Source/Tests/MISCQ15.cpp +++ b/Testing/Source/Tests/MISCQ15.cpp @@ -1,7 +1,8 @@ +#include "arm_vec_math.h" + #include "MISCQ15.h" #include #include "Error.h" -#include "arm_vec_math.h" #include "Test.h" #define SNR_THRESHOLD 70 diff --git a/Testing/Source/Tests/MISCQ31.cpp b/Testing/Source/Tests/MISCQ31.cpp index a7cd54d9..70ab68a4 100755 --- a/Testing/Source/Tests/MISCQ31.cpp +++ b/Testing/Source/Tests/MISCQ31.cpp @@ -1,7 +1,8 @@ +#include "arm_vec_math.h" + #include "MISCQ31.h" #include #include "Error.h" -#include "arm_vec_math.h" #include "Test.h" #define SNR_THRESHOLD 100 diff --git a/Testing/Source/Tests/MISCQ7.cpp b/Testing/Source/Tests/MISCQ7.cpp index 71d0ebb6..f8fde510 100755 --- a/Testing/Source/Tests/MISCQ7.cpp +++ b/Testing/Source/Tests/MISCQ7.cpp @@ -1,7 +1,8 @@ +#include "arm_vec_math.h" + #include "MISCQ7.h" #include #include "Error.h" -#include "arm_vec_math.h" #include "Test.h" #define SNR_THRESHOLD 15