Enable f64 Neon optimisations only for aarch64

3 years ago · 3bf0f6e814
parent fceb49ed5d
commit 3bf0f6e814
24 changed files with 30 additions and 30 deletions
--- a/Source/BasicMathFunctions/arm_dot_prod_f64.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f64.c
@ -45,7 +45,7 @@
  @param[out]    result     output result returned here.
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_dot_prod_f64(
    const float64_t * pSrcA,
    const float64_t * pSrcB,
--- a/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
+++ b/Source/DistanceFunctions/arm_chebyshev_distance_f64.c
@ -52,7 +52,7 @@ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, ui
    float64_t diff=0.,  maxVal,tmpA, tmpB;
    uint32_t blkCnt;
    maxVal = F64_MIN;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t diffV , tmpAV , tmpBV , maxValV ;
    maxValV = vdupq_n_f64(maxVal);
    blkCnt = blockSize >> 1U ;
--- a/Source/DistanceFunctions/arm_cityblock_distance_f64.c
+++ b/Source/DistanceFunctions/arm_cityblock_distance_f64.c
@ -51,7 +51,7 @@ float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, ui
    uint32_t blkCnt;
    accum = 0.;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t tmpAV, tmpBV,accumV , subV;
    accumV = vdupq_n_f64(0.0f);
    blkCnt = blockSize >> 1U;
--- a/Source/DistanceFunctions/arm_euclidean_distance_f64.c
+++ b/Source/DistanceFunctions/arm_euclidean_distance_f64.c
@ -51,7 +51,7 @@ float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, ui
 {
    float64_t accum=0.,tmp;
    uint32_t blkCnt;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t accumV,tmpV , pAV ,pBV;
    accumV = vdupq_n_f64(0.0f);
    blkCnt = blockSize >> 1U;
--- a/Source/FastMathFunctions/arm_vexp_f64.c
+++ b/Source/FastMathFunctions/arm_vexp_f64.c
@ -38,7 +38,7 @@ void arm_vexp_f64(
    uint32_t blockSize)
 {
    uint32_t blkCnt;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t src;
--- a/Source/FastMathFunctions/arm_vlog_f64.c
+++ b/Source/FastMathFunctions/arm_vlog_f64.c
@ -39,7 +39,7 @@ void arm_vlog_f64(
    uint32_t blockSize)
 {
    uint32_t blkCnt;
-#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
    float64x2_t src;
    float64x2_t dst;
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
@ -134,7 +134,7 @@
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_biquad_cascade_df2T_f64(
    const arm_biquad_cascade_df2T_instance_f64 * S,
    const float64_t * pSrc,
--- a/Source/FilteringFunctions/arm_correlate_f64.c
+++ b/Source/FilteringFunctions/arm_correlate_f64.c
@ -65,7 +65,7 @@ void arm_correlate_f64(
    uint32_t j, k, count, blkCnt;                  /* Loop counters */
    uint32_t outBlockSize;                         /* Loop counter */
    int32_t inc = 1;                               /* Destination address modifier */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t sumV,pxV,pyV ;
 #endif
@ -167,7 +167,7 @@ void arm_correlate_f64(
    {
        /* Accumulator is made zero for every iteration */
        sum = 0.;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
        sumV = vdupq_n_f64(0.0f);
        k = count >> 1U ;
@ -249,7 +249,7 @@ void arm_correlate_f64(
        {
            /* Accumulator is made zero for every iteration */
            sum = 0.;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
            sumV = vdupq_n_f64(0.0f);
            k = srcBLen >> 1U ;
            while(k > 0U)
@ -305,7 +305,7 @@ void arm_correlate_f64(
        {
            /* Accumulator is made zero for every iteration */
            sum = 0.;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
            sumV = vdupq_n_f64(0.0f);
            k = srcBLen >> 1U ;
            while(k > 0U)
@ -382,7 +382,7 @@ void arm_correlate_f64(
    {
        /* Accumulator is made zero for every iteration */
        sum = 0.;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
        sumV = vdupq_n_f64(0.0f);
        k = count >> 1U ;
--- a/Source/FilteringFunctions/arm_fir_f64.c
+++ b/Source/FilteringFunctions/arm_fir_f64.c
@ -45,7 +45,7 @@
  @param[in]     blockSize  number of samples to process
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_fir_f64(
    const arm_fir_instance_f64 * S,
    const float64_t * pSrc,
--- a/Source/MatrixFunctions/arm_mat_cholesky_f64.c
+++ b/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@ -52,7 +52,7 @@
   * The decomposition of A is returning a lower triangular matrix L such that A = L L^t
   */
-#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
 arm_status arm_mat_cholesky_f64(
    const arm_matrix_instance_f64 * pSrc,
--- a/Source/MatrixFunctions/arm_mat_mult_f64.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f64.c
@ -27,7 +27,7 @@
 */
 #include "dsp/matrix_functions.h"
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 #define GROUPOFROWS 8
 #endif
@ -66,7 +66,7 @@
 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 arm_status arm_mat_mult_f64(
  const arm_matrix_instance_f64 * pSrcA,
  const arm_matrix_instance_f64 * pSrcB,
--- a/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@ -47,7 +47,7 @@
   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
   */
-#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
 arm_status arm_mat_solve_lower_triangular_f64(
    const arm_matrix_instance_f64 * lt,
    const arm_matrix_instance_f64 * a,
--- a/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
+++ b/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@ -47,7 +47,7 @@
   * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
  */
-#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
 arm_status arm_mat_solve_upper_triangular_f64(
    const arm_matrix_instance_f64 * ut,
    const arm_matrix_instance_f64 * a,
--- a/Source/MatrixFunctions/arm_mat_trans_f64.c
+++ b/Source/MatrixFunctions/arm_mat_trans_f64.c
@ -54,7 +54,7 @@
                   - \ref ARM_MATH_SUCCESS       : Operation successful
                   - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 arm_status arm_mat_trans_f64(
    const arm_matrix_instance_f64 * pSrc,
--- a/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmax_no_idx_f64.c
@ -47,7 +47,7 @@
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_absmax_no_idx_f64(
    const float64_t * pSrc,
    uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_absmin_no_idx_f64.c
@ -45,7 +45,7 @@
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_absmin_no_idx_f64(
    const float64_t * pSrc,
    uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_entropy_f64.c
+++ b/Source/StatisticsFunctions/arm_entropy_f64.c
@ -29,7 +29,7 @@
 #include "dsp/statistics_functions.h"
 #include <limits.h>
 #include <math.h>
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 #include "arm_vec_math.h"
 #endif
@ -57,7 +57,7 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
    accum = 0.0;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t sumV ,pInV ;
    sumV = vdupq_n_f64(0.0f);
    blkCnt = blockSize >> 1U ;
--- a/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
+++ b/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
@ -44,7 +44,7 @@
 * @return Kullback-Leibler divergence D(A || B)
 *
 */
-#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
 #include "NEMath.h"
--- a/Source/StatisticsFunctions/arm_max_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_max_no_idx_f64.c
@ -45,7 +45,7 @@
  @param[out]    pResult    maximum value returned here
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_max_no_idx_f64(
    const float64_t * pSrc,
    uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_mean_f64.c
+++ b/Source/StatisticsFunctions/arm_mean_f64.c
@ -46,7 +46,7 @@
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_mean_f64(
    const float64_t * pSrc,
--- a/Source/StatisticsFunctions/arm_min_no_idx_f64.c
+++ b/Source/StatisticsFunctions/arm_min_no_idx_f64.c
@ -53,7 +53,7 @@ void arm_min_no_idx_f64(
    float64_t   minValue = F64_MAX;
    float64_t   newVal;
    uint32_t blkCnt ;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t minValueV , newValV ;
    minValueV = vdupq_n_f64(F64_MAX);
    blkCnt = blockSize >> 1U;
--- a/Source/StatisticsFunctions/arm_mse_f64.c
+++ b/Source/StatisticsFunctions/arm_mse_f64.c
@ -57,7 +57,7 @@ void arm_mse_f64(
    uint32_t blkCnt;                               /* Loop counter */
    float64_t inA, inB;
    float64_t sum = 0.0;
-#if defined (ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t inAV , inBV , subV, sumV;
    sumV = vdupq_n_f64(0.0f);
--- a/Source/StatisticsFunctions/arm_power_f64.c
+++ b/Source/StatisticsFunctions/arm_power_f64.c
@ -44,7 +44,7 @@
  @param[out]    pResult    sum of the squares value returned here
  @return        none
 */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
 void arm_power_f64(
    const float64_t * pSrc,
    uint32_t blockSize,
--- a/Source/StatisticsFunctions/arm_var_f64.c
+++ b/Source/StatisticsFunctions/arm_var_f64.c
@ -62,7 +62,7 @@ void arm_var_f64(
        return;
    }
    arm_mean_f64(pInput, blockSize, &fMean);
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && defined(__aarch64__)
    float64x2_t fValueV ,fsumV , pInputV , fMeanV;
    fsumV = vdupq_n_f64(0.0f);
    fMeanV = vdupq_n_f64(fMean);