Enable f64 Neon optimisations only for aarch64

pull/39/head
Jean-Baptiste Rolland 3 years ago
parent fceb49ed5d
commit 3bf0f6e814

@ -45,7 +45,7 @@
@param[out] result output result returned here. @param[out] result output result returned here.
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_dot_prod_f64( void arm_dot_prod_f64(
const float64_t * pSrcA, const float64_t * pSrcA,
const float64_t * pSrcB, const float64_t * pSrcB,

@ -52,7 +52,7 @@ float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, ui
float64_t diff=0., maxVal,tmpA, tmpB; float64_t diff=0., maxVal,tmpA, tmpB;
uint32_t blkCnt; uint32_t blkCnt;
maxVal = F64_MIN; maxVal = F64_MIN;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t diffV , tmpAV , tmpBV , maxValV ; float64x2_t diffV , tmpAV , tmpBV , maxValV ;
maxValV = vdupq_n_f64(maxVal); maxValV = vdupq_n_f64(maxVal);
blkCnt = blockSize >> 1U ; blkCnt = blockSize >> 1U ;

@ -51,7 +51,7 @@ float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, ui
uint32_t blkCnt; uint32_t blkCnt;
accum = 0.; accum = 0.;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t tmpAV, tmpBV,accumV , subV; float64x2_t tmpAV, tmpBV,accumV , subV;
accumV = vdupq_n_f64(0.0f); accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U; blkCnt = blockSize >> 1U;

@ -51,7 +51,7 @@ float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, ui
{ {
float64_t accum=0.,tmp; float64_t accum=0.,tmp;
uint32_t blkCnt; uint32_t blkCnt;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t accumV,tmpV , pAV ,pBV; float64x2_t accumV,tmpV , pAV ,pBV;
accumV = vdupq_n_f64(0.0f); accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U; blkCnt = blockSize >> 1U;

@ -38,7 +38,7 @@ void arm_vexp_f64(
uint32_t blockSize) uint32_t blockSize)
{ {
uint32_t blkCnt; uint32_t blkCnt;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t src; float64x2_t src;

@ -39,7 +39,7 @@ void arm_vlog_f64(
uint32_t blockSize) uint32_t blockSize)
{ {
uint32_t blkCnt; uint32_t blkCnt;
#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
float64x2_t src; float64x2_t src;
float64x2_t dst; float64x2_t dst;

@ -134,7 +134,7 @@
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_biquad_cascade_df2T_f64( void arm_biquad_cascade_df2T_f64(
const arm_biquad_cascade_df2T_instance_f64 * S, const arm_biquad_cascade_df2T_instance_f64 * S,
const float64_t * pSrc, const float64_t * pSrc,

@ -65,7 +65,7 @@ void arm_correlate_f64(
uint32_t j, k, count, blkCnt; /* Loop counters */ uint32_t j, k, count, blkCnt; /* Loop counters */
uint32_t outBlockSize; /* Loop counter */ uint32_t outBlockSize; /* Loop counter */
int32_t inc = 1; /* Destination address modifier */ int32_t inc = 1; /* Destination address modifier */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t sumV,pxV,pyV ; float64x2_t sumV,pxV,pyV ;
#endif #endif
@ -167,7 +167,7 @@ void arm_correlate_f64(
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.; sum = 0.;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
sumV = vdupq_n_f64(0.0f); sumV = vdupq_n_f64(0.0f);
k = count >> 1U ; k = count >> 1U ;
@ -249,7 +249,7 @@ void arm_correlate_f64(
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.; sum = 0.;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
sumV = vdupq_n_f64(0.0f); sumV = vdupq_n_f64(0.0f);
k = srcBLen >> 1U ; k = srcBLen >> 1U ;
while(k > 0U) while(k > 0U)
@ -305,7 +305,7 @@ void arm_correlate_f64(
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.; sum = 0.;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
sumV = vdupq_n_f64(0.0f); sumV = vdupq_n_f64(0.0f);
k = srcBLen >> 1U ; k = srcBLen >> 1U ;
while(k > 0U) while(k > 0U)
@ -382,7 +382,7 @@ void arm_correlate_f64(
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0.; sum = 0.;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
sumV = vdupq_n_f64(0.0f); sumV = vdupq_n_f64(0.0f);
k = count >> 1U ; k = count >> 1U ;

@ -45,7 +45,7 @@
@param[in] blockSize number of samples to process @param[in] blockSize number of samples to process
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_fir_f64( void arm_fir_f64(
const arm_fir_instance_f64 * S, const arm_fir_instance_f64 * S,
const float64_t * pSrc, const float64_t * pSrc,

@ -52,7 +52,7 @@
* The decomposition of A is returning a lower triangular matrix L such that A = L L^t * The decomposition of A is returning a lower triangular matrix L such that A = L L^t
*/ */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
arm_status arm_mat_cholesky_f64( arm_status arm_mat_cholesky_f64(
const arm_matrix_instance_f64 * pSrc, const arm_matrix_instance_f64 * pSrc,

@ -27,7 +27,7 @@
*/ */
#include "dsp/matrix_functions.h" #include "dsp/matrix_functions.h"
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
#define GROUPOFROWS 8 #define GROUPOFROWS 8
#endif #endif
@ -66,7 +66,7 @@
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
arm_status arm_mat_mult_f64( arm_status arm_mat_mult_f64(
const arm_matrix_instance_f64 * pSrcA, const arm_matrix_instance_f64 * pSrcA,
const arm_matrix_instance_f64 * pSrcB, const arm_matrix_instance_f64 * pSrcB,

@ -47,7 +47,7 @@
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/ */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
arm_status arm_mat_solve_lower_triangular_f64( arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt, const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a, const arm_matrix_instance_f64 * a,

@ -47,7 +47,7 @@
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved. * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/ */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
arm_status arm_mat_solve_upper_triangular_f64( arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut, const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a, const arm_matrix_instance_f64 * a,

@ -54,7 +54,7 @@
- \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
arm_status arm_mat_trans_f64( arm_status arm_mat_trans_f64(
const arm_matrix_instance_f64 * pSrc, const arm_matrix_instance_f64 * pSrc,

@ -47,7 +47,7 @@
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_absmax_no_idx_f64( void arm_absmax_no_idx_f64(
const float64_t * pSrc, const float64_t * pSrc,
uint32_t blockSize, uint32_t blockSize,

@ -45,7 +45,7 @@
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_absmin_no_idx_f64( void arm_absmin_no_idx_f64(
const float64_t * pSrc, const float64_t * pSrc,
uint32_t blockSize, uint32_t blockSize,

@ -29,7 +29,7 @@
#include "dsp/statistics_functions.h" #include "dsp/statistics_functions.h"
#include <limits.h> #include <limits.h>
#include <math.h> #include <math.h>
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
#include "arm_vec_math.h" #include "arm_vec_math.h"
#endif #endif
@ -57,7 +57,7 @@ float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
accum = 0.0; accum = 0.0;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t sumV ,pInV ; float64x2_t sumV ,pInV ;
sumV = vdupq_n_f64(0.0f); sumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U ; blkCnt = blockSize >> 1U ;

@ -44,7 +44,7 @@
* @return Kullback-Leibler divergence D(A || B) * @return Kullback-Leibler divergence D(A || B)
* *
*/ */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__aarch64__)
#include "NEMath.h" #include "NEMath.h"

@ -45,7 +45,7 @@
@param[out] pResult maximum value returned here @param[out] pResult maximum value returned here
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_max_no_idx_f64( void arm_max_no_idx_f64(
const float64_t * pSrc, const float64_t * pSrc,
uint32_t blockSize, uint32_t blockSize,

@ -46,7 +46,7 @@
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_mean_f64( void arm_mean_f64(
const float64_t * pSrc, const float64_t * pSrc,

@ -53,7 +53,7 @@ void arm_min_no_idx_f64(
float64_t minValue = F64_MAX; float64_t minValue = F64_MAX;
float64_t newVal; float64_t newVal;
uint32_t blkCnt ; uint32_t blkCnt ;
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t minValueV , newValV ; float64x2_t minValueV , newValV ;
minValueV = vdupq_n_f64(F64_MAX); minValueV = vdupq_n_f64(F64_MAX);
blkCnt = blockSize >> 1U; blkCnt = blockSize >> 1U;

@ -57,7 +57,7 @@ void arm_mse_f64(
uint32_t blkCnt; /* Loop counter */ uint32_t blkCnt; /* Loop counter */
float64_t inA, inB; float64_t inA, inB;
float64_t sum = 0.0; float64_t sum = 0.0;
#if defined (ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t inAV , inBV , subV, sumV; float64x2_t inAV , inBV , subV, sumV;
sumV = vdupq_n_f64(0.0f); sumV = vdupq_n_f64(0.0f);

@ -44,7 +44,7 @@
@param[out] pResult sum of the squares value returned here @param[out] pResult sum of the squares value returned here
@return none @return none
*/ */
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
void arm_power_f64( void arm_power_f64(
const float64_t * pSrc, const float64_t * pSrc,
uint32_t blockSize, uint32_t blockSize,

@ -62,7 +62,7 @@ void arm_var_f64(
return; return;
} }
arm_mean_f64(pInput, blockSize, &fMean); arm_mean_f64(pInput, blockSize, &fMean);
#if defined(ARM_MATH_NEON) #if defined(ARM_MATH_NEON) && defined(__aarch64__)
float64x2_t fValueV ,fsumV , pInputV , fMeanV; float64x2_t fValueV ,fsumV , pInputV , fMeanV;
fsumV = vdupq_n_f64(0.0f); fsumV = vdupq_n_f64(0.0f);
fMeanV = vdupq_n_f64(fMean); fMeanV = vdupq_n_f64(fMean);

Loading…
Cancel
Save