diff --git a/ComputeLibrary/Include/NEMath.h b/ComputeLibrary/Include/NEMath.h index 13ae8c4d..1eae4a0e 100755 --- a/ComputeLibrary/Include/NEMath.h +++ b/ComputeLibrary/Include/NEMath.h @@ -69,11 +69,11 @@ static inline float32x4_t vinvq_f32(float32x4_t x); /** Perform a 7th degree polynomial approximation using Estrin's method. * * @param[in] x Input vector value in F32 format. - * @param[in] coeffs Polynomial coefficients table. + * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors) * * @return The calculated approximation. */ -static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32x4_t *coeffs); +static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs); /** Calculate exponential * @@ -180,21 +180,21 @@ static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ /** Exponent polynomial coefficients */ -extern const float32x4_t exp_tab[8]; +extern const float32_t exp_tab[4*8]; /** Logarithm polynomial coefficients */ -extern const float32x4_t log_tab[8]; +extern const float32_t log_tab[4*8]; #ifndef DOXYGEN_SKIP_THIS inline float32x4_t vfloorq_f32(float32x4_t val) { - static const float32x4_t CONST_1 = {1.f,1.f,1.f,1.f}; + static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f}; const int32x4_t z = vcvtq_s32_f32(val); const float32x4_t r = vcvtq_f32_s32(z); - return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r); + return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r); } inline float32x2_t vinvsqrt_f32(float32x2_t x) @@ -231,12 +231,12 @@ inline float32x4_t vinvq_f32(float32x4_t x) return recip; } -inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32x4_t *coeffs) +inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs) { - float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x); - float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x); - float32x4_t C = vmlaq_f32(coeffs[1], coeffs[5], x); - float32x4_t D = vmlaq_f32(coeffs[3], coeffs[7], x); + float32x4_t A = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x); + float32x4_t B = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x); + float32x4_t C = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x); + float32x4_t D = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x); float32x4_t x2 = vmulq_f32(x, x); float32x4_t x4 = vmulq_f32(x2, x2); float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4); @@ -245,54 +245,54 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32x4_t *coeffs) inline float32x4_t vexpq_f32(float32x4_t x) { - static const float32x4_t CONST_LN2 = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2) - static const float32x4_t CONST_INV_LN2 = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2) - static const float32x4_t CONST_0 = {0.f,0.f,0.f,0.f}; - static const int32x4_t CONST_NEGATIVE_126 = {-126,-126,-126,-126}; + static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2) + static const float32_t CONST_INV_LN2[4] = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2) + static const float32_t CONST_0[4] = {0.f,0.f,0.f,0.f}; + static const int32_t CONST_NEGATIVE_126[4] = {-126,-126,-126,-126}; // Perform range reduction [-log(2),log(2)] - int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); - float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); + int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2))); + float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2)); // Polynomial Approximation float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); // Reconstruct poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23))); - poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); + poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly); return poly; } inline float32x4_t vlogq_f32(float32x4_t x) { - static const int32x4_t CONST_127 = {127,127,127,127}; // 127 - static const float32x4_t CONST_LN2 = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2) + static const int32_t CONST_127[4] = {127,127,127,127}; // 127 + static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2) // Extract exponent - int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); + int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127)); float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); // Polynomial Approximation float32x4_t poly = vtaylor_polyq_f32(val, log_tab); // Reconstruct - poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); + poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2)); return poly; } inline float32x4_t vtanhq_f32(float32x4_t val) { - static const float32x4_t CONST_1 = {1.f,1.f,1.f,1.f}; - static const float32x4_t CONST_2 = {2.f,2.f,2.f,2.f}; - static const float32x4_t CONST_MIN_TANH = {-10.f,-10.f,-10.f,-10.f}; - static const float32x4_t CONST_MAX_TANH = {10.f,10.f,10.f,10.f}; - - float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH); - float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x)); - float32x4_t num = vsubq_f32(exp2x, CONST_1); - float32x4_t den = vaddq_f32(exp2x, CONST_1); + static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f}; + static const float32_t CONST_2[4] = {2.f,2.f,2.f,2.f}; + static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f}; + static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f}; + + float32x4_t x = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH)); + float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x)); + float32x4_t num = vsubq_f32(exp2x, vld1q_f32(CONST_1)); + float32x4_t den = vaddq_f32(exp2x, vld1q_f32(CONST_1)); float32x4_t tanh = vmulq_f32(num, vinvq_f32(den)); return tanh; } @@ -309,12 +309,12 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) #ifndef DOXYGEN_SKIP_THIS inline float16x8_t vfloorq_f16(float16x8_t val) { - static const float16x8_t CONST_1 = {1.f,1.f,1.f,1.f}; + static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f}; const int16x8_t z = vcvtq_s16_f16(val); const float16x8_t r = vcvtq_f16_s16(z); - return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r); + return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r); } inline float16x4_t vinvsqrt_f16(float16x4_t x) { @@ -350,15 +350,15 @@ inline float16x8_t vinvq_f16(float16x8_t x) inline float16x8_t vtanhq_f16(float16x8_t val) { - const float16x8_t CONST_1 = {1.f,1.f,1.f,1.f}; - const float16x8_t CONST_2 = {2.f,2.f,2.f,2.f}; - const float16x8_t CONST_MIN_TANH = {-10.f,-10.f,-10.f,-10.f}; - const float16x8_t CONST_MAX_TANH = {10.f,10.f,10.f,10.f}; - - const float16x8_t x = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH); - const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x)); - const float16x8_t num = vsubq_f16(exp2x, CONST_1); - const float16x8_t den = vaddq_f16(exp2x, CONST_1); + const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f}; + const float16_t CONST_2[8] = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f}; + const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f}; + const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f}; + + const float16x8_t x = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH)); + const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x)); + const float16x8_t num = vsubq_f16(exp2x, vld1q_f16(CONST_1)); + const float16x8_t den = vaddq_f16(exp2x, vld1q_f16(CONST_1)); const float16x8_t tanh = vmulq_f16(num, vinvq_f16(den)); return tanh; } @@ -411,4 +411,4 @@ inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n) #endif /* DOXYGEN_SKIP_THIS */ #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ #endif -#endif /* __ARM_COMPUTE_NEMATH_H__ */ \ No newline at end of file +#endif /* __ARM_COMPUTE_NEMATH_H__ */ diff --git a/ComputeLibrary/Source/arm_cl_tables.c b/ComputeLibrary/Source/arm_cl_tables.c index 53d548b4..bdd39a6b 100755 --- a/ComputeLibrary/Source/arm_cl_tables.c +++ b/ComputeLibrary/Source/arm_cl_tables.c @@ -27,29 +27,29 @@ #if defined(ARM_MATH_NEON) /** Exponent polynomial coefficients */ -const float32x4_t exp_tab[8] = +const float32_t exp_tab[4*8] = { - {1.f,1.f,1.f,1.f}, - {0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f}, - {0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f}, - {0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f}, - {1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f}, - {0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f}, - {0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f}, - {0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f} + 1.f,1.f,1.f,1.f, + 0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f, + 0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f, + 0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f, + 1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f, + 0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f, + 0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f, + 0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f }; /** Logarithm polynomial coefficients */ -const float32x4_t log_tab[8] = +const float32_t log_tab[4*8] = { - {-2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f}, - {-2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f}, - {-5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f}, - {-0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f}, - {5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f}, - {0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f}, - {4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f}, - {0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f}, + -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f, + -2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f, + -5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f, + -0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f, + 5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f, + 0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f, + 4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f, + 0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f }; -#endif \ No newline at end of file +#endif diff --git a/Include/arm_math.h b/Include/arm_math.h index 50fa170a..ffd79a9e 100644 --- a/Include/arm_math.h +++ b/Include/arm_math.h @@ -93,8 +93,10 @@ * Toolchain Support * ------------ * - * The library has been developed and tested with MDK version 5.14.0.0 - * The library is being tested in GCC and IAR toolchains and updates on this activity will be made available shortly. + * The library is now tested on Fast Models building with cmake. + * Core M0, M7, A5 are tested. + * + * * * Building the Library * ------------ @@ -140,6 +142,23 @@ * of some DSP functions. Experimental Neon versions currently do not have better * performances than the scalar versions. * + * - ARM_MATH_HELIUM: + * + * It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_FLOAT16. + * + * - ARM_MATH_MVEF: + * + * Select Helium versions of the f32 algorithms. + * It implies ARM_MATH_FLOAT16 and ARM_MATH_MVEI. + * + * - ARM_MATH_MVEI: + * + * Select Helium versions of the int and fixed point algorithms. + * + * - ARM_MATH_FLOAT16: + * + * Float16 implementations of some algorithms (Requires MVE extension). + * *
* CMSIS-DSP in ARM::CMSIS Pack * ----------------------------- diff --git a/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h b/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h index 8feba2ba..44a1b313 100644 --- a/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h +++ b/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h @@ -44,7 +44,7 @@ // *----------------------------------------------------------------------------*/ #define __ROM_BASE 0x80000000 -#define __ROM_SIZE 0x00200000 +#define __ROM_SIZE 0x00400000 /*--------------------- RAM Configuration ----------------------------------- // RAM Configuration @@ -68,7 +68,7 @@ // // *----------------------------------------------------------------------------*/ -#define __RAM_BASE 0x80200000 +#define __RAM_BASE 0x80400000 #define __RAM_SIZE 0x00300000 #define __RW_DATA_SIZE 0x00100000 @@ -94,7 +94,7 @@ // TTB Size (in Bytes) <0x0-0xFFFFFFFF:8> // *----------------------------------------------------------------------------*/ -#define __TTB_BASE 0x80500000 +#define __TTB_BASE 0x80800000 #define __TTB_SIZE 0x00005000 #endif /* __MEM_ARMCA5_H */ diff --git a/Platforms/FVP/ARMCA5/Startup/AC5/startup_ARMCA5.c b/Platforms/FVP/ARMCA5/Startup/AC5/startup_ARMCA5.c index 535a2005..84a122b4 100755 --- a/Platforms/FVP/ARMCA5/Startup/AC5/startup_ARMCA5.c +++ b/Platforms/FVP/ARMCA5/Startup/AC5/startup_ARMCA5.c @@ -41,8 +41,8 @@ /*---------------------------------------------------------------------------- Internal References *----------------------------------------------------------------------------*/ -void Vectors (void) __attribute__ ((naked, section("RESET"))); -void Reset_Handler (void) __attribute__ ((naked)); +void Vectors (void) __attribute__ ((section("RESET"))); +void Reset_Handler (void); /*---------------------------------------------------------------------------- Exception / Interrupt Handler @@ -59,14 +59,14 @@ void FIQ_Handler (void) __attribute__ ((weak, alias("Default_Handler"))); *----------------------------------------------------------------------------*/ void Vectors(void) { __ASM volatile( - "LDR PC, =Reset_Handler \n" - "LDR PC, =Undef_Handler \n" - "LDR PC, =SVC_Handler \n" - "LDR PC, =PAbt_Handler \n" - "LDR PC, =DAbt_Handler \n" + "LDR __current_pc, =Reset_Handler \n" + "LDR __current_pc, =Undef_Handler \n" + "LDR __current_pc, =SVC_Handler \n" + "LDR __current_pc, =PAbt_Handler \n" + "LDR __current_pc, =DAbt_Handler \n" "NOP \n" - "LDR PC, =IRQ_Handler \n" - "LDR PC, =FIQ_Handler \n" + "LDR __current_pc, =IRQ_Handler \n" + "LDR __current_pc, =FIQ_Handler \n" ); } diff --git a/Platforms/FVP/ARMCM0/LinkScripts/AC5/lnk.sct b/Platforms/FVP/ARMCM0/LinkScripts/AC5/lnk.sct index 37072231..18ee8b29 100755 --- a/Platforms/FVP/ARMCM0/LinkScripts/AC5/lnk.sct +++ b/Platforms/FVP/ARMCM0/LinkScripts/AC5/lnk.sct @@ -14,7 +14,7 @@ ; *----------------------------------------------------------------------------*/ #define __ROM_BASE 0x00000000 -#define __ROM_SIZE 0x00200000 +#define __ROM_SIZE 0x00300000 /*--------------------- Embedded RAM Configuration --------------------------- ; RAM Configuration diff --git a/Platforms/FVP/ARMCM0/LinkScripts/GCC/lnk.ld b/Platforms/FVP/ARMCM0/LinkScripts/GCC/lnk.ld new file mode 100755 index 00000000..d2485ad7 --- /dev/null +++ b/Platforms/FVP/ARMCM0/LinkScripts/GCC/lnk.ld @@ -0,0 +1,296 @@ +/****************************************************************************** + * @file gcc_arm.ld + * @brief GNU Linker Script for Cortex-M based device + * @version V2.0.0 + * @date 21. May 2019 + ******************************************************************************/ +/* + * Copyright (c) 2009-2019 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "mem_ARMCM0.h" +/* + *-------- <<< Use Configuration Wizard in Context Menu >>> ------------------- + */ + +/*---------------------- Flash Configuration ---------------------------------- + Flash Configuration + Flash Base Address <0x0-0xFFFFFFFF:8> + Flash Size (in Bytes) <0x0-0xFFFFFFFF:8> + + -----------------------------------------------------------------------------*/ +__ROM_BASE = 0x00000000; +__ROM_SIZE = 0x00400000; + +/*--------------------- Embedded RAM Configuration ---------------------------- + RAM Configuration + RAM Base Address <0x0-0xFFFFFFFF:8> + RAM Size (in Bytes) <0x0-0xFFFFFFFF:8> + + -----------------------------------------------------------------------------*/ +__RAM_BASE = 0x20000000; +__RAM_SIZE = 0x00300000; + +/*--------------------- Stack / Heap Configuration ---------------------------- + Stack / Heap Configuration + Stack Size (in Bytes) <0x0-0xFFFFFFFF:8> + Heap Size (in Bytes) <0x0-0xFFFFFFFF:8> + + -----------------------------------------------------------------------------*/ +__STACK_SIZE = STACK_SIZE; +__HEAP_SIZE = HEAP_SIZE; + +/* + *-------------------- <<< end of configuration section >>> ------------------- + */ + +MEMORY +{ + FLASH (rx) : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE + RAM (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE +} + +/* Linker script to place sections and symbol values. Should be used together + * with other linker script that defines memory regions FLASH and RAM. + * It references following symbols, which must be defined in code: + * Reset_Handler : Entry of reset handler + * + * It defines following symbols, which code can use without definition: + * __exidx_start + * __exidx_end + * __copy_table_start__ + * __copy_table_end__ + * __zero_table_start__ + * __zero_table_end__ + * __etext + * __data_start__ + * __preinit_array_start + * __preinit_array_end + * __init_array_start + * __init_array_end + * __fini_array_start + * __fini_array_end + * __data_end__ + * __bss_start__ + * __bss_end__ + * __end__ + * end + * __HeapLimit + * __StackLimit + * __StackTop + * __stack + */ +ENTRY(Reset_Handler) + +SECTIONS +{ + .text : + { + KEEP(*(.vectors)) + *(.text*) + + KEEP(*(.init)) + KEEP(*(.fini)) + + /* .ctors */ + *crtbegin.o(.ctors) + *crtbegin?.o(.ctors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors) + *(SORT(.ctors.*)) + *(.ctors) + + /* .dtors */ + *crtbegin.o(.dtors) + *crtbegin?.o(.dtors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors) + *(SORT(.dtors.*)) + *(.dtors) + + *(.rodata*) + + KEEP(*(.eh_frame*)) + } > FLASH + + /* + * SG veneers: + * All SG veneers are placed in the special output section .gnu.sgstubs. Its start address + * must be set, either with the command line option ‘--section-start’ or in a linker script, + * to indicate where to place these veneers in memory. + */ +/* + .gnu.sgstubs : + { + . = ALIGN(32); + } > FLASH +*/ + .ARM.extab : + { + *(.ARM.extab* .gnu.linkonce.armextab.*) + } > FLASH + + __exidx_start = .; + .ARM.exidx : + { + *(.ARM.exidx* .gnu.linkonce.armexidx.*) + } > FLASH + __exidx_end = .; + + .copy.table : + { + . = ALIGN(4); + __copy_table_start__ = .; + LONG (__etext) + LONG (__data_start__) + LONG (__data_end__ - __data_start__) + /* Add each additional data section here */ +/* + LONG (__etext2) + LONG (__data2_start__) + LONG (__data2_end__ - __data2_start__) +*/ + __copy_table_end__ = .; + } > FLASH + + .zero.table : + { + . = ALIGN(4); + __zero_table_start__ = .; + /* Add each additional bss section here */ +/* + LONG (__bss2_start__) + LONG (__bss2_end__ - __bss2_start__) +*/ + __zero_table_end__ = .; + } > FLASH + + /** + * Location counter can end up 2byte aligned with narrow Thumb code but + * __etext is assumed by startup code to be the LMA of a section in RAM + * which must be 4byte aligned + */ + __etext = ALIGN (4); + + .data : AT (__etext) + { + __data_start__ = .; + *(vtable) + *(.data) + *(.data.*) + + . = ALIGN(4); + /* preinit data */ + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP(*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + + . = ALIGN(4); + /* init data */ + PROVIDE_HIDDEN (__init_array_start = .); + KEEP(*(SORT(.init_array.*))) + KEEP(*(.init_array)) + PROVIDE_HIDDEN (__init_array_end = .); + + + . = ALIGN(4); + /* finit data */ + PROVIDE_HIDDEN (__fini_array_start = .); + KEEP(*(SORT(.fini_array.*))) + KEEP(*(.fini_array)) + PROVIDE_HIDDEN (__fini_array_end = .); + + KEEP(*(.jcr*)) + . = ALIGN(4); + /* All data end */ + __data_end__ = .; + + } > RAM + + /* + * Secondary data section, optional + * + * Remember to add each additional data section + * to the .copy.table above to asure proper + * initialization during startup. + */ +/* + __etext2 = ALIGN (4); + + .data2 : AT (__etext2) + { + . = ALIGN(4); + __data2_start__ = .; + *(.data2) + *(.data2.*) + . = ALIGN(4); + __data2_end__ = .; + + } > RAM2 +*/ + + .bss : + { + . = ALIGN(4); + __bss_start__ = .; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(4); + __bss_end__ = .; + } > RAM AT > RAM + + /* + * Secondary bss section, optional + * + * Remember to add each additional bss section + * to the .zero.table above to asure proper + * initialization during startup. + */ +/* + .bss2 : + { + . = ALIGN(4); + __bss2_start__ = .; + *(.bss2) + *(.bss2.*) + . = ALIGN(4); + __bss2_end__ = .; + } > RAM2 AT > RAM2 +*/ + + .heap (COPY) : + { + . = ALIGN(8); + __end__ = .; + __HeapBase = .; + PROVIDE(end = .); + . = . + __HEAP_SIZE; + . = ALIGN(8); + __HeapLimit = .; + } > RAM + + .stack (ORIGIN(RAM) + LENGTH(RAM) - __STACK_SIZE) (COPY) : + { + . = ALIGN(8); + __StackLimit = .; + . = . + __STACK_SIZE; + . = ALIGN(8); + __StackTop = .; + } > RAM + PROVIDE(__stack = __StackTop); + + /* Check if data + heap + stack exceeds RAM limit */ + ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack") +} diff --git a/Platforms/FVP/ARMCM0/LinkScripts/GCC/mem_ARMCM0.h b/Platforms/FVP/ARMCM0/LinkScripts/GCC/mem_ARMCM0.h new file mode 100755 index 00000000..b25b6182 --- /dev/null +++ b/Platforms/FVP/ARMCM0/LinkScripts/GCC/mem_ARMCM0.h @@ -0,0 +1,38 @@ +/**************************************************************************//** + * @file mem_ARMCM0.h + * @brief Memory base and size definitions (used in scatter file) + * @version V1.1.0 + * @date 15. May 2019 + * + * @note + * + ******************************************************************************/ +/* + * Copyright (c) 2009-2019 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MEM_ARMCM0_H +#define __MEM_ARMCM0_H + + + +#define STACK_SIZE 0x00003000 +#define HEAP_SIZE 0x00100000 + + + +#endif /* __MEM_ARMCM7_H */ diff --git a/Platforms/FVP/ARMCM0/Startup/GCC/startup_ARMCM0.S b/Platforms/FVP/ARMCM0/Startup/GCC/startup_ARMCM0.S new file mode 100755 index 00000000..7bbd2dd6 --- /dev/null +++ b/Platforms/FVP/ARMCM0/Startup/GCC/startup_ARMCM0.S @@ -0,0 +1,179 @@ +/**************************************************************************//** + * @file startup_ARMCM0.S + * @brief CMSIS-Core(M) Device Startup File for Cortex-M0 Device + * @version V2.0.1 + * @date 23. July 2019 + ******************************************************************************/ +/* + * Copyright (c) 2009-2019 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .syntax unified + .arch armv6-m + + .section .vectors + .align 2 + .globl __Vectors + .globl __Vectors_End + .globl __Vectors_Size +__Vectors: + .long __StackTop /* Top of Stack */ + .long Reset_Handler /* Reset Handler */ + .long NMI_Handler /* -14 NMI Handler */ + .long HardFault_Handler /* -13 Hard Fault Handler */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long SVC_Handler /* -5 SVCall Handler */ + .long 0 /* Reserved */ + .long 0 /* Reserved */ + .long PendSV_Handler /* -2 PendSV Handler */ + .long SysTick_Handler /* -1 SysTick Handler */ + + /* Interrupts */ + .long Interrupt0_Handler /* 0 Interrupt 0 */ + .long Interrupt1_Handler /* 1 Interrupt 1 */ + .long Interrupt2_Handler /* 2 Interrupt 2 */ + .long Interrupt3_Handler /* 3 Interrupt 3 */ + .long Interrupt4_Handler /* 4 Interrupt 4 */ + .long Interrupt5_Handler /* 5 Interrupt 5 */ + .long Interrupt6_Handler /* 6 Interrupt 6 */ + .long Interrupt7_Handler /* 7 Interrupt 7 */ + .long Interrupt8_Handler /* 8 Interrupt 8 */ + .long Interrupt9_Handler /* 9 Interrupt 9 */ + + .space ( 22 * 4) /* Interrupts 10 .. 31 are left out */ +__Vectors_End: + .equ __Vectors_Size, __Vectors_End - __Vectors + .size __Vectors, . - __Vectors + + + .thumb + .section .text + .align 2 + + .thumb_func + .type Reset_Handler, %function + .globl Reset_Handler + .fnstart +Reset_Handler: + bl SystemInit + + ldr r4, =__copy_table_start__ + ldr r5, =__copy_table_end__ + +.L_loop0: + cmp r4, r5 + bge .L_loop0_done + ldr r1, [r4] + ldr r2, [r4, #4] + ldr r3, [r4, #8] + +.L_loop0_0: + subs r3, #4 + blt .L_loop0_0_done + ldr r0, [r1, r3] + str r0, [r2, r3] + b .L_loop0_0 + +.L_loop0_0_done: + adds r4, #12 + b .L_loop0 + +.L_loop0_done: + + ldr r3, =__zero_table_start__ + ldr r4, =__zero_table_end__ + +.L_loop2: + cmp r3, r4 + bge .L_loop2_done + ldr r1, [r3] + ldr r2, [r3, #4] + movs r0, 0 + +.L_loop2_0: + subs r2, #4 + blt .L_loop2_0_done + str r0, [r1, r2] + b .L_loop2_0 +.L_loop2_0_done: + + adds r3, #8 + b .L_loop2 +.L_loop2_done: + + bl _start + + .fnend + .size Reset_Handler, . - Reset_Handler + +/* The default macro is not used for HardFault_Handler + * because this results in a poor debug illusion. + */ + .thumb_func + .type HardFault_Handler, %function + .weak HardFault_Handler + .fnstart +HardFault_Handler: + b . + .fnend + .size HardFault_Handler, . - HardFault_Handler + + .thumb_func + .type Default_Handler, %function + .weak Default_Handler + .fnstart +Default_Handler: + b . + .fnend + .size Default_Handler, . - Default_Handler + +/* Macro to define default exception/interrupt handlers. + * Default handler are weak symbols with an endless loop. + * They can be overwritten by real handlers. + */ + .macro Set_Default_Handler Handler_Name + .weak \Handler_Name + .set \Handler_Name, Default_Handler + .endm + + +/* Default exception/interrupt handler */ + + Set_Default_Handler NMI_Handler + Set_Default_Handler SVC_Handler + Set_Default_Handler PendSV_Handler + Set_Default_Handler SysTick_Handler + + Set_Default_Handler Interrupt0_Handler + Set_Default_Handler Interrupt1_Handler + Set_Default_Handler Interrupt2_Handler + Set_Default_Handler Interrupt3_Handler + Set_Default_Handler Interrupt4_Handler + Set_Default_Handler Interrupt5_Handler + Set_Default_Handler Interrupt6_Handler + Set_Default_Handler Interrupt7_Handler + Set_Default_Handler Interrupt8_Handler + Set_Default_Handler Interrupt9_Handler + + + .end diff --git a/Platforms/FVP/ARMCM0/Startup/GCC/support.c b/Platforms/FVP/ARMCM0/Startup/GCC/support.c new file mode 100755 index 00000000..740f6b08 --- /dev/null +++ b/Platforms/FVP/ARMCM0/Startup/GCC/support.c @@ -0,0 +1,36 @@ + +#ifdef __cplusplus +extern "C" +{ +#endif + +char * _sbrk(int incr); + +void __malloc_lock() ; +void __malloc_unlock(); + +char __HeapBase, __HeapLimit; // make sure to define these symbols in linker command file +#ifdef __cplusplus +} +#endif + +static int totalBytesProvidedBySBRK = 0; + +//! sbrk/_sbrk version supporting reentrant newlib (depends upon above symbols defined by linker control file). +char * sbrk(int incr) { + static char *currentHeapEnd = &__HeapBase; + char *previousHeapEnd = currentHeapEnd; + if (currentHeapEnd + incr > &__HeapLimit) { + return (char *)-1; // the malloc-family routine that called sbrk will return 0 + } + currentHeapEnd += incr; + + totalBytesProvidedBySBRK += incr; + + return (char *) previousHeapEnd; +} +//! Synonym for sbrk. +char * _sbrk(int incr) { return sbrk(incr); }; + +void __malloc_lock() { }; +void __malloc_unlock() { }; \ No newline at end of file diff --git a/Platforms/FVP/ARMCM7/LinkScripts/AC5/lnk.sct b/Platforms/FVP/ARMCM7/LinkScripts/AC5/lnk.sct index bc03375e..60b878c9 100755 --- a/Platforms/FVP/ARMCM7/LinkScripts/AC5/lnk.sct +++ b/Platforms/FVP/ARMCM7/LinkScripts/AC5/lnk.sct @@ -14,7 +14,7 @@ ; *----------------------------------------------------------------------------*/ #define __ROM_BASE 0x00000000 -#define __ROM_SIZE 0x00200000 +#define __ROM_SIZE 0x00300000 /*--------------------- Embedded RAM Configuration --------------------------- ; RAM Configuration diff --git a/Platforms/FVP/ARMCM7/LinkScripts/GCC/lnk.ld b/Platforms/FVP/ARMCM7/LinkScripts/GCC/lnk.ld index dd8398d9..5b1c56a5 100644 --- a/Platforms/FVP/ARMCM7/LinkScripts/GCC/lnk.ld +++ b/Platforms/FVP/ARMCM7/LinkScripts/GCC/lnk.ld @@ -33,7 +33,7 @@ -----------------------------------------------------------------------------*/ __ROM_BASE = 0x00000000; -__ROM_SIZE = 0x00300000; +__ROM_SIZE = 0x00400000; /*--------------------- Embedded RAM Configuration ---------------------------- RAM Configuration diff --git a/Source/BasicMathFunctions/arm_dot_prod_f32.c b/Source/BasicMathFunctions/arm_dot_prod_f32.c index ef0e2aa0..2bce15bf 100644 --- a/Source/BasicMathFunctions/arm_dot_prod_f32.c +++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c @@ -131,7 +131,8 @@ void arm_dot_prod_f32( #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) f32x4_t vec1; f32x4_t vec2; - f32x4_t accum = vdupq_n_f32(0); + f32x4_t accum = vdupq_n_f32(0); + f32x2_t tmp = vdup_n_f32(0); /* Compute 4 outputs at a time */ blkCnt = blockSize >> 2U; @@ -160,7 +161,9 @@ void arm_dot_prod_f32( #if __aarch64__ sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum))); #else - sum = (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[0] + (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[1]; + tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)); + sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1); + #endif /* Tail */ diff --git a/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c b/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c index 243a76e6..27bd793f 100755 --- a/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c +++ b/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c @@ -228,10 +228,10 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins vecBlkCnt--; } tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV)); - tmp += tmpV2[0] + tmpV2[1]; - + tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1); + tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1)); - tmp1 += tmpV2[0] + tmpV2[1]; + tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1); vecBlkCnt = S->vectorDimension & 3; while(vecBlkCnt > 0) @@ -300,7 +300,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins vecBlkCnt--; } tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV)); - tmp += tmpV2[0] + tmpV2[1]; + tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1); vecBlkCnt = S->vectorDimension & 3; while(vecBlkCnt > 0) diff --git a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c index 1f747ae9..4925f02c 100644 --- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c +++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c @@ -169,7 +169,7 @@ void arm_cmplx_dot_prod_f32( /* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */ /* Calculate dot product and then store the result in a temporary buffer. */ - vec1 = vld2q_f32(pSrcA); + vec1 = vld2q_f32(pSrcA); vec2 = vld2q_f32(pSrcB); /* Increment pointers */ @@ -204,10 +204,10 @@ void arm_cmplx_dot_prod_f32( } accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR)); - real_sum += accum[0] + accum[1]; + real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI)); - imag_sum += accum[0] + accum[1]; + imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); /* Tail */ blkCnt = numSamples & 0x7; diff --git a/Source/DistanceFunctions/arm_boolean_distance_template.h b/Source/DistanceFunctions/arm_boolean_distance_template.h index f54adecf..6888e9e0 100755 --- a/Source/DistanceFunctions/arm_boolean_distance_template.h +++ b/Source/DistanceFunctions/arm_boolean_distance_template.h @@ -351,16 +351,16 @@ void FUNC(EXT)(const uint32_t *pA } #ifdef TT - _ctt += tmp4tt[0] + tmp4tt[1]; + _ctt += vgetq_lane_u64(tmp4tt, 0) + vgetq_lane_u64(tmp4tt, 1); #endif #ifdef FF - _cff += tmp4ff[0] + tmp4ff[1]; + _cff +=vgetq_lane_u64(tmp4ff, 0) + vgetq_lane_u64(tmp4ff, 1); #endif #ifdef TF - _ctf += tmp4tf[0] + tmp4tf[1]; + _ctf += vgetq_lane_u64(tmp4tf, 0) + vgetq_lane_u64(tmp4tf, 1); #endif #ifdef FT - _cft += tmp4ft[0] + tmp4ft[1]; + _cft += vgetq_lane_u64(tmp4ft, 0) + vgetq_lane_u64(tmp4ft, 1); #endif nbBoolBlock = numberOfBools & 0x7F; diff --git a/Source/DistanceFunctions/arm_braycurtis_distance_f32.c b/Source/DistanceFunctions/arm_braycurtis_distance_f32.c index 154827ed..38c9d18e 100755 --- a/Source/DistanceFunctions/arm_braycurtis_distance_f32.c +++ b/Source/DistanceFunctions/arm_braycurtis_distance_f32.c @@ -144,10 +144,10 @@ float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, u blkCnt --; } accumV2 = vpadd_f32(vget_low_f32(accumDiffV),vget_high_f32(accumDiffV)); - accumDiff = accumV2[0] + accumV2[1]; + accumDiff = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); accumV2 = vpadd_f32(vget_low_f32(accumSumV),vget_high_f32(accumSumV)); - accumSum = accumV2[0] + accumV2[1]; + accumSum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Source/DistanceFunctions/arm_canberra_distance_f32.c b/Source/DistanceFunctions/arm_canberra_distance_f32.c index 219d2bba..f71b74d3 100755 --- a/Source/DistanceFunctions/arm_canberra_distance_f32.c +++ b/Source/DistanceFunctions/arm_canberra_distance_f32.c @@ -135,7 +135,7 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin uint32_t blkCnt; float32x4_t a,b,c,accumV; float32x2_t accumV2; - int32x4_t isZeroV; + uint32x4_t isZeroV; float32x4_t zeroV = vdupq_n_f32(0.0f); accumV = vdupq_n_f32(0.0f); @@ -162,7 +162,7 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin * Force result of a division by 0 to 0. It the behavior of the * sklearn canberra function. */ - a = vbicq_s32(a,isZeroV); + a = vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(a),vreinterpretq_s32_u32(isZeroV))); c = vmulq_f32(c,a); accumV = vaddq_f32(accumV,c); @@ -171,7 +171,7 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin blkCnt --; } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); - accum = accumV2[0] + accumV2[1]; + accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); blkCnt = blockSize & 3; diff --git a/Source/DistanceFunctions/arm_chebyshev_distance_f32.c b/Source/DistanceFunctions/arm_chebyshev_distance_f32.c index 349158ac..f6a47365 100755 --- a/Source/DistanceFunctions/arm_chebyshev_distance_f32.c +++ b/Source/DistanceFunctions/arm_chebyshev_distance_f32.c @@ -158,7 +158,7 @@ float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, ui } maxValV2 = vpmax_f32(vget_low_f32(maxValV),vget_high_f32(maxValV)); maxValV2 = vpmax_f32(maxValV2,maxValV2); - maxVal = maxValV2[0]; + maxVal = vget_lane_f32(maxValV2,0); blkCnt = blockSize & 3; diff --git a/Source/DistanceFunctions/arm_cityblock_distance_f32.c b/Source/DistanceFunctions/arm_cityblock_distance_f32.c index 88173fa8..8dce697c 100755 --- a/Source/DistanceFunctions/arm_cityblock_distance_f32.c +++ b/Source/DistanceFunctions/arm_cityblock_distance_f32.c @@ -115,7 +115,7 @@ float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, ui } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); accumV2 = vpadd_f32(accumV2,accumV2); - accum = accumV2[0]; + accum = vget_lane_f32(accumV2,0); blkCnt = blockSize & 3; diff --git a/Source/DistanceFunctions/arm_euclidean_distance_f32.c b/Source/DistanceFunctions/arm_euclidean_distance_f32.c index af59ffbe..34df6d7d 100755 --- a/Source/DistanceFunctions/arm_euclidean_distance_f32.c +++ b/Source/DistanceFunctions/arm_euclidean_distance_f32.c @@ -114,7 +114,7 @@ float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, ui blkCnt --; } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); - accum = accumV2[0] + accumV2[1]; + accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c b/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c index 3c3a4fac..6673e472 100755 --- a/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c +++ b/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c @@ -168,7 +168,7 @@ float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); - accum = accumV2[0] + accumV2[1]; + accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Source/DistanceFunctions/arm_minkowski_distance_f32.c b/Source/DistanceFunctions/arm_minkowski_distance_f32.c index bf30fe55..3d550fb4 100755 --- a/Source/DistanceFunctions/arm_minkowski_distance_f32.c +++ b/Source/DistanceFunctions/arm_minkowski_distance_f32.c @@ -126,7 +126,7 @@ float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, in } sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); - sum = sumV2[0] + sumV2[1]; + sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1); blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c b/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c index c9745914..0abc1659 100644 --- a/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c @@ -370,19 +370,20 @@ void arm_biquad_cascade_df1_f32( while (stage > 0U) { /* Reading the coefficients */ - Xn = vdupq_n_f32(0.0); - Xn[2] = pState[0]; - Xn[3] = pState[1]; - Yn[0] = pState[2]; - Yn[1] = pState[3]; + Xn = vdupq_n_f32(0.0f); + Xn = vsetq_lane_f32(pState[0],Xn,2); + Xn = vsetq_lane_f32(pState[1],Xn,3); + Yn = vset_lane_f32(pState[2],Yn,0); + Yn = vset_lane_f32(pState[3],Yn,1); + b = vld1q_f32(pCoeffs); b = vrev64q_f32(b); b = vcombine_f32(vget_high_f32(b), vget_low_f32(b)); a = vld1_f32(pCoeffs + 3); a = vrev64_f32(a); - b[0] = 0.0; + b = vsetq_lane_f32(0.0f,b,0); pCoeffs += 5; /* Reading the pState values */ @@ -460,7 +461,11 @@ void arm_biquad_cascade_df1_f32( Xns = *pIn++; /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ - acc = (b[1] * Xn[2]) + (b[2] * Xn[3]) + (b[3] * Xns) + (a[0] * Yn[0]) + (a[1] * Yn[1]); + acc = (vgetq_lane_f32(b, 1) * vgetq_lane_f32(Xn, 2)) + + (vgetq_lane_f32(b, 2) * vgetq_lane_f32(Xn, 3)) + + (vgetq_lane_f32(b, 3) * Xns) + + (vget_lane_f32(a, 0) * vget_lane_f32(Yn, 0)) + + (vget_lane_f32(a, 1) * vget_lane_f32(Yn, 1)); /* Store the result in the accumulator in the destination buffer. */ *pOut++ = acc; @@ -471,10 +476,10 @@ void arm_biquad_cascade_df1_f32( /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc */ - Xn[2] = Xn[3]; - Xn[3] = Xns; - Yn[0] = Yn[1]; - Yn[1] = acc; + Xn = vsetq_lane_f32(vgetq_lane_f32(Xn, 3),Xn,2); + Xn = vsetq_lane_f32(Xns,Xn,3); + Yn = vset_lane_f32(vget_lane_f32(Yn, 1),Yn,0); + Yn = vset_lane_f32(acc,Yn,1); /* Decrement the loop counter */ sample--; diff --git a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c index 574dc28b..46454dad 100644 --- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c +++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c @@ -268,7 +268,7 @@ void arm_biquad_cascade_df2T_f32( dV.val[1] = vmulq_f32(s, b2V); dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V); - *pOut++ = YnV[3]; + *pOut++ = vgetq_lane_f32(YnV, 3) ; sample--; } diff --git a/Source/FilteringFunctions/arm_conv_f32.c b/Source/FilteringFunctions/arm_conv_f32.c index ea64b80f..f217f1ed 100644 --- a/Source/FilteringFunctions/arm_conv_f32.c +++ b/Source/FilteringFunctions/arm_conv_f32.c @@ -263,7 +263,8 @@ void arm_conv_f32( uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */ uint32_t j, k, count, blkCnt; /* Loop counters */ -#if defined (ARM_MATH_LOOPUNROLL) + +#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) float32_t acc0, acc1, acc2, acc3, c0; /* Accumulators */ #if !defined(ARM_MATH_NEON) float32_t x0, x1, x2, x3; /* Temporary variables to hold state and coefficient values */ diff --git a/Source/FilteringFunctions/arm_conv_partial_f32.c b/Source/FilteringFunctions/arm_conv_partial_f32.c index 179b6dd0..34c77f24 100644 --- a/Source/FilteringFunctions/arm_conv_partial_f32.c +++ b/Source/FilteringFunctions/arm_conv_partial_f32.c @@ -142,7 +142,7 @@ arm_status arm_conv_partial_f32( blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; - blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0; blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex); blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; diff --git a/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/Source/FilteringFunctions/arm_conv_partial_fast_q31.c index 9d934f65..fe2e2bbb 100644 --- a/Source/FilteringFunctions/arm_conv_partial_fast_q31.c +++ b/Source/FilteringFunctions/arm_conv_partial_fast_q31.c @@ -118,7 +118,7 @@ arm_status arm_conv_partial_fast_q31( blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; - blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0; blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex); blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; diff --git a/Source/FilteringFunctions/arm_conv_partial_q15.c b/Source/FilteringFunctions/arm_conv_partial_q15.c index b8a965be..eb246417 100644 --- a/Source/FilteringFunctions/arm_conv_partial_q15.c +++ b/Source/FilteringFunctions/arm_conv_partial_q15.c @@ -119,7 +119,7 @@ arm_status arm_conv_partial_q15( blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; - blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0; blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex); blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; diff --git a/Source/FilteringFunctions/arm_conv_partial_q31.c b/Source/FilteringFunctions/arm_conv_partial_q31.c index 388ef6e7..cfb1ad01 100644 --- a/Source/FilteringFunctions/arm_conv_partial_q31.c +++ b/Source/FilteringFunctions/arm_conv_partial_q31.c @@ -121,7 +121,7 @@ arm_status arm_conv_partial_q31( blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; - blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0; blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex); blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; diff --git a/Source/FilteringFunctions/arm_conv_partial_q7.c b/Source/FilteringFunctions/arm_conv_partial_q7.c index 1123591c..bcfc51eb 100644 --- a/Source/FilteringFunctions/arm_conv_partial_q7.c +++ b/Source/FilteringFunctions/arm_conv_partial_q7.c @@ -123,7 +123,7 @@ arm_status arm_conv_partial_q7( blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3; blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; - blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0; + blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0; blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex); blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; diff --git a/Source/FilteringFunctions/arm_correlate_f32.c b/Source/FilteringFunctions/arm_correlate_f32.c index 3a8aa00e..4dd5bb93 100644 --- a/Source/FilteringFunctions/arm_correlate_f32.c +++ b/Source/FilteringFunctions/arm_correlate_f32.c @@ -320,7 +320,7 @@ void arm_correlate_f32( uint32_t outBlockSize; /* Loop counter */ int32_t inc = 1; /* Destination address modifier */ -#if defined (ARM_MATH_LOOPUNROLL) +#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) float32_t acc0, acc1, acc2, acc3,c0; /* Accumulators */ #if !defined(ARM_MATH_NEON) float32_t x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */ diff --git a/Source/FilteringFunctions/arm_fir_decimate_f32.c b/Source/FilteringFunctions/arm_fir_decimate_f32.c index 6ebbe49e..7d445dda 100644 --- a/Source/FilteringFunctions/arm_fir_decimate_f32.c +++ b/Source/FilteringFunctions/arm_fir_decimate_f32.c @@ -219,16 +219,16 @@ void arm_fir_decimate_f32( } temp = vpadd_f32(vget_low_f32(acc0v),vget_high_f32(acc0v)); - accv[0] = temp[0] + temp[1]; + accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,0); temp = vpadd_f32(vget_low_f32(acc1v),vget_high_f32(acc1v)); - accv[1] = temp[0] + temp[1]; + accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,1); temp = vpadd_f32(vget_low_f32(acc2v),vget_high_f32(acc2v)); - accv[2] = temp[0] + temp[1]; + accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,2); temp = vpadd_f32(vget_low_f32(acc3v),vget_high_f32(acc3v)); - accv[3] = temp[0] + temp[1]; + accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,3); /* If the filter length is not a multiple of 4, compute the remaining filter taps */ tapCnt = numTaps % 0x4U; @@ -245,10 +245,10 @@ void arm_fir_decimate_f32( x3 = *(px3++); /* Perform the multiply-accumulate */ - accv[0] += x0 * c0; - accv[1] += x1 * c0; - accv[2] += x2 * c0; - accv[3] += x3 * c0; + accv = vsetq_lane_f32(vgetq_lane_f32(accv, 0) + x0 * c0,accv,0); + accv = vsetq_lane_f32(vgetq_lane_f32(accv, 1) + x1 * c0,accv,1); + accv = vsetq_lane_f32(vgetq_lane_f32(accv, 2) + x2 * c0,accv,2); + accv = vsetq_lane_f32(vgetq_lane_f32(accv, 3) + x3 * c0,accv,3); /* Decrement the loop counter */ tapCnt--; @@ -306,7 +306,7 @@ void arm_fir_decimate_f32( } temp = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v)); - sum0 = temp[0] + temp[1]; + sum0 = vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1); /* If the filter length is not a multiple of 4, compute the remaining filter taps */ tapCnt = numTaps % 0x4U; diff --git a/Source/FilteringFunctions/arm_fir_f32.c b/Source/FilteringFunctions/arm_fir_f32.c index d20a9b7d..2f9632e5 100644 --- a/Source/FilteringFunctions/arm_fir_f32.c +++ b/Source/FilteringFunctions/arm_fir_f32.c @@ -767,26 +767,26 @@ uint32_t blockSize) b = vld1q_f32(pb); xa = x0; xb = x1; - accv0 = vmlaq_n_f32(accv0,xa,b[0]); - accv1 = vmlaq_n_f32(accv1,xb,b[0]); + accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0)); + accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0)); xa = vextq_f32(x0,x1,1); xb = vextq_f32(x1,x2,1); - - accv0 = vmlaq_n_f32(accv0,xa,b[1]); - accv1 = vmlaq_n_f32(accv1,xb,b[1]); + + accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1)); + accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1)); - xa = vextq_f32(x0,x1,2); + xa = vextq_f32(x0,x1,2); xb = vextq_f32(x1,x2,2); - accv0 = vmlaq_n_f32(accv0,xa,b[2]); - accv1 = vmlaq_n_f32(accv1,xb,b[2]); + accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2)); + accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2)); - xa = vextq_f32(x0,x1,3); - xb = vextq_f32(x1,x2,3); + xa = vextq_f32(x0,x1,3); + xb = vextq_f32(x1,x2,3); - accv0 = vmlaq_n_f32(accv0,xa,b[3]); - accv1 = vmlaq_n_f32(accv1,xb,b[3]); + accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3)); + accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3)); pb += 4; x0 = x1; @@ -810,8 +810,8 @@ uint32_t blockSize) pb++; - xa = vextq_f32(x0,x1,1); - xb = vextq_f32(x1,x2,1); + xa = vextq_f32(x0,x1,1); + xb = vextq_f32(x1,x2,1); accv0 = vmlaq_n_f32(accv0,xa,*pb); accv1 = vmlaq_n_f32(accv1,xb,*pb); @@ -821,7 +821,7 @@ uint32_t blockSize) xa = vextq_f32(x0,x1,2); xb = vextq_f32(x1,x2,2); - accv0 = vmlaq_n_f32(accv0,xa,*pb); + accv0 = vmlaq_n_f32(accv0,xa,*pb); accv1 = vmlaq_n_f32(accv1,xb,*pb); } @@ -836,7 +836,7 @@ uint32_t blockSize) xa = vextq_f32(x0,x1,1); xb = vextq_f32(x1,x2,1); - accv0 = vmlaq_n_f32(accv0,xa,*pb); + accv0 = vmlaq_n_f32(accv0,xa,*pb); accv1 = vmlaq_n_f32(accv1,xb,*pb); } diff --git a/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/Source/FilteringFunctions/arm_fir_interpolate_f32.c index 23ba08d7..726549cb 100644 --- a/Source/FilteringFunctions/arm_fir_interpolate_f32.c +++ b/Source/FilteringFunctions/arm_fir_interpolate_f32.c @@ -314,15 +314,15 @@ void arm_fir_interpolate_f32( } /* The result is in the accumulator, store in the destination buffer. */ - *pDst = accV0[0]; - *(pDst + S->L) = accV0[1]; - *(pDst + 2 * S->L) = accV0[2]; - *(pDst + 3 * S->L) = accV0[3]; + *pDst = vgetq_lane_f32(accV0, 0); + *(pDst + S->L) = vgetq_lane_f32(accV0, 1); + *(pDst + 2 * S->L) = vgetq_lane_f32(accV0, 2); + *(pDst + 3 * S->L) = vgetq_lane_f32(accV0, 3); - *(pDst + 4 * S->L) = accV1[0]; - *(pDst + 5 * S->L) = accV1[1]; - *(pDst + 6 * S->L) = accV1[2]; - *(pDst + 7 * S->L) = accV1[3]; + *(pDst + 4 * S->L) = vgetq_lane_f32(accV1, 0); + *(pDst + 5 * S->L) = vgetq_lane_f32(accV1, 1); + *(pDst + 6 * S->L) = vgetq_lane_f32(accV1, 2); + *(pDst + 7 * S->L) = vgetq_lane_f32(accV1, 3); pDst++; @@ -375,7 +375,7 @@ void arm_fir_interpolate_f32( while (tapCnt > 0U) { /* Read the coefficient */ - x1v[0] = *(ptr2); + x1v = vsetq_lane_f32(*(ptr2),x1v,0); /* Upsampling is done by stuffing L-1 zeros between each sample. * So instead of multiplying zeros with coefficients, @@ -387,19 +387,19 @@ void arm_fir_interpolate_f32( ptr1 += 4; /* Read the coefficient */ - x1v[1] = *(ptr2); + x1v = vsetq_lane_f32(*(ptr2),x1v,1); /* Increment the coefficient pointer by interpolation factor times. */ ptr2 += S->L; /* Read the coefficient */ - x1v[2] = *(ptr2); + x1v = vsetq_lane_f32(*(ptr2),x1v,2); /* Increment the coefficient pointer by interpolation factor times. */ ptr2 += S->L; /* Read the coefficient */ - x1v[3] = *(ptr2); + x1v = vsetq_lane_f32(*(ptr2),x1v,3); /* Increment the coefficient pointer by interpolation factor times. */ ptr2 += S->L; @@ -411,7 +411,7 @@ void arm_fir_interpolate_f32( } tempV = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v)); - sum0 = tempV[0] + tempV[1]; + sum0 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1); /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */ tapCnt = phaseLen % 0x4U; diff --git a/Source/FilteringFunctions/arm_lms_f32.c b/Source/FilteringFunctions/arm_lms_f32.c index 4fc6e7e2..4a863594 100644 --- a/Source/FilteringFunctions/arm_lms_f32.c +++ b/Source/FilteringFunctions/arm_lms_f32.c @@ -225,7 +225,7 @@ void arm_lms_f32( tapCnt--; } tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); - sum = tempV2[0] + tempV2[1]; + sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1); /* If the filter length is not a multiple of 4, compute the remaining filter taps */ diff --git a/Source/FilteringFunctions/arm_lms_norm_f32.c b/Source/FilteringFunctions/arm_lms_norm_f32.c index f62494d0..693d45f8 100644 --- a/Source/FilteringFunctions/arm_lms_norm_f32.c +++ b/Source/FilteringFunctions/arm_lms_norm_f32.c @@ -233,7 +233,7 @@ void arm_lms_norm_f32( tapCnt--; } tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); - sum = tempV2[0] + tempV2[1]; + sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1); /* If the filter length is not a multiple of 4, compute the remaining filter taps */ tapCnt = numTaps % 0x4U; diff --git a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c index e65bb42e..1b79eabe 100644 --- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c +++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c @@ -938,20 +938,21 @@ arm_status arm_mat_cmplx_mult_f32( pIn1 += 8; pIn1B += 8; - tempR[0] = *pIn2; - tempI[0] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,0); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0); pIn2 += 2 * numColsB; - tempR[1] = *pIn2; - tempI[1] = *(pIn2 + 1U); + + tempR = vsetq_lane_f32(*pIn2,tempR,1); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1); pIn2 += 2 * numColsB; - tempR[2] = *pIn2; - tempI[2] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,2); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2); pIn2 += 2 * numColsB; - tempR[3] = *pIn2; - tempI[3] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,3); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3); pIn2 += 2 * numColsB; accR0 = vmlaq_f32(accR0,a0V.val[0],tempR); @@ -971,16 +972,16 @@ arm_status arm_mat_cmplx_mult_f32( } accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0)); - sumReal1 += accum[0] + accum[1]; + sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0)); - sumImag1 += accum[0] + accum[1]; + sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1)); - sumReal1B += accum[0] + accum[1]; + sumReal1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1)); - sumImag1B += accum[0] + accum[1]; + sumImag1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. ** No loop unrolling is used. */ @@ -1088,20 +1089,20 @@ arm_status arm_mat_cmplx_mult_f32( a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2) pIn1 += 8; - tempR[0] = *pIn2; - tempI[0] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,0); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0); pIn2 += 2 * numColsB; - tempR[1] = *pIn2; - tempI[1] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,1); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1); pIn2 += 2 * numColsB; - tempR[2] = *pIn2; - tempI[2] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,2); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2); pIn2 += 2 * numColsB; - tempR[3] = *pIn2; - tempI[3] = *(pIn2 + 1U); + tempR = vsetq_lane_f32(*pIn2,tempR,3); + tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3); pIn2 += 2 * numColsB; accR0 = vmlaq_f32(accR0,a0V.val[0],tempR); @@ -1115,10 +1116,10 @@ arm_status arm_mat_cmplx_mult_f32( } accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0)); - sumReal1 += accum[0] + accum[1]; + sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0)); - sumImag1 += accum[0] + accum[1]; + sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. ** No loop unrolling is used. */ diff --git a/Source/MatrixFunctions/arm_mat_inverse_f32.c b/Source/MatrixFunctions/arm_mat_inverse_f32.c index ca19f97b..7e7ca80e 100644 --- a/Source/MatrixFunctions/arm_mat_inverse_f32.c +++ b/Source/MatrixFunctions/arm_mat_inverse_f32.c @@ -774,7 +774,7 @@ arm_status arm_mat_inverse_f32( /* Pivot element of the row */ in = *pPivotRowIn; - tmpV = vdupq_n_f32(1.0/in); + tmpV = vdupq_n_f32(1.0f/in); /* Loop over number of columns * to the right of the pilot element */ diff --git a/Source/MatrixFunctions/arm_mat_mult_f32.c b/Source/MatrixFunctions/arm_mat_mult_f32.c index 577c1f19..90b8e952 100644 --- a/Source/MatrixFunctions/arm_mat_mult_f32.c +++ b/Source/MatrixFunctions/arm_mat_mult_f32.c @@ -633,7 +633,7 @@ arm_status arm_mat_mult_f32( a6V = vld1q_f32(pIn1G); a7V = vld1q_f32(pIn1H); - pIn1 += 4; + pIn1 += 4; pIn1B += 4; pIn1C += 4; pIn1D += 4; @@ -642,13 +642,13 @@ arm_status arm_mat_mult_f32( pIn1G += 4; pIn1H += 4; - temp[0] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,0); pIn2 += numColsB; - temp[1] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,1); pIn2 += numColsB; - temp[2] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,2); pIn2 += numColsB; - temp[3] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,3); pIn2 += numColsB; acc0 = vmlaq_f32(acc0,a0V,temp); @@ -665,28 +665,28 @@ arm_status arm_mat_mult_f32( } accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); - sum0 += accum[0] + accum[1]; + sum0 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); - sum1 += accum[0] + accum[1]; + sum1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)); - sum2 += accum[0] + accum[1]; + sum2 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)); - sum3 += accum[0] + accum[1]; + sum3 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4)); - sum4 += accum[0] + accum[1]; + sum4 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5)); - sum5 += accum[0] + accum[1]; + sum5 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6)); - sum6 += accum[0] + accum[1]; + sum6 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7)); - sum7 += accum[0] + accum[1]; + sum7 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. ** No loop unrolling is used. */ @@ -782,13 +782,13 @@ arm_status arm_mat_mult_f32( a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2) pIn1 += 4; - temp[0] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,0); pIn2 += numColsB; - temp[1] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,1); pIn2 += numColsB; - temp[2] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,2); pIn2 += numColsB; - temp[3] = *pIn2; + temp = vsetq_lane_f32(*pIn2,temp,3); pIn2 += numColsB; acc0 = vmlaq_f32(acc0,a0V,temp); @@ -798,7 +798,7 @@ arm_status arm_mat_mult_f32( } accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); - sum += accum[0] + accum[1]; + sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here. ** No loop unrolling is used. */ diff --git a/Source/SVMFunctions/arm_svm_linear_predict_f32.c b/Source/SVMFunctions/arm_svm_linear_predict_f32.c index d9c9ffd6..503ba16f 100755 --- a/Source/SVMFunctions/arm_svm_linear_predict_f32.c +++ b/Source/SVMFunctions/arm_svm_linear_predict_f32.c @@ -344,25 +344,25 @@ void arm_svm_linear_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); - dotV[0] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); - dotV[1] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); - dotV[2] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); - dotV[3] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); blkCnt = S->vectorDimension & 3; while (blkCnt > 0U) { - dotV[0] = dotV[0] + *pIn * *pSupporta++; - dotV[1] = dotV[1] + *pIn * *pSupportb++; - dotV[2] = dotV[2] + *pIn * *pSupportc++; - dotV[3] = dotV[3] + *pIn * *pSupportd++; + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); pIn++; @@ -374,7 +374,7 @@ void arm_svm_linear_predict_f32( accum = vmulq_f32(vec1,dotV); accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - sum += accum2[0] + accum2[1]; + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); pSupporta += 3*S->vectorDimension; pSupportb += 3*S->vectorDimension; @@ -406,7 +406,7 @@ void arm_svm_linear_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - dot = accum2[0] + accum2[1]; + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); blkCnt = S->vectorDimension & 3; diff --git a/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c b/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c index 703ab243..0873bff6 100755 --- a/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c +++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c @@ -370,25 +370,25 @@ void arm_svm_polynomial_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); - dotV[0] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); - dotV[1] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); - dotV[2] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); - dotV[3] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); blkCnt = S->vectorDimension & 3; while (blkCnt > 0U) { - dotV[0] = dotV[0] + *pIn * *pSupporta++; - dotV[1] = dotV[1] + *pIn * *pSupportb++; - dotV[2] = dotV[2] + *pIn * *pSupportc++; - dotV[3] = dotV[3] + *pIn * *pSupportd++; + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); pIn++; @@ -406,7 +406,7 @@ void arm_svm_polynomial_predict_f32( accum = vmulq_f32(vec1,dotV); accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - sum += accum2[0] + accum2[1]; + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); pSupporta += 3*S->vectorDimension; pSupportb += 3*S->vectorDimension; @@ -439,7 +439,7 @@ void arm_svm_polynomial_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - dot = accum2[0] + accum2[1]; + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); blkCnt = S->vectorDimension & 3; diff --git a/Source/SVMFunctions/arm_svm_rbf_predict_f32.c b/Source/SVMFunctions/arm_svm_rbf_predict_f32.c index a4fa0318..8625ef49 100755 --- a/Source/SVMFunctions/arm_svm_rbf_predict_f32.c +++ b/Source/SVMFunctions/arm_svm_rbf_predict_f32.c @@ -396,25 +396,26 @@ void arm_svm_rbf_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); - dotV[0] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); - dotV[1] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); - dotV[2] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); - dotV[3] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); blkCnt = S->vectorDimension & 3; while (blkCnt > 0U) { - dotV[0] = dotV[0] + SQ(*pIn - *pSupporta); - dotV[1] = dotV[1] + SQ(*pIn - *pSupportb); - dotV[2] = dotV[2] + SQ(*pIn - *pSupportc); - dotV[3] = dotV[3] + SQ(*pIn - *pSupportd); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3); + pSupporta++; pSupportb++; pSupportc++; @@ -434,7 +435,7 @@ void arm_svm_rbf_predict_f32( accum = vmulq_f32(vec1,dotV); accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - sum += accum2[0] + accum2[1]; + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); pSupporta += 3*S->vectorDimension; pSupportb += 3*S->vectorDimension; @@ -468,7 +469,7 @@ void arm_svm_rbf_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - dot = accum2[0] + accum2[1]; + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); blkCnt = S->vectorDimension & 3; diff --git a/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c b/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c index 9be478a0..f3b2f2f4 100755 --- a/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c +++ b/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c @@ -368,25 +368,25 @@ void arm_svm_sigmoid_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma)); - dotV[0] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0); accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb)); - dotV[1] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1); accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc)); - dotV[2] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2); accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd)); - dotV[3] = accum2[0] + accum2[1]; + dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3); blkCnt = S->vectorDimension & 3; while (blkCnt > 0U) { - dotV[0] = dotV[0] + *pIn * *pSupporta++; - dotV[1] = dotV[1] + *pIn * *pSupportb++; - dotV[2] = dotV[2] + *pIn * *pSupportc++; - dotV[3] = dotV[3] + *pIn * *pSupportd++; + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2); + dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3); pIn++; @@ -404,7 +404,7 @@ void arm_svm_sigmoid_predict_f32( accum = vmulq_f32(vec1,dotV); accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - sum += accum2[0] + accum2[1]; + sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); pSupporta += 3*S->vectorDimension; pSupportb += 3*S->vectorDimension; @@ -437,7 +437,7 @@ void arm_svm_sigmoid_predict_f32( blkCnt -- ; } accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum)); - dot = accum2[0] + accum2[1]; + dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1); blkCnt = S->vectorDimension & 3; diff --git a/Source/StatisticsFunctions/arm_entropy_f32.c b/Source/StatisticsFunctions/arm_entropy_f32.c index 9b0b02e9..01f7bc78 100755 --- a/Source/StatisticsFunctions/arm_entropy_f32.c +++ b/Source/StatisticsFunctions/arm_entropy_f32.c @@ -125,7 +125,8 @@ float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize) } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); - accum = accumV2[0] + accumV2[1]; + accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); + blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Source/StatisticsFunctions/arm_kullback_leibler_f32.c b/Source/StatisticsFunctions/arm_kullback_leibler_f32.c index b2618621..c406e84d 100755 --- a/Source/StatisticsFunctions/arm_kullback_leibler_f32.c +++ b/Source/StatisticsFunctions/arm_kullback_leibler_f32.c @@ -142,7 +142,7 @@ float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSr } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); - accum = accumV2[0] + accumV2[1]; + accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Source/StatisticsFunctions/arm_logsumexp_f32.c b/Source/StatisticsFunctions/arm_logsumexp_f32.c index 2170ad99..03f778da 100755 --- a/Source/StatisticsFunctions/arm_logsumexp_f32.c +++ b/Source/StatisticsFunctions/arm_logsumexp_f32.c @@ -173,7 +173,7 @@ float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize) accumV2 = vpmax_f32(vget_low_f32(maxValV),vget_high_f32(maxValV)); accumV2 = vpmax_f32(accumV2,accumV2); - maxVal = accumV2[0]; + maxVal = vget_lane_f32(accumV2, 0) ; blkCnt = (blockSize - 4) & 3; @@ -211,7 +211,7 @@ float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize) } accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV)); - accum = accumV2[0] + accumV2[1]; + accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1); blkCnt = blockSize & 0x3; while(blkCnt > 0) diff --git a/Source/StatisticsFunctions/arm_max_f32.c b/Source/StatisticsFunctions/arm_max_f32.c index 18520cdf..677ad1dc 100644 --- a/Source/StatisticsFunctions/arm_max_f32.c +++ b/Source/StatisticsFunctions/arm_max_f32.c @@ -150,20 +150,26 @@ void arm_max_f32( uint32_t * pIndex) { float32_t maxVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ + uint32_t blkCnt, outIndex; /* loop counter */ float32x4_t outV, srcV; float32x2_t outV2; uint32x4_t idxV; - uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX}; - uint32x4_t index={4,5,6,7}; - uint32x4_t delta={4,4,4,4}; - uint32x4_t countV={0,1,2,3}; + uint32x4_t maxIdx; + static const uint32_t indexInit[4]={4,5,6,7}; + static const uint32_t countVInit[4]={0,1,2,3}; + + uint32x4_t index; + uint32x4_t delta; + uint32x4_t countV; uint32x2_t countV2; - /* Initialise the count value. */ - count = 0U; + maxIdx = vdupq_n_u32(ULONG_MAX); + delta = vdupq_n_u32(4); + index = vld1q_u32(indexInit); + countV = vld1q_u32(countVInit); + /* Initialise the index value to zero. */ outIndex = 0U; @@ -217,14 +223,14 @@ void arm_max_f32( outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV)); outV2 = vpmax_f32(outV2,outV2); - out = outV2[0]; + out = vget_lane_f32(outV2, 0); idxV = vceqq_f32(outV, vdupq_n_f32(out)); countV = vbslq_u32(idxV, countV,maxIdx); countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV)); countV2 = vpmin_u32(countV2,countV2); - outIndex = countV2[0]; + outIndex = vget_lane_u32(countV2,0); /* if (blockSize - 1U) is not multiple of 4 */ blkCnt = (blockSize - 4 ) % 4U; diff --git a/Source/StatisticsFunctions/arm_max_no_idx_f32.c b/Source/StatisticsFunctions/arm_max_no_idx_f32.c index 76fe44ef..ccb31c71 100755 --- a/Source/StatisticsFunctions/arm_max_no_idx_f32.c +++ b/Source/StatisticsFunctions/arm_max_no_idx_f32.c @@ -135,4 +135,4 @@ void arm_max_no_idx_f32( /** @} end of Max group - */ \ No newline at end of file + */ diff --git a/Source/StatisticsFunctions/arm_min_f32.c b/Source/StatisticsFunctions/arm_min_f32.c index 8b789d2d..7a4d2ab2 100644 --- a/Source/StatisticsFunctions/arm_min_f32.c +++ b/Source/StatisticsFunctions/arm_min_f32.c @@ -152,20 +152,24 @@ void arm_min_f32( uint32_t * pIndex) { float32_t maxVal1, out; /* Temporary variables to store the output value. */ - uint32_t blkCnt, outIndex, count; /* loop counter */ + uint32_t blkCnt, outIndex; /* loop counter */ float32x4_t outV, srcV; float32x2_t outV2; uint32x4_t idxV; - uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX}; - uint32x4_t index={4,5,6,7}; - uint32x4_t delta={4,4,4,4}; - uint32x4_t countV={0,1,2,3}; + static const uint32_t indexInit[4]={4,5,6,7}; + static const uint32_t countVInit[4]={0,1,2,3}; + uint32x4_t maxIdx; + uint32x4_t index; + uint32x4_t delta; + uint32x4_t countV; uint32x2_t countV2; - /* Initialise the count value. */ - count = 0U; + maxIdx = vdupq_n_u32(ULONG_MAX); + delta = vdupq_n_u32(4); + index = vld1q_u32(indexInit); + countV = vld1q_u32(countVInit); /* Initialise the index value to zero. */ outIndex = 0U; @@ -219,14 +223,14 @@ void arm_min_f32( outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV)); outV2 = vpmin_f32(outV2,outV2); - out = outV2[0]; + out = vget_lane_f32(outV2,0); idxV = vceqq_f32(outV, vdupq_n_f32(out)); countV = vbslq_u32(idxV, countV,maxIdx); countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV)); countV2 = vpmin_u32(countV2,countV2); - outIndex = countV2[0]; + outIndex = vget_lane_u32(countV2,0); /* if (blockSize - 1U) is not multiple of 4 */ blkCnt = (blockSize - 4 ) % 4U; diff --git a/Source/StatisticsFunctions/arm_power_f32.c b/Source/StatisticsFunctions/arm_power_f32.c index 3acbf2be..afe40cd4 100644 --- a/Source/StatisticsFunctions/arm_power_f32.c +++ b/Source/StatisticsFunctions/arm_power_f32.c @@ -136,7 +136,7 @@ void arm_power_f32( blkCnt--; } sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); - sum = sumV2[0] + sumV2[1]; + sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1); /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ diff --git a/Source/StatisticsFunctions/arm_rms_f32.c b/Source/StatisticsFunctions/arm_rms_f32.c index 118f5393..7f26aff2 100644 --- a/Source/StatisticsFunctions/arm_rms_f32.c +++ b/Source/StatisticsFunctions/arm_rms_f32.c @@ -103,7 +103,7 @@ void arm_rms_f32( } sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV)); - sum = sumV2[0] + sumV2[1]; + sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1); /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ diff --git a/Source/SupportFunctions/arm_bitonic_sort_f32.c b/Source/SupportFunctions/arm_bitonic_sort_f32.c index 6f83e306..10267056 100644 --- a/Source/SupportFunctions/arm_bitonic_sort_f32.c +++ b/Source/SupportFunctions/arm_bitonic_sort_f32.c @@ -834,47 +834,41 @@ static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir) arm_bitonic_merge_128_f32(pSrc+128, dir); } +#define SWAP(a,i,j) \ + temp = vgetq_lane_f32(a, j); \ + a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\ + a = vsetq_lane_f32(temp, a, i); + static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir) { float32_t temp; - if( dir==(a[0]>a[1]) ) + + if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) { - temp = a[1]; - a[1] = a[0]; - a[0] = temp; + SWAP(a,0,1); } - if( dir==(a[2]>a[3]) ) + if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) ) { - temp = a[3]; - a[3] = a[2]; - a[2] = temp; + SWAP(a,2,3); } - if( dir==(a[0]>a[3]) ) + if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) ) { - temp = a[3]; - a[3] = a[0]; - a[0] = temp; + SWAP(a,0,3); } - if( dir==(a[1]>a[2]) ) + if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) ) { - temp = a[2]; - a[2] = a[1]; - a[1] = temp; + SWAP(a,1,2); } - if( dir==(a[0]>a[1]) ) + if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) ) { - temp = a[1]; - a[1] = a[0]; - a[0] = temp; + SWAP(a,0,1); } - if( dir==(a[2]>a[3]) ) + if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) ) { - temp = a[3]; - a[3] = a[2]; - a[2] = temp; + SWAP(a,2,3); } return a; diff --git a/Source/SupportFunctions/arm_spline_interp_f32.c b/Source/SupportFunctions/arm_spline_interp_f32.c index cfbf3763..3ec7c642 100644 --- a/Source/SupportFunctions/arm_spline_interp_f32.c +++ b/Source/SupportFunctions/arm_spline_interp_f32.c @@ -157,6 +157,11 @@ void arm_spline_f32( float32_t * pDst, uint32_t blockSize) { + /* + + As explained in arm_spline_interp_init_f32.c, this must be > 1 + + */ int32_t n = S->n_x; arm_spline_type type = S->type; @@ -169,6 +174,9 @@ void arm_spline_f32( float32_t bi, di; float32_t x_sc; + bi = 0.0f; + di = 0.0f; + const float32_t * pXq = xq; int32_t blkCnt = (int32_t)blockSize; diff --git a/Source/SupportFunctions/arm_spline_interp_init_f32.c b/Source/SupportFunctions/arm_spline_interp_init_f32.c index f9b5ee91..e014f201 100644 --- a/Source/SupportFunctions/arm_spline_interp_init_f32.c +++ b/Source/SupportFunctions/arm_spline_interp_init_f32.c @@ -40,7 +40,7 @@ /** * @brief Initialization function for the floating-point cubic spline interpolation. * @param[in,out] S points to an instance of the floating-point spline structure. - * @param[in] n number of known data points. + * @param[in] n number of known data points (must > 1). * @param[in] type type of cubic spline interpolation (boundary conditions) */ diff --git a/Source/SupportFunctions/arm_weighted_sum_f32.c b/Source/SupportFunctions/arm_weighted_sum_f32.c index 405c6c70..ac3d4819 100755 --- a/Source/SupportFunctions/arm_weighted_sum_f32.c +++ b/Source/SupportFunctions/arm_weighted_sum_f32.c @@ -136,10 +136,10 @@ float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uin } tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V)); - accum1 = tempV[0] + tempV[1]; + accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1); tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V)); - accum2 = tempV[0] + tempV[1]; + accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1); blkCnt = blockSize & 3; while(blkCnt > 0) diff --git a/Testing/FrameworkSource/Error.cpp b/Testing/FrameworkSource/Error.cpp index fedddc22..c621c8fd 100644 --- a/Testing/FrameworkSource/Error.cpp +++ b/Testing/FrameworkSource/Error.cpp @@ -25,9 +25,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "arm_math.h" -#include "Error.h" #include +#include +#include "Error.h" +#include "arm_math.h" namespace Client { diff --git a/Testing/FrameworkSource/IORunner.cpp b/Testing/FrameworkSource/IORunner.cpp index 4ebd0436..3c9e62e9 100644 --- a/Testing/FrameworkSource/IORunner.cpp +++ b/Testing/FrameworkSource/IORunner.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include "IORunner.h" #include "Error.h" #include "Timing.h" diff --git a/Testing/RTE_Components.h b/Testing/RTE_Components.h index db747a66..81b7e8dc 100644 --- a/Testing/RTE_Components.h +++ b/Testing/RTE_Components.h @@ -2,4 +2,4 @@ #define RTE_COMPONENTS_H -#endif /* RTE_COMPONENTS_H */ \ No newline at end of file +#endif /* RTE_COMPONENTS_H */ diff --git a/Testing/Source/Tests/BIQUADF32.cpp b/Testing/Source/Tests/BIQUADF32.cpp index a34b4d4a..7486ad8d 100755 --- a/Testing/Source/Tests/BIQUADF32.cpp +++ b/Testing/Source/Tests/BIQUADF32.cpp @@ -1,4 +1,5 @@ #include "BIQUADF32.h" +#include #include "Error.h" #define SNR_THRESHOLD 98 @@ -31,10 +32,7 @@ static __ALIGNED(8) float32_t coeffArray[32]; arm_biquad_mod_coef_f32 *coefsmodp = (arm_biquad_mod_coef_f32*)vecCoefs.ptr(); #endif - int i,j; int blockSize; - int numTaps; - int nb=0; @@ -48,7 +46,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; */ blockSize = inputs.nbSamples() >> 1; - numTaps = coefs.nbSamples(); /* @@ -107,10 +104,7 @@ static __ALIGNED(8) float32_t coeffArray[32]; const float32_t *inputp = inputs.ptr(); float32_t *outp = output.ptr(); - int i,j; int blockSize; - int numTaps; - int nb=0; @@ -124,7 +118,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; */ blockSize = inputs.nbSamples() >> 1; - numTaps = coefs.nbSamples(); /* @@ -189,7 +182,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; int blockSize; int numStages; - int nb=0; int i; @@ -267,7 +259,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; int blockSize; int numStages; - int nb=0; int i; @@ -350,7 +341,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; int blockSize; int numStages; - int nb=0; int i; @@ -410,8 +400,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; void BIQUADF32::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; - switch(id) { diff --git a/Testing/Source/Tests/BIQUADF64.cpp b/Testing/Source/Tests/BIQUADF64.cpp index 6faa3486..8c64ee0f 100755 --- a/Testing/Source/Tests/BIQUADF64.cpp +++ b/Testing/Source/Tests/BIQUADF64.cpp @@ -1,4 +1,5 @@ #include "BIQUADF64.h" +#include #include "Error.h" #define SNR_THRESHOLD 98 @@ -28,10 +29,7 @@ static __ALIGNED(8) float64_t coeffArray[64]; float64_t *inputp = inputs.ptr(); float64_t *outp = output.ptr(); - int i,j; int blockSize; - int numTaps; - int nb=0; @@ -45,7 +43,6 @@ static __ALIGNED(8) float64_t coeffArray[64]; */ blockSize = inputs.nbSamples() >> 1; - numTaps = coefs.nbSamples(); /* @@ -98,7 +95,6 @@ static __ALIGNED(8) float64_t coeffArray[64]; int blockSize; int numStages; - int nb=0; int i; @@ -158,8 +154,6 @@ static __ALIGNED(8) float64_t coeffArray[64]; void BIQUADF64::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; - switch(id) { diff --git a/Testing/Source/Tests/BIQUADQ15.cpp b/Testing/Source/Tests/BIQUADQ15.cpp index b0985284..5b2625d0 100755 --- a/Testing/Source/Tests/BIQUADQ15.cpp +++ b/Testing/Source/Tests/BIQUADQ15.cpp @@ -1,4 +1,5 @@ #include "BIQUADQ15.h" +#include #include "Error.h" @@ -26,10 +27,7 @@ static __ALIGNED(8) q15_t coeffArray[32]; const q15_t *inputp = inputs.ptr(); q15_t *outp = output.ptr(); - int i,j; int blockSize; - int numTaps; - int nb=0; @@ -43,7 +41,6 @@ static __ALIGNED(8) q15_t coeffArray[32]; */ blockSize = inputs.nbSamples() >> 1; - numTaps = coefs.nbSamples(); /* @@ -85,7 +82,6 @@ static __ALIGNED(8) q15_t coeffArray[32]; void BIQUADQ15::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; switch(id) diff --git a/Testing/Source/Tests/BIQUADQ31.cpp b/Testing/Source/Tests/BIQUADQ31.cpp index c9bf9a14..14dc4586 100755 --- a/Testing/Source/Tests/BIQUADQ31.cpp +++ b/Testing/Source/Tests/BIQUADQ31.cpp @@ -1,4 +1,5 @@ #include "BIQUADQ31.h" +#include #include "Error.h" @@ -24,10 +25,7 @@ static __ALIGNED(8) q31_t coeffArray[32]; const q31_t *inputp = inputs.ptr(); q31_t *outp = output.ptr(); - int i,j; int blockSize; - int numTaps; - int nb=0; @@ -41,7 +39,6 @@ static __ALIGNED(8) q31_t coeffArray[32]; */ blockSize = inputs.nbSamples() >> 1; - numTaps = coefs.nbSamples(); /* @@ -87,10 +84,7 @@ static __ALIGNED(8) q31_t coeffArray[32]; q31_t *inputp = inputs.ptr(); q31_t *outp = output.ptr(); - int i,j; int blockSize; - int numTaps; - int nb=0; @@ -104,7 +98,6 @@ static __ALIGNED(8) q31_t coeffArray[32]; */ blockSize = inputs.nbSamples() >> 1; - numTaps = coefs.nbSamples(); /* @@ -145,8 +138,6 @@ static __ALIGNED(8) q31_t coeffArray[32]; void BIQUADQ31::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; - switch(id) { diff --git a/Testing/Source/Tests/BasicTestsF32.cpp b/Testing/Source/Tests/BasicTestsF32.cpp index a447caec..a00dec56 100644 --- a/Testing/Source/Tests/BasicTestsF32.cpp +++ b/Testing/Source/Tests/BasicTestsF32.cpp @@ -1,4 +1,5 @@ #include "BasicTestsF32.h" +#include #include "Error.h" #define SNR_THRESHOLD 120 diff --git a/Testing/Source/Tests/BasicTestsQ15.cpp b/Testing/Source/Tests/BasicTestsQ15.cpp index df24ed71..9369dd6f 100755 --- a/Testing/Source/Tests/BasicTestsQ15.cpp +++ b/Testing/Source/Tests/BasicTestsQ15.cpp @@ -1,4 +1,5 @@ #include "BasicTestsQ15.h" +#include #include "Error.h" #define SNR_THRESHOLD 70 diff --git a/Testing/Source/Tests/BasicTestsQ31.cpp b/Testing/Source/Tests/BasicTestsQ31.cpp index cc322ed6..bae5d4d9 100755 --- a/Testing/Source/Tests/BasicTestsQ31.cpp +++ b/Testing/Source/Tests/BasicTestsQ31.cpp @@ -1,4 +1,5 @@ #include "BasicTestsQ31.h" +#include #include "Error.h" #define SNR_THRESHOLD 100 diff --git a/Testing/Source/Tests/BasicTestsQ7.cpp b/Testing/Source/Tests/BasicTestsQ7.cpp index 83cc0b79..8c6474dc 100755 --- a/Testing/Source/Tests/BasicTestsQ7.cpp +++ b/Testing/Source/Tests/BasicTestsQ7.cpp @@ -1,4 +1,5 @@ #include "BasicTestsQ7.h" +#include #include "Error.h" #define SNR_THRESHOLD 20 diff --git a/Testing/Source/Tests/BayesF32.cpp b/Testing/Source/Tests/BayesF32.cpp index 656d7e6a..bb109454 100755 --- a/Testing/Source/Tests/BayesF32.cpp +++ b/Testing/Source/Tests/BayesF32.cpp @@ -1,9 +1,9 @@ #include "BayesF32.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include void BayesF32::test_gaussian_naive_bayes_predict_f32() diff --git a/Testing/Source/Tests/BinaryTestsF32.cpp b/Testing/Source/Tests/BinaryTestsF32.cpp index 10a00f54..ef9bd85d 100755 --- a/Testing/Source/Tests/BinaryTestsF32.cpp +++ b/Testing/Source/Tests/BinaryTestsF32.cpp @@ -1,4 +1,5 @@ #include "BinaryTestsF32.h" +#include #include "Error.h" #define SNR_THRESHOLD 120 diff --git a/Testing/Source/Tests/BinaryTestsQ15.cpp b/Testing/Source/Tests/BinaryTestsQ15.cpp index 1f16ab38..67cc38af 100755 --- a/Testing/Source/Tests/BinaryTestsQ15.cpp +++ b/Testing/Source/Tests/BinaryTestsQ15.cpp @@ -1,4 +1,5 @@ #include "BinaryTestsQ15.h" +#include #include "Error.h" #define SNR_THRESHOLD 70 diff --git a/Testing/Source/Tests/BinaryTestsQ31.cpp b/Testing/Source/Tests/BinaryTestsQ31.cpp index 31b88214..0701748e 100755 --- a/Testing/Source/Tests/BinaryTestsQ31.cpp +++ b/Testing/Source/Tests/BinaryTestsQ31.cpp @@ -1,4 +1,5 @@ #include "BinaryTestsQ31.h" +#include #include "Error.h" #define SNR_THRESHOLD 100 diff --git a/Testing/Source/Tests/ComplexTestsF32.cpp b/Testing/Source/Tests/ComplexTestsF32.cpp index c77ae7f9..e0b91888 100755 --- a/Testing/Source/Tests/ComplexTestsF32.cpp +++ b/Testing/Source/Tests/ComplexTestsF32.cpp @@ -1,4 +1,5 @@ #include "ComplexTestsF32.h" +#include #include "Error.h" #define SNR_THRESHOLD 120 diff --git a/Testing/Source/Tests/ComplexTestsQ15.cpp b/Testing/Source/Tests/ComplexTestsQ15.cpp index a9bff1a2..2e3bfa76 100755 --- a/Testing/Source/Tests/ComplexTestsQ15.cpp +++ b/Testing/Source/Tests/ComplexTestsQ15.cpp @@ -1,4 +1,5 @@ #include "ComplexTestsQ15.h" +#include #include "Error.h" #define SNR_THRESHOLD 25 diff --git a/Testing/Source/Tests/ComplexTestsQ31.cpp b/Testing/Source/Tests/ComplexTestsQ31.cpp index 86c03811..e76b3bb5 100755 --- a/Testing/Source/Tests/ComplexTestsQ31.cpp +++ b/Testing/Source/Tests/ComplexTestsQ31.cpp @@ -1,4 +1,5 @@ #include "ComplexTestsQ31.h" +#include #include "Error.h" #define SNR_THRESHOLD 100 diff --git a/Testing/Source/Tests/DistanceTestsF32.cpp b/Testing/Source/Tests/DistanceTestsF32.cpp index c1e21ec7..b75beeef 100755 --- a/Testing/Source/Tests/DistanceTestsF32.cpp +++ b/Testing/Source/Tests/DistanceTestsF32.cpp @@ -1,9 +1,9 @@ #include "DistanceTestsF32.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include void DistanceTestsF32::test_braycurtis_distance_f32() diff --git a/Testing/Source/Tests/DistanceTestsU32.cpp b/Testing/Source/Tests/DistanceTestsU32.cpp index c31c6417..877b4db1 100755 --- a/Testing/Source/Tests/DistanceTestsU32.cpp +++ b/Testing/Source/Tests/DistanceTestsU32.cpp @@ -1,10 +1,9 @@ #include "DistanceTestsU32.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include - #define ERROR_THRESHOLD 1e-8 void DistanceTestsU32::test_dice_distance() diff --git a/Testing/Source/Tests/ExampleCategoryF32.cpp b/Testing/Source/Tests/ExampleCategoryF32.cpp index ef38b936..249204ea 100755 --- a/Testing/Source/Tests/ExampleCategoryF32.cpp +++ b/Testing/Source/Tests/ExampleCategoryF32.cpp @@ -1,4 +1,5 @@ #include "ExampleCategoryF32.h" +#include #include "Error.h" /* diff --git a/Testing/Source/Tests/ExampleCategoryQ15.cpp b/Testing/Source/Tests/ExampleCategoryQ15.cpp index 0a60e701..f88b926b 100755 --- a/Testing/Source/Tests/ExampleCategoryQ15.cpp +++ b/Testing/Source/Tests/ExampleCategoryQ15.cpp @@ -1,4 +1,5 @@ #include "ExampleCategoryQ15.h" +#include #include "Error.h" #define SNR_THRESHOLD 70 diff --git a/Testing/Source/Tests/ExampleCategoryQ31.cpp b/Testing/Source/Tests/ExampleCategoryQ31.cpp index 478e851d..50a4724c 100755 --- a/Testing/Source/Tests/ExampleCategoryQ31.cpp +++ b/Testing/Source/Tests/ExampleCategoryQ31.cpp @@ -1,4 +1,5 @@ #include "ExampleCategoryQ31.h" +#include #include "Error.h" #define SNR_THRESHOLD 100 diff --git a/Testing/Source/Tests/ExampleCategoryQ7.cpp b/Testing/Source/Tests/ExampleCategoryQ7.cpp index e598aec0..e4ba59f8 100755 --- a/Testing/Source/Tests/ExampleCategoryQ7.cpp +++ b/Testing/Source/Tests/ExampleCategoryQ7.cpp @@ -1,4 +1,5 @@ #include "ExampleCategoryQ7.h" +#include #include "Error.h" #define SNR_THRESHOLD 20 diff --git a/Testing/Source/Tests/FIRF32.cpp b/Testing/Source/Tests/FIRF32.cpp index 1bf9c418..5523a48f 100644 --- a/Testing/Source/Tests/FIRF32.cpp +++ b/Testing/Source/Tests/FIRF32.cpp @@ -1,4 +1,5 @@ #include "FIRF32.h" +#include #include "Error.h" #define SNR_THRESHOLD 120 @@ -27,10 +28,12 @@ static __ALIGNED(8) float32_t coeffArray[32]; const float32_t *inputp = inputs.ptr(); float32_t *outp = output.ptr(); - int i,j; + int i; +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + int j; +#endif int blockSize; int numTaps; - int nb=0; @@ -93,7 +96,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; configp += 2; orgcoefsp += numTaps; - nb += 2*blockSize; } @@ -109,7 +111,6 @@ static __ALIGNED(8) float32_t coeffArray[32]; void FIRF32::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; switch(id) diff --git a/Testing/Source/Tests/FIRQ15.cpp b/Testing/Source/Tests/FIRQ15.cpp index 10f9eb85..052f219d 100644 --- a/Testing/Source/Tests/FIRQ15.cpp +++ b/Testing/Source/Tests/FIRQ15.cpp @@ -1,4 +1,5 @@ #include "FIRQ15.h" +#include #include "Error.h" #define SNR_THRESHOLD 59 @@ -21,10 +22,12 @@ static __ALIGNED(8) q15_t coeffArray[32]; const q15_t *inputp = inputs.ptr(); q15_t *outp = output.ptr(); - int i,j; + int i; +#if defined(ARM_MATH_MVEI) + int j; +#endif int blockSize; int numTaps; - int nb=0; /* @@ -86,7 +89,6 @@ static __ALIGNED(8) q15_t coeffArray[32]; configp += 2; orgcoefsp += numTaps; - nb += 2*blockSize; } @@ -102,8 +104,6 @@ static __ALIGNED(8) q15_t coeffArray[32]; void FIRQ15::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; - switch(id) { diff --git a/Testing/Source/Tests/FIRQ31.cpp b/Testing/Source/Tests/FIRQ31.cpp index 16832077..c3d4a405 100644 --- a/Testing/Source/Tests/FIRQ31.cpp +++ b/Testing/Source/Tests/FIRQ31.cpp @@ -1,4 +1,5 @@ #include "FIRQ31.h" +#include #include "Error.h" #define SNR_THRESHOLD 100 @@ -20,10 +21,12 @@ static __ALIGNED(8) q31_t coeffArray[32]; const q31_t *inputp = inputs.ptr(); q31_t *outp = output.ptr(); - int i,j; + int i; +#if defined(ARM_MATH_MVEI) + int j; +#endif int blockSize; int numTaps; - int nb=0; /* @@ -83,7 +86,6 @@ static __ALIGNED(8) q31_t coeffArray[32]; configp += 2; orgcoefsp += numTaps; - nb += 2*blockSize; } @@ -99,8 +101,6 @@ static __ALIGNED(8) q31_t coeffArray[32]; void FIRQ31::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; - switch(id) { diff --git a/Testing/Source/Tests/FIRQ7.cpp b/Testing/Source/Tests/FIRQ7.cpp index e079389e..329d37c9 100644 --- a/Testing/Source/Tests/FIRQ7.cpp +++ b/Testing/Source/Tests/FIRQ7.cpp @@ -1,4 +1,5 @@ #include "FIRQ7.h" +#include #include "Error.h" #define SNR_THRESHOLD 10 @@ -21,7 +22,10 @@ static __ALIGNED(8) q7_t coeffArray[32]; const q7_t *inputp = inputs.ptr(); q7_t *outp = output.ptr(); - int i,j; + int i; +#if defined(ARM_MATH_MVEI) + int j; +#endif int blockSize; int numTaps; @@ -97,8 +101,6 @@ static __ALIGNED(8) q7_t coeffArray[32]; void FIRQ7::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { - Testing::nbSamples_t nb=MAX_NB_SAMPLES; - switch(id) { diff --git a/Testing/Source/Tests/FastMathF32.cpp b/Testing/Source/Tests/FastMathF32.cpp index 7c5cbd0a..d5ec8627 100755 --- a/Testing/Source/Tests/FastMathF32.cpp +++ b/Testing/Source/Tests/FastMathF32.cpp @@ -1,10 +1,10 @@ #include "FastMathF32.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_vec_math.h" #include "Test.h" -#include #define SNR_THRESHOLD 120 /* diff --git a/Testing/Source/Tests/FastMathQ15.cpp b/Testing/Source/Tests/FastMathQ15.cpp index 270dfd00..977faaa7 100755 --- a/Testing/Source/Tests/FastMathQ15.cpp +++ b/Testing/Source/Tests/FastMathQ15.cpp @@ -1,9 +1,9 @@ #include "FastMathQ15.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include #define SNR_THRESHOLD 70 /* diff --git a/Testing/Source/Tests/FastMathQ31.cpp b/Testing/Source/Tests/FastMathQ31.cpp index a9c160b3..0de59869 100755 --- a/Testing/Source/Tests/FastMathQ31.cpp +++ b/Testing/Source/Tests/FastMathQ31.cpp @@ -1,9 +1,9 @@ #include "FastMathQ31.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include #define SNR_THRESHOLD 100 /* diff --git a/Testing/Source/Tests/FullyConnected.cpp b/Testing/Source/Tests/FullyConnected.cpp index 70593c46..231d3181 100644 --- a/Testing/Source/Tests/FullyConnected.cpp +++ b/Testing/Source/Tests/FullyConnected.cpp @@ -1,10 +1,11 @@ #include "FullyConnected.h" +#include #include "Error.h" #include "arm_nnfunctions.h" #include "Test.h" #include "stdio.h" -#include + void printPattern(char *s,Client::AnyPattern pat) { diff --git a/Testing/Source/Tests/MISCF32.cpp b/Testing/Source/Tests/MISCF32.cpp index b25d0207..2920b25b 100755 --- a/Testing/Source/Tests/MISCF32.cpp +++ b/Testing/Source/Tests/MISCF32.cpp @@ -1,11 +1,10 @@ #include "MISCF32.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_vec_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 120 /* diff --git a/Testing/Source/Tests/MISCQ15.cpp b/Testing/Source/Tests/MISCQ15.cpp index 6f6e0305..96e1b866 100755 --- a/Testing/Source/Tests/MISCQ15.cpp +++ b/Testing/Source/Tests/MISCQ15.cpp @@ -1,11 +1,10 @@ #include "MISCQ15.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_vec_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 70 /* diff --git a/Testing/Source/Tests/MISCQ31.cpp b/Testing/Source/Tests/MISCQ31.cpp index 9f538606..d56f4f3f 100755 --- a/Testing/Source/Tests/MISCQ31.cpp +++ b/Testing/Source/Tests/MISCQ31.cpp @@ -1,11 +1,10 @@ #include "MISCQ31.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_vec_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 100 /* diff --git a/Testing/Source/Tests/MISCQ7.cpp b/Testing/Source/Tests/MISCQ7.cpp index 11bf6558..6b496521 100755 --- a/Testing/Source/Tests/MISCQ7.cpp +++ b/Testing/Source/Tests/MISCQ7.cpp @@ -1,11 +1,10 @@ #include "MISCQ7.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_vec_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 15 /* diff --git a/Testing/Source/Tests/NNSupport.cpp b/Testing/Source/Tests/NNSupport.cpp index 550d96d9..4e87792f 100755 --- a/Testing/Source/Tests/NNSupport.cpp +++ b/Testing/Source/Tests/NNSupport.cpp @@ -1,10 +1,9 @@ #include "NNSupport.h" +#include #include "Error.h" #include "arm_nnfunctions.h" #include "Test.h" -#include - void NNSupport::test_nn_elementwise_add_s8() diff --git a/Testing/Source/Tests/Pooling.cpp b/Testing/Source/Tests/Pooling.cpp index a1bd9f50..0b144efb 100755 --- a/Testing/Source/Tests/Pooling.cpp +++ b/Testing/Source/Tests/Pooling.cpp @@ -1,10 +1,9 @@ #include "Pooling.h" +#include #include "Error.h" #include "arm_nnfunctions.h" #include "Test.h" -#include - void Pooling::test_avgpool_s8() { diff --git a/Testing/Source/Tests/SVMF32.cpp b/Testing/Source/Tests/SVMF32.cpp index 939fe157..95dda2c3 100755 --- a/Testing/Source/Tests/SVMF32.cpp +++ b/Testing/Source/Tests/SVMF32.cpp @@ -1,4 +1,5 @@ #include "SVMF32.h" +#include #include "Error.h" diff --git a/Testing/Source/Tests/Softmax.cpp b/Testing/Source/Tests/Softmax.cpp index 10019334..a979ad24 100755 --- a/Testing/Source/Tests/Softmax.cpp +++ b/Testing/Source/Tests/Softmax.cpp @@ -1,10 +1,9 @@ #include "Softmax.h" +#include #include "Error.h" #include "arm_nnfunctions.h" #include "Test.h" -#include - /* Tests have shown that, compared to a float32 implementation diff --git a/Testing/Source/Tests/StatsTestsF32.cpp b/Testing/Source/Tests/StatsTestsF32.cpp index e1320095..7de2b932 100755 --- a/Testing/Source/Tests/StatsTestsF32.cpp +++ b/Testing/Source/Tests/StatsTestsF32.cpp @@ -1,9 +1,9 @@ #include "StatsTestsF32.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include #define SNR_THRESHOLD 120 /* diff --git a/Testing/Source/Tests/StatsTestsQ15.cpp b/Testing/Source/Tests/StatsTestsQ15.cpp index 3b697be8..b02dea32 100755 --- a/Testing/Source/Tests/StatsTestsQ15.cpp +++ b/Testing/Source/Tests/StatsTestsQ15.cpp @@ -1,5 +1,6 @@ #include "arm_math.h" #include "StatsTestsQ15.h" +#include #include "Error.h" #include "Test.h" diff --git a/Testing/Source/Tests/StatsTestsQ31.cpp b/Testing/Source/Tests/StatsTestsQ31.cpp index 0e105106..900aef3a 100755 --- a/Testing/Source/Tests/StatsTestsQ31.cpp +++ b/Testing/Source/Tests/StatsTestsQ31.cpp @@ -1,5 +1,6 @@ #include "arm_math.h" #include "StatsTestsQ31.h" +#include #include "Error.h" #include "Test.h" diff --git a/Testing/Source/Tests/StatsTestsQ7.cpp b/Testing/Source/Tests/StatsTestsQ7.cpp index f3f521dc..895a93a2 100755 --- a/Testing/Source/Tests/StatsTestsQ7.cpp +++ b/Testing/Source/Tests/StatsTestsQ7.cpp @@ -1,5 +1,6 @@ #include "arm_math.h" #include "StatsTestsQ7.h" +#include #include "Error.h" #include "Test.h" diff --git a/Testing/Source/Tests/SupportBarTestsF32.cpp b/Testing/Source/Tests/SupportBarTestsF32.cpp index f3914cdc..33bdb292 100755 --- a/Testing/Source/Tests/SupportBarTestsF32.cpp +++ b/Testing/Source/Tests/SupportBarTestsF32.cpp @@ -1,10 +1,9 @@ #include "SupportBarTestsF32.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include - void SupportBarTestsF32::test_barycenter_f32() { diff --git a/Testing/Source/Tests/SupportTestsF32.cpp b/Testing/Source/Tests/SupportTestsF32.cpp index 736d663b..58468372 100755 --- a/Testing/Source/Tests/SupportTestsF32.cpp +++ b/Testing/Source/Tests/SupportTestsF32.cpp @@ -1,10 +1,9 @@ #include "SupportTestsF32.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 120 #define REL_ERROR (1.0e-5) #define ABS_Q15_ERROR ((q15_t)10) diff --git a/Testing/Source/Tests/SupportTestsQ15.cpp b/Testing/Source/Tests/SupportTestsQ15.cpp index 2610e838..baf7a40a 100755 --- a/Testing/Source/Tests/SupportTestsQ15.cpp +++ b/Testing/Source/Tests/SupportTestsQ15.cpp @@ -1,10 +1,9 @@ #include "SupportTestsQ15.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 120 #define REL_ERROR (1.0e-3) #define ABS_Q15_ERROR ((q15_t)10) diff --git a/Testing/Source/Tests/SupportTestsQ31.cpp b/Testing/Source/Tests/SupportTestsQ31.cpp index d2e1cb20..db5c3ed6 100755 --- a/Testing/Source/Tests/SupportTestsQ31.cpp +++ b/Testing/Source/Tests/SupportTestsQ31.cpp @@ -1,10 +1,9 @@ #include "SupportTestsQ31.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 120 #define REL_ERROR (1.0e-5) #define ABS_Q15_ERROR ((q15_t)10) diff --git a/Testing/Source/Tests/SupportTestsQ7.cpp b/Testing/Source/Tests/SupportTestsQ7.cpp index 47356e87..d84b27f8 100755 --- a/Testing/Source/Tests/SupportTestsQ7.cpp +++ b/Testing/Source/Tests/SupportTestsQ7.cpp @@ -1,10 +1,9 @@ #include "SupportTestsQ7.h" +#include #include "Error.h" #include "arm_math.h" #include "Test.h" -#include - #define SNR_THRESHOLD 120 #define REL_ERROR (1.0e-5) #define ABS_Q15_ERROR ((q15_t)(1<<8)) diff --git a/Testing/Source/Tests/TransformCF32.cpp b/Testing/Source/Tests/TransformCF32.cpp index 0cc6ae7c..5c68d5cc 100755 --- a/Testing/Source/Tests/TransformCF32.cpp +++ b/Testing/Source/Tests/TransformCF32.cpp @@ -1,11 +1,10 @@ #include "TransformCF32.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_const_structs.h" #include "Test.h" -#include - #define SNR_THRESHOLD 120 void TransformCF32::test_cfft_f32() diff --git a/Testing/Source/Tests/TransformCF64.cpp b/Testing/Source/Tests/TransformCF64.cpp index 05c542b2..02b54f32 100755 --- a/Testing/Source/Tests/TransformCF64.cpp +++ b/Testing/Source/Tests/TransformCF64.cpp @@ -1,11 +1,10 @@ #include "TransformCF64.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_const_structs.h" #include "Test.h" -#include - #define SNR_THRESHOLD 250 void TransformCF64::test_cfft_f64() diff --git a/Testing/Source/Tests/TransformQ15.cpp b/Testing/Source/Tests/TransformQ15.cpp index ea1393fc..16d26014 100755 --- a/Testing/Source/Tests/TransformQ15.cpp +++ b/Testing/Source/Tests/TransformQ15.cpp @@ -1,11 +1,10 @@ #include "TransformQ15.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_const_structs.h" #include "Test.h" -#include - #define SNR_THRESHOLD 30 void TransformQ15::test_cfft_q15() diff --git a/Testing/Source/Tests/TransformQ31.cpp b/Testing/Source/Tests/TransformQ31.cpp index b61be559..a7c0e610 100755 --- a/Testing/Source/Tests/TransformQ31.cpp +++ b/Testing/Source/Tests/TransformQ31.cpp @@ -1,11 +1,10 @@ #include "TransformQ31.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_const_structs.h" #include "Test.h" -#include - #define SNR_THRESHOLD 90 void TransformQ31::test_cfft_q31() diff --git a/Testing/Source/Tests/TransformRF32.cpp b/Testing/Source/Tests/TransformRF32.cpp index 5a3d301a..3dbf2634 100755 --- a/Testing/Source/Tests/TransformRF32.cpp +++ b/Testing/Source/Tests/TransformRF32.cpp @@ -1,10 +1,10 @@ #include "TransformRF32.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_const_structs.h" #include "Test.h" -#include #define SNR_THRESHOLD 120 diff --git a/Testing/Source/Tests/TransformRF64.cpp b/Testing/Source/Tests/TransformRF64.cpp index cb19870f..bcadcbd3 100755 --- a/Testing/Source/Tests/TransformRF64.cpp +++ b/Testing/Source/Tests/TransformRF64.cpp @@ -1,10 +1,10 @@ #include "TransformRF64.h" +#include #include "Error.h" #include "arm_math.h" #include "arm_const_structs.h" #include "Test.h" -#include #define SNR_THRESHOLD 250 diff --git a/Testing/Source/Tests/UnaryTestsF32.cpp b/Testing/Source/Tests/UnaryTestsF32.cpp index 3494521b..d7224d37 100755 --- a/Testing/Source/Tests/UnaryTestsF32.cpp +++ b/Testing/Source/Tests/UnaryTestsF32.cpp @@ -1,4 +1,5 @@ #include "UnaryTestsF32.h" +#include #include "Error.h" #define SNR_THRESHOLD 120 @@ -207,7 +208,7 @@ void UnaryTestsF32::test_mat_inverse_f32() PREPAREDATA1(false); status=arm_mat_inverse_f32(&this->in1,&this->out); - //ASSERT_TRUE(status==ARM_MATH_SUCCESS); + ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); inp1 += (rows * columns); @@ -216,7 +217,7 @@ void UnaryTestsF32::test_mat_inverse_f32() ASSERT_EMPTY_TAIL(output); - //ASSERT_SNR(output,ref,(float32_t)SNR_THRESHOLD_INV); + ASSERT_SNR(output,ref,(float32_t)SNR_THRESHOLD_INV); ASSERT_CLOSE_ERROR(output,ref,ABS_ERROR_INV,REL_ERROR_INV); diff --git a/Testing/Source/Tests/UnaryTestsF64.cpp b/Testing/Source/Tests/UnaryTestsF64.cpp index df506c14..141c8321 100755 --- a/Testing/Source/Tests/UnaryTestsF64.cpp +++ b/Testing/Source/Tests/UnaryTestsF64.cpp @@ -1,5 +1,6 @@ #include "arm_math.h" #include "UnaryTestsF64.h" +#include #include "Error.h" #define SNR_THRESHOLD 120 diff --git a/Testing/Source/Tests/UnaryTestsQ15.cpp b/Testing/Source/Tests/UnaryTestsQ15.cpp index 0ce730b1..e9a44de2 100755 --- a/Testing/Source/Tests/UnaryTestsQ15.cpp +++ b/Testing/Source/Tests/UnaryTestsQ15.cpp @@ -1,4 +1,5 @@ #include "UnaryTestsQ15.h" +#include #include "Error.h" #define SNR_THRESHOLD 70 diff --git a/Testing/Source/Tests/UnaryTestsQ31.cpp b/Testing/Source/Tests/UnaryTestsQ31.cpp index 23992786..adc06840 100755 --- a/Testing/Source/Tests/UnaryTestsQ31.cpp +++ b/Testing/Source/Tests/UnaryTestsQ31.cpp @@ -1,4 +1,5 @@ #include "UnaryTestsQ31.h" +#include #include "Error.h" #define SNR_THRESHOLD 100 diff --git a/Toolchain/GCC.cmake b/Toolchain/GCC.cmake index c2c96a4e..4fff3c1b 100644 --- a/Toolchain/GCC.cmake +++ b/Toolchain/GCC.cmake @@ -37,6 +37,11 @@ function(compilerSpecificCompileOptions PROJECTNAME) target_compile_options(${PROJECTNAME} PUBLIC "-march=armv7e-m;-mfpu=fpv5-d16") target_link_options(${PROJECTNAME} PUBLIC "-march=armv7e-m;-mfpu=fpv5-d16") endif() + + if (ARM_CPU STREQUAL "cortex-m0" ) + target_compile_options(${PROJECTNAME} PUBLIC "-march=armv6-m") + target_link_options(${PROJECTNAME} PUBLIC "-march=armv6-m") + endif() if (ARM_CPU STREQUAL "cortex-a9" ) @@ -113,14 +118,14 @@ function(toolchainSpecificLinkForCortexA PROJECTNAME ROOT CORE PLATFORMFOLDER) # RTE Components target_include_directories(${PROJECTNAME} PRIVATE ${ROOT}/CMSIS/DSP/Testing) + target_include_directories(${PROJECTNAME} PRIVATE ${PLATFORMFOLDER}/${CORE}/LinkScripts/GCC) - # Using the mem file which is included. - # Since meme file in same temp directory, it is found by linker - # when processing the scatter file + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tempLink) set(SCATTERFILE ${CMAKE_CURRENT_BINARY_DIR}/tempLink/lnk.ld) preprocessScatter(${CORE} ${PLATFORMFOLDER} ${SCATTERFILE}) - target_include_directories(${PROJECTNAME} PRIVATE ${PLATFORMFOLDER}/${CORE}/LinkScripts/GCC) + + set_target_properties(${PROJECTNAME} PROPERTIES LINK_DEPENDS "${SCATTERFILE}") target_link_options(${PROJECTNAME} PRIVATE "--entry=Reset_Handler;-T${SCATTERFILE}") endfunction()