CMSIS-DSP: Added support for Helium.

Only arm_dot_prod_f32 is currently providing an Helium implementation.
pull/19/head
Christophe Favergeon 6 years ago
parent 499a9ecb70
commit 4a0aa2ad26

@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_helium_utils.h
* Description: Utility functions for Helium development
*
* $Date: 09. September 2019
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_UTILS_HELIUM_H_
#define _ARM_UTILS_HELIUM_H_
#if defined (ARM_MATH_HELIUM)
#define nbLanes(sz) (128/sz)
#define VEC_LANES_F32 nbLanes(32)
#define VEC_LANES_F16 nbLanes(16)
#define VEC_LANES_Q63 nbLanes(64)
#define VEC_LANES_Q31 nbLanes(32)
#define VEC_LANES_Q15 nbLanes(16)
#define VEC_LANES_Q7 nbLanes(8)
#define nb_vec_lanes(ptr) _Generic((ptr), \
uint32_t *: VEC_LANES_Q31, \
uint16_t *: VEC_LANES_Q15, \
uint8_t *: VEC_LANES_Q7, \
q31_t *: VEC_LANES_Q31, \
q15_t *: VEC_LANES_Q15, \
q7_t *: VEC_LANES_Q7, \
float32_t*: VEC_LANES_F32, \
float16_t*: VEC_LANES_F16, \
const q31_t *: VEC_LANES_Q31, \
const q15_t *: VEC_LANES_Q15, \
const q7_t *: VEC_LANES_Q7, \
const float32_t*: VEC_LANES_F32, \
const float16_t*: VEC_LANES_F16, \
default: "err")
__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
{
float32_t acc;
acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
vgetq_lane(in, 2) + vgetq_lane(in, 3);
return acc;
}
#define post_incr_vec_size(ptr) ptr += nb_vec_lanes(ptr)
#endif
#endif

@ -128,6 +128,7 @@ void Reset_Handler(void)
__set_MSPLIM((uint32_t)(&__STACK_LIMIT));
SystemInit(); /* CMSIS System Initialization */
__PROGRAM_START(); /* Enter PreMain (C library entry point) */
}

@ -40,6 +40,11 @@
#define SYSTEM_CLOCK (5U * XTAL)
#define DEBUG_DEMCR (*((unsigned int *)0xE000EDFC))
#define DEBUG_TRCENA (1<<24) //Global debug enable bit
#define CCR (*((volatile unsigned int *)0xE000ED14))
#define CCR_DL (1 << 19)
/*----------------------------------------------------------------------------
Externals
@ -88,4 +93,10 @@ void SystemInit (void)
SystemCoreClock = SYSTEM_CLOCK;
//Disable debug
DEBUG_DEMCR &=~ DEBUG_TRCENA;
// enable DL branch cache
CCR |= CCR_DL;
}

@ -59,6 +59,46 @@
@return none
*/
#if defined (ARM_MATH_HELIUM)
#include "arm_mve.h"
#include "arm_helium_utils.h"
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
float32x4_t vecA, vecB;
float32x4_t vecSum;
vecSum = vdupq_n_f32(0.0);
do {
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
mve_pred16_t p = vctp32q(blockSize);
vecA = vldrwq_z_f32(pSrcA, p);
vecB = vldrwq_z_f32(pSrcB, p);
vecSum = vfmaq_m(vecSum, vecA, vecB, p);
/*
* Decrement the blockSize loop counter
* Advance vector source and destination pointers
*/
post_incr_vec_size(pSrcA);
post_incr_vec_size(pSrcB);
blockSize -= VEC_LANES_F32;
}
while ((int32_t) blockSize > 0);
*result = vecAddAcrossF32Mve(vecSum);
}
#else
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
@ -158,6 +198,7 @@ void arm_dot_prod_f32(
*result = sum;
}
#endif /* ARM_MATH_HELIUM */
/**
@} end of BasicDotProd group
*/

@ -17,6 +17,7 @@ option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
option(LOOPUNROLL "Loop unrolling" ON)
option(ROUNDING "Rounding" OFF)
option(MATRIXCHECK "Matrix Checks" OFF)
option(HELIUM "Helium acceleration" OFF)
# Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked

@ -7,7 +7,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/..)
function(writeConfig path)
set(output "")
list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n")
list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,HELIUM,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n")
if (OPTIMIZED)
list(APPEND output "1")
@ -33,6 +33,12 @@ function(writeConfig path)
list(APPEND output ",0")
endif()
if (HELIUM)
list(APPEND output ",1")
else()
list(APPEND output ",0")
endif()
if (LOOPUNROLL)
list(APPEND output ",1")
else()

@ -19,7 +19,7 @@ import re
# For table creation
MKSTRFIELD=['NAME']
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
MKINTFIELD=['ID', 'CYCLES']
MKDATEFIELD=['DATE']
MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
@ -31,7 +31,7 @@ MKKEYFIELDID={'CATEGORY':'categoryid',
# For table value extraction
VALSTRFIELD=['NAME','VERSION']
VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
VALINTFIELD=['ID', 'CYCLES']
VALDATEFIELD=['DATE']
VALKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
@ -246,6 +246,7 @@ def addRows(conn,elem,tableName,full):
conn.commit()
def addOneBenchmark(elem,fullPath,db,group):
if os.path.isfile(fullPath):
full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
full['DATE'] = datetime.datetime.now()
if group:

@ -19,7 +19,7 @@ import re
# For table creation
MKSTRFIELD=['NAME','Regression']
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
MKINTFIELD=['ID','MAX']
MKREALFIELD=['MAXREGCOEF']
MKDATEFIELD=['DATE']
@ -32,7 +32,7 @@ MKKEYFIELDID={'CATEGORY':'categoryid',
# For table value extraction
VALSTRFIELD=['NAME','VERSION','Regression']
VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
VALINTFIELD=['ID', 'MAX']
VALREALFIELD=['MAXREGCOEF']
VALDATEFIELD=['DATE']
@ -257,6 +257,7 @@ def addRows(conn,elem,tableName,full):
conn.commit()
def addOneBenchmark(elem,fullPath,db,group):
if os.path.isfile(fullPath):
full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
full['DATE'] = datetime.datetime.now()
if group:

@ -70,3 +70,4 @@ INSERT INTO CORE VALUES(9,"a5","ARMCA5");
INSERT INTO CORE VALUES(10,"a7","ARMCA7");
INSERT INTO CORE VALUES(11,"a9","ARMCA9");
INSERT INTO CORE VALUES(12,"a15","ARMCA15");
INSERT INTO CORE VALUES(13,"helium","ARMv81MML_DSP_DP_MVE_FP");

@ -57,6 +57,8 @@ def formatProd(a,b):
def summaryBenchmark(resultPath,elem,path):
regressionPath=os.path.join(os.path.dirname(path),"regression.csv")
if os.path.isfile(path):
print(" Generating %s" % regressionPath)
full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False)
#print(full)

@ -18,10 +18,14 @@ using namespace std;
#include "Patterns.h"
int testmain()
{
char *memoryBuf=NULL;
memoryBuf = (char*)malloc(MEMSIZE);
if (memoryBuf !=NULL)
{

@ -180,7 +180,6 @@ function(configcore PROJECTNAME ROOT)
#
if (NEON AND NOT CORTEXM)
#target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON __FPU_PRESENT)
target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON)
endif()
@ -189,6 +188,10 @@ function(configcore PROJECTNAME ROOT)
target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON_EXPERIMENTAL)
endif()
if (HELIUM AND CORTEXM)
target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_HELIUM)
endif()
compilerSpecificCompileOptions(${PROJECTNAME} ${ROOT})
endfunction()
Loading…
Cancel
Save