CMSIS-DSP: Added support for Helium.

Only arm_dot_prod_f32 is currently providing an Helium implementation.
pull/19/head
Christophe Favergeon 6 years ago
parent 499a9ecb70
commit 4a0aa2ad26

@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_helium_utils.h
* Description: Utility functions for Helium development
*
* $Date: 09. September 2019
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_UTILS_HELIUM_H_
#define _ARM_UTILS_HELIUM_H_
#if defined (ARM_MATH_HELIUM)
#define nbLanes(sz) (128/sz)
#define VEC_LANES_F32 nbLanes(32)
#define VEC_LANES_F16 nbLanes(16)
#define VEC_LANES_Q63 nbLanes(64)
#define VEC_LANES_Q31 nbLanes(32)
#define VEC_LANES_Q15 nbLanes(16)
#define VEC_LANES_Q7 nbLanes(8)
#define nb_vec_lanes(ptr) _Generic((ptr), \
uint32_t *: VEC_LANES_Q31, \
uint16_t *: VEC_LANES_Q15, \
uint8_t *: VEC_LANES_Q7, \
q31_t *: VEC_LANES_Q31, \
q15_t *: VEC_LANES_Q15, \
q7_t *: VEC_LANES_Q7, \
float32_t*: VEC_LANES_F32, \
float16_t*: VEC_LANES_F16, \
const q31_t *: VEC_LANES_Q31, \
const q15_t *: VEC_LANES_Q15, \
const q7_t *: VEC_LANES_Q7, \
const float32_t*: VEC_LANES_F32, \
const float16_t*: VEC_LANES_F16, \
default: "err")
__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
{
float32_t acc;
acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
vgetq_lane(in, 2) + vgetq_lane(in, 3);
return acc;
}
#define post_incr_vec_size(ptr) ptr += nb_vec_lanes(ptr)
#endif
#endif

@ -128,6 +128,7 @@ void Reset_Handler(void)
__set_MSPLIM((uint32_t)(&__STACK_LIMIT)); __set_MSPLIM((uint32_t)(&__STACK_LIMIT));
SystemInit(); /* CMSIS System Initialization */ SystemInit(); /* CMSIS System Initialization */
__PROGRAM_START(); /* Enter PreMain (C library entry point) */ __PROGRAM_START(); /* Enter PreMain (C library entry point) */
} }

@ -40,6 +40,11 @@
#define SYSTEM_CLOCK (5U * XTAL) #define SYSTEM_CLOCK (5U * XTAL)
#define DEBUG_DEMCR (*((unsigned int *)0xE000EDFC))
#define DEBUG_TRCENA (1<<24) //Global debug enable bit
#define CCR (*((volatile unsigned int *)0xE000ED14))
#define CCR_DL (1 << 19)
/*---------------------------------------------------------------------------- /*----------------------------------------------------------------------------
Externals Externals
@ -88,4 +93,10 @@ void SystemInit (void)
SystemCoreClock = SYSTEM_CLOCK; SystemCoreClock = SYSTEM_CLOCK;
//Disable debug
DEBUG_DEMCR &=~ DEBUG_TRCENA;
// enable DL branch cache
CCR |= CCR_DL;
} }

@ -59,6 +59,46 @@
@return none @return none
*/ */
#if defined (ARM_MATH_HELIUM)
#include "arm_mve.h"
#include "arm_helium_utils.h"
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
float32x4_t vecA, vecB;
float32x4_t vecSum;
vecSum = vdupq_n_f32(0.0);
do {
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
mve_pred16_t p = vctp32q(blockSize);
vecA = vldrwq_z_f32(pSrcA, p);
vecB = vldrwq_z_f32(pSrcB, p);
vecSum = vfmaq_m(vecSum, vecA, vecB, p);
/*
* Decrement the blockSize loop counter
* Advance vector source and destination pointers
*/
post_incr_vec_size(pSrcA);
post_incr_vec_size(pSrcB);
blockSize -= VEC_LANES_F32;
}
while ((int32_t) blockSize > 0);
*result = vecAddAcrossF32Mve(vecSum);
}
#else
void arm_dot_prod_f32( void arm_dot_prod_f32(
const float32_t * pSrcA, const float32_t * pSrcA,
const float32_t * pSrcB, const float32_t * pSrcB,
@ -158,6 +198,7 @@ void arm_dot_prod_f32(
*result = sum; *result = sum;
} }
#endif /* ARM_MATH_HELIUM */
/** /**
@} end of BasicDotProd group @} end of BasicDotProd group
*/ */

@ -17,6 +17,7 @@ option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
option(LOOPUNROLL "Loop unrolling" ON) option(LOOPUNROLL "Loop unrolling" ON)
option(ROUNDING "Rounding" OFF) option(ROUNDING "Rounding" OFF)
option(MATRIXCHECK "Matrix Checks" OFF) option(MATRIXCHECK "Matrix Checks" OFF)
option(HELIUM "Helium acceleration" OFF)
# Select which parts of the CMSIS-DSP must be compiled. # Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked # There are some dependencies between the parts but they are not tracked

@ -7,7 +7,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/..)
function(writeConfig path) function(writeConfig path)
set(output "") set(output "")
list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n") list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,HELIUM,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n")
if (OPTIMIZED) if (OPTIMIZED)
list(APPEND output "1") list(APPEND output "1")
@ -33,6 +33,12 @@ function(writeConfig path)
list(APPEND output ",0") list(APPEND output ",0")
endif() endif()
if (HELIUM)
list(APPEND output ",1")
else()
list(APPEND output ",0")
endif()
if (LOOPUNROLL) if (LOOPUNROLL)
list(APPEND output ",1") list(APPEND output ",1")
else() else()

@ -19,7 +19,7 @@ import re
# For table creation # For table creation
MKSTRFIELD=['NAME'] MKSTRFIELD=['NAME']
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
MKINTFIELD=['ID', 'CYCLES'] MKINTFIELD=['ID', 'CYCLES']
MKDATEFIELD=['DATE'] MKDATEFIELD=['DATE']
MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE'] MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
@ -31,7 +31,7 @@ MKKEYFIELDID={'CATEGORY':'categoryid',
# For table value extraction # For table value extraction
VALSTRFIELD=['NAME','VERSION'] VALSTRFIELD=['NAME','VERSION']
VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
VALINTFIELD=['ID', 'CYCLES'] VALINTFIELD=['ID', 'CYCLES']
VALDATEFIELD=['DATE'] VALDATEFIELD=['DATE']
VALKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE'] VALKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
@ -246,16 +246,17 @@ def addRows(conn,elem,tableName,full):
conn.commit() conn.commit()
def addOneBenchmark(elem,fullPath,db,group): def addOneBenchmark(elem,fullPath,db,group):
full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) if os.path.isfile(fullPath):
full['DATE'] = datetime.datetime.now() full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
if group: full['DATE'] = datetime.datetime.now()
tableName = group if group:
else: tableName = group
tableName = elem.data["class"] else:
conn = sqlite3.connect(db) tableName = elem.data["class"]
createTableIfMissing(conn,elem,tableName,full) conn = sqlite3.connect(db)
addRows(conn,elem,tableName,full) createTableIfMissing(conn,elem,tableName,full)
conn.close() addRows(conn,elem,tableName,full)
conn.close()
def addToDB(benchmark,dbpath,elem,group): def addToDB(benchmark,dbpath,elem,group):

@ -19,7 +19,7 @@ import re
# For table creation # For table creation
MKSTRFIELD=['NAME','Regression'] MKSTRFIELD=['NAME','Regression']
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
MKINTFIELD=['ID','MAX'] MKINTFIELD=['ID','MAX']
MKREALFIELD=['MAXREGCOEF'] MKREALFIELD=['MAXREGCOEF']
MKDATEFIELD=['DATE'] MKDATEFIELD=['DATE']
@ -32,7 +32,7 @@ MKKEYFIELDID={'CATEGORY':'categoryid',
# For table value extraction # For table value extraction
VALSTRFIELD=['NAME','VERSION','Regression'] VALSTRFIELD=['NAME','VERSION','Regression']
VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
VALINTFIELD=['ID', 'MAX'] VALINTFIELD=['ID', 'MAX']
VALREALFIELD=['MAXREGCOEF'] VALREALFIELD=['MAXREGCOEF']
VALDATEFIELD=['DATE'] VALDATEFIELD=['DATE']
@ -257,16 +257,17 @@ def addRows(conn,elem,tableName,full):
conn.commit() conn.commit()
def addOneBenchmark(elem,fullPath,db,group): def addOneBenchmark(elem,fullPath,db,group):
full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) if os.path.isfile(fullPath):
full['DATE'] = datetime.datetime.now() full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
if group: full['DATE'] = datetime.datetime.now()
tableName = group if group:
else: tableName = group
tableName = elem.data["class"] else:
conn = sqlite3.connect(db) tableName = elem.data["class"]
createTableIfMissing(conn,elem,tableName,full) conn = sqlite3.connect(db)
addRows(conn,elem,tableName,full) createTableIfMissing(conn,elem,tableName,full)
conn.close() addRows(conn,elem,tableName,full)
conn.close()
def addToDB(benchmark,dbpath,elem,group): def addToDB(benchmark,dbpath,elem,group):

@ -70,3 +70,4 @@ INSERT INTO CORE VALUES(9,"a5","ARMCA5");
INSERT INTO CORE VALUES(10,"a7","ARMCA7"); INSERT INTO CORE VALUES(10,"a7","ARMCA7");
INSERT INTO CORE VALUES(11,"a9","ARMCA9"); INSERT INTO CORE VALUES(11,"a9","ARMCA9");
INSERT INTO CORE VALUES(12,"a15","ARMCA15"); INSERT INTO CORE VALUES(12,"a15","ARMCA15");
INSERT INTO CORE VALUES(13,"helium","ARMv81MML_DSP_DP_MVE_FP");

@ -57,38 +57,40 @@ def formatProd(a,b):
def summaryBenchmark(resultPath,elem,path): def summaryBenchmark(resultPath,elem,path):
regressionPath=os.path.join(os.path.dirname(path),"regression.csv") regressionPath=os.path.join(os.path.dirname(path),"regression.csv")
print(" Generating %s" % regressionPath)
full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False)
#print(full)
csvheaders = []
with open(os.path.join(resultPath,'currentConfig.csv'), 'r') as f:
reader = csv.reader(f)
csvheaders = next(reader, None)
groupList = list(set(elem.params.full) - set(elem.params.summary)) if os.path.isfile(path):
#grouped=full.groupby(list(elem.params.summary) + ['ID','CATEGORY']).max() print(" Generating %s" % regressionPath)
#grouped.reset_index(level=grouped.index.names, inplace=True) full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False)
#print(grouped) #print(full)
#print(grouped.columns)
csvheaders = []
with open(os.path.join(resultPath,'currentConfig.csv'), 'r') as f:
reader = csv.reader(f)
csvheaders = next(reader, None)
groupList = list(set(elem.params.full) - set(elem.params.summary))
#grouped=full.groupby(list(elem.params.summary) + ['ID','CATEGORY']).max()
#grouped.reset_index(level=grouped.index.names, inplace=True)
#print(grouped)
#print(grouped.columns)
def reg(d): def reg(d):
m=d["CYCLES"].max() m=d["CYCLES"].max()
results = smf.ols('CYCLES ~ ' + elem.params.formula, data=d).fit() results = smf.ols('CYCLES ~ ' + elem.params.formula, data=d).fit()
f=joinit([formatProd(a,b) for (a,b) in zip(results.params.index,results.params.values)]," + ") f=joinit([formatProd(a,b) for (a,b) in zip(results.params.index,results.params.values)]," + ")
f="".join(f) f="".join(f)
f = re.sub(r':','*',f) f = re.sub(r':','*',f)
#print(results.summary()) #print(results.summary())
return(pd.Series({'Regression':"%s" % f,'MAX' : m,'MAXREGCOEF' : results.params.values[-1]})) return(pd.Series({'Regression':"%s" % f,'MAX' : m,'MAXREGCOEF' : results.params.values[-1]}))
regList = ['ID','OLDID','CATEGORY','NAME'] + csvheaders + groupList
regression=full.groupby(regList).apply(reg) regList = ['ID','OLDID','CATEGORY','NAME'] + csvheaders + groupList
regression.reset_index(level=regression.index.names, inplace=True)
renamingDict = { a : b for (a,b) in zip(elem.params.full,elem.params.paramNames)} regression=full.groupby(regList).apply(reg)
regression = regression.rename(columns=renamingDict) regression.reset_index(level=regression.index.names, inplace=True)
regression.to_csv(regressionPath,index=False,quoting=csv.QUOTE_NONNUMERIC) renamingDict = { a : b for (a,b) in zip(elem.params.full,elem.params.paramNames)}
regression = regression.rename(columns=renamingDict)
regression.to_csv(regressionPath,index=False,quoting=csv.QUOTE_NONNUMERIC)
def extractBenchmarks(resultPath,benchmark,elem): def extractBenchmarks(resultPath,benchmark,elem):

@ -18,9 +18,13 @@ using namespace std;
#include "Patterns.h" #include "Patterns.h"
int testmain() int testmain()
{ {
char *memoryBuf=NULL; char *memoryBuf=NULL;
memoryBuf = (char*)malloc(MEMSIZE); memoryBuf = (char*)malloc(MEMSIZE);
if (memoryBuf !=NULL) if (memoryBuf !=NULL)

@ -180,7 +180,6 @@ function(configcore PROJECTNAME ROOT)
# #
if (NEON AND NOT CORTEXM) if (NEON AND NOT CORTEXM)
#target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON __FPU_PRESENT)
target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON) target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON)
endif() endif()
@ -189,6 +188,10 @@ function(configcore PROJECTNAME ROOT)
target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON_EXPERIMENTAL) target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON_EXPERIMENTAL)
endif() endif()
if (HELIUM AND CORTEXM)
target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_HELIUM)
endif()
compilerSpecificCompileOptions(${PROJECTNAME} ${ROOT}) compilerSpecificCompileOptions(${PROJECTNAME} ${ROOT})
endfunction() endfunction()
Loading…
Cancel
Save