diff --git a/Include/arm_helium_utils.h b/Include/arm_helium_utils.h new file mode 100755 index 00000000..fd4c707b --- /dev/null +++ b/Include/arm_helium_utils.h @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_helium_utils.h + * Description: Utility functions for Helium development + * + * $Date: 09. September 2019 + * $Revision: V.1.5.1 + * + * Target Processor: Cortex-M cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ARM_UTILS_HELIUM_H_ +#define _ARM_UTILS_HELIUM_H_ + +#if defined (ARM_MATH_HELIUM) + +#define nbLanes(sz) (128/sz) + +#define VEC_LANES_F32 nbLanes(32) +#define VEC_LANES_F16 nbLanes(16) +#define VEC_LANES_Q63 nbLanes(64) +#define VEC_LANES_Q31 nbLanes(32) +#define VEC_LANES_Q15 nbLanes(16) +#define VEC_LANES_Q7 nbLanes(8) + +#define nb_vec_lanes(ptr) _Generic((ptr), \ + uint32_t *: VEC_LANES_Q31, \ + uint16_t *: VEC_LANES_Q15, \ + uint8_t *: VEC_LANES_Q7, \ + q31_t *: VEC_LANES_Q31, \ + q15_t *: VEC_LANES_Q15, \ + q7_t *: VEC_LANES_Q7, \ + float32_t*: VEC_LANES_F32, \ + float16_t*: VEC_LANES_F16, \ + const q31_t *: VEC_LANES_Q31, \ + const q15_t *: VEC_LANES_Q15, \ + const q7_t *: VEC_LANES_Q7, \ + const float32_t*: VEC_LANES_F32, \ + const float16_t*: VEC_LANES_F16, \ + default: "err") + +__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in) +{ + float32_t acc; + + acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) + + vgetq_lane(in, 2) + vgetq_lane(in, 3); + + return acc; +} + +#define post_incr_vec_size(ptr) ptr += nb_vec_lanes(ptr) + +#endif + +#endif \ No newline at end of file diff --git a/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c b/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c index b373ffc2..eccf7258 100755 --- a/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c +++ b/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c @@ -128,6 +128,7 @@ void Reset_Handler(void) __set_MSPLIM((uint32_t)(&__STACK_LIMIT)); SystemInit(); /* CMSIS System Initialization */ + __PROGRAM_START(); /* Enter PreMain (C library entry point) */ } diff --git a/Platforms/FVP/ARMv81MML/system_ARMv81MML.c b/Platforms/FVP/ARMv81MML/system_ARMv81MML.c index 2919ec05..75e51649 100644 --- a/Platforms/FVP/ARMv81MML/system_ARMv81MML.c +++ b/Platforms/FVP/ARMv81MML/system_ARMv81MML.c @@ -40,6 +40,11 @@ #define SYSTEM_CLOCK (5U * XTAL) +#define DEBUG_DEMCR (*((unsigned int *)0xE000EDFC)) +#define DEBUG_TRCENA (1<<24) //Global debug enable bit + +#define CCR (*((volatile unsigned int *)0xE000ED14)) +#define CCR_DL (1 << 19) /*---------------------------------------------------------------------------- Externals @@ -88,4 +93,10 @@ void SystemInit (void) SystemCoreClock = SYSTEM_CLOCK; + //Disable debug + DEBUG_DEMCR &=~ DEBUG_TRCENA; + + // enable DL branch cache + CCR |= CCR_DL; + } diff --git a/Source/BasicMathFunctions/arm_dot_prod_f32.c b/Source/BasicMathFunctions/arm_dot_prod_f32.c index 3eee3b97..99ca7dc4 100644 --- a/Source/BasicMathFunctions/arm_dot_prod_f32.c +++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c @@ -59,6 +59,46 @@ @return none */ +#if defined (ARM_MATH_HELIUM) + +#include "arm_mve.h" +#include "arm_helium_utils.h" + +void arm_dot_prod_f32( + const float32_t * pSrcA, + const float32_t * pSrcB, + uint32_t blockSize, + float32_t * result) +{ + float32x4_t vecA, vecB; + float32x4_t vecSum; + vecSum = vdupq_n_f32(0.0); + + do { + /* + * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] + * Calculate dot product and then store the result in a temporary buffer. + */ + mve_pred16_t p = vctp32q(blockSize); + + vecA = vldrwq_z_f32(pSrcA, p); + vecB = vldrwq_z_f32(pSrcB, p); + vecSum = vfmaq_m(vecSum, vecA, vecB, p); + /* + * Decrement the blockSize loop counter + * Advance vector source and destination pointers + */ + post_incr_vec_size(pSrcA); + post_incr_vec_size(pSrcB); + blockSize -= VEC_LANES_F32; + } + while ((int32_t) blockSize > 0); + + *result = vecAddAcrossF32Mve(vecSum); +} + +#else + void arm_dot_prod_f32( const float32_t * pSrcA, const float32_t * pSrcB, @@ -158,6 +198,7 @@ void arm_dot_prod_f32( *result = sum; } +#endif /* ARM_MATH_HELIUM */ /** @} end of BasicDotProd group */ diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt index ac56523f..8248c21a 100755 --- a/Source/CMakeLists.txt +++ b/Source/CMakeLists.txt @@ -17,6 +17,7 @@ option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF) option(LOOPUNROLL "Loop unrolling" ON) option(ROUNDING "Rounding" OFF) option(MATRIXCHECK "Matrix Checks" OFF) +option(HELIUM "Helium acceleration" OFF) # Select which parts of the CMSIS-DSP must be compiled. # There are some dependencies between the parts but they are not tracked diff --git a/Testing/CMakeLists.txt b/Testing/CMakeLists.txt index de00fe25..be96fef1 100644 --- a/Testing/CMakeLists.txt +++ b/Testing/CMakeLists.txt @@ -7,7 +7,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/..) function(writeConfig path) set(output "") - list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n") + list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,HELIUM,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n") if (OPTIMIZED) list(APPEND output "1") @@ -33,6 +33,12 @@ function(writeConfig path) list(APPEND output ",0") endif() + if (HELIUM) + list(APPEND output ",1") + else() + list(APPEND output ",0") + endif() + if (LOOPUNROLL) list(APPEND output ",1") else() diff --git a/Testing/addToDB.py b/Testing/addToDB.py index 514df546..bdf570ad 100755 --- a/Testing/addToDB.py +++ b/Testing/addToDB.py @@ -19,7 +19,7 @@ import re # For table creation MKSTRFIELD=['NAME'] -MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] +MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED'] MKINTFIELD=['ID', 'CYCLES'] MKDATEFIELD=['DATE'] MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE'] @@ -31,7 +31,7 @@ MKKEYFIELDID={'CATEGORY':'categoryid', # For table value extraction VALSTRFIELD=['NAME','VERSION'] -VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] +VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED'] VALINTFIELD=['ID', 'CYCLES'] VALDATEFIELD=['DATE'] VALKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE'] @@ -246,16 +246,17 @@ def addRows(conn,elem,tableName,full): conn.commit() def addOneBenchmark(elem,fullPath,db,group): - full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) - full['DATE'] = datetime.datetime.now() - if group: - tableName = group - else: - tableName = elem.data["class"] - conn = sqlite3.connect(db) - createTableIfMissing(conn,elem,tableName,full) - addRows(conn,elem,tableName,full) - conn.close() + if os.path.isfile(fullPath): + full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) + full['DATE'] = datetime.datetime.now() + if group: + tableName = group + else: + tableName = elem.data["class"] + conn = sqlite3.connect(db) + createTableIfMissing(conn,elem,tableName,full) + addRows(conn,elem,tableName,full) + conn.close() def addToDB(benchmark,dbpath,elem,group): diff --git a/Testing/addToRegDB.py b/Testing/addToRegDB.py index 50c3e76a..acbf3fd3 100755 --- a/Testing/addToRegDB.py +++ b/Testing/addToRegDB.py @@ -19,7 +19,7 @@ import re # For table creation MKSTRFIELD=['NAME','Regression'] -MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] +MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED'] MKINTFIELD=['ID','MAX'] MKREALFIELD=['MAXREGCOEF'] MKDATEFIELD=['DATE'] @@ -32,7 +32,7 @@ MKKEYFIELDID={'CATEGORY':'categoryid', # For table value extraction VALSTRFIELD=['NAME','VERSION','Regression'] -VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED'] +VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED'] VALINTFIELD=['ID', 'MAX'] VALREALFIELD=['MAXREGCOEF'] VALDATEFIELD=['DATE'] @@ -257,16 +257,17 @@ def addRows(conn,elem,tableName,full): conn.commit() def addOneBenchmark(elem,fullPath,db,group): - full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) - full['DATE'] = datetime.datetime.now() - if group: - tableName = group - else: - tableName = elem.data["class"] - conn = sqlite3.connect(db) - createTableIfMissing(conn,elem,tableName,full) - addRows(conn,elem,tableName,full) - conn.close() + if os.path.isfile(fullPath): + full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) + full['DATE'] = datetime.datetime.now() + if group: + tableName = group + else: + tableName = elem.data["class"] + conn = sqlite3.connect(db) + createTableIfMissing(conn,elem,tableName,full) + addRows(conn,elem,tableName,full) + conn.close() def addToDB(benchmark,dbpath,elem,group): diff --git a/Testing/createDb.sql b/Testing/createDb.sql index 64737b35..2c9b161e 100755 --- a/Testing/createDb.sql +++ b/Testing/createDb.sql @@ -70,3 +70,4 @@ INSERT INTO CORE VALUES(9,"a5","ARMCA5"); INSERT INTO CORE VALUES(10,"a7","ARMCA7"); INSERT INTO CORE VALUES(11,"a9","ARMCA9"); INSERT INTO CORE VALUES(12,"a15","ARMCA15"); +INSERT INTO CORE VALUES(13,"helium","ARMv81MML_DSP_DP_MVE_FP"); diff --git a/Testing/summaryBench.py b/Testing/summaryBench.py index 14cb2be7..ff758bc0 100644 --- a/Testing/summaryBench.py +++ b/Testing/summaryBench.py @@ -57,38 +57,40 @@ def formatProd(a,b): def summaryBenchmark(resultPath,elem,path): regressionPath=os.path.join(os.path.dirname(path),"regression.csv") - print(" Generating %s" % regressionPath) - full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False) - #print(full) - - csvheaders = [] - with open(os.path.join(resultPath,'currentConfig.csv'), 'r') as f: - reader = csv.reader(f) - csvheaders = next(reader, None) - groupList = list(set(elem.params.full) - set(elem.params.summary)) - #grouped=full.groupby(list(elem.params.summary) + ['ID','CATEGORY']).max() - #grouped.reset_index(level=grouped.index.names, inplace=True) - #print(grouped) - #print(grouped.columns) + if os.path.isfile(path): + print(" Generating %s" % regressionPath) + full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False) + #print(full) + + csvheaders = [] + with open(os.path.join(resultPath,'currentConfig.csv'), 'r') as f: + reader = csv.reader(f) + csvheaders = next(reader, None) + + groupList = list(set(elem.params.full) - set(elem.params.summary)) + #grouped=full.groupby(list(elem.params.summary) + ['ID','CATEGORY']).max() + #grouped.reset_index(level=grouped.index.names, inplace=True) + #print(grouped) + #print(grouped.columns) - def reg(d): - m=d["CYCLES"].max() - results = smf.ols('CYCLES ~ ' + elem.params.formula, data=d).fit() - f=joinit([formatProd(a,b) for (a,b) in zip(results.params.index,results.params.values)]," + ") - f="".join(f) - f = re.sub(r':','*',f) - #print(results.summary()) - return(pd.Series({'Regression':"%s" % f,'MAX' : m,'MAXREGCOEF' : results.params.values[-1]})) - - regList = ['ID','OLDID','CATEGORY','NAME'] + csvheaders + groupList + def reg(d): + m=d["CYCLES"].max() + results = smf.ols('CYCLES ~ ' + elem.params.formula, data=d).fit() + f=joinit([formatProd(a,b) for (a,b) in zip(results.params.index,results.params.values)]," + ") + f="".join(f) + f = re.sub(r':','*',f) + #print(results.summary()) + return(pd.Series({'Regression':"%s" % f,'MAX' : m,'MAXREGCOEF' : results.params.values[-1]})) - regression=full.groupby(regList).apply(reg) - regression.reset_index(level=regression.index.names, inplace=True) - renamingDict = { a : b for (a,b) in zip(elem.params.full,elem.params.paramNames)} - regression = regression.rename(columns=renamingDict) - regression.to_csv(regressionPath,index=False,quoting=csv.QUOTE_NONNUMERIC) + regList = ['ID','OLDID','CATEGORY','NAME'] + csvheaders + groupList + + regression=full.groupby(regList).apply(reg) + regression.reset_index(level=regression.index.names, inplace=True) + renamingDict = { a : b for (a,b) in zip(elem.params.full,elem.params.paramNames)} + regression = regression.rename(columns=renamingDict) + regression.to_csv(regressionPath,index=False,quoting=csv.QUOTE_NONNUMERIC) def extractBenchmarks(resultPath,benchmark,elem): diff --git a/Testing/testmain.cpp b/Testing/testmain.cpp index eb5014a7..51d7dd1f 100644 --- a/Testing/testmain.cpp +++ b/Testing/testmain.cpp @@ -18,9 +18,13 @@ using namespace std; #include "Patterns.h" + int testmain() { char *memoryBuf=NULL; + + + memoryBuf = (char*)malloc(MEMSIZE); if (memoryBuf !=NULL) diff --git a/configCore.cmake b/configCore.cmake index eda57ec9..f7a24f65 100644 --- a/configCore.cmake +++ b/configCore.cmake @@ -180,7 +180,6 @@ function(configcore PROJECTNAME ROOT) # if (NEON AND NOT CORTEXM) - #target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON __FPU_PRESENT) target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON) endif() @@ -189,6 +188,10 @@ function(configcore PROJECTNAME ROOT) target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON_EXPERIMENTAL) endif() + if (HELIUM AND CORTEXM) + target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_HELIUM) + endif() + compilerSpecificCompileOptions(${PROJECTNAME} ${ROOT}) endfunction() \ No newline at end of file