CMSIS-DSP: Improvements to MVE code for min/max.

pull/19/head
Christophe Favergeon 5 years ago
parent 380fbca6a1
commit cf32f9527e

@ -28,6 +28,11 @@
#error device not specified!
#endif
#define SERIAL_BASE_ADDRESS (0xA8000000ul)
#define SERIAL_DATA *((volatile unsigned *) SERIAL_BASE_ADDRESS)
/*----------------------------------------------------------------------------
Exception / Interrupt Handler Function Prototype
*----------------------------------------------------------------------------*/
@ -138,6 +143,8 @@ void Reset_Handler(void)
*----------------------------------------------------------------------------*/
void HardFault_Handler(void)
{
SERIAL_DATA = 'H';
SERIAL_DATA = '\n';
while(1);
}
@ -146,5 +153,7 @@ void HardFault_Handler(void)
*----------------------------------------------------------------------------*/
void Default_Handler(void)
{
SERIAL_DATA = 'D';
SERIAL_DATA = '\n';
while(1);
}

@ -55,78 +55,49 @@ void arm_max_q15(
q15_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15x8_t curExtremValVec = vdupq_n_s16(Q15_MIN);
q15_t maxValue = Q15_MIN, temp;
uint32_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u16((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u16(0);
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
pSrc += 8;
int32_t blkCnt; /* loop counters */
q15x8_t extremValVec = vdupq_n_s16(Q15_MIN);
q15_t maxValue = Q15_MIN;
uint16x8_t indexVec;
uint16x8_t extremIdxVec;
mve_pred16_t p0;
uint16_t extremIdxArr[8];
indexVec = vidupq_u16(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp16q(blkCnt);
q15x8_t extremIdxVal = vld1q_z(pSrc, p);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
indexVec = indexVec + 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x7;
while (blkCnt > 0U)
{
/* Initialize temp to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the maximum value */
if (maxValue < temp)
{
/* Update the maximum value and it's index */
maxValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p(extremIdxArr, indexVec, p0);
indexVec += 8;
pSrc += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
/*
* Save result
*/
*pIndex = idx;
/* Get max value across the vector */
maxValue = vmaxvq(maxValue, extremValVec);
/* set index for lower values to max possible index */
p0 = vcmpgeq(extremValVec, maxValue);
extremIdxVec = vld1q(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
*pIndex = vminvq(blockSize - 1, indexVec);
*pResult = maxValue;
}
#else
void arm_max_q15(
const q15_t * pSrc,

@ -50,86 +50,54 @@
#include "arm_helium_utils.h"
void arm_max_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31x4_t curExtremValVec = vdupq_n_s32( Q31_MIN);
q31_t maxValue = Q31_MIN;
q31_t temp;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u32((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u32(0);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
pSrc += 4;
int32_t blkCnt; /* loop counters */
q31x4_t extremValVec = vdupq_n_s32(Q31_MIN);
q31_t maxValue = Q31_MIN;
uint32x4_t indexVec;
uint32x4_t extremIdxVec;
mve_pred16_t p0;
uint32_t extremIdxArr[4];
indexVec = vidupq_u32(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp32q(blkCnt);
q31x4_t extremIdxVal = vld1q_z(pSrc, p);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
indexVec = indexVec + 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the maximum value */
if (maxValue < temp)
{
/* Update the maximum value and it's index */
maxValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p(extremIdxArr, indexVec, p0);
indexVec += 4;
pSrc += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
/*
* Save result
*/
*pIndex = idx;
/* Get max value across the vector */
maxValue = vmaxvq(maxValue, extremValVec);
/* set index for lower values to max possible index */
p0 = vcmpgeq(extremValVec, maxValue);
extremIdxVec = vld1q(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
*pIndex = vminvq(blockSize - 1, indexVec);
*pResult = maxValue;
}
#else
void arm_max_q31(
const q31_t * pSrc,

@ -51,83 +51,50 @@
static void arm_small_blk_max_q7(
const q7_t * pSrc,
uint8_t blockSize,
uint16_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q7x16_t curExtremValVec = vdupq_n_s8( Q7_MIN);
q7_t maxValue = Q7_MIN, temp;
uint32_t idx = blockSize;
uint8x16_t indexVec;
uint8x16_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u8((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u8(0);
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vldrbq_s8(pSrc);
pSrc += 16;
int32_t blkCnt; /* loop counters */
q7x16_t extremValVec = vdupq_n_s8(Q7_MIN);
q7_t maxValue = Q7_MIN;
uint8x16_t indexVec;
uint8x16_t extremIdxVec;
mve_pred16_t p0;
uint8_t extremIdxArr[16];
indexVec = vidupq_u8(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp8q(blkCnt);
q7x16_t extremIdxVal = vld1q_z(pSrc, p);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
indexVec = indexVec + 16;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get max value across the vector
*/
maxValue = vmaxvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* tail
*/
blkCnt = blockSize & 0xF;
while (blkCnt > 0U)
{
/* Initialize temp to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the maximum value */
if (maxValue < temp)
{
/* Update the maximum value and it's index */
maxValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p(extremIdxArr, indexVec, p0);
indexVec += 16;
pSrc += 16;
blkCnt -= 16;
}
/*
* Save result
*/
*pIndex = idx;
while (blkCnt > 0);
/* Get max value across the vector */
maxValue = vmaxvq(maxValue, extremValVec);
/* set index for lower values to max possible index */
p0 = vcmpgeq(extremValVec, maxValue);
extremIdxVec = vld1q(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
*pIndex = vminvq_u8(blockSize - 1, indexVec);
*pResult = maxValue;
}
@ -138,8 +105,9 @@ void arm_max_q7(
uint32_t * pIndex)
{
int32_t totalSize = blockSize;
const uint16_t sub_blk_sz = UINT8_MAX + 1;
if (totalSize <= UINT8_MAX)
if (totalSize <= sub_blk_sz)
{
arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex);
}
@ -152,11 +120,11 @@ void arm_max_q7(
/*
* process blocks of 255 elts
*/
while (totalSize >= UINT8_MAX)
while (totalSize >= sub_blk_sz)
{
const q7_t *curSrc = pSrc;
arm_small_blk_max_q7(curSrc, UINT8_MAX, pResult, pIndex);
arm_small_blk_max_q7(curSrc, sub_blk_sz, pResult, pIndex);
if (*pResult > curBlkExtr)
{
/*
@ -167,8 +135,8 @@ void arm_max_q7(
curBlkIdx = curIdx;
}
curIdx++;
pSrc += UINT8_MAX;
totalSize -= UINT8_MAX;
pSrc += sub_blk_sz;
totalSize -= sub_blk_sz;
}
/*
* remainder
@ -180,7 +148,7 @@ void arm_max_q7(
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
*pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
*pIndex = curBlkIdx * sub_blk_sz + curBlkPos;
*pResult = curBlkExtr;
}
}

@ -56,79 +56,48 @@ void arm_min_q15(
q15_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15x8_t curExtremValVec = vdupq_n_s16(Q15_MAX);
q15_t minValue = Q15_MAX,temp;
uint32_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
mve_pred16_t p0;
int32_t blkCnt; /* loop counters */
q15x8_t extremValVec = vdupq_n_s16(Q15_MAX);
q15_t minValue = Q15_MAX;
uint16x8_t indexVec;
uint16x8_t extremIdxVec;
mve_pred16_t p0;
uint16_t extremIdxArr[8];
indexVec = vidupq_u16((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u16(0);
indexVec = vidupq_u16(0U, 1);
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_s16(pSrc);
pSrc += 8;
blkCnt = blockSize;
do {
mve_pred16_t p = vctp16q(blkCnt);
q15x8_t extremIdxVal = vld1q_z(pSrc, p);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
indexVec = indexVec + 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
/*
* tail
*/
blkCnt = blockSize & 7;
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the minimum value */
if (minValue > temp)
{
/* Update the minimum value and it's index */
minValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p(extremIdxArr, indexVec, p0);
indexVec += 8;
pSrc += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
/* Get min value across the vector */
minValue = vminvq(minValue, extremValVec);
/* set index for lower values to min possible index */
p0 = vcmpleq(extremValVec, minValue);
extremIdxVec = vld1q(extremIdxArr);
/*
* Save result
*/
*pIndex = idx;
indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
*pIndex = vminvq(blockSize - 1, indexVec);
*pResult = minValue;
}
#else
void arm_min_q15(

@ -56,79 +56,49 @@ void arm_min_q31(
q31_t * pResult,
uint32_t * pIndex)
{
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31x4_t curExtremValVec = vdupq_n_s32(Q31_MAX);
q31_t minValue = Q31_MAX, temp;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u32((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u32(0);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrc);
pSrc += 4;
int32_t blkCnt; /* loop counters */
q31x4_t extremValVec = vdupq_n_s32(Q31_MAX);
q31_t minValue = Q31_MAX;
uint32x4_t indexVec;
uint32x4_t extremIdxVec;
mve_pred16_t p0;
uint32_t extremIdxArr[4];
indexVec = vidupq_u32(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp32q(blkCnt);
q31x4_t extremIdxVal = vld1q_z(pSrc, p);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
indexVec = indexVec + 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
/* Tail */
blkCnt = blockSize & 0x3;
while (blkCnt > 0U)
{
/* Initialize temp to the next consecutive values one by one */
temp = *pSrc++;
/* compare for the minimum value */
if (minValue > temp)
{
/* Update the minimum value and it's index */
minValue = temp;
idx = blockSize - blkCnt;
}
/* Decrement loop counter */
blkCnt--;
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p(extremIdxArr, indexVec, p0);
indexVec += 4;
pSrc += 4;
blkCnt -= 4;
}
/*
* Save result
*/
*pIndex = idx;
while (blkCnt > 0);
/* Get min value across the vector */
minValue = vminvq(minValue, extremValVec);
/* set index for lower values to min possible index */
p0 = vcmpleq(extremValVec, minValue);
extremIdxVec = vld1q(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
*pIndex = vminvq(blockSize - 1, indexVec);
*pResult = minValue;
}
#else
void arm_min_q31(
const q31_t * pSrc,

Loading…
Cancel
Save