CMSIS-DSP: Helium versions of absmax and absmin.

pull/19/head
Christophe Favergeon 5 years ago
parent b1c5560344
commit f1bd948a66

@ -51,6 +51,90 @@
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmax_f16(
const float16_t * pSrc,
uint32_t blockSize,
float16_t * pResult,
uint32_t * pIndex)
{
uint16_t blkCnt; /* loop counters */
f16x8_t vecSrc;
float16_t const *pSrcVec;
f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMIN);
float16_t maxValue = F16_ABSMIN;
uint16_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u16((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u16(0);
pSrcVec = (float16_t const *) pSrc;
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_f16(pSrcVec);
pSrcVec += 8;
vecSrc = vabsq(vecSrc);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
vecSrc = vldrhq_f16(pSrcVec);
pSrcVec += 8;
vecSrc = vabsq(vecSrc);
p0 = vctp16q(blkCnt);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq_m(vecSrc, curExtremValVec, p0);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
}
/*
* Get max value across the vector
*/
maxValue = vmaxnmvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* Save result
*/
*pIndex = idx;
*pResult = maxValue;
}
#else
#if defined(ARM_MATH_LOOPUNROLL)
void arm_absmax_f16(
const float16_t * pSrc,
@ -179,6 +263,7 @@ void arm_absmax_f16(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_LOOPUNROLL) */
#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMax group
*/

@ -54,7 +54,73 @@
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmax_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
int32_t blkSize = blockSize;
f32x4_t vecSrc;
f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMIN);
float32_t maxValue = F32_ABSMIN;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
uint32_t curIdx = 0;
mve_pred16_t p0;
indexVec = vidupq_wb_u32(&curIdx, 1);
curExtremIdxVec = vdupq_n_u32(0);
do {
mve_pred16_t p = vctp32q(blkSize);
vecSrc = vldrwq_z_f32((float32_t const *) pSrc, p);
vecSrc = vabsq_m(vuninitializedq_f32(), vecSrc, p);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq_m(vecSrc, curExtremValVec, p);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
/* Does TP detection works here ?? */
indexVec = vidupq_wb_u32(&curIdx, 1);
blkSize -= 4;
pSrc += 4;
}
while (blkSize > 0);
/*
* Get max value across the vector
*/
maxValue = vmaxnmvq(maxValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpgeq(curExtremValVec, maxValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* Save result
*/
*pIndex = idx;
*pResult = maxValue;
}
#else
#if defined(ARM_MATH_LOOPUNROLL)
void arm_absmax_f32(
const float32_t * pSrc,
@ -186,6 +252,7 @@ void arm_absmax_f32(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_LOOPUNROLL) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMax group
*/

@ -43,7 +43,62 @@
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmax_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
int32_t blkCnt; /* loop counters */
q15x8_t extremValVec = vdupq_n_s16(Q15_ABSMIN);
q15_t maxValue = Q15_ABSMIN;
uint16x8_t indexVec;
uint16x8_t extremIdxVec;
mve_pred16_t p0;
uint16_t extremIdxArr[8];
indexVec = vidupq_u16(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp16q(blkCnt);
q15x8_t extremIdxVal = vld1q_z_s16(pSrc, p);
extremIdxVal = vabsq(extremIdxVal);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p_u16(extremIdxArr, indexVec, p0);
indexVec += 8;
pSrc += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
/* Get max value across the vector */
maxValue = vmaxvq(maxValue, extremValVec);
/* set index for lower values to max possible index */
p0 = vcmpgeq(extremValVec, maxValue);
extremIdxVec = vld1q_u16(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
*pIndex = vminvq(blockSize - 1, indexVec);
*pResult = maxValue;
}
#else
#if defined(ARM_MATH_DSP)
void arm_absmax_q15(
const q15_t * pSrc,
@ -173,7 +228,7 @@ void arm_absmax_q15(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_DSP) */
#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMax group
*/

@ -44,6 +44,60 @@
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmax_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
int32_t blkCnt; /* loop counters */
q31x4_t extremValVec = vdupq_n_s32(Q31_ABSMIN);
q31_t maxValue = Q31_ABSMIN;
uint32x4_t indexVec;
uint32x4_t extremIdxVec;
mve_pred16_t p0;
uint32_t extremIdxArr[4];
indexVec = vidupq_u32(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp32q(blkCnt);
q31x4_t extremIdxVal = vld1q_z_s32(pSrc, p);
extremIdxVal = vabsq(extremIdxVal);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p_u32(extremIdxArr, indexVec, p0);
indexVec += 4;
pSrc += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
/* Get max value across the vector */
maxValue = vmaxvq(maxValue, extremValVec);
/* set index for lower values to max possible index */
p0 = vcmpgeq(extremValVec, maxValue);
extremIdxVec = vld1q_u32(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
*pIndex = vminvq(blockSize - 1, indexVec);
*pResult = maxValue;
}
#else
#if defined(ARM_MATH_DSP)
void arm_absmax_q31(
const q31_t * pSrc,
@ -174,6 +228,7 @@ void arm_absmax_q31(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_DSP) */
#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMax group
*/

@ -43,6 +43,119 @@
@param[out] pIndex index of maximum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include <stdint.h>
#include "arm_helium_utils.h"
#define MAX_BLKSZ_S8 (UINT8_MAX+1)
static void arm_small_blk_absmax_q7(
const q7_t * pSrc,
uint16_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
int32_t blkCnt; /* loop counters */
q7x16_t extremValVec = vdupq_n_s8(Q7_ABSMIN);
q7_t maxValue = Q7_ABSMIN;
uint8x16_t indexVec;
uint8x16_t extremIdxVec;
mve_pred16_t p0;
uint8_t extremIdxArr[16];
indexVec = vidupq_u8(0U, 1);
blkCnt = blockSize;
do {
mve_pred16_t p = vctp8q(blkCnt);
q7x16_t extremIdxVal = vld1q_z_s8(pSrc, p);
extremIdxVal = vabsq(extremIdxVal);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
/* store per-lane extrema indexes */
vst1q_p_u8(extremIdxArr, indexVec, p0);
indexVec += 16;
pSrc += 16;
blkCnt -= 16;
}
while (blkCnt > 0);
/* Get max value across the vector */
maxValue = vmaxvq(maxValue, extremValVec);
/* set index for lower values to max possible index */
p0 = vcmpgeq(extremValVec, maxValue);
extremIdxVec = vld1q_u8(extremIdxArr);
indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
*pIndex = vminvq_u8(blockSize - 1, indexVec);
*pResult = maxValue;
}
void arm_absmax_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
int32_t totalSize = blockSize;
if (totalSize <= MAX_BLKSZ_S8)
{
arm_small_blk_absmax_q7(pSrc, blockSize, pResult, pIndex);
}
else
{
uint32_t curIdx = 0;
q7_t curBlkExtr = Q7_MIN;
uint32_t curBlkPos = 0;
uint32_t curBlkIdx = 0;
/*
* process blocks of 255 elts
*/
while (totalSize >= MAX_BLKSZ_S8)
{
const q7_t *curSrc = pSrc;
arm_small_blk_absmax_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex);
if (*pResult > curBlkExtr)
{
/*
* update partial extrema
*/
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
curIdx++;
pSrc += MAX_BLKSZ_S8;
totalSize -= MAX_BLKSZ_S8;
}
/*
* remainder
*/
arm_small_blk_absmax_q7(pSrc, totalSize, pResult, pIndex);
if (*pResult > curBlkExtr)
{
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
*pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos;
*pResult = curBlkExtr;
}
}
#else
#if defined(ARM_MATH_DSP)
void arm_absmax_q7(
const q7_t * pSrc,
@ -173,7 +286,7 @@ void arm_absmax_q7(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_DSP) */
#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMax group
*/

@ -52,7 +52,91 @@
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmin_f16(
const float16_t * pSrc,
uint32_t blockSize,
float16_t * pResult,
uint32_t * pIndex)
{
uint16_t blkCnt; /* loop counters */
f16x8_t vecSrc;
float16_t const *pSrcVec;
f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMAX);
float16_t minValue = F16_ABSMAX;
uint16_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u16((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u16(0);
pSrcVec = (float16_t const *) pSrc;
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vldrhq_f16(pSrcVec);
pSrcVec += 8;
vecSrc = vabsq(vecSrc);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
p0 = vctp16q(blkCnt);
vecSrc = vldrhq_f16(pSrcVec);
pSrcVec += 8;
vecSrc = vabsq(vecSrc);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
}
/*
* Get min value across the vector
*/
minValue = vminnmvq(minValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
#if defined(ARM_MATH_LOOPUNROLL)
void arm_absmin_f16(
const float16_t * pSrc,
@ -181,6 +265,7 @@ void arm_absmin_f16(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_LOOPUNROLL) */
#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMin group
*/

@ -57,6 +57,91 @@
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmin_f32(
const float32_t * pSrc,
uint32_t blockSize,
float32_t * pResult,
uint32_t * pIndex)
{
int32_t blkCnt; /* loop counters */
f32x4_t vecSrc;
float32_t const *pSrcVec;
f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMAX);
float32_t minValue = F32_ABSMAX;
uint32_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
mve_pred16_t p0;
indexVec = vidupq_u32((uint32_t)0, 1);
curExtremIdxVec = vdupq_n_u32(0);
pSrcVec = (float32_t const *) pSrc;
blkCnt = blockSize >> 2;
while (blkCnt > 0)
{
vecSrc = vldrwq_f32(pSrcVec);
pSrcVec += 4;
vecSrc = vabsq(vecSrc);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = indexVec + 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = blockSize & 3;
if (blkCnt > 0)
{
p0 = vctp32q(blkCnt);
vecSrc = vldrwq_f32(pSrcVec);
pSrcVec += 4;
vecSrc = vabsq(vecSrc);
/*
* Get current max per lane and current index per lane
* when a max is selected
*/
p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
}
/*
* Get min value across the vector
*/
minValue = vminnmvq(minValue, curExtremValVec);
/*
* set index for lower values to max possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a max value
*/
idx = vminvq(idx, indexVec);
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
#if defined(ARM_MATH_LOOPUNROLL)
void arm_absmin_f32(
const float32_t * pSrc,
@ -186,6 +271,7 @@ void arm_absmin_f32(
}
#endif /* defined(ARM_MATH_LOOPUNROLL) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMin group
*/

@ -44,7 +44,93 @@
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmin_q15(
const q15_t * pSrc,
uint32_t blockSize,
q15_t * pResult,
uint32_t * pIndex)
{
uint16_t blkCnt; /* loop counters */
q15x8_t vecSrc;
q15_t const *pSrcVec;
q15x8_t curExtremValVec = vdupq_n_s16(Q15_ABSMAX);
q15_t minValue = Q15_ABSMAX;
uint16_t idx = blockSize;
uint16x8_t indexVec;
uint16x8_t curExtremIdxVec;
uint32_t startIdx = 0;
mve_pred16_t p0;
indexVec = vidupq_wb_u16(&startIdx, 1);
curExtremIdxVec = vdupq_n_u16(0);
pSrcVec = (q15_t const *) pSrc;
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrcVec);
pSrcVec += 8;
vecSrc = vabsq(vecSrc);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = vidupq_wb_u16(&startIdx, 1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
vecSrc = vld1q(pSrcVec);
pSrcVec += 8;
vecSrc = vabsq(vecSrc);
p0 = vctp16q(blkCnt);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
#if defined(ARM_MATH_DSP)
void arm_absmin_q15(
const q15_t * pSrc,
@ -175,7 +261,7 @@ void arm_absmin_q15(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_DSP) */
#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMin group
*/

@ -45,6 +45,93 @@
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_absmin_q31(
const q31_t * pSrc,
uint32_t blockSize,
q31_t * pResult,
uint32_t * pIndex)
{
uint16_t blkCnt; /* loop counters */
q31x4_t vecSrc;
q31_t const *pSrcVec;
q31x4_t curExtremValVec = vdupq_n_s32(Q31_ABSMAX);
q31_t minValue = Q31_ABSMAX;
uint16_t idx = blockSize;
uint32x4_t indexVec;
uint32x4_t curExtremIdxVec;
uint32_t startIdx = 0;
mve_pred16_t p0;
indexVec = vidupq_wb_u32(&startIdx, 1);
curExtremIdxVec = vdupq_n_u32(0);
pSrcVec = (q31_t const *) pSrc;
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrcVec);
pSrcVec += 4;
vecSrc = vabsq(vecSrc);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = vidupq_wb_u32(&startIdx, 1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
vecSrc = vldrwq_s32(pSrcVec);
pSrcVec += 4;
vecSrc = vabsq(vecSrc);
p0 = vctp32q(blkCnt);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
/*
* Get min index which is thus for a min value
*/
idx = vminvq(idx, indexVec);
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
#else
#if defined(ARM_MATH_DSP)
void arm_absmin_q31(
const q31_t * pSrc,
@ -174,6 +261,7 @@ void arm_absmin_q31(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_DSP) */
#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMin group
*/

@ -44,6 +44,147 @@
@param[out] pIndex index of minimum value returned here
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include <stdint.h>
#include "arm_helium_utils.h"
#define MAX_BLKSZ_S8 (UINT8_MAX+1)
static void arm_small_blk_absmin_q7(
const q7_t *pSrc,
uint32_t blockSize,
q7_t *pResult,
uint32_t *pIndex)
{
uint16_t blkCnt; /* loop counters */
q7x16_t vecSrc;
q7_t const *pSrcVec;
q7x16_t curExtremValVec = vdupq_n_s8(Q7_ABSMAX);
q7_t minValue = Q7_ABSMAX;
uint16_t idx = blockSize - 1;
uint8x16_t indexVec;
uint8x16_t curExtremIdxVec;
uint32_t startIdx = 0;
mve_pred16_t p0;
indexVec = vidupq_wb_u8(&startIdx, 1);
curExtremIdxVec = vdupq_n_u8(0);
pSrcVec = (q7_t const *) pSrc;
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrcVec);
pSrcVec += 16;
vecSrc = vabsq(vecSrc);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq(vecSrc, curExtremValVec);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
indexVec = vidupq_wb_u8(&startIdx, 1);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
vecSrc = vld1q(pSrcVec);
pSrcVec += 16;
vecSrc = vabsq(vecSrc);
p0 = vctp8q(blkCnt);
/*
* Get current min per lane and current index per lane
* when a min is selected
*/
p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
}
/*
* Get min value across the vector
*/
minValue = vminvq(minValue, curExtremValVec);
/*
* set index for lower values to min possible index
*/
p0 = vcmpleq(curExtremValVec, minValue);
idx = vminvq_p_u8(idx, curExtremIdxVec, p0);
/*
* Save result
*/
*pIndex = idx;
*pResult = minValue;
}
void arm_absmin_q7(
const q7_t * pSrc,
uint32_t blockSize,
q7_t * pResult,
uint32_t * pIndex)
{
int32_t totalSize = blockSize;
if (totalSize <= MAX_BLKSZ_S8)
{
arm_small_blk_absmin_q7(pSrc, blockSize, pResult, pIndex);
}
else
{
uint32_t curIdx = 0;
q7_t curBlkExtr = Q7_MAX;
uint32_t curBlkPos = 0;
uint32_t curBlkIdx = 0;
/*
* process blocks of 255 elts
*/
while (totalSize >= MAX_BLKSZ_S8)
{
const q7_t *curSrc = pSrc;
arm_small_blk_absmin_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex);
if (*pResult < curBlkExtr)
{
/*
* update partial extrema
*/
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
curIdx++;
pSrc += MAX_BLKSZ_S8;
totalSize -= MAX_BLKSZ_S8;
}
/*
* remainder
*/
arm_small_blk_absmin_q7(pSrc, totalSize, pResult, pIndex);
if (*pResult < curBlkExtr)
{
curBlkExtr = *pResult;
curBlkPos = *pIndex;
curBlkIdx = curIdx;
}
*pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos;
*pResult = curBlkExtr;
}
}
#else
#if defined(ARM_MATH_DSP)
void arm_absmin_q7(
const q7_t * pSrc,
@ -173,6 +314,7 @@ void arm_absmin_q7(
*pIndex = outIndex;
}
#endif /* defined(ARM_MATH_DSP) */
#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of AbsMin group
*/

@ -48,6 +48,13 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT)
target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
endif()
if (ARM_CPU STREQUAL "cortex-m55+nomve.fp+nofp" )
target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8.1-m.main+dsp+fp.dp")
target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
endif()
if (ARM_CPU STREQUAL "cortex-m33" )
target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-sp-d16")
target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-sp-d16")
@ -68,6 +75,12 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT)
# target_link_options(${PROJECTNAME} PUBLIC "")
#endif()
if (ARM_CPU STREQUAL "cortex-a32" )
if (NOT (NEON OR NEONEXPERIMENTAL))
target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8-a;-mfpu=vfpv3-d16")
target_link_options(${PROJECTNAME} PUBLIC "-march=armv8-a;-mfpu=vfpv3-d16")
endif()
endif()
if (ARM_CPU STREQUAL "cortex-a9" )
if (NOT (NEON OR NEONEXPERIMENTAL))

Loading…
Cancel
Save