diff --git a/Source/StatisticsFunctions/arm_absmax_f16.c b/Source/StatisticsFunctions/arm_absmax_f16.c index 1f82d51f..b2c50769 100755 --- a/Source/StatisticsFunctions/arm_absmax_f16.c +++ b/Source/StatisticsFunctions/arm_absmax_f16.c @@ -51,6 +51,90 @@ @return none */ +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +void arm_absmax_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult, + uint32_t * pIndex) +{ + uint16_t blkCnt; /* loop counters */ + f16x8_t vecSrc; + float16_t const *pSrcVec; + f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMIN); + float16_t maxValue = F16_ABSMIN; + uint16_t idx = blockSize; + uint16x8_t indexVec; + uint16x8_t curExtremIdxVec; + mve_pred16_t p0; + + + indexVec = vidupq_u16((uint32_t)0, 1); + curExtremIdxVec = vdupq_n_u16(0); + + pSrcVec = (float16_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + vecSrc = vldrhq_f16(pSrcVec); + pSrcVec += 8; + vecSrc = vabsq(vecSrc); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpgeq(vecSrc, curExtremValVec); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + indexVec = indexVec + 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 7; + if (blkCnt > 0U) + { + vecSrc = vldrhq_f16(pSrcVec); + pSrcVec += 8; + vecSrc = vabsq(vecSrc); + + p0 = vctp16q(blkCnt); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpgeq_m(vecSrc, curExtremValVec, p0); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + } + /* + * Get max value across the vector + */ + maxValue = vmaxnmvq(maxValue, curExtremValVec); + /* + * set index for lower values to max possible index + */ + p0 = vcmpgeq(curExtremValVec, maxValue); + indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0); + /* + * Get min index which is thus for a max value + */ + idx = vminvq(idx, indexVec); + /* + * Save result + */ + *pIndex = idx; + *pResult = maxValue; +} +#else #if defined(ARM_MATH_LOOPUNROLL) void arm_absmax_f16( const float16_t * pSrc, @@ -179,6 +263,7 @@ void arm_absmax_f16( *pIndex = outIndex; } #endif /* defined(ARM_MATH_LOOPUNROLL) */ +#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMax group */ diff --git a/Source/StatisticsFunctions/arm_absmax_f32.c b/Source/StatisticsFunctions/arm_absmax_f32.c index 3367f730..85e0e5cb 100755 --- a/Source/StatisticsFunctions/arm_absmax_f32.c +++ b/Source/StatisticsFunctions/arm_absmax_f32.c @@ -54,7 +54,73 @@ @param[out] pIndex index of maximum value returned here @return none */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_helium_utils.h" + +void arm_absmax_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex) +{ + int32_t blkSize = blockSize; + f32x4_t vecSrc; + f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMIN); + float32_t maxValue = F32_ABSMIN; + uint32_t idx = blockSize; + uint32x4_t indexVec; + uint32x4_t curExtremIdxVec; + uint32_t curIdx = 0; + mve_pred16_t p0; + + + indexVec = vidupq_wb_u32(&curIdx, 1); + curExtremIdxVec = vdupq_n_u32(0); + + do { + mve_pred16_t p = vctp32q(blkSize); + + vecSrc = vldrwq_z_f32((float32_t const *) pSrc, p); + vecSrc = vabsq_m(vuninitializedq_f32(), vecSrc, p); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpgeq_m(vecSrc, curExtremValVec, p); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + /* Does TP detection works here ?? */ + indexVec = vidupq_wb_u32(&curIdx, 1); + + blkSize -= 4; + pSrc += 4; + } + while (blkSize > 0); + + /* + * Get max value across the vector + */ + maxValue = vmaxnmvq(maxValue, curExtremValVec); + /* + * set index for lower values to max possible index + */ + p0 = vcmpgeq(curExtremValVec, maxValue); + indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0); + /* + * Get min index which is thus for a max value + */ + idx = vminvq(idx, indexVec); + /* + * Save result + */ + *pIndex = idx; + *pResult = maxValue; +} + + +#else #if defined(ARM_MATH_LOOPUNROLL) void arm_absmax_f32( const float32_t * pSrc, @@ -186,6 +252,7 @@ void arm_absmax_f32( *pIndex = outIndex; } #endif /* defined(ARM_MATH_LOOPUNROLL) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMax group */ diff --git a/Source/StatisticsFunctions/arm_absmax_q15.c b/Source/StatisticsFunctions/arm_absmax_q15.c index 8473fb67..50182e54 100755 --- a/Source/StatisticsFunctions/arm_absmax_q15.c +++ b/Source/StatisticsFunctions/arm_absmax_q15.c @@ -43,7 +43,62 @@ @param[out] pIndex index of maximum value returned here @return none */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_helium_utils.h" + +void arm_absmax_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex) +{ + int32_t blkCnt; /* loop counters */ + q15x8_t extremValVec = vdupq_n_s16(Q15_ABSMIN); + q15_t maxValue = Q15_ABSMIN; + uint16x8_t indexVec; + uint16x8_t extremIdxVec; + mve_pred16_t p0; + uint16_t extremIdxArr[8]; + + indexVec = vidupq_u16(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp16q(blkCnt); + q15x8_t extremIdxVal = vld1q_z_s16(pSrc, p); + + extremIdxVal = vabsq(extremIdxVal); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpgeq_m(extremIdxVal, extremValVec, p); + + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p_u16(extremIdxArr, indexVec, p0); + + indexVec += 8; + pSrc += 8; + blkCnt -= 8; + } + while (blkCnt > 0); + + + /* Get max value across the vector */ + maxValue = vmaxvq(maxValue, extremValVec); + + /* set index for lower values to max possible index */ + p0 = vcmpgeq(extremValVec, maxValue); + extremIdxVec = vld1q_u16(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0); + *pIndex = vminvq(blockSize - 1, indexVec); + *pResult = maxValue; +} + +#else #if defined(ARM_MATH_DSP) void arm_absmax_q15( const q15_t * pSrc, @@ -173,7 +228,7 @@ void arm_absmax_q15( *pIndex = outIndex; } #endif /* defined(ARM_MATH_DSP) */ - +#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMax group */ diff --git a/Source/StatisticsFunctions/arm_absmax_q31.c b/Source/StatisticsFunctions/arm_absmax_q31.c index f8a99fae..cd65ce40 100755 --- a/Source/StatisticsFunctions/arm_absmax_q31.c +++ b/Source/StatisticsFunctions/arm_absmax_q31.c @@ -44,6 +44,60 @@ @return none */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +void arm_absmax_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex) +{ + int32_t blkCnt; /* loop counters */ + q31x4_t extremValVec = vdupq_n_s32(Q31_ABSMIN); + q31_t maxValue = Q31_ABSMIN; + uint32x4_t indexVec; + uint32x4_t extremIdxVec; + mve_pred16_t p0; + uint32_t extremIdxArr[4]; + + indexVec = vidupq_u32(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp32q(blkCnt); + q31x4_t extremIdxVal = vld1q_z_s32(pSrc, p); + + extremIdxVal = vabsq(extremIdxVal); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpgeq_m(extremIdxVal, extremValVec, p); + + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p_u32(extremIdxArr, indexVec, p0); + + indexVec += 4; + pSrc += 4; + blkCnt -= 4; + } + while (blkCnt > 0); + + + /* Get max value across the vector */ + maxValue = vmaxvq(maxValue, extremValVec); + + /* set index for lower values to max possible index */ + p0 = vcmpgeq(extremValVec, maxValue); + extremIdxVec = vld1q_u32(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0); + *pIndex = vminvq(blockSize - 1, indexVec); + *pResult = maxValue; +} +#else #if defined(ARM_MATH_DSP) void arm_absmax_q31( const q31_t * pSrc, @@ -174,6 +228,7 @@ void arm_absmax_q31( *pIndex = outIndex; } #endif /* defined(ARM_MATH_DSP) */ +#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMax group */ diff --git a/Source/StatisticsFunctions/arm_absmax_q7.c b/Source/StatisticsFunctions/arm_absmax_q7.c index acf0ebda..2321983d 100755 --- a/Source/StatisticsFunctions/arm_absmax_q7.c +++ b/Source/StatisticsFunctions/arm_absmax_q7.c @@ -43,6 +43,119 @@ @param[out] pIndex index of maximum value returned here @return none */ + +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include +#include "arm_helium_utils.h" + +#define MAX_BLKSZ_S8 (UINT8_MAX+1) + +static void arm_small_blk_absmax_q7( + const q7_t * pSrc, + uint16_t blockSize, + q7_t * pResult, + uint32_t * pIndex) +{ + int32_t blkCnt; /* loop counters */ + q7x16_t extremValVec = vdupq_n_s8(Q7_ABSMIN); + q7_t maxValue = Q7_ABSMIN; + uint8x16_t indexVec; + uint8x16_t extremIdxVec; + mve_pred16_t p0; + uint8_t extremIdxArr[16]; + + indexVec = vidupq_u8(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp8q(blkCnt); + q7x16_t extremIdxVal = vld1q_z_s8(pSrc, p); + + extremIdxVal = vabsq(extremIdxVal); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpgeq_m(extremIdxVal, extremValVec, p); + + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p_u8(extremIdxArr, indexVec, p0); + + indexVec += 16; + pSrc += 16; + blkCnt -= 16; + } + while (blkCnt > 0); + + + /* Get max value across the vector */ + maxValue = vmaxvq(maxValue, extremValVec); + + /* set index for lower values to max possible index */ + p0 = vcmpgeq(extremValVec, maxValue); + extremIdxVec = vld1q_u8(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0); + *pIndex = vminvq_u8(blockSize - 1, indexVec); + *pResult = maxValue; +} + +void arm_absmax_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex) +{ + int32_t totalSize = blockSize; + + if (totalSize <= MAX_BLKSZ_S8) + { + arm_small_blk_absmax_q7(pSrc, blockSize, pResult, pIndex); + } + else + { + uint32_t curIdx = 0; + q7_t curBlkExtr = Q7_MIN; + uint32_t curBlkPos = 0; + uint32_t curBlkIdx = 0; + /* + * process blocks of 255 elts + */ + while (totalSize >= MAX_BLKSZ_S8) + { + const q7_t *curSrc = pSrc; + + arm_small_blk_absmax_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex); + if (*pResult > curBlkExtr) + { + /* + * update partial extrema + */ + curBlkExtr = *pResult; + curBlkPos = *pIndex; + curBlkIdx = curIdx; + } + curIdx++; + pSrc += MAX_BLKSZ_S8; + totalSize -= MAX_BLKSZ_S8; + } + /* + * remainder + */ + arm_small_blk_absmax_q7(pSrc, totalSize, pResult, pIndex); + if (*pResult > curBlkExtr) + { + curBlkExtr = *pResult; + curBlkPos = *pIndex; + curBlkIdx = curIdx; + } + *pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos; + *pResult = curBlkExtr; + } +} +#else #if defined(ARM_MATH_DSP) void arm_absmax_q7( const q7_t * pSrc, @@ -173,7 +286,7 @@ void arm_absmax_q7( *pIndex = outIndex; } #endif /* defined(ARM_MATH_DSP) */ - +#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMax group */ diff --git a/Source/StatisticsFunctions/arm_absmin_f16.c b/Source/StatisticsFunctions/arm_absmin_f16.c index 84200cb4..2a68e98a 100755 --- a/Source/StatisticsFunctions/arm_absmin_f16.c +++ b/Source/StatisticsFunctions/arm_absmin_f16.c @@ -52,7 +52,91 @@ @return none */ +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_helium_utils.h" +void arm_absmin_f16( + const float16_t * pSrc, + uint32_t blockSize, + float16_t * pResult, + uint32_t * pIndex) +{ + uint16_t blkCnt; /* loop counters */ + f16x8_t vecSrc; + float16_t const *pSrcVec; + f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMAX); + float16_t minValue = F16_ABSMAX; + uint16_t idx = blockSize; + uint16x8_t indexVec; + uint16x8_t curExtremIdxVec; + mve_pred16_t p0; + + + indexVec = vidupq_u16((uint32_t)0, 1); + curExtremIdxVec = vdupq_n_u16(0); + + pSrcVec = (float16_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + vecSrc = vldrhq_f16(pSrcVec); + pSrcVec += 8; + vecSrc = vabsq(vecSrc); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpleq(vecSrc, curExtremValVec); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + indexVec = indexVec + 8; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 7; + if (blkCnt > 0U) + { + p0 = vctp16q(blkCnt); + + vecSrc = vldrhq_f16(pSrcVec); + pSrcVec += 8; + vecSrc = vabsq(vecSrc); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpleq_m(vecSrc, curExtremValVec, p0); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + } + /* + * Get min value across the vector + */ + minValue = vminnmvq(minValue, curExtremValVec); + /* + * set index for lower values to max possible index + */ + p0 = vcmpleq(curExtremValVec, minValue); + indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0); + /* + * Get min index which is thus for a max value + */ + idx = vminvq(idx, indexVec); + /* + * Save result + */ + *pIndex = idx; + *pResult = minValue; +} + +#else #if defined(ARM_MATH_LOOPUNROLL) void arm_absmin_f16( const float16_t * pSrc, @@ -181,6 +265,7 @@ void arm_absmin_f16( *pIndex = outIndex; } #endif /* defined(ARM_MATH_LOOPUNROLL) */ +#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMin group */ diff --git a/Source/StatisticsFunctions/arm_absmin_f32.c b/Source/StatisticsFunctions/arm_absmin_f32.c index 4a2ec2a6..2ca7c42e 100755 --- a/Source/StatisticsFunctions/arm_absmin_f32.c +++ b/Source/StatisticsFunctions/arm_absmin_f32.c @@ -57,6 +57,91 @@ @return none */ +#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" +void arm_absmin_f32( + const float32_t * pSrc, + uint32_t blockSize, + float32_t * pResult, + uint32_t * pIndex) +{ + int32_t blkCnt; /* loop counters */ + f32x4_t vecSrc; + float32_t const *pSrcVec; + f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMAX); + float32_t minValue = F32_ABSMAX; + uint32_t idx = blockSize; + uint32x4_t indexVec; + uint32x4_t curExtremIdxVec; + mve_pred16_t p0; + + + indexVec = vidupq_u32((uint32_t)0, 1); + curExtremIdxVec = vdupq_n_u32(0); + + pSrcVec = (float32_t const *) pSrc; + blkCnt = blockSize >> 2; + while (blkCnt > 0) + { + vecSrc = vldrwq_f32(pSrcVec); + pSrcVec += 4; + vecSrc = vabsq(vecSrc); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpleq(vecSrc, curExtremValVec); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + indexVec = indexVec + 4; + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 3; + if (blkCnt > 0) + { + p0 = vctp32q(blkCnt); + + vecSrc = vldrwq_f32(pSrcVec); + pSrcVec += 4; + vecSrc = vabsq(vecSrc); + /* + * Get current max per lane and current index per lane + * when a max is selected + */ + p0 = vcmpleq_m(vecSrc, curExtremValVec, p0); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + } + /* + * Get min value across the vector + */ + minValue = vminnmvq(minValue, curExtremValVec); + /* + * set index for lower values to max possible index + */ + p0 = vcmpleq(curExtremValVec, minValue); + indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0); + /* + * Get min index which is thus for a max value + */ + idx = vminvq(idx, indexVec); + /* + * Save result + */ + *pIndex = idx; + *pResult = minValue; +} + +#else #if defined(ARM_MATH_LOOPUNROLL) void arm_absmin_f32( const float32_t * pSrc, @@ -186,6 +271,7 @@ void arm_absmin_f32( } #endif /* defined(ARM_MATH_LOOPUNROLL) */ +#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMin group */ diff --git a/Source/StatisticsFunctions/arm_absmin_q15.c b/Source/StatisticsFunctions/arm_absmin_q15.c index b6b1fee5..10c27818 100755 --- a/Source/StatisticsFunctions/arm_absmin_q15.c +++ b/Source/StatisticsFunctions/arm_absmin_q15.c @@ -44,7 +44,93 @@ @param[out] pIndex index of minimum value returned here @return none */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) +#include "arm_helium_utils.h" + +void arm_absmin_q15( + const q15_t * pSrc, + uint32_t blockSize, + q15_t * pResult, + uint32_t * pIndex) +{ + uint16_t blkCnt; /* loop counters */ + q15x8_t vecSrc; + q15_t const *pSrcVec; + q15x8_t curExtremValVec = vdupq_n_s16(Q15_ABSMAX); + q15_t minValue = Q15_ABSMAX; + uint16_t idx = blockSize; + uint16x8_t indexVec; + uint16x8_t curExtremIdxVec; + uint32_t startIdx = 0; + mve_pred16_t p0; + + + indexVec = vidupq_wb_u16(&startIdx, 1); + curExtremIdxVec = vdupq_n_u16(0); + + pSrcVec = (q15_t const *) pSrc; + blkCnt = blockSize >> 3; + while (blkCnt > 0U) + { + vecSrc = vld1q(pSrcVec); + pSrcVec += 8; + vecSrc = vabsq(vecSrc); + /* + * Get current min per lane and current index per lane + * when a min is selected + */ + p0 = vcmpleq(vecSrc, curExtremValVec); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + indexVec = vidupq_wb_u16(&startIdx, 1); + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 7; + if (blkCnt > 0U) + { + vecSrc = vld1q(pSrcVec); + pSrcVec += 8; + vecSrc = vabsq(vecSrc); + + p0 = vctp16q(blkCnt); + /* + * Get current min per lane and current index per lane + * when a min is selected + */ + p0 = vcmpleq_m(vecSrc, curExtremValVec, p0); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + } + /* + * Get min value across the vector + */ + minValue = vminvq(minValue, curExtremValVec); + /* + * set index for lower values to min possible index + */ + p0 = vcmpleq(curExtremValVec, minValue); + indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0); + /* + * Get min index which is thus for a min value + */ + idx = vminvq(idx, indexVec); + /* + * Save result + */ + *pIndex = idx; + *pResult = minValue; +} + +#else #if defined(ARM_MATH_DSP) void arm_absmin_q15( const q15_t * pSrc, @@ -175,7 +261,7 @@ void arm_absmin_q15( *pIndex = outIndex; } #endif /* defined(ARM_MATH_DSP) */ - +#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMin group */ diff --git a/Source/StatisticsFunctions/arm_absmin_q31.c b/Source/StatisticsFunctions/arm_absmin_q31.c index c02523e3..82765c08 100755 --- a/Source/StatisticsFunctions/arm_absmin_q31.c +++ b/Source/StatisticsFunctions/arm_absmin_q31.c @@ -45,6 +45,93 @@ @return none */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include "arm_helium_utils.h" + +void arm_absmin_q31( + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex) +{ + uint16_t blkCnt; /* loop counters */ + q31x4_t vecSrc; + q31_t const *pSrcVec; + q31x4_t curExtremValVec = vdupq_n_s32(Q31_ABSMAX); + q31_t minValue = Q31_ABSMAX; + uint16_t idx = blockSize; + uint32x4_t indexVec; + uint32x4_t curExtremIdxVec; + uint32_t startIdx = 0; + mve_pred16_t p0; + + + indexVec = vidupq_wb_u32(&startIdx, 1); + curExtremIdxVec = vdupq_n_u32(0); + + pSrcVec = (q31_t const *) pSrc; + blkCnt = blockSize >> 2; + while (blkCnt > 0U) + { + vecSrc = vldrwq_s32(pSrcVec); + pSrcVec += 4; + vecSrc = vabsq(vecSrc); + /* + * Get current min per lane and current index per lane + * when a min is selected + */ + p0 = vcmpleq(vecSrc, curExtremValVec); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + indexVec = vidupq_wb_u32(&startIdx, 1); + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 3; + if (blkCnt > 0U) + { + vecSrc = vldrwq_s32(pSrcVec); + pSrcVec += 4; + vecSrc = vabsq(vecSrc); + + p0 = vctp32q(blkCnt); + /* + * Get current min per lane and current index per lane + * when a min is selected + */ + p0 = vcmpleq_m(vecSrc, curExtremValVec, p0); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + } + /* + * Get min value across the vector + */ + minValue = vminvq(minValue, curExtremValVec); + /* + * set index for lower values to min possible index + */ + p0 = vcmpleq(curExtremValVec, minValue); + indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0); + /* + * Get min index which is thus for a min value + */ + idx = vminvq(idx, indexVec); + /* + * Save result + */ + *pIndex = idx; + *pResult = minValue; +} + +#else #if defined(ARM_MATH_DSP) void arm_absmin_q31( const q31_t * pSrc, @@ -174,6 +261,7 @@ void arm_absmin_q31( *pIndex = outIndex; } #endif /* defined(ARM_MATH_DSP) */ +#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMin group */ diff --git a/Source/StatisticsFunctions/arm_absmin_q7.c b/Source/StatisticsFunctions/arm_absmin_q7.c index a0f1fb73..ccd67f09 100755 --- a/Source/StatisticsFunctions/arm_absmin_q7.c +++ b/Source/StatisticsFunctions/arm_absmin_q7.c @@ -44,6 +44,147 @@ @param[out] pIndex index of minimum value returned here @return none */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#include +#include "arm_helium_utils.h" + +#define MAX_BLKSZ_S8 (UINT8_MAX+1) + +static void arm_small_blk_absmin_q7( + const q7_t *pSrc, + uint32_t blockSize, + q7_t *pResult, + uint32_t *pIndex) +{ + uint16_t blkCnt; /* loop counters */ + q7x16_t vecSrc; + q7_t const *pSrcVec; + q7x16_t curExtremValVec = vdupq_n_s8(Q7_ABSMAX); + q7_t minValue = Q7_ABSMAX; + uint16_t idx = blockSize - 1; + uint8x16_t indexVec; + uint8x16_t curExtremIdxVec; + uint32_t startIdx = 0; + mve_pred16_t p0; + + + indexVec = vidupq_wb_u8(&startIdx, 1); + curExtremIdxVec = vdupq_n_u8(0); + + pSrcVec = (q7_t const *) pSrc; + blkCnt = blockSize >> 4; + while (blkCnt > 0U) + { + vecSrc = vld1q(pSrcVec); + pSrcVec += 16; + vecSrc = vabsq(vecSrc); + /* + * Get current min per lane and current index per lane + * when a min is selected + */ + p0 = vcmpleq(vecSrc, curExtremValVec); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + + indexVec = vidupq_wb_u8(&startIdx, 1); + /* + * Decrement the blockSize loop counter + */ + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = blockSize & 0xF; + if (blkCnt > 0U) + { + vecSrc = vld1q(pSrcVec); + pSrcVec += 16; + vecSrc = vabsq(vecSrc); + + p0 = vctp8q(blkCnt); + /* + * Get current min per lane and current index per lane + * when a min is selected + */ + p0 = vcmpleq_m(vecSrc, curExtremValVec, p0); + curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); + curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + } + /* + * Get min value across the vector + */ + minValue = vminvq(minValue, curExtremValVec); + /* + * set index for lower values to min possible index + */ + p0 = vcmpleq(curExtremValVec, minValue); + idx = vminvq_p_u8(idx, curExtremIdxVec, p0); + /* + * Save result + */ + *pIndex = idx; + *pResult = minValue; +} + + +void arm_absmin_q7( + const q7_t * pSrc, + uint32_t blockSize, + q7_t * pResult, + uint32_t * pIndex) +{ + int32_t totalSize = blockSize; + + if (totalSize <= MAX_BLKSZ_S8) + { + arm_small_blk_absmin_q7(pSrc, blockSize, pResult, pIndex); + } + else + { + uint32_t curIdx = 0; + q7_t curBlkExtr = Q7_MAX; + uint32_t curBlkPos = 0; + uint32_t curBlkIdx = 0; + /* + * process blocks of 255 elts + */ + while (totalSize >= MAX_BLKSZ_S8) + { + const q7_t *curSrc = pSrc; + + arm_small_blk_absmin_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex); + if (*pResult < curBlkExtr) + { + /* + * update partial extrema + */ + curBlkExtr = *pResult; + curBlkPos = *pIndex; + curBlkIdx = curIdx; + } + curIdx++; + pSrc += MAX_BLKSZ_S8; + totalSize -= MAX_BLKSZ_S8; + } + /* + * remainder + */ + arm_small_blk_absmin_q7(pSrc, totalSize, pResult, pIndex); + if (*pResult < curBlkExtr) + { + curBlkExtr = *pResult; + curBlkPos = *pIndex; + curBlkIdx = curIdx; + } + *pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos; + *pResult = curBlkExtr; + } +} + +#else #if defined(ARM_MATH_DSP) void arm_absmin_q7( const q7_t * pSrc, @@ -173,6 +314,7 @@ void arm_absmin_q7( *pIndex = outIndex; } #endif /* defined(ARM_MATH_DSP) */ +#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */ /** @} end of AbsMin group */ diff --git a/Toolchain/GCC.cmake b/Toolchain/GCC.cmake index c0e051d4..6c495802 100644 --- a/Toolchain/GCC.cmake +++ b/Toolchain/GCC.cmake @@ -48,6 +48,13 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT) target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16") endif() + if (ARM_CPU STREQUAL "cortex-m55+nomve.fp+nofp" ) + target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8.1-m.main+dsp+fp.dp") + target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16") + target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16") + endif() + + if (ARM_CPU STREQUAL "cortex-m33" ) target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-sp-d16") target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-sp-d16") @@ -68,6 +75,12 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT) # target_link_options(${PROJECTNAME} PUBLIC "") #endif() + if (ARM_CPU STREQUAL "cortex-a32" ) + if (NOT (NEON OR NEONEXPERIMENTAL)) + target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8-a;-mfpu=vfpv3-d16") + target_link_options(${PROJECTNAME} PUBLIC "-march=armv8-a;-mfpu=vfpv3-d16") + endif() + endif() if (ARM_CPU STREQUAL "cortex-a9" ) if (NOT (NEON OR NEONEXPERIMENTAL))