diff --git a/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c b/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c index eccf7258..6c9123fd 100755 --- a/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c +++ b/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c @@ -28,6 +28,11 @@ #error device not specified! #endif +#define SERIAL_BASE_ADDRESS (0xA8000000ul) + +#define SERIAL_DATA *((volatile unsigned *) SERIAL_BASE_ADDRESS) + + /*---------------------------------------------------------------------------- Exception / Interrupt Handler Function Prototype *----------------------------------------------------------------------------*/ @@ -138,6 +143,8 @@ void Reset_Handler(void) *----------------------------------------------------------------------------*/ void HardFault_Handler(void) { + SERIAL_DATA = 'H'; + SERIAL_DATA = '\n'; while(1); } @@ -146,5 +153,7 @@ void HardFault_Handler(void) *----------------------------------------------------------------------------*/ void Default_Handler(void) { + SERIAL_DATA = 'D'; + SERIAL_DATA = '\n'; while(1); } diff --git a/Source/StatisticsFunctions/arm_max_q15.c b/Source/StatisticsFunctions/arm_max_q15.c index 0ae049b4..cfc8cd60 100644 --- a/Source/StatisticsFunctions/arm_max_q15.c +++ b/Source/StatisticsFunctions/arm_max_q15.c @@ -55,78 +55,49 @@ void arm_max_q15( q15_t * pResult, uint32_t * pIndex) { - uint32_t blkCnt; /* loop counters */ - q15x8_t vecSrc; - q15x8_t curExtremValVec = vdupq_n_s16(Q15_MIN); - q15_t maxValue = Q15_MIN, temp; - uint32_t idx = blockSize; - uint16x8_t indexVec; - uint16x8_t curExtremIdxVec; - mve_pred16_t p0; - - - indexVec = vidupq_u16((uint32_t)0, 1); - curExtremIdxVec = vdupq_n_u16(0); - - blkCnt = blockSize >> 3; - while (blkCnt > 0U) - { - vecSrc = vldrhq_s16(pSrc); - pSrc += 8; + int32_t blkCnt; /* loop counters */ + q15x8_t extremValVec = vdupq_n_s16(Q15_MIN); + q15_t maxValue = Q15_MIN; + uint16x8_t indexVec; + uint16x8_t extremIdxVec; + mve_pred16_t p0; + uint16_t extremIdxArr[8]; + + indexVec = vidupq_u16(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp16q(blkCnt); + q15x8_t extremIdxVal = vld1q_z(pSrc, p); /* * Get current max per lane and current index per lane * when a max is selected */ - p0 = vcmpgeq(vecSrc, curExtremValVec); - curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); - curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + p0 = vcmpgeq_m(extremIdxVal, extremValVec, p); - indexVec = indexVec + 8; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - /* - * Get max value across the vector - */ - maxValue = vmaxvq(maxValue, curExtremValVec); - /* - * set index for lower values to max possible index - */ - p0 = vcmpgeq(curExtremValVec, maxValue); - indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0); - /* - * Get min index which is thus for a max value - */ - idx = vminvq(idx, indexVec); - - /* Tail */ - blkCnt = blockSize & 0x7; - while (blkCnt > 0U) - { - /* Initialize temp to the next consecutive values one by one */ - temp = *pSrc++; - - /* compare for the maximum value */ - if (maxValue < temp) - { - /* Update the maximum value and it's index */ - maxValue = temp; - idx = blockSize - blkCnt; - } - - /* Decrement loop counter */ - blkCnt--; + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p(extremIdxArr, indexVec, p0); + + indexVec += 8; + pSrc += 8; + blkCnt -= 8; } + while (blkCnt > 0); + - /* - * Save result - */ - *pIndex = idx; + /* Get max value across the vector */ + maxValue = vmaxvq(maxValue, extremValVec); + + /* set index for lower values to max possible index */ + p0 = vcmpgeq(extremValVec, maxValue); + extremIdxVec = vld1q(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0); + *pIndex = vminvq(blockSize - 1, indexVec); *pResult = maxValue; } + #else void arm_max_q15( const q15_t * pSrc, diff --git a/Source/StatisticsFunctions/arm_max_q31.c b/Source/StatisticsFunctions/arm_max_q31.c index 9b4a7436..9cff80a8 100644 --- a/Source/StatisticsFunctions/arm_max_q31.c +++ b/Source/StatisticsFunctions/arm_max_q31.c @@ -50,86 +50,54 @@ #include "arm_helium_utils.h" void arm_max_q31( - const q31_t * pSrc, - uint32_t blockSize, - q31_t * pResult, - uint32_t * pIndex) + const q31_t * pSrc, + uint32_t blockSize, + q31_t * pResult, + uint32_t * pIndex) { - uint32_t blkCnt; /* loop counters */ - q31x4_t vecSrc; - q31x4_t curExtremValVec = vdupq_n_s32( Q31_MIN); - q31_t maxValue = Q31_MIN; - q31_t temp; - uint32_t idx = blockSize; - uint32x4_t indexVec; - uint32x4_t curExtremIdxVec; - mve_pred16_t p0; - - - indexVec = vidupq_u32((uint32_t)0, 1); - curExtremIdxVec = vdupq_n_u32(0); - - /* Compute 4 outputs at a time */ - blkCnt = blockSize >> 2U; - while (blkCnt > 0U) - { - vecSrc = vldrwq_s32(pSrc); - pSrc += 4; + int32_t blkCnt; /* loop counters */ + q31x4_t extremValVec = vdupq_n_s32(Q31_MIN); + q31_t maxValue = Q31_MIN; + uint32x4_t indexVec; + uint32x4_t extremIdxVec; + mve_pred16_t p0; + uint32_t extremIdxArr[4]; + + indexVec = vidupq_u32(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp32q(blkCnt); + q31x4_t extremIdxVal = vld1q_z(pSrc, p); /* * Get current max per lane and current index per lane * when a max is selected */ - p0 = vcmpgeq(vecSrc, curExtremValVec); - curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); - curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + p0 = vcmpgeq_m(extremIdxVal, extremValVec, p); - indexVec = indexVec + 4; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - /* - * Get max value across the vector - */ - maxValue = vmaxvq(maxValue, curExtremValVec); - /* - * set index for lower values to max possible index - */ - p0 = vcmpgeq(curExtremValVec, maxValue); - indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0); - /* - * Get min index which is thus for a max value - */ - idx = vminvq(idx, indexVec); - - /* Tail */ - blkCnt = blockSize & 0x3; - - while (blkCnt > 0U) - { - /* Initialize maxVal to the next consecutive values one by one */ - temp = *pSrc++; - - /* compare for the maximum value */ - if (maxValue < temp) - { - /* Update the maximum value and it's index */ - maxValue = temp; - idx = blockSize - blkCnt; - } - - /* Decrement loop counter */ - blkCnt--; + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p(extremIdxArr, indexVec, p0); + + indexVec += 4; + pSrc += 4; + blkCnt -= 4; } + while (blkCnt > 0); - /* - * Save result - */ - *pIndex = idx; + + /* Get max value across the vector */ + maxValue = vmaxvq(maxValue, extremValVec); + + /* set index for lower values to max possible index */ + p0 = vcmpgeq(extremValVec, maxValue); + extremIdxVec = vld1q(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0); + *pIndex = vminvq(blockSize - 1, indexVec); *pResult = maxValue; } + #else void arm_max_q31( const q31_t * pSrc, diff --git a/Source/StatisticsFunctions/arm_max_q7.c b/Source/StatisticsFunctions/arm_max_q7.c index 1bb2a1f9..517c082e 100644 --- a/Source/StatisticsFunctions/arm_max_q7.c +++ b/Source/StatisticsFunctions/arm_max_q7.c @@ -51,83 +51,50 @@ static void arm_small_blk_max_q7( const q7_t * pSrc, - uint8_t blockSize, + uint16_t blockSize, q7_t * pResult, uint32_t * pIndex) { - uint32_t blkCnt; /* loop counters */ - q7x16_t vecSrc; - q7x16_t curExtremValVec = vdupq_n_s8( Q7_MIN); - q7_t maxValue = Q7_MIN, temp; - uint32_t idx = blockSize; - uint8x16_t indexVec; - uint8x16_t curExtremIdxVec; - mve_pred16_t p0; - - - indexVec = vidupq_u8((uint32_t)0, 1); - curExtremIdxVec = vdupq_n_u8(0); - - blkCnt = blockSize >> 4; - while (blkCnt > 0U) - { - vecSrc = vldrbq_s8(pSrc); - pSrc += 16; + int32_t blkCnt; /* loop counters */ + q7x16_t extremValVec = vdupq_n_s8(Q7_MIN); + q7_t maxValue = Q7_MIN; + uint8x16_t indexVec; + uint8x16_t extremIdxVec; + mve_pred16_t p0; + uint8_t extremIdxArr[16]; + + indexVec = vidupq_u8(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp8q(blkCnt); + q7x16_t extremIdxVal = vld1q_z(pSrc, p); /* * Get current max per lane and current index per lane * when a max is selected */ - p0 = vcmpgeq(vecSrc, curExtremValVec); - curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); - curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + p0 = vcmpgeq_m(extremIdxVal, extremValVec, p); - indexVec = indexVec + 16; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - - /* - * Get max value across the vector - */ - maxValue = vmaxvq(maxValue, curExtremValVec); - /* - * set index for lower values to max possible index - */ - p0 = vcmpgeq(curExtremValVec, maxValue); - indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0); - /* - * Get min index which is thus for a max value - */ - idx = vminvq(idx, indexVec); - - /* - * tail - */ - blkCnt = blockSize & 0xF; - - while (blkCnt > 0U) - { - /* Initialize temp to the next consecutive values one by one */ - temp = *pSrc++; - - /* compare for the maximum value */ - if (maxValue < temp) - { - /* Update the maximum value and it's index */ - maxValue = temp; - idx = blockSize - blkCnt; - } - - /* Decrement loop counter */ - blkCnt--; + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p(extremIdxArr, indexVec, p0); + + indexVec += 16; + pSrc += 16; + blkCnt -= 16; } - /* - * Save result - */ - *pIndex = idx; + while (blkCnt > 0); + + + /* Get max value across the vector */ + maxValue = vmaxvq(maxValue, extremValVec); + + /* set index for lower values to max possible index */ + p0 = vcmpgeq(extremValVec, maxValue); + extremIdxVec = vld1q(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0); + *pIndex = vminvq_u8(blockSize - 1, indexVec); *pResult = maxValue; } @@ -138,8 +105,9 @@ void arm_max_q7( uint32_t * pIndex) { int32_t totalSize = blockSize; + const uint16_t sub_blk_sz = UINT8_MAX + 1; - if (totalSize <= UINT8_MAX) + if (totalSize <= sub_blk_sz) { arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex); } @@ -152,11 +120,11 @@ void arm_max_q7( /* * process blocks of 255 elts */ - while (totalSize >= UINT8_MAX) + while (totalSize >= sub_blk_sz) { const q7_t *curSrc = pSrc; - arm_small_blk_max_q7(curSrc, UINT8_MAX, pResult, pIndex); + arm_small_blk_max_q7(curSrc, sub_blk_sz, pResult, pIndex); if (*pResult > curBlkExtr) { /* @@ -167,8 +135,8 @@ void arm_max_q7( curBlkIdx = curIdx; } curIdx++; - pSrc += UINT8_MAX; - totalSize -= UINT8_MAX; + pSrc += sub_blk_sz; + totalSize -= sub_blk_sz; } /* * remainder @@ -180,7 +148,7 @@ void arm_max_q7( curBlkPos = *pIndex; curBlkIdx = curIdx; } - *pIndex = curBlkIdx * UINT8_MAX + curBlkPos; + *pIndex = curBlkIdx * sub_blk_sz + curBlkPos; *pResult = curBlkExtr; } } diff --git a/Source/StatisticsFunctions/arm_min_q15.c b/Source/StatisticsFunctions/arm_min_q15.c index f8ef0942..76db76a6 100644 --- a/Source/StatisticsFunctions/arm_min_q15.c +++ b/Source/StatisticsFunctions/arm_min_q15.c @@ -56,79 +56,48 @@ void arm_min_q15( q15_t * pResult, uint32_t * pIndex) { - uint32_t blkCnt; /* loop counters */ - q15x8_t vecSrc; - q15x8_t curExtremValVec = vdupq_n_s16(Q15_MAX); - q15_t minValue = Q15_MAX,temp; - uint32_t idx = blockSize; - uint16x8_t indexVec; - uint16x8_t curExtremIdxVec; - mve_pred16_t p0; + int32_t blkCnt; /* loop counters */ + q15x8_t extremValVec = vdupq_n_s16(Q15_MAX); + q15_t minValue = Q15_MAX; + uint16x8_t indexVec; + uint16x8_t extremIdxVec; + mve_pred16_t p0; + uint16_t extremIdxArr[8]; - indexVec = vidupq_u16((uint32_t)0, 1); - curExtremIdxVec = vdupq_n_u16(0); + indexVec = vidupq_u16(0U, 1); - blkCnt = blockSize >> 3; - while (blkCnt > 0U) - { - vecSrc = vldrhq_s16(pSrc); - pSrc += 8; + blkCnt = blockSize; + do { + mve_pred16_t p = vctp16q(blkCnt); + q15x8_t extremIdxVal = vld1q_z(pSrc, p); /* * Get current min per lane and current index per lane * when a min is selected */ - p0 = vcmpleq(vecSrc, curExtremValVec); - curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); - curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + p0 = vcmpleq_m(extremIdxVal, extremValVec, p); - indexVec = indexVec + 8; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - /* - * Get min value across the vector - */ - minValue = vminvq(minValue, curExtremValVec); - /* - * set index for lower values to min possible index - */ - p0 = vcmpleq(curExtremValVec, minValue); - indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0); - /* - * Get min index which is thus for a min value - */ - idx = vminvq(idx, indexVec); - - /* - * tail - */ - blkCnt = blockSize & 7; - while (blkCnt > 0U) - { - /* Initialize minVal to the next consecutive values one by one */ - temp = *pSrc++; - - /* compare for the minimum value */ - if (minValue > temp) - { - /* Update the minimum value and it's index */ - minValue = temp; - idx = blockSize - blkCnt; - } - - /* Decrement loop counter */ - blkCnt--; + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p(extremIdxArr, indexVec, p0); + + indexVec += 8; + pSrc += 8; + blkCnt -= 8; } + while (blkCnt > 0); + + /* Get min value across the vector */ + minValue = vminvq(minValue, extremValVec); + + /* set index for lower values to min possible index */ + p0 = vcmpleq(extremValVec, minValue); + extremIdxVec = vld1q(extremIdxArr); - /* - * Save result - */ - *pIndex = idx; + indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0); + *pIndex = vminvq(blockSize - 1, indexVec); *pResult = minValue; + } #else void arm_min_q15( diff --git a/Source/StatisticsFunctions/arm_min_q31.c b/Source/StatisticsFunctions/arm_min_q31.c index 2b0c8127..2cc451f8 100644 --- a/Source/StatisticsFunctions/arm_min_q31.c +++ b/Source/StatisticsFunctions/arm_min_q31.c @@ -56,79 +56,49 @@ void arm_min_q31( q31_t * pResult, uint32_t * pIndex) { - uint32_t blkCnt; /* loop counters */ - q31x4_t vecSrc; - q31x4_t curExtremValVec = vdupq_n_s32(Q31_MAX); - q31_t minValue = Q31_MAX, temp; - uint32_t idx = blockSize; - uint32x4_t indexVec; - uint32x4_t curExtremIdxVec; - mve_pred16_t p0; - - - indexVec = vidupq_u32((uint32_t)0, 1); - curExtremIdxVec = vdupq_n_u32(0); - - /* Compute 4 outputs at a time */ - blkCnt = blockSize >> 2U; - while (blkCnt > 0U) - { - vecSrc = vldrwq_s32(pSrc); - pSrc += 4; + int32_t blkCnt; /* loop counters */ + q31x4_t extremValVec = vdupq_n_s32(Q31_MAX); + q31_t minValue = Q31_MAX; + uint32x4_t indexVec; + uint32x4_t extremIdxVec; + mve_pred16_t p0; + uint32_t extremIdxArr[4]; + + indexVec = vidupq_u32(0U, 1); + + blkCnt = blockSize; + do { + mve_pred16_t p = vctp32q(blkCnt); + q31x4_t extremIdxVal = vld1q_z(pSrc, p); /* * Get current min per lane and current index per lane * when a min is selected */ - p0 = vcmpleq(vecSrc, curExtremValVec); - curExtremValVec = vpselq(vecSrc, curExtremValVec, p0); - curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0); + p0 = vcmpleq_m(extremIdxVal, extremValVec, p); - indexVec = indexVec + 4; - /* - * Decrement the blockSize loop counter - */ - blkCnt--; - } - - /* - * Get min value across the vector - */ - minValue = vminvq(minValue, curExtremValVec); - /* - * set index for lower values to min possible index - */ - p0 = vcmpleq(curExtremValVec, minValue); - indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0); - /* - * Get min index which is thus for a min value - */ - idx = vminvq(idx, indexVec); - - - /* Tail */ - blkCnt = blockSize & 0x3; - while (blkCnt > 0U) - { - /* Initialize temp to the next consecutive values one by one */ - temp = *pSrc++; - - /* compare for the minimum value */ - if (minValue > temp) - { - /* Update the minimum value and it's index */ - minValue = temp; - idx = blockSize - blkCnt; - } - - /* Decrement loop counter */ - blkCnt--; + extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0); + /* store per-lane extrema indexes */ + vst1q_p(extremIdxArr, indexVec, p0); + + indexVec += 4; + pSrc += 4; + blkCnt -= 4; } - /* - * Save result - */ - *pIndex = idx; + while (blkCnt > 0); + + + /* Get min value across the vector */ + minValue = vminvq(minValue, extremValVec); + + /* set index for lower values to min possible index */ + p0 = vcmpleq(extremValVec, minValue); + extremIdxVec = vld1q(extremIdxArr); + + indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0); + *pIndex = vminvq(blockSize - 1, indexVec); *pResult = minValue; } + #else void arm_min_q31( const q31_t * pSrc,