From c2ca0dd2f8099c6e6100038de5fd80b4bb0cd4f1 Mon Sep 17 00:00:00 2001 From: Christophe Favergeon Date: Wed, 13 May 2020 13:06:58 +0200 Subject: [PATCH] CMSIS-DSP: Improvement to testing scripts --- PrivateInclude/arm_vec_filtering.h | 2519 ++++++++++---------- Testing/CMakeLists.txt | 8 +- Testing/TestScripts/Regression/Commands.py | 19 +- Testing/addToDB.py | 33 +- Testing/addToRegDB.py | 34 +- Testing/bench.txt | 60 +- Testing/createDb.sql | 21 +- Testing/examples.sql | 7 +- Testing/extractDb.py | 283 +++ Testing/runAllBenchmarks.bat | 70 - Testing/runAllBenchmarks.py | 105 - Testing/runAllTests.py | 42 +- Toolchain/AC6.cmake | 2 +- 13 files changed, 1654 insertions(+), 1549 deletions(-) create mode 100755 Testing/extractDb.py delete mode 100755 Testing/runAllBenchmarks.bat delete mode 100755 Testing/runAllBenchmarks.py diff --git a/PrivateInclude/arm_vec_filtering.h b/PrivateInclude/arm_vec_filtering.h index 65fc752a..b2a06903 100755 --- a/PrivateInclude/arm_vec_filtering.h +++ b/PrivateInclude/arm_vec_filtering.h @@ -253,291 +253,223 @@ extern "C" acc1 = vecAddAcrossF32Mve(acc1Vec); \ } -#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count) \ -{ \ - float32_t const *pSrcX; \ - f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ - uint32_t k; \ - \ - acc0Vec = vdupq_n_f32(0.0f); \ - acc1Vec = vdupq_n_f32(0.0f); \ - pSrcX = (float32_t const *) pX; \ - k = (count - 1) >> 2; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - pY-=4; \ - xVec = vldrwq_f32(&pSrcX[1]); \ - acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = (count - 1) % 0x4U; \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - xVec = vldrwq_f32(&pSrcX[1]); \ - acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - p0 = vctp32q(k+1); \ - acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ - \ - acc0 = vecAddAcrossF32Mve(acc0Vec); \ - acc1 = vecAddAcrossF32Mve(acc1Vec); \ +#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count) \ +{ \ + float32_t const *pSrcX; \ + f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ + uint32_t k; \ + \ + acc0Vec = vdupq_n_f32(0.0f); \ + acc1Vec = vdupq_n_f32(0.0f); \ + pSrcX = (float32_t const *) pX; \ + k = (count - 1) >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vldrwq_f32(&pSrcX[1]); \ + acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = (count - 1) % 0x4U; \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + xVec = vldrwq_f32(&pSrcX[1]); \ + acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + p0 = vctp32q(k+1); \ + acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ + \ + acc0 = vecAddAcrossF32Mve(acc0Vec); \ + acc1 = vecAddAcrossF32Mve(acc1Vec); \ } -#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count) \ -{ \ - float32_t const *pSrcX; \ - f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ - uint32_t k; \ - \ - acc0Vec = vdupq_n_f32(0.0f); \ - acc1Vec = vdupq_n_f32(0.0f); \ - pSrcX = (float32_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - pY-=4; \ - xVec = vldrwq_f32(&pSrcX[1]); \ - acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - xVec = vldrwq_f32(&pSrcX[1]); \ - acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ - } \ - acc0 = vecAddAcrossF32Mve(acc0Vec); \ - acc1 = vecAddAcrossF32Mve(acc1Vec); \ +#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count) \ +{ \ + float32_t const *pSrcX; \ + f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ + uint32_t k; \ + \ + acc0Vec = vdupq_n_f32(0.0f); \ + acc1Vec = vdupq_n_f32(0.0f); \ + pSrcX = (float32_t const *) pX; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vldrwq_f32(&pSrcX[1]); \ + acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + xVec = vldrwq_f32(&pSrcX[1]); \ + acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ + } \ + acc0 = vecAddAcrossF32Mve(acc0Vec); \ + acc1 = vecAddAcrossF32Mve(acc1Vec); \ } -#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count) \ -{ \ - float32_t const *pSrcX; \ - const float32_t *pY1 = pY + 1; \ - f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ - uint32_t k; \ - \ - acc0Vec = vdupq_n_f32(0.0f); \ - acc1Vec = vdupq_n_f32(0.0f); \ - pSrcX = (float32_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - pY-=4; \ - acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ - yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \ - pY1-=4; \ - acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x4U; \ - /* use predication to finalize MAC sum */ \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - mve_pred16_t p0 = vctp32q(k); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ - yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp32q(k+1); \ - acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ - \ - acc0 = vecAddAcrossF32Mve(acc0Vec); \ - acc1 = vecAddAcrossF32Mve(acc1Vec); \ +#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\ +{ \ + float32_t const *pSrcX; \ + const float32_t *pY1 = pY + 1; \ + f32x4_t acc0Vec, acc1Vec, xVec, yVec; \ + uint32_t k; \ + \ + acc0Vec = vdupq_n_f32(0.0f); \ + acc1Vec = vdupq_n_f32(0.0f); \ + pSrcX = (float32_t const *) pX; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + pY-=4; \ + acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \ + yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \ + pY1-=4; \ + acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = count % 0x4U; \ + /* use predication to finalize MAC sum */ \ + /* acc0 requires exact number of sample */ \ + /* disable extra lanes in final MAC computation */ \ + mve_pred16_t p0 = vctp32q(k); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \ + yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \ + /* acc1 requires 1 additional sample */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp32q(k+1); \ + acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \ + \ + acc0 = vecAddAcrossF32Mve(acc0Vec); \ + acc1 = vecAddAcrossF32Mve(acc1Vec); \ } -#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count) \ -{ \ - float32_t const *pSrcX; \ - f32x4_t accVec, xVec, yVec; \ - uint32_t k; \ - \ - accVec = vdupq_n_f32(0.0f); \ - pSrcX = (float32_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - pY-=4; \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - accVec = vfmaq_f32(accVec, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ - accVec = vfmaq_m_f32(accVec, xVec, yVec, p0); \ - } \ - acc = vecAddAcrossF32Mve(accVec); \ +#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count) \ +{ \ + float32_t const *pSrcX; \ + f32x4_t accVec, xVec, yVec; \ + uint32_t k; \ + \ + accVec = vdupq_n_f32(0.0f); \ + pSrcX = (float32_t const *) pX; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + accVec = vfmaq_f32(accVec, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \ + accVec = vfmaq_m_f32(accVec, xVec, yVec, p0); \ + } \ + acc = vecAddAcrossF32Mve(accVec); \ } #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/ #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) -#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count) \ -{ \ - q31_t const *pSrcX; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - pY-=4; \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc = vmlaldavaq(acc, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ - } \ - acc = asrl(acc, 31); \ -} - - - -#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count) \ -{ \ - q31_t const *pSrcX; \ - const q31_t *pY1 = pY + 1; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - pY-=4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \ - pY1-=4; \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x4U; \ - /* use predication to finalize MAC sum */ \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - mve_pred16_t p0 = vctp32q(k); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp32q(k+1); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ +#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count) \ +{ \ + q31_t const *pSrcX; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc = vmlaldavaq(acc, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ + } \ + acc = asrl(acc, 31); \ } - -#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\ +#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\ { \ q31_t const *pSrcX; \ + const q31_t *pY1 = pY + 1; \ q31x4_t xVec, yVec; \ uint32_t k; \ \ pSrcX = (q31_t const *) pX; \ - k = (count-1) >> 2; \ + k = count >> 2; \ \ while (k > 0U) \ { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ + xVec = vld1q(pSrcX); pSrcX += 4; \ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ pY-=4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); \ - pSrcX += 4; \ acc0 = vmlaldavaq(acc0, xVec, yVec); \ + yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \ + pY1-=4; \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ /* Decrement the loop counter */ \ k--; \ } \ - k = (count - 1) % 0x4U; \ + k = count % 0x4U; \ /* use predication to finalize MAC sum */ \ - /* acc1 requires exact number of sample (count-1) */ \ + /* acc0 requires exact number of sample */ \ /* disable extra lanes in final MAC computation */ \ mve_pred16_t p0 = vctp32q(k); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - /* acc0 requires 1 additional sample (count) */ \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \ + /* acc1 requires 1 additional sample */ \ /* so add 1 to unmask an extra lane in final MAC computation */ \ p0 = vctp32q(k+1); \ - xVec = vld1q(pSrcX); \ - pSrcX += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ \ acc0 = asrl(acc0, 31); \ acc1 = asrl(acc1, 31); \ @@ -545,1110 +477,1103 @@ extern "C" -#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \ -{ \ - q31_t const *pSrcX; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - pY-=4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ + +#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \ +{ \ + q31_t const *pSrcX; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + k = (count-1) >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); \ + pSrcX += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = (count - 1) % 0x4U; \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires exact number of sample (count-1) */ \ + /* disable extra lanes in final MAC computation */ \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + /* acc0 requires 1 additional sample (count) */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp32q(k+1); \ + xVec = vld1q(pSrcX); \ + pSrcX += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ } -#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \ -{ \ - q31_t const *pSrcX; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - pY-=4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vldrwq_s32(&pSrcX[2]); \ - acc2 = vmlaldavaq(acc2, xVec, yVec); \ - xVec = vldrwq_s32(&pSrcX[3]); \ - acc3 = vmlaldavaq(acc3, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vldrwq_s32(&pSrcX[2]); \ - acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ - xVec = vldrwq_s32(&pSrcX[3]); \ - acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ - acc2 = asrl(acc2, 31); \ - acc3 = asrl(acc3, 31); \ +#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \ +{ \ + q31_t const *pSrcX; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ } -#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count) \ -{ \ - q31_t const *pSrcX, *pSrcY; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - pSrcY = (q31_t const *) pY; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_s32(&pSrcY[-1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x4U; \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - mve_pred16_t p0 = vctp32q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vldrwq_s32(&pSrcY[-1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - p0 = vctp32q(k); \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \ - \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ + + +#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \ +{ \ + q31_t const *pSrcX; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + pY-=4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vldrwq_s32(&pSrcX[2]); \ + acc2 = vmlaldavaq(acc2, xVec, yVec); \ + xVec = vldrwq_s32(&pSrcX[3]); \ + acc3 = vmlaldavaq(acc3, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vldrwq_s32(&pSrcX[2]); \ + acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ + xVec = vldrwq_s32(&pSrcX[3]); \ + acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ + acc2 = asrl(acc2, 31); \ + acc3 = asrl(acc3, 31); \ } -#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count) \ -{ \ - q31_t const *pSrcX, *pSrcY; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - pSrcY = (q31_t const *) pY; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - acc = vmlaldavaq(acc, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ - } \ - acc = asrl(acc, 31); \ +#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\ +{ \ + q31_t const *pSrcX, *pSrcY; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + pSrcY = (q31_t const *) pY; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vldrwq_s32(&pSrcY[-1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = count % 0x4U; \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires 1 additional sample */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + mve_pred16_t p0 = vctp32q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vldrwq_s32(&pSrcY[-1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \ + /* acc0 requires exact number of sample */ \ + /* disable extra lanes in final MAC computation */ \ + p0 = vctp32q(k); \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \ + \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ } -#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \ -{ \ - q31_t const *pSrcX, *pSrcY; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - pSrcY = (q31_t const *) pY; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vldrwq_s32(&pSrcX[2]); \ - acc2 = vmlaldavaq(acc2, xVec, yVec); \ - xVec = vldrwq_s32(&pSrcX[3]); \ - acc3 = vmlaldavaq(acc3, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* loop + tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vldrwq_s32(&pSrcX[2]); \ - acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ - xVec = vldrwq_s32(&pSrcX[3]); \ - acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ - acc2 = asrl(acc2, 31); \ - acc3 = asrl(acc3, 31); \ +#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\ +{ \ + q31_t const *pSrcX, *pSrcY; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + pSrcY = (q31_t const *) pY; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + acc = vmlaldavaq(acc, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ + } \ + acc = asrl(acc, 31); \ } -#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \ -{ \ - q31_t const *pSrcX, *pSrcY; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - pSrcY = (q31_t const *) pY; \ - k = count >> 2; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* loop + tail predication expected here */ \ - k = count % 0x4U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ +#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\ +{ \ + q31_t const *pSrcX, *pSrcY; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + pSrcY = (q31_t const *) pY; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vldrwq_s32(&pSrcX[2]); \ + acc2 = vmlaldavaq(acc2, xVec, yVec); \ + xVec = vldrwq_s32(&pSrcX[3]); \ + acc3 = vmlaldavaq(acc3, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* loop + tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vldrwq_s32(&pSrcX[2]); \ + acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ + xVec = vldrwq_s32(&pSrcX[3]); \ + acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ + acc2 = asrl(acc2, 31); \ + acc3 = asrl(acc3, 31); \ } -#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \ -{ \ - q31_t const *pSrcX, *pSrcY; \ - q31x4_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q31_t const *) pX; \ - pSrcY = (q31_t const *) pY; \ - k = (count-1) >> 2; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires exact number of sample (count-1) */ \ - /* disable extra lanes in final MAC computation */ \ - k = (count-1) % 0x4U; \ - mve_pred16_t p0 = vctp32q(k); \ - yVec = vld1q(pSrcY); pSrcY += 4; \ - xVec = vldrwq_s32(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - /* acc0 requires 1 additional sample (count) */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp32q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 4; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - \ - acc0 = asrl(acc0, 31); \ - acc1 = asrl(acc1, 31); \ +#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\ +{ \ + q31_t const *pSrcX, *pSrcY; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + pSrcY = (q31_t const *) pY; \ + k = count >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* loop + tail predication expected here */ \ + k = count % 0x4U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ } -#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count) \ -{ \ - q15_t const *pSrcX, *pSrcY; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - pSrcY = (q15_t const *) pY; \ - k = count >> 3; \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vldrhq_s16(&pSrcY[-1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x8U; \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - mve_pred16_t p0 = vctp16q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vldrhq_s16(&pSrcY[-1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - p0 = vctp16q(k); \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \ - \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ +#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\ +{ \ + q31_t const *pSrcX, *pSrcY; \ + q31x4_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q31_t const *) pX; \ + pSrcY = (q31_t const *) pY; \ + k = (count-1) >> 2; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires exact number of sample (count-1) */ \ + /* disable extra lanes in final MAC computation */ \ + k = (count-1) % 0x4U; \ + mve_pred16_t p0 = vctp32q(k); \ + yVec = vld1q(pSrcY); pSrcY += 4; \ + xVec = vldrwq_s32(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + /* acc0 requires 1 additional sample (count) */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp32q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 4; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + \ + acc0 = asrl(acc0, 31); \ + acc1 = asrl(acc1, 31); \ +} + +#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\ +{ \ + q15_t const *pSrcX, *pSrcY; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + pSrcY = (q15_t const *) pY; \ + k = count >> 3; \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vldrhq_s16(&pSrcY[-1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = count % 0x8U; \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires 1 additional sample */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + mve_pred16_t p0 = vctp16q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vldrhq_s16(&pSrcY[-1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \ + /* acc0 requires exact number of sample */ \ + /* disable extra lanes in final MAC computation */ \ + p0 = vctp16q(k); \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \ + \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ } -#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count) \ -{ \ - q15_t const *pSrcX, *pSrcY; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - pSrcY = (q15_t const *) pY; \ - k = count >> 3; \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - acc = vmlaldavaq(acc, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* tail predication expected here */ \ - k = count % 0x8U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp16q(k); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ - } \ - acc = asrl(acc, 15); \ - acc = __SSAT(acc, 16); \ +#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\ +{ \ + q15_t const *pSrcX, *pSrcY; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + pSrcY = (q15_t const *) pY; \ + k = count >> 3; \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + acc = vmlaldavaq(acc, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* tail predication expected here */ \ + k = count % 0x8U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp16q(k); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ + } \ + acc = asrl(acc, 15); \ + acc = __SSAT(acc, 16); \ } -#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \ -{ \ - q15_t const *pSrcX, *pSrcY; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - pSrcY = (q15_t const *) pY; \ - k = count >> 3; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vldrhq_s16(&pSrcX[2]); \ - acc2 = vmlaldavaq(acc2, xVec, yVec); \ - xVec = vldrhq_s16(&pSrcX[3]); \ - acc3 = vmlaldavaq(acc3, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* loop + tail predication expected here */ \ - k = count % 0x8U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp16q(k); \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vldrhq_s16(&pSrcX[2]); \ - acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ - xVec = vldrhq_s16(&pSrcX[3]); \ - acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc2 = asrl(acc2, 15); \ - acc3 = asrl(acc3, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ - acc2 = __SSAT(acc2, 16); \ - acc3 = __SSAT(acc3, 16); \ +#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\ +{ \ + q15_t const *pSrcX, *pSrcY; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + pSrcY = (q15_t const *) pY; \ + k = count >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vldrhq_s16(&pSrcX[2]); \ + acc2 = vmlaldavaq(acc2, xVec, yVec); \ + xVec = vldrhq_s16(&pSrcX[3]); \ + acc3 = vmlaldavaq(acc3, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* loop + tail predication expected here */ \ + k = count % 0x8U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp16q(k); \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vldrhq_s16(&pSrcX[2]); \ + acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ + xVec = vldrhq_s16(&pSrcX[3]); \ + acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc2 = asrl(acc2, 15); \ + acc3 = asrl(acc3, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ + acc2 = __SSAT(acc2, 16); \ + acc3 = __SSAT(acc3, 16); \ } -#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \ -{ \ - q15_t const *pSrcX, *pSrcY; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - pSrcY = (q15_t const *) pY; \ - k = count >> 3; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* loop + tail predication expected here */ \ - k = count % 0x8U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp16q(k); \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ +#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\ +{ \ + q15_t const *pSrcX, *pSrcY; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + pSrcY = (q15_t const *) pY; \ + k = count >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* loop + tail predication expected here */ \ + k = count % 0x8U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp16q(k); \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ } -#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \ -{ \ - q15_t const *pSrcX, *pSrcY; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - pSrcY = (q15_t const *) pY; \ - k = (count-1) >> 3; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires exact number of sample (count-1) */ \ - /* disable extra lanes in final MAC computation */ \ - k = (count-1) % 0x8U; \ - mve_pred16_t p0 = vctp16q(k); \ - yVec = vld1q(pSrcY); pSrcY += 8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - /* acc0 requires 1 additional sample (count) */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp16q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ +#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\ +{ \ + q15_t const *pSrcX, *pSrcY; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + pSrcY = (q15_t const *) pY; \ + k = (count-1) >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires exact number of sample (count-1) */ \ + /* disable extra lanes in final MAC computation */ \ + k = (count-1) % 0x8U; \ + mve_pred16_t p0 = vctp16q(k); \ + yVec = vld1q(pSrcY); pSrcY += 8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + /* acc0 requires 1 additional sample (count) */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp16q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ } -#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count) \ -{ \ - q15_t const *pSrcX; \ - const q15_t *pY1 = pY + 1; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - k = count >> 3; \ - \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - pY-=8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \ - pY1-=8; \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x8U; \ - /* use predication to finalize MAC sum */ \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - mve_pred16_t p0 = vctp16q(k); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp16q(k+1); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ +#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\ +{ \ + q15_t const *pSrcX; \ + const q15_t *pY1 = pY + 1; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + k = count >> 3; \ + \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + pY-=8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \ + pY1-=8; \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = count % 0x8U; \ + /* use predication to finalize MAC sum */ \ + /* acc0 requires exact number of sample */ \ + /* disable extra lanes in final MAC computation */ \ + mve_pred16_t p0 = vctp16q(k); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \ + /* acc1 requires 1 additional sample */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp16q(k+1); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ } -#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count) \ -{ \ - q15_t const *pSrcX; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - k = count >> 3; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - pY-=8; \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc = vmlaldavaq(acc, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x8U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp16q(k); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ - } \ - acc = asrl(acc, 15); \ - acc = __SSAT(acc, 16); \ +#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count) \ +{ \ + q15_t const *pSrcX; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + k = count >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + pY-=8; \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc = vmlaldavaq(acc, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x8U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp16q(k); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + acc = vmlaldavaq_p(acc, xVec, yVec, p0); \ + } \ + acc = asrl(acc, 15); \ + acc = __SSAT(acc, 16); \ } -#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \ -{ \ - q15_t const *pSrcX; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - k = count >> 3; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - pY-=8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vldrhq_s16(&pSrcX[2]); \ - acc2 = vmlaldavaq(acc2, xVec, yVec); \ - xVec = vldrhq_s16(&pSrcX[3]); \ - acc3 = vmlaldavaq(acc3, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x8U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp16q(k); \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vldrhq_s16(&pSrcX[2]); \ - acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ - xVec = vldrhq_s16(&pSrcX[3]); \ - acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc2 = asrl(acc2, 15); \ - acc3 = asrl(acc3, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ - acc2 = __SSAT(acc2, 16); \ - acc3 = __SSAT(acc3, 16); \ +#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \ +{ \ + q15_t const *pSrcX; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + k = count >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + pY-=8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vldrhq_s16(&pSrcX[2]); \ + acc2 = vmlaldavaq(acc2, xVec, yVec); \ + xVec = vldrhq_s16(&pSrcX[3]); \ + acc3 = vmlaldavaq(acc3, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x8U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp16q(k); \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vldrhq_s16(&pSrcX[2]); \ + acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \ + xVec = vldrhq_s16(&pSrcX[3]); \ + acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc2 = asrl(acc2, 15); \ + acc3 = asrl(acc3, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ + acc2 = __SSAT(acc2, 16); \ + acc3 = __SSAT(acc3, 16); \ } -#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \ -{ \ - q15_t const *pSrcX; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - k = count >> 3; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - pY-=8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x8U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp16q(k); \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - } \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ +#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \ +{ \ + q15_t const *pSrcX; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + k = count >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + pY-=8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x8U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp16q(k); \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + } \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ } -#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \ -{ \ - q15_t const *pSrcX; \ - q15x8_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q15_t const *) pX; \ - k = (count-1) >> 3; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - pY-=8; \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = (count - 1) % 0x8U; \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires exact number of sample (count-1) */ \ - /* disable extra lanes in final MAC computation */ \ - mve_pred16_t p0 = vctp16q(k); \ - yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ - xVec = vldrhq_s16(&pSrcX[1]); \ - acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ - /* acc0 requires 1 additional sample (count) */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp16q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 8; \ - acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ - \ - acc0 = asrl(acc0, 15); \ - acc1 = asrl(acc1, 15); \ - acc0 = __SSAT(acc0, 16); \ - acc1 = __SSAT(acc1, 16); \ +#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \ +{ \ + q15_t const *pSrcX; \ + q15x8_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q15_t const *) pX; \ + k = (count-1) >> 3; \ + \ + while (k > 0U) \ + { \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + pY-=8; \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = (count - 1) % 0x8U; \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires exact number of sample (count-1) */ \ + /* disable extra lanes in final MAC computation */ \ + mve_pred16_t p0 = vctp16q(k); \ + yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \ + xVec = vldrhq_s16(&pSrcX[1]); \ + acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \ + /* acc0 requires 1 additional sample (count) */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp16q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 8; \ + acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \ + \ + acc0 = asrl(acc0, 15); \ + acc1 = asrl(acc1, 15); \ + acc0 = __SSAT(acc0, 16); \ + acc1 = __SSAT(acc1, 16); \ } -#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count) \ -{ \ - q7_t const *pSrcX, *pSrcY; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - pSrcY = (q7_t const *) pY; \ - k = count >> 4; \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vldrbq_s8(&pSrcY[-1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x10U; \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - mve_pred16_t p0 = vctp8q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vldrbq_s8(&pSrcY[-1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec,p0); \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - p0 = vctp8q(k); \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec,p0); \ - \ - acc0 = (acc0 >> 7); \ - acc1 = (acc1 >> 7); \ - acc0 = __SSAT(acc0, 8); \ - acc1 = __SSAT(acc1, 8); \ +#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\ +{ \ + q7_t const *pSrcX, *pSrcY; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + pSrcY = (q7_t const *) pY; \ + k = count >> 4; \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vldrbq_s8(&pSrcY[-1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = count % 0x10U; \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires 1 additional sample */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + mve_pred16_t p0 = vctp8q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vldrbq_s8(&pSrcY[-1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec,p0); \ + /* acc0 requires exact number of sample */ \ + /* disable extra lanes in final MAC computation */ \ + p0 = vctp8q(k); \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec,p0); \ + \ + acc0 = (acc0 >> 7); \ + acc1 = (acc1 >> 7); \ + acc0 = __SSAT(acc0, 8); \ + acc1 = __SSAT(acc1, 8); \ } -#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count) \ -{ \ - q7_t const *pSrcX, *pSrcY; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - pSrcY = (q7_t const *) pY; \ - k = count >> 4; \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - acc = vmladavaq(acc, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* tail predication expected here */ \ - k = count % 0x10U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp8q(k); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - acc = vmladavaq_p(acc, xVec, yVec, p0); \ - } \ - acc =(acc >> 7); \ - acc = __SSAT(acc, 8); \ +#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\ +{ \ + q7_t const *pSrcX, *pSrcY; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + pSrcY = (q7_t const *) pY; \ + k = count >> 4; \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + acc = vmladavaq(acc, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* tail predication expected here */ \ + k = count % 0x10U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp8q(k); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + acc = vmladavaq_p(acc, xVec, yVec, p0); \ + } \ + acc =(acc >> 7); \ + acc = __SSAT(acc, 8); \ } -#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \ -{ \ - q7_t const *pSrcX, *pSrcY; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - pSrcY = (q7_t const *) pY; \ - k = count >> 4; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - xVec = vldrbq_s8(&pSrcX[2]); \ - acc2 = vmladavaq(acc2, xVec, yVec); \ - xVec = vldrbq_s8(&pSrcX[3]); \ - acc3 = vmladavaq(acc3, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* loop + tail predication expected here */ \ - k = count % 0x10U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp8q(k); \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - xVec = vldrbq_s8(&pSrcX[2]); \ - acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \ - xVec = vldrbq_s8(&pSrcX[3]); \ - acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - } \ - \ - acc0 = (acc0 >> 7); \ - acc1 = (acc1 >> 7); \ - acc2 = (acc2 >> 7); \ - acc3 = (acc3 >> 7); \ - acc0 = __SSAT(acc0, 8); \ - acc1 = __SSAT(acc1, 8); \ - acc2 = __SSAT(acc2, 8); \ - acc3 = __SSAT(acc3, 8); \ +#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\ +{ \ + q7_t const *pSrcX, *pSrcY; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + pSrcY = (q7_t const *) pY; \ + k = count >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + xVec = vldrbq_s8(&pSrcX[2]); \ + acc2 = vmladavaq(acc2, xVec, yVec); \ + xVec = vldrbq_s8(&pSrcX[3]); \ + acc3 = vmladavaq(acc3, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* loop + tail predication expected here */ \ + k = count % 0x10U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp8q(k); \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + xVec = vldrbq_s8(&pSrcX[2]); \ + acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \ + xVec = vldrbq_s8(&pSrcX[3]); \ + acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + } \ + \ + acc0 = (acc0 >> 7); \ + acc1 = (acc1 >> 7); \ + acc2 = (acc2 >> 7); \ + acc3 = (acc3 >> 7); \ + acc0 = __SSAT(acc0, 8); \ + acc1 = __SSAT(acc1, 8); \ + acc2 = __SSAT(acc2, 8); \ + acc3 = __SSAT(acc3, 8); \ } -#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \ -{ \ - q7_t const *pSrcX, *pSrcY; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - pSrcY = (q7_t const *) pY; \ - k = count >> 4; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* loop + tail predication expected here */ \ - k = count % 0x10U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp8q(k); \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - } \ - \ - acc0 = (acc0 >> 7); \ - acc1 = (acc1 >> 7); \ - acc0 = __SSAT(acc0, 8); \ - acc1 = __SSAT(acc1, 8); \ +#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\ +{ \ + q7_t const *pSrcX, *pSrcY; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + pSrcY = (q7_t const *) pY; \ + k = count >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* loop + tail predication expected here */ \ + k = count % 0x10U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp8q(k); \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + } \ + \ + acc0 = (acc0 >> 7); \ + acc1 = (acc1 >> 7); \ + acc0 = __SSAT(acc0, 8); \ + acc1 = __SSAT(acc1, 8); \ } -#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \ -{ \ - q7_t const *pSrcX, *pSrcY; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - pSrcY = (q7_t const *) pY; \ - k = (count-1) >> 4; \ - \ - while (k > 0U) \ - { \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires exact number of sample (count-1) */ \ - /* disable extra lanes in final MAC computation */ \ - k = (count-1) % 0x10U; \ - mve_pred16_t p0 = vctp8q(k); \ - yVec = vld1q(pSrcY); pSrcY += 16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - /* acc0 requires 1 additional sample (count) */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp8q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - \ - acc0 = (acc0 >> 7); \ - acc1 = (acc1 >> 7); \ - acc0 = __SSAT(acc0, 8); \ - acc1 = __SSAT(acc1, 8); \ +#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\ +{ \ + q7_t const *pSrcX, *pSrcY; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + pSrcY = (q7_t const *) pY; \ + k = (count-1) >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires exact number of sample (count-1) */ \ + /* disable extra lanes in final MAC computation */ \ + k = (count-1) % 0x10U; \ + mve_pred16_t p0 = vctp8q(k); \ + yVec = vld1q(pSrcY); pSrcY += 16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + /* acc0 requires 1 additional sample (count) */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp8q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + \ + acc0 = (acc0 >> 7); \ + acc1 = (acc1 >> 7); \ + acc0 = __SSAT(acc0, 8); \ + acc1 = __SSAT(acc1, 8); \ } -#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count) \ -{ \ - q7_t const *pSrcX; \ - const q7_t *pY1 = pY + 1; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - k = count >> 4; \ - \ - while (k > 0U) \ - { \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - pY-=16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \ - pY1-=16; \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = count % 0x10U; \ - /* use predication to finalize MAC sum */ \ - /* acc0 requires exact number of sample */ \ - /* disable extra lanes in final MAC computation */ \ - mve_pred16_t p0 = vctp8q(k); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \ - /* acc1 requires 1 additional sample */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp8q(k+1); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - \ - acc0 = (acc0 >> 7); \ - acc1 = (acc1 >> 7); \ - acc0 = __SSAT(acc0, 8); \ - acc1 = __SSAT(acc1, 8); \ +#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\ +{ \ + q7_t const *pSrcX; \ + const q7_t *pY1 = pY + 1; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + k = count >> 4; \ + \ + while (k > 0U) \ + { \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + pY-=16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \ + pY1-=16; \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = count % 0x10U; \ + /* use predication to finalize MAC sum */ \ + /* acc0 requires exact number of sample */ \ + /* disable extra lanes in final MAC computation */ \ + mve_pred16_t p0 = vctp8q(k); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \ + /* acc1 requires 1 additional sample */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp8q(k+1); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + \ + acc0 = (acc0 >> 7); \ + acc1 = (acc1 >> 7); \ + acc0 = __SSAT(acc0, 8); \ + acc1 = __SSAT(acc1, 8); \ } -#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count) \ -{ \ - q7_t const *pSrcX; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - k = count >> 4; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - pY-=16; \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc = vmladavaq(acc, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x10U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp8q(k); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - acc = vmladavaq_p(acc, xVec, yVec, p0); \ - } \ - acc = __SSAT(acc >> 7, 8); \ +#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count) \ +{ \ + q7_t const *pSrcX; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + k = count >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + pY-=16; \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc = vmladavaq(acc, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x10U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp8q(k); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + acc = vmladavaq_p(acc, xVec, yVec, p0); \ + } \ + acc = __SSAT(acc >> 7, 8); \ } -#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \ -{ \ - q7_t const *pSrcX; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - k = count >> 4; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - pY-=16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - xVec = vldrbq_s8(&pSrcX[2]); \ - acc2 = vmladavaq(acc2, xVec, yVec); \ - xVec = vldrbq_s8(&pSrcX[3]); \ - acc3 = vmladavaq(acc3, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x10U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp8q(k); \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - xVec = vldrbq_s8(&pSrcX[2]); \ - acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \ - xVec = vldrbq_s8(&pSrcX[3]); \ - acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - } \ - acc0 = __SSAT(acc0 >> 7, 8); \ - acc1 = __SSAT(acc1 >> 7, 8); \ - acc2 = __SSAT(acc2 >> 7, 8); \ - acc3 = __SSAT(acc3 >> 7, 8); \ +#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \ +{ \ + q7_t const *pSrcX; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + k = count >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + pY-=16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + xVec = vldrbq_s8(&pSrcX[2]); \ + acc2 = vmladavaq(acc2, xVec, yVec); \ + xVec = vldrbq_s8(&pSrcX[3]); \ + acc3 = vmladavaq(acc3, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x10U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp8q(k); \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + xVec = vldrbq_s8(&pSrcX[2]); \ + acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \ + xVec = vldrbq_s8(&pSrcX[3]); \ + acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + } \ + acc0 = __SSAT(acc0 >> 7, 8); \ + acc1 = __SSAT(acc1 >> 7, 8); \ + acc2 = __SSAT(acc2 >> 7, 8); \ + acc3 = __SSAT(acc3 >> 7, 8); \ } -#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \ -{ \ - q7_t const *pSrcX; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - k = count >> 4; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - pY-=16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - /* Loop with tail predication expected here */ \ - k = count % 0x10U; \ - if (k > 0U) \ - { \ - mve_pred16_t p0 = vctp8q(k); \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - } \ - acc0 = __SSAT(acc0 >> 7, 8); \ - acc1 = __SSAT(acc1 >> 7, 8); \ +#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \ +{ \ + q7_t const *pSrcX; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + k = count >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + pY-=16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + /* Loop with tail predication expected here */ \ + k = count % 0x10U; \ + if (k > 0U) \ + { \ + mve_pred16_t p0 = vctp8q(k); \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + } \ + acc0 = __SSAT(acc0 >> 7, 8); \ + acc1 = __SSAT(acc1 >> 7, 8); \ } -#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \ -{ \ - q7_t const *pSrcX; \ - q7x16_t xVec, yVec; \ - uint32_t k; \ - \ - pSrcX = (q7_t const *) pX; \ - k = (count-1) >> 4; \ - \ - while (k > 0U) \ - { \ - /* note */ \ - /* could can be more efficient using Vector Scatter Store: */ \ - /* + pre-increment + WB */ \ - /* To be revisited when intrinsic available */ \ - /* SDCOMP-52618 */ \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - pY-=16; \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq(acc1, xVec, yVec); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq(acc0, xVec, yVec); \ - /* Decrement the loop counter */ \ - k--; \ - } \ - k = (count - 1) % 0x10U; \ - /* use predication to finalize MAC sum */ \ - /* acc1 requires exact number of sample (count-1) */ \ - /* disable extra lanes in final MAC computation */ \ - mve_pred16_t p0 = vctp8q(k); \ - yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ - xVec = vldrbq_s8(&pSrcX[1]); \ - acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ - /* acc0 requires 1 additional sample (count) */ \ - /* so add 1 to unmask an extra lane in final MAC computation */ \ - p0 = vctp8q(k+1); \ - xVec = vld1q(pSrcX); pSrcX += 16; \ - acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ - \ - acc0 = (acc0 >> 7); \ - acc1 = (acc1 >> 7); \ - acc0 = __SSAT(acc0, 8); \ - acc1 = __SSAT(acc1, 8); \ +#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \ +{ \ + q7_t const *pSrcX; \ + q7x16_t xVec, yVec; \ + uint32_t k; \ + \ + pSrcX = (q7_t const *) pX; \ + k = (count-1) >> 4; \ + \ + while (k > 0U) \ + { \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + pY-=16; \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq(acc1, xVec, yVec); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq(acc0, xVec, yVec); \ + /* Decrement the loop counter */ \ + k--; \ + } \ + k = (count - 1) % 0x10U; \ + /* use predication to finalize MAC sum */ \ + /* acc1 requires exact number of sample (count-1) */ \ + /* disable extra lanes in final MAC computation */ \ + mve_pred16_t p0 = vctp8q(k); \ + yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \ + xVec = vldrbq_s8(&pSrcX[1]); \ + acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \ + /* acc0 requires 1 additional sample (count) */ \ + /* so add 1 to unmask an extra lane in final MAC computation */ \ + p0 = vctp8q(k+1); \ + xVec = vld1q(pSrcX); pSrcX += 16; \ + acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \ + \ + acc0 = (acc0 >> 7); \ + acc1 = (acc1 >> 7); \ + acc0 = __SSAT(acc0, 8); \ + acc1 = __SSAT(acc1, 8); \ } #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */ diff --git a/Testing/CMakeLists.txt b/Testing/CMakeLists.txt index d132de86..c3d5267d 100644 --- a/Testing/CMakeLists.txt +++ b/Testing/CMakeLists.txt @@ -54,7 +54,13 @@ function(writeConfig path) endif() list(APPEND output ",${PLATFORMID}") - list(APPEND output ",${COREID},") + if (CONFIGID) + # Specific config ID (like M55 with autovectorizer) + list(APPEND output ",${CONFIGID},") + else() + # Core ID is used as config ID when not specified + list(APPEND output ",${COREID},") + endif() if (ARMAC6) list(APPEND output "AC6") elseif(GCC) diff --git a/Testing/TestScripts/Regression/Commands.py b/Testing/TestScripts/Regression/Commands.py index 6c970e19..19d1866f 100755 --- a/Testing/TestScripts/Regression/Commands.py +++ b/Testing/TestScripts/Regression/Commands.py @@ -48,14 +48,14 @@ def joinit(iterable, delimiter): yield delimiter yield x -def addToDb(db,desc): +def addToDb(db,desc,runid): msg("Add %s to summary database\n" % desc) - completed = subprocess.run([sys.executable, "addToDB.py","-o",db,desc], timeout=3600) + completed = subprocess.run([sys.executable, "addToDB.py","-o",db,"-r",str(runid),desc], timeout=3600) check(completed) -def addToRegDb(db,desc): +def addToRegDb(db,desc,runid): msg("Add %s to regression database\n" % desc) - completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,desc], timeout=3600) + completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,"-r",str(runid),desc], timeout=3600) check(completed) @@ -187,7 +187,7 @@ class BuildConfig: # Launch cmake command. - def createCMake(self,flags,benchMode,platform): + def createCMake(self,configid,flags,benchMode,platform): with self.buildFolder() as b: self.saveEnv() if benchMode: @@ -198,7 +198,8 @@ class BuildConfig: cmd += ["-DCMAKE_PREFIX_PATH=%s" % self.compiler(), "-DCMAKE_TOOLCHAIN_FILE=%s" % toolchainCmake, "-DARM_CPU=%s" % self.core(), - "-DPLATFORM=%s" % platform + "-DPLATFORM=%s" % platform, + "-DCONFIGID=%s" % configid ] cmd += flags @@ -373,7 +374,7 @@ class Test: else: return(TESTFAILED) - def runAndProcess(self,compiler,fvp,sim,benchmode,db,regdb): + def runAndProcess(self,compiler,fvp,sim,benchmode,db,regdb,benchid,regid): # If we can't parse test description we fail all tests self.processTest() # Otherwise if only building or those tests are failing, we continue @@ -393,9 +394,9 @@ class Test: if benchmode and (error == NOTESTFAILED): error = self.computeSummaryStat() if db is not None: - addToDb(db,self.testName()) + addToDb(db,self.testName(),benchid) if regdb is not None: - addToRegDb(regdb,self.testName()) + addToRegDb(regdb,self.testName(),regid) return(error) else: msg("No FVP available") diff --git a/Testing/addToDB.py b/Testing/addToDB.py index 4d7b6688..5b43cff0 100755 --- a/Testing/addToDB.py +++ b/Testing/addToDB.py @@ -22,12 +22,13 @@ MKSTRFIELD=['NAME'] MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED'] MKINTFIELD=['ID', 'CYCLES'] MKDATEFIELD=['DATE'] -MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE'] +MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE',"RUN"] MKKEYFIELDID={'CATEGORY':'categoryid', 'PLATFORM':'platformid', 'CORE':'coreid', 'COMPILER':'compilerid', - 'TYPE':'typeid'} + 'TYPE':'typeid', + 'RUN':'runid'} # For table value extraction VALSTRFIELD=['NAME','VERSION'] @@ -56,7 +57,7 @@ def getColumns(elem,full): colsToKeep=[] cols = list(full.columns) params = list(elem.params.full) - common = diff(cols + ["TYPE"] , ['OLDID'] + params) + common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params) for field in common: if field in MKSTRFIELD: @@ -76,7 +77,7 @@ def createTableIfMissing(conn,elem,tableName,full): sql = "CREATE TABLE %s (" % tableName cols = list(full.columns) params = list(elem.params.full) - common = diff(cols + ["TYPE"] , ['OLDID'] + params) + common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params) sql += "%sid INTEGER PRIMARY KEY" % (tableName) start = "," @@ -103,6 +104,7 @@ def createTableIfMissing(conn,elem,tableName,full): sql += "FOREIGN KEY(platformid) REFERENCES PLATFORM(platformid)," sql += "FOREIGN KEY(coreid) REFERENCES CORE(coreid)," sql += "FOREIGN KEY(compilerid) REFERENCES COMPILER(compilerid)" + sql += "FOREIGN KEY(runid) REFERENCES RUN(runid)" sql += " )" conn.execute(sql) @@ -143,7 +145,7 @@ def findInCompilerTable(conn,kind,version): return(None) -def addRows(conn,elem,tableName,full): +def addRows(conn,elem,tableName,full,runid=0): # List of columns we have in DB which is # different from the columns in the table compilerid = 0 @@ -211,6 +213,7 @@ def addRows(conn,elem,tableName,full): keys[field]=row[field] + keys['RUN']=runid # Get foreign keys and create missing data for field in common: if field in VALKEYFIELD: @@ -261,31 +264,34 @@ def addConfig(conn,config,fullDate): conn.execute("INSERT INTO CONFIG(compilerid,platformid,coreid,date) VALUES(?,?,?,?)" ,(config['compilerid'],config['platformid'],config['coreid'],fullDate)) conn.commit() -def addOneBenchmark(elem,fullPath,db,group): +def getGroup(a): + return(re.sub(r'^(.+)(F64|F32|F16|Q31|Q15|Q7|U32|U16|U8|S32|S16|S8)$',r'\1',a)) + +def addOneBenchmark(elem,fullPath,db,group,runid): if os.path.isfile(fullPath): full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) fullDate = datetime.datetime.now() full['DATE'] = fullDate if group: - tableName = group + tableName = getGroup(group) else: - tableName = elem.data["class"] + tableName = getGroup(elem.data["class"]) conn = sqlite3.connect(db) createTableIfMissing(conn,elem,tableName,full) - config = addRows(conn,elem,tableName,full) + config = addRows(conn,elem,tableName,full,runid) addConfig(conn,config,fullDate) conn.close() -def addToDB(benchmark,dbpath,elem,group): +def addToDB(benchmark,dbpath,elem,group,runid): if not elem.data["deprecated"]: if elem.params: benchPath = os.path.join(benchmark,elem.fullPath(),"fullBenchmark.csv") print("Processing %s" % benchPath) - addOneBenchmark(elem,benchPath,dbpath,group) + addOneBenchmark(elem,benchPath,dbpath,group,runid) for c in elem.children: - addToDB(benchmark,dbpath,c,group) + addToDB(benchmark,dbpath,c,group,runid) @@ -295,6 +301,7 @@ parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="F parser.add_argument('-b', nargs='?',type = str, default="FullBenchmark", help="Full Benchmark dir path") #parser.add_argument('-e', action='store_true', help="Embedded test") parser.add_argument('-o', nargs='?',type = str, default="bench.db", help="Benchmark database") +parser.add_argument('-r', nargs='?',type = int, default=0, help="Run ID") parser.add_argument('others', nargs=argparse.REMAINDER) @@ -310,7 +317,7 @@ if args.f is not None: group=args.others[0] else: group=None - addToDB(args.b,args.o,root,group) + addToDB(args.b,args.o,root,group,args.r) else: parser.print_help() \ No newline at end of file diff --git a/Testing/addToRegDB.py b/Testing/addToRegDB.py index 3e7a54a6..cf6297d4 100755 --- a/Testing/addToRegDB.py +++ b/Testing/addToRegDB.py @@ -23,12 +23,13 @@ MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMI MKINTFIELD=['ID','MAX'] MKREALFIELD=['MAXREGCOEF'] MKDATEFIELD=['DATE'] -MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE'] +MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE','RUN'] MKKEYFIELDID={'CATEGORY':'categoryid', 'PLATFORM':'platformid', 'CORE':'coreid', 'COMPILER':'compilerid', - 'TYPE':'typeid'} + 'TYPE':'typeid', + 'RUN':'runid'} # For table value extraction VALSTRFIELD=['NAME','VERSION','Regression'] @@ -58,7 +59,7 @@ def getColumns(elem,full): colsToKeep=[] cols = list(full.columns) params=diff(elem.params.full , elem.params.summary) - common = diff(cols + ["TYPE"] , ['OLDID'] + params) + common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params) for field in common: if field in MKSTRFIELD: @@ -80,7 +81,7 @@ def createTableIfMissing(conn,elem,tableName,full): sql = "CREATE TABLE %s (" % tableName cols = list(full.columns) params=diff(elem.params.full , elem.params.summary) - common = diff(cols + ["TYPE"] , ['OLDID'] + params) + common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params) sql += "%sid INTEGER PRIMARY KEY" % (tableName) start = "," @@ -109,6 +110,7 @@ def createTableIfMissing(conn,elem,tableName,full): sql += "FOREIGN KEY(platformid) REFERENCES PLATFORM(platformid)," sql += "FOREIGN KEY(coreid) REFERENCES CORE(coreid)," sql += "FOREIGN KEY(compilerid) REFERENCES COMPILER(compilerid)" + sql += "FOREIGN KEY(runid) REFERENCES RUN(runid)" sql += " )" conn.execute(sql) @@ -149,7 +151,7 @@ def findInCompilerTable(conn,kind,version): return(None) -def addRows(conn,elem,tableName,full): +def addRows(conn,elem,tableName,full,runid=0): # List of columns we have in DB which is # different from the columns in the table compilerid = 0 @@ -218,7 +220,7 @@ def addRows(conn,elem,tableName,full): if field in VALBOOLFIELD: keys[field]=row[field] - + keys['RUN']=runid # Get foreign keys and create missing data for field in common: if field in VALKEYFIELD: @@ -272,31 +274,34 @@ def addConfig(conn,config,fullDate): conn.execute("INSERT INTO CONFIG(compilerid,platformid,coreid,date) VALUES(?,?,?,?)" ,(config['compilerid'],config['platformid'],config['coreid'],fullDate)) conn.commit() -def addOneBenchmark(elem,fullPath,db,group): +def getGroup(a): + return(re.sub(r'^(.+)(F64|F32|F16|Q31|Q15|Q7|U32|U16|U8|S32|S16|S8)$',r'\1',a)) + +def addOneBenchmark(elem,fullPath,db,group,runid): if os.path.isfile(fullPath): full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False) fullDate = datetime.datetime.now() full['DATE'] = fullDate if group: - tableName = group + tableName = getGroup(group) else: - tableName = elem.data["class"] + tableName = getGroup(elem.data["class"]) conn = sqlite3.connect(db) createTableIfMissing(conn,elem,tableName,full) - config = addRows(conn,elem,tableName,full) + config = addRows(conn,elem,tableName,full,runid) addConfig(conn,config,fullDate) conn.close() -def addToDB(benchmark,dbpath,elem,group): +def addToDB(benchmark,dbpath,elem,group,runid): if not elem.data["deprecated"]: if elem.params: benchPath = os.path.join(benchmark,elem.fullPath(),"regression.csv") print("Processing %s" % benchPath) - addOneBenchmark(elem,benchPath,dbpath,group) + addOneBenchmark(elem,benchPath,dbpath,group,runid) for c in elem.children: - addToDB(benchmark,dbpath,c,group) + addToDB(benchmark,dbpath,c,group,runid) @@ -306,6 +311,7 @@ parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="F parser.add_argument('-b', nargs='?',type = str, default="FullBenchmark", help="Full Benchmark dir path") #parser.add_argument('-e', action='store_true', help="Embedded test") parser.add_argument('-o', nargs='?',type = str, default="reg.db", help="Regression benchmark database") +parser.add_argument('-r', nargs='?',type = int, default=0, help="Run ID") parser.add_argument('others', nargs=argparse.REMAINDER) @@ -321,7 +327,7 @@ if args.f is not None: group=args.others[0] else: group=None - addToDB(args.b,args.o,root,group) + addToDB(args.b,args.o,root,group,args.r) else: parser.print_help() \ No newline at end of file diff --git a/Testing/bench.txt b/Testing/bench.txt index c4a9a3ad..2a5c83c2 100755 --- a/Testing/bench.txt +++ b/Testing/bench.txt @@ -351,8 +351,8 @@ group Root { Output ERR_F32_ID : Err Params PARAM1_ID = { - NumTaps = [4,8,16,32,64] - NB = [16,64,128,256] + NumTaps = [16,32,64] + NB = [64,128,256] } Functions { @@ -388,8 +388,8 @@ group Root { Output ERR_Q31_ID : Err Params PARAM1_ID = { - NumTaps = [4,8,16,32,64] - NB = [16,64,128,256] + NumTaps = [16,32,64] + NB = [64,128,256] } Functions { @@ -425,8 +425,8 @@ group Root { Output ERR_Q15_ID : Err Params PARAM1_ID = { - NumTaps = [4,8,16,32,64] - NB = [16,64,128,256] + NumTaps = [16,32,64] + NB = [64,128,256] } Functions { @@ -464,8 +464,8 @@ group Root { Output OUT_SAMPLES_F32_ID : Output Params PARAM1_ID = { - NBA = [4,5,9,16,64] - NBB = [5,9,16,128] + NBA = [9,16,64] + NBB = [9,16,128] } Functions { @@ -495,8 +495,8 @@ group Root { Output OUT_SAMPLES_Q31_ID : Output Params PARAM1_ID = { - NBA = [4,5,9,16,64] - NBB = [5,9,16,128] + NBA = [9,16,64] + NBB = [9,16,128] } Functions { @@ -526,8 +526,8 @@ group Root { Output OUT_SAMPLES_Q15_ID : Output Params PARAM1_ID = { - NBA = [4,5,9,16,64] - NBB = [5,9,16,128] + NBA = [9,16,64] + NBB = [9,16,128] } Functions { @@ -557,8 +557,8 @@ group Root { Output OUT_SAMPLES_Q7_ID : Output Params PARAM1_ID = { - NBA = [4,5,9,16,64] - NBB = [5,9,16,128] + NBA = [9,16,64] + NBB = [9,16,128] } Functions { @@ -762,7 +762,7 @@ group Root { Output OUT_SAMPLES_F32_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -792,7 +792,7 @@ group Root { Output OUT_SAMPLES_Q31_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -820,7 +820,7 @@ group Root { Output OUT_SAMPLES_Q15_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -848,7 +848,7 @@ group Root { Output OUT_SAMPLES_F32_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -873,7 +873,7 @@ group Root { Output OUT_SAMPLES_Q31_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -898,7 +898,7 @@ group Root { Output OUT_SAMPLES_Q15_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -962,7 +962,7 @@ group Root { Output OUT_SAMPLES_F32_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -992,7 +992,7 @@ group Root { Output OUT_SAMPLES_Q31_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -1020,7 +1020,7 @@ group Root { Output OUT_SAMPLES_Q15_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -1048,7 +1048,7 @@ group Root { Output OUT_SAMPLES_Q7_ID : Output Params PARAM1_ID = { - NB = [16,64,128,256] + NB = [64,128,256] } Functions { @@ -1292,7 +1292,7 @@ group Root { Params CFFT_PARAM_ID = { - NB = [16,64,128,256] + NB = [64,128,256] IFFT = [0,1] REV = [0,1] } @@ -1304,7 +1304,7 @@ group Root { } Params RFFT_PARAM_ID = { - NB = [32,64,128,256] + NB = [64,128,256] IFFT = [0,1] REV = [1] } @@ -1342,7 +1342,7 @@ group Root { Params CFFT_PARAM_ID = { - NB = [16,64,128,256] + NB = [64,128,256] IFFT = [0,1] REV = [0,1] } @@ -1354,7 +1354,7 @@ group Root { } Params RFFT_PARAM_ID = { - NB = [32,64,128,256] + NB = [64,128,256] IFFT = [0,1] REV = [0,1] } @@ -1392,7 +1392,7 @@ group Root { Params CFFT_PARAM_ID = { - NB = [16,64,128,256] + NB = [64,128,256] IFFT = [0,1] REV = [0,1] } @@ -1404,7 +1404,7 @@ group Root { } Params RFFT_PARAM_ID = { - NB = [32,64,128,256] + NB = [64,128,256] IFFT = [0,1] REV = [1] } diff --git a/Testing/createDb.sql b/Testing/createDb.sql index eca7a75e..9285be6e 100755 --- a/Testing/createDb.sql +++ b/Testing/createDb.sql @@ -29,6 +29,11 @@ CREATE INDEX compiler_index ON COMPILER(compilerkindid,version); CREATE INDEX compiler_date_index ON COMPILER(date); CREATE INDEX compiler_all_index ON COMPILER(compilerkindid,version,date); +CREATE TABLE RUN ( + runid INTEGER PRIMARY KEY, + date text + ); + CREATE TABLE TYPE ( typeid INTEGER PRIMARY KEY, type text ); @@ -76,15 +81,23 @@ INSERT INTO CORE VALUES(2,"m0p","ARMCM0P"); INSERT INTO CORE VALUES(3,"m3","ARMCM3"); INSERT INTO CORE VALUES(4,"m4","ARMCM4"); INSERT INTO CORE VALUES(5,"m4f","ARMCM4_FP"); -INSERT INTO CORE VALUES(6,"m7","ARMCM7_DP"); +INSERT INTO CORE VALUES(6,"m7d","ARMCM7_DP"); INSERT INTO CORE VALUES(7,"m23","ARMCM23"); -INSERT INTO CORE VALUES(8,"m33","ARMCM33_DSP_FP"); -INSERT INTO CORE VALUES(9,"m35","ARMCM35P_DSP_FP"); +INSERT INTO CORE VALUES(8,"m33f","ARMCM33_DSP_FP"); +INSERT INTO CORE VALUES(9,"m35f","ARMCM35P_DSP_FP"); INSERT INTO CORE VALUES(10,"a5","ARMCA5"); INSERT INTO CORE VALUES(11,"a7","ARMCA7"); INSERT INTO CORE VALUES(12,"a9","ARMCA9"); INSERT INTO CORE VALUES(13,"a15","ARMCA15"); -INSERT INTO CORE VALUES(14,"m55","ARMv81MML_DSP_DP_MVE_FP"); +INSERT INTO CORE VALUES(14,"m55mvef","ARMv81MML_DSP_DP_MVE_FP"); + +INSERT INTO CORE VALUES(15,"m0","M0"); +INSERT INTO CORE VALUES(16,"m7","M7"); +INSERT INTO CORE VALUES(17,"m33","M33"); +INSERT INTO CORE VALUES(18,"m4","M4"); +INSERT INTO CORE VALUES(19,"m55 mve","M55"); +INSERT INTO CORE VALUES(20,"m55 scalar","M55SCALAR"); + .quit diff --git a/Testing/examples.sql b/Testing/examples.sql index e4c201c6..660ff20f 100755 --- a/Testing/examples.sql +++ b/Testing/examples.sql @@ -6,7 +6,7 @@ Build the table with the platform, compiler and core names. .headers ON .mode csv -/* + select NB,CATEGORY.category,NAME,CYCLES,PLATFORM.platform,CORE.core,COMPILERKIND.compiler,COMPILER.version,BasicMathsBenchmarksF32.DATE from BasicMathsBenchmarksF32 INNER JOIN CATEGORY USING(categoryid) @@ -15,8 +15,9 @@ select NB,CATEGORY.category,NAME,CYCLES,PLATFORM.platform,CORE.core,COMPILERKIND INNER JOIN COMPILER USING(compilerid) INNER JOIN COMPILERKIND USING(compilerkindid) ; -*/ + +/* select Regression,MAX,MAXREGCOEF,CATEGORY.category,NAME,PLATFORM.platform,CORE.core,COMPILERKIND.compiler,COMPILER.version,BasicMathsBenchmarksF32.DATE from BasicMathsBenchmarksF32 INNER JOIN CATEGORY USING(categoryid) @@ -25,7 +26,7 @@ select Regression,MAX,MAXREGCOEF,CATEGORY.category,NAME,PLATFORM.platform,CORE.c INNER JOIN COMPILER USING(compilerid) INNER JOIN COMPILERKIND USING(compilerkindid) ; - +*/ /* Compute the max cycles for a test configuration (category + name) diff --git a/Testing/extractDb.py b/Testing/extractDb.py new file mode 100755 index 00000000..13595bd9 --- /dev/null +++ b/Testing/extractDb.py @@ -0,0 +1,283 @@ +import argparse +import sqlite3 +import re +import pandas as pd +import numpy as np + +# Command to get last runid +lastID="""SELECT runid FROM RUN ORDER BY runid DESC LIMIT 1 +""" + +def getLastRunID(): + r=c.execute(lastID) + return(int(r.fetchone()[0])) + + +runid = 1 + +parser = argparse.ArgumentParser(description='Generate summary benchmarks') + +parser.add_argument('-b', nargs='?',type = str, default="bench.db", help="Benchmark database") +parser.add_argument('-o', nargs='?',type = str, default="full.md", help="Full summary") +parser.add_argument('-r', action='store_true', help="Regression database") + +# For runid or runid range +parser.add_argument('others', nargs=argparse.REMAINDER) + +args = parser.parse_args() + +c = sqlite3.connect(args.b) + +if args.others: + runid=int(args.others[0]) +else: + runid=getLastRunID() + +# We extract data only from data tables +# Those tables below are used for descriptions +REMOVETABLES=['RUN','CORE', 'PLATFORM', 'COMPILERKIND', 'COMPILER', 'TYPE', 'CATEGORY', 'CONFIG'] + +# This is assuming the database is generated by the regression script +# So platform is the same for all benchmarks. +# Category and type is coming from the test name in the yaml +# So no need to add this information here +# Name is removed here because it is added at the beginning +REMOVECOLUMNS=['runid','NAME','type','platform','category','coredef','OPTIMIZED','HARDFP','FASTMATH','NEON','HELIUM','UNROLL','ROUNDING','DATE','compilerkindid','date','categoryid', 'ID', 'platformid', 'coreid', 'compilerid', 'typeid'] + +# Get existing benchmark tables +def getBenchTables(): + r=c.execute("SELECT name FROM sqlite_master WHERE type='table'") + benchtables=[] + for table in r: + if not table[0] in REMOVETABLES: + benchtables.append(table[0]) + return(benchtables) + +# get existing types in a table +def getExistingTypes(benchTable): + r=c.execute("select distinct typeid from %s" % benchTable).fetchall() + result=[x[0] for x in r] + return(result) + +# Get compilers from specific type and table +versioncompiler="""select distinct compiler,version from %s + INNER JOIN COMPILER USING(compilerid) + INNER JOIN COMPILERKIND USING(compilerkindid) WHERE typeid=?""" + +# Get existing compiler in a table for a specific type +# (In case report is structured by types) +def getExistingCompiler(benchTable,typeid): + r=c.execute(versioncompiler % benchTable,(typeid,)).fetchall() + return(r) + +# Get type name from type id +def getTypeName(typeid): + r=c.execute("select type from TYPE where typeid=?",(typeid,)).fetchone() + return(r[0]) + +# Diff of 2 lists +def diff(first, second): + second = set(second) + return [item for item in first if item not in second] + + +# Command to get data for specific compiler +# and type +benchCmd="""select %s from %s + INNER JOIN CATEGORY USING(categoryid) + INNER JOIN PLATFORM USING(platformid) + INNER JOIN CORE USING(coreid) + INNER JOIN COMPILER USING(compilerid) + INNER JOIN COMPILERKIND USING(compilerkindid) + INNER JOIN TYPE USING(typeid) + WHERE compiler=? AND VERSION=? AND typeid = ? AND runid = ? + """ + +# Command to get test names for specific compiler +# and type +benchNames="""select distinct NAME from %s + INNER JOIN COMPILER USING(compilerid) + INNER JOIN COMPILERKIND USING(compilerkindid) + INNER JOIN TYPE USING(typeid) + WHERE compiler=? AND VERSION=? AND typeid = ? AND runid = ? + """ + +# Command to get columns for specific table +benchCmdColumns="""select * from %s + INNER JOIN CATEGORY USING(categoryid) + INNER JOIN PLATFORM USING(platformid) + INNER JOIN CORE USING(coreid) + INNER JOIN COMPILER USING(compilerid) + INNER JOIN COMPILERKIND USING(compilerkindid) + INNER JOIN TYPE USING(typeid) + """ + +def joinit(iterable, delimiter): + it = iter(iterable) + yield next(it) + for x in it: + yield delimiter + yield x + +# Is not a column name finishing by id +# (often primary key for thetable) +def isNotIDColumn(col): + if re.match(r'^.*id$',col): + return(False) + else: + return(True) + +# Get test names +# for specific typeid and compiler (for the data) +def getTestNames(benchTable,comp,typeid): + vals=(comp[0],comp[1],typeid,runid) + result=c.execute(benchNames % benchTable,vals).fetchall() + return([x[0] for x in list(result)]) + +# Get names of columns and data for a table +# for specific typeid and compiler (for the data) +def getColNamesAndData(benchTable,comp,typeid): + cursor=c.cursor() + result=cursor.execute(benchCmdColumns % (benchTable)) + cols= [member[0] for member in cursor.description] + keepCols = ['NAME'] + [c for c in diff(cols , REMOVECOLUMNS) if isNotIDColumn(c)] + keepColsStr = "".join(joinit(keepCols,",")) + vals=(comp[0],comp[1],typeid,runid) + result=cursor.execute(benchCmd % (keepColsStr,benchTable),vals) + vals =np.array([list(x) for x in list(result)]) + return(keepCols,vals) + +# Write columns in markdown format +def writeColumns(f,cols): + colStr = "".join(joinit(cols,"|")) + f.write("|") + f.write(colStr) + f.write("|\n") + sepStr="".join(joinit([":-:" for x in cols],"|")) + f.write("|") + f.write(sepStr) + f.write("|\n") + +# Write row in markdown format +def writeRow(f,row): + row=[str(x) for x in row] + rowStr = "".join(joinit(row,"|")) + f.write("|") + f.write(rowStr) + f.write("|\n") + +PARAMS=["NB","NumTaps", "NBA", "NBB", "Factor", "NumStages","VECDIM","NBR","NBC","NBI","IFFT", "BITREV"] + +def regressionTableFor(name,output,ref,toSort,indexCols,field): + data=ref.pivot_table(index=indexCols, columns='core', + values=[field], aggfunc='first') + + data=data.sort_values(toSort) + + cores = [c[1] for c in list(data.columns)] + columns = diff(indexCols,['NAME']) + cores + + writeColumns(output,columns) + dataForFunc=data.loc[name] + if type(dataForFunc) is pd.DataFrame: + for row in dataForFunc.itertuples(): + row=list(row) + if type(row[0]) is int: + row=[row[0]] + row[1:] + else: + row=list(row[0]) + row[1:] + writeRow(output,row) + else: + writeRow(output,dataForFunc) + +def formatTableByCore(output,testNames,cols,vals): + if vals.size != 0: + ref=pd.DataFrame(vals,columns=cols) + toSort=["NAME"] + + for param in PARAMS: + if param in ref.columns: + ref[param]=pd.to_numeric(ref[param]) + toSort.append(param) + if args.r: + # Regression table + ref['MAX']=pd.to_numeric(ref['MAX']) + ref['MAXREGCOEF']=pd.to_numeric(ref['MAXREGCOEF']) + + indexCols=diff(cols,['core','Regression','MAXREGCOEF','MAX','version','compiler']) + valList = ['Regression'] + else: + ref['CYCLES']=pd.to_numeric(ref['CYCLES']) + + indexCols=diff(cols,['core','CYCLES','version','compiler']) + valList = ['CYCLES'] + + + + for name in testNames: + if args.r: + output.write("#### %s\n" % name) + + output.write("##### Regression\n" ) + regressionTableFor(name,output,ref,toSort,indexCols,'Regression') + + output.write("##### Max cycles\n" ) + regressionTableFor(name,output,ref,toSort,indexCols,'MAX') + + output.write("##### Max Reg Coef\n" ) + regressionTableFor(name,output,ref,toSort,indexCols,'MAXREGCOEF') + + else: + data=ref.pivot_table(index=indexCols, columns='core', + values=valList, aggfunc='first') + + data=data.sort_values(toSort) + + cores = [c[1] for c in list(data.columns)] + columns = diff(indexCols,['NAME']) + cores + + output.write("#### %s\n" % name) + writeColumns(output,columns) + dataForFunc=data.loc[name] + if type(dataForFunc) is pd.DataFrame: + for row in dataForFunc.itertuples(): + row=list(row) + if type(row[0]) is int: + row=[row[0]] + row[1:] + else: + row=list(row[0]) + row[1:] + writeRow(output,row) + else: + writeRow(output,dataForFunc) + +# Add a report for each table +def addReportFor(output,benchName): + print("Process %s\n" % benchName) + output.write("# %s\n" % benchName) + allTypes = getExistingTypes(benchName) + # Add report for each type + for aTypeID in allTypes: + typeName = getTypeName(aTypeID) + output.write("## %s\n" % typeName) + ## Add report for each compiler + allCompilers = getExistingCompiler(benchName,aTypeID) + for compiler in allCompilers: + #print(compiler) + output.write("### %s (%s)\n" % compiler) + cols,vals=getColNamesAndData(benchName,compiler,aTypeID) + names=getTestNames(benchName,compiler,aTypeID) + formatTableByCore(output,names,cols,vals) + + + + + +try: + with open(args.o,"w") as output: + benchtables=getBenchTables() + for bench in benchtables: + addReportFor(output,bench) +finally: + c.close() + + diff --git a/Testing/runAllBenchmarks.bat b/Testing/runAllBenchmarks.bat deleted file mode 100755 index 3074ddcf..00000000 --- a/Testing/runAllBenchmarks.bat +++ /dev/null @@ -1,70 +0,0 @@ -@ECHO OFF - -echo "Basic Maths" -python processTests.py -e BasicBenchmarks -call:runBench - -echo "Complex Maths" -python processTests.py -e ComplexBenchmarks -call:runBench - -echo "FIR" -python processTests.py -e FIR -call:runBench - -echo "Convolution / Correlation" -python processTests.py -e MISC -call:runBench - -echo "Decimation / Interpolation" -python processTests.py -e DECIM -call:runBench - -echo "BiQuad" -python processTests.py -e BIQUAD -call:runBench - -echo "Controller" -python processTests.py -e Controller -call:runBench - -echo "Fast Math" -python processTests.py -e FastMath -call:runBench - -echo "Barycenter" -python processTests.py -e SupportBarF32 -call:runBench - -echo "Support" -python processTests.py -e Support -call:runBench - -echo "Unary Matrix" -python processTests.py -e Unary -call:runBench - -echo "Binary Matrix" -python processTests.py -e Binary -call:runBench - -echo "Transform" -python processTests.py -e Transform -call:runBench - -EXIT /B - -:runBench -REM pushd build_m7 -REM pushd build_m0 -pushd build_a5 -make -REM "C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_MPS2_Cortex-M7.exe" -a Testing > result.txt -REM "C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_MPS2_Cortex-M0.exe" -a Testing > result.txt -"C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_VE_Cortex-A5x1.exe" -a Testing > result.txt -popd -echo "Parse result" -REM python processResult.py -e -r build_m7\result.txt -REM python processResult.py -e -r build_m0\result.txt -python processResult.py -e -r build_a5\result.txt -goto:eof \ No newline at end of file diff --git a/Testing/runAllBenchmarks.py b/Testing/runAllBenchmarks.py deleted file mode 100755 index b62a15ca..00000000 --- a/Testing/runAllBenchmarks.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import os.path -import subprocess -import colorama -from colorama import init,Fore, Back, Style -import argparse - -GROUPS = [ -"BasicBenchmarks", -"ComplexBenchmarks", -"FIR", -"MISC", -"DECIM", -"BIQUAD", -"Controller", -"FastMath", -"SupportBarF32", -"Support", -"Unary", -"Binary", -"Transform" -] - -init() - -def msg(t): - print(Fore.CYAN + t + Style.RESET_ALL) - -def processTest(test): - subprocess.call(["python","processTests.py","-e",test]) - -def addToDB(cmd): - for g in GROUPS: - msg("Add group %s" % g) - subprocess.call(["python",cmd,g]) - -def run(build,fvp,custom=None): - result = "results.txt" - resultPath = os.path.join(build,result) - - current=os.getcwd() - try: - msg("Build" ) - os.chdir(build) - subprocess.call(["make"]) - msg("Run") - with open(result,"w") as results: - if custom: - subprocess.call([fvp] + custom,stdout=results) - else: - subprocess.call([fvp,"-a","Testing"],stdout=results) - finally: - os.chdir(current) - - msg("Parse result") - subprocess.call(["python","processResult.py","-e","-r",resultPath]) - - msg("Regression computations") - subprocess.call(["python","summaryBench.py","-r",resultPath]) - - msg("Add results to benchmark database") - addToDB("addToDB.py") - - msg("Add results to regression database") - addToDB("addToRegDB.py") - - - -def processAndRun(buildfolder,fvp,custom=None): - processTest("DSPBenchmarks") - run(buildfolder,fvp,custom=custom) - -parser = argparse.ArgumentParser(description='Parse test description') -parser.add_argument('-f', nargs='?',type = str, default="build_benchmark_m7", help="Build folder") -parser.add_argument('-v', nargs='?',type = str, default="C:\\Program Files\\ARM\\Development Studio 2019.0\\sw\\models\\bin\\FVP_MPS2_Cortex-M7.exe", help="Fast Model") -parser.add_argument('-c', nargs='?',type = str, help="Custom args") - -args = parser.parse_args() - -if args.f is not None: - BUILDFOLDER=args.f -else: - BUILDFOLDER="build_benchmark_m7" - -if args.v is not None: - FVP=args.v -else: - FVP="C:\\Program Files\\ARM\\Development Studio 2019.0\\sw\\models\\bin\\FVP_MPS2_Cortex-M7.exe" - - -if args.c: - custom = args.c.split() -else: - custom = None - -print(Fore.RED + "bench.db and reg.db databases must exist before running this script" + Style.RESET_ALL) - -msg("Process benchmark description file") -subprocess.call(["python", "preprocess.py","-f","bench.txt"]) - -msg("Generate all missing C files") -subprocess.call(["python","processTests.py", "-e"]) - - -processAndRun(BUILDFOLDER,FVP,custom=custom) diff --git a/Testing/runAllTests.py b/Testing/runAllTests.py index d1455ed6..12d1e3e1 100755 --- a/Testing/runAllTests.py +++ b/Testing/runAllTests.py @@ -9,7 +9,29 @@ import yaml import sys import itertools from pathlib import Path +import sqlite3 +# Command to get last runid +lastID="""SELECT runid FROM RUN ORDER BY runid DESC LIMIT 1 +""" + +addNewIDCmd="""INSERT INTO RUN VALUES(?,date('now')) +""" + +benchID = 0 +regID = 0 + +def getLastRunID(c): + r=c.execute(lastID) + result=r.fetchone() + if result is None: + return(0) + else: + return(int(result[0])) + +def addNewID(c,newid): + c.execute(addNewIDCmd,(newid,)) + c.commit() # Small state machine def updateTestStatus(testStatusForThisBuild,newTestStatus): @@ -120,10 +142,26 @@ if args.db is not None: if not os.path.exists(args.db): createDb(args.sqlite,args.db) + conn = sqlite3.connect(args.db) + try: + currentID = getLastRunID(conn) + benchID = currentID + 1 + addNewID(conn,benchID) + finally: + conn.close() + if args.regdb is not None: if not os.path.exists(args.regdb): createDb(args.sqlite,args.regdb) + conn = sqlite3.connect(args.regdb) + try: + currentID = getLastRunID(conn) + regID = currentID + 1 + addNewID(conn,regID) + finally: + conn.close() + with open(args.i,"r") as f: config=yaml.safe_load(f) @@ -208,7 +246,7 @@ def buildAndTest(compiler,theConfig,cmake,sim): build.createArchive(flags) msg("Config " + str(flagConfig) + "\n") - build.createCMake(flags,args.b,args.p) + build.createCMake(core,flags,args.b,args.p) for test in config["TESTS"]: msg(test["testName"]+"\n") testClass=test["testClass"] @@ -220,7 +258,7 @@ def buildAndTest(compiler,theConfig,cmake,sim): if 'SIM' in config: if core in config['SIM']: fvp = config['SIM'][core] - newTestStatus = test.runAndProcess(compiler,fvp,sim,args.b,args.db,args.regdb) + newTestStatus = test.runAndProcess(compiler,fvp,sim,args.b,args.db,args.regdb,benchID,regID) testStatusForThisBuild = updateTestStatus(testStatusForThisBuild,newTestStatus) if testStatusForThisBuild != NOTESTFAILED: failedBuild[buildStr] = testStatusForThisBuild diff --git a/Toolchain/AC6.cmake b/Toolchain/AC6.cmake index ec988529..f232acd4 100644 --- a/Toolchain/AC6.cmake +++ b/Toolchain/AC6.cmake @@ -17,7 +17,7 @@ function(compilerSpecificCompileOptions PROJECTNAME ROOT) get_target_property(DISABLEOPTIM ${PROJECTNAME} DISABLEOPTIMIZATION) if ((OPTIMIZED) AND (NOT DISABLEOPTIM)) #cmake_print_variables(DISABLEOPTIM) - target_compile_options(${PROJECTNAME} PRIVATE "-O3") + target_compile_options(${PROJECTNAME} PRIVATE "-Ofast") endif() if (FASTMATHCOMPUTATIONS)