diff --git a/Include/dsp/matrix_functions.h b/Include/dsp/matrix_functions.h index 3d444883..f3801a98 100755 --- a/Include/dsp/matrix_functions.h +++ b/Include/dsp/matrix_functions.h @@ -444,6 +444,21 @@ arm_status arm_mat_mult_q31( const arm_matrix_instance_q31 * pSrcB, arm_matrix_instance_q31 * pDst); + /** + * @brief Q31 matrix multiplication + * @param[in] pSrcA points to the first input matrix structure + * @param[in] pSrcB points to the second input matrix structure + * @param[out] pDst points to output matrix structure + * @param[in] pState points to the array for storing intermediate results + * @return The function returns either + * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking. + */ +arm_status arm_mat_mult_opt_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst, + q31_t *pState); + /** * @brief Q31 matrix and vector multiplication * @param[in] pSrcMat points to the input matrix structure diff --git a/Source/MatrixFunctions/MatrixFunctions.c b/Source/MatrixFunctions/MatrixFunctions.c index cad08519..d4fa42c1 100644 --- a/Source/MatrixFunctions/MatrixFunctions.c +++ b/Source/MatrixFunctions/MatrixFunctions.c @@ -44,6 +44,7 @@ #include "arm_mat_mult_q7.c" #include "arm_mat_mult_q15.c" #include "arm_mat_mult_q31.c" +#include "arm_mat_mult_opt_q31.c" #include "arm_mat_scale_f32.c" #include "arm_mat_scale_q15.c" #include "arm_mat_scale_q31.c" diff --git a/Source/MatrixFunctions/arm_mat_mult_opt_q31.c b/Source/MatrixFunctions/arm_mat_mult_opt_q31.c new file mode 100755 index 00000000..91b1bcd5 --- /dev/null +++ b/Source/MatrixFunctions/arm_mat_mult_opt_q31.c @@ -0,0 +1,784 @@ +/* ---------------------------------------------------------------------- + * Project: CMSIS DSP Library + * Title: arm_mat_mult_opt_q31.c + * Description: Q31 matrix multiplication + * + * $Date: 3 Nov 2021 + * $Revision: V1.10.0 + * + * Target Processor: Cortex-M and Cortex-A cores + * -------------------------------------------------------------------- */ +/* + * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dsp/matrix_functions.h" + +/** + @ingroup groupMatrix + */ + +/** + @addtogroup MatrixMult + @{ + */ + +/** + @brief Q31 matrix multiplication. + @param[in] pSrcA points to the first input matrix structure + @param[in] pSrcB points to the second input matrix structure + @param[out] pDst points to output matrix structure + @param[in] pState points to the array for storing intermediate results + @return execution status + - \ref ARM_MATH_SUCCESS : Operation successful + - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed + + @par Scaling and Overflow Behavior + The function is implemented using an internal 64-bit accumulator. + The accumulator has a 2.62 format and maintains full precision of the intermediate + multiplication results but provides only a single guard bit. There is no saturation + on intermediate additions. Thus, if the accumulator overflows it wraps around and + distorts the result. The input signals should be scaled down to avoid intermediate + overflows. The input is thus scaled down by log2(numColsA) bits + to avoid overflows, as a total of numColsA additions are performed internally. + The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result. + @remark + Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function. + @remark + This function is a faster implementation of arm_mat_mult_q31 for MVE but it is requiring + additional storage for intermediate results. + */ +#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) + +#define MATRIX_DIM2 2 +#define MATRIX_DIM3 3 +#define MATRIX_DIM4 4 + +__STATIC_INLINE arm_status arm_mat_mult_opt_q31_2x2_mve( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst) +{ + q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ + q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ + q31_t *pOut = pDst->pData; /* output data matrix pointer */ + uint32x4_t vecColBOffs; + q31_t *pInA0 = pInA; + q31_t *pInA1 = pInA0 + MATRIX_DIM2; + q63_t acc0, acc1; + q31x4_t vecB, vecA0, vecA1; + /* enable predication to disable half of vector elements */ + mve_pred16_t p0 = vctp32q(MATRIX_DIM2); + + vecColBOffs = vidupq_u32((uint32_t)0, 1); + vecColBOffs = vecColBOffs * MATRIX_DIM2; + + pInB = pSrcB->pData; + + /* load 1st B column (partial load) */ + vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0); + + /* load A rows */ + vecA0 = vldrwq_s32(pInA0); + vecA1 = vldrwq_s32(pInA1); + + acc0 = vrmlaldavhq(vecA0, vecB); + acc1 = vrmlaldavhq(vecA1, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + + pOut[0 * MATRIX_DIM2] = (q31_t) acc0; + pOut[1 * MATRIX_DIM2] = (q31_t) acc1; + pOut++; + + /* move to next B column */ + pInB = pInB + 1; + + vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0); + + acc0 = vrmlaldavhq(vecA0, vecB); + acc1 = vrmlaldavhq(vecA1, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + + pOut[0 * MATRIX_DIM2] = (q31_t) acc0; + pOut[1 * MATRIX_DIM2] = (q31_t) acc1; + /* + * Return to application + */ + return (ARM_MATH_SUCCESS); +} + + + +__STATIC_INLINE arm_status arm_mat_mult_opt_q31_3x3_mve( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst) +{ + q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ + q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ + q31_t *pOut = pDst->pData; /* output data matrix pointer */ + uint32x4_t vecColBOffs; + q31_t *pInA0 = pInA; + q31_t *pInA1 = pInA0 + MATRIX_DIM3; + q31_t *pInA2 = pInA1 + MATRIX_DIM3; + q63_t acc0, acc1, acc2; + q31x4_t vecB, vecA; + /* enable predication to disable last (4th) vector element */ + mve_pred16_t p0 = vctp32q(MATRIX_DIM3); + + vecColBOffs = vidupq_u32((uint32_t)0, 1); + vecColBOffs = vecColBOffs * MATRIX_DIM3; + + pInB = pSrcB->pData; + + vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + + pOut[0 * MATRIX_DIM3] = (q31_t) acc0; + pOut[1 * MATRIX_DIM3] = (q31_t) acc1; + pOut[2 * MATRIX_DIM3] = (q31_t) acc2; + pOut++; + + /* move to next B column */ + pInB = pInB + 1; + + vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + + pOut[0 * MATRIX_DIM3] = (q31_t) acc0; + pOut[1 * MATRIX_DIM3] = (q31_t) acc1; + pOut[2 * MATRIX_DIM3] = (q31_t) acc2; + pOut++; + + /* move to next B column */ + pInB = pInB + 1; + + vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + + pOut[0 * MATRIX_DIM3] = (q31_t) acc0; + pOut[1 * MATRIX_DIM3] = (q31_t) acc1; + pOut[2 * MATRIX_DIM3] = (q31_t) acc2; + /* + * Return to application + */ + return (ARM_MATH_SUCCESS); +} + +__STATIC_INLINE arm_status arm_mat_mult_opt_q31_4x4_mve( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst) +{ + q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ + q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ + q31_t *pOut = pDst->pData; /* output data matrix pointer */ + uint32x4_t vecColBOffs; + q31_t *pInA0 = pInA; + q31_t *pInA1 = pInA0 + MATRIX_DIM4; + q31_t *pInA2 = pInA1 + MATRIX_DIM4; + q31_t *pInA3 = pInA2 + MATRIX_DIM4; + q63_t acc0, acc1, acc2, acc3; + q31x4_t vecB, vecA; + + vecColBOffs = vidupq_u32((uint32_t)0, 4); + + pInB = pSrcB->pData; + + vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA3); + acc3 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + acc3 = asrl(acc3, 23); + + pOut[0 * MATRIX_DIM4] = (q31_t) acc0; + pOut[1 * MATRIX_DIM4] = (q31_t) acc1; + pOut[2 * MATRIX_DIM4] = (q31_t) acc2; + pOut[3 * MATRIX_DIM4] = (q31_t) acc3; + pOut++; + + /* move to next B column */ + pInB = pInB + 1; + + vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA3); + acc3 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + acc3 = asrl(acc3, 23); + + pOut[0 * MATRIX_DIM4] = (q31_t) acc0; + pOut[1 * MATRIX_DIM4] = (q31_t) acc1; + pOut[2 * MATRIX_DIM4] = (q31_t) acc2; + pOut[3 * MATRIX_DIM4] = (q31_t) acc3; + + pOut++; + + /* move to next B column */ + pInB = pInB + 1; + + vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA3); + acc3 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + acc3 = asrl(acc3, 23); + + pOut[0 * MATRIX_DIM4] = (q31_t) acc0; + pOut[1 * MATRIX_DIM4] = (q31_t) acc1; + pOut[2 * MATRIX_DIM4] = (q31_t) acc2; + pOut[3 * MATRIX_DIM4] = (q31_t) acc3; + + pOut++; + + /* move to next B column */ + pInB = pInB + 1; + + vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs); + + vecA = vldrwq_s32(pInA0); + acc0 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA1); + acc1 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA2); + acc2 = vrmlaldavhq(vecA, vecB); + vecA = vldrwq_s32(pInA3); + acc3 = vrmlaldavhq(vecA, vecB); + + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + acc3 = asrl(acc3, 23); + + pOut[0 * MATRIX_DIM4] = (q31_t) acc0; + pOut[1 * MATRIX_DIM4] = (q31_t) acc1; + pOut[2 * MATRIX_DIM4] = (q31_t) acc2; + pOut[3 * MATRIX_DIM4] = (q31_t) acc3; + /* + * Return to application + */ + return (ARM_MATH_SUCCESS); +} + + +arm_status arm_mat_mult_opt_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst, + q31_t *pState) +{ + q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ + q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ + q31_t *pInA2; + q31_t *pInB2; + q31_t *px; /* Temporary output data matrix pointer */ + q31_t *px2; /* Temporary output data matrix pointer */ + uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ + uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ + uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ + uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ + uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */ + q31_t *pSrcBT = pState; /* input data matrix pointer for transpose */ + uint32_t blkCnt; /* loop counters */ + arm_status status; /* Status of matrix multiplication */ + arm_matrix_instance_q31 BT; +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((pSrcA->numCols != pSrcB->numRows) || + (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } else +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + { + + /* small squared matrix specialized routines */ + if(numRowsA == numColsB && numColsB == numColsA) { + if (numRowsA == 1) + { + q63_t sum = (q63_t) *pInA * *pInB; + pDst->pData[0] = (q31_t)(sum >> 31); + return (ARM_MATH_SUCCESS); + } + else if(numRowsA == 2) + return arm_mat_mult_opt_q31_2x2_mve(pSrcA, pSrcB, pDst); + else if(numRowsA == 3) + return arm_mat_mult_opt_q31_3x3_mve(pSrcA, pSrcB, pDst); + else if (numRowsA == 4) + return arm_mat_mult_opt_q31_4x4_mve(pSrcA, pSrcB, pDst); + } + + + /* + * Matrix transpose + */ + BT.numRows = numColsB; + BT.numCols = numRowsB; + BT.pData = pSrcBT; + + arm_mat_trans_q31(pSrcB, &BT); + + + /* + * Reset the variables for the usage in the following multiplication process + */ + i = 0; + row = numRowsA >> 1; + px = pDst->pData; + px2 = px + numColsB; + + /* + * main loop + * compute 2 x 2 output blocks + * with dot products (Matrix A rows * Transposed MAtrix B rows) + */ + while (row > 0u) { + /* + * For every row wise process, the column loop counter is to be initiated + * Compute 2 columns and 2 rows in parrallel + */ + col = numColsB >> 1; + j = 0; + + /* + * column pair loop + */ + while (col > 0u) { + q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec; + q31x4_t vecA, vecA2, vecB, vecB2; + q63_t acc0, acc1, acc2, acc3; + + /* + * Initiate the pointers + * - 2 x consecutive Matrix A rows (i increment is 2 x numColsA) + * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB) + */ + pInA = pSrcA->pData + i; + pInA2 = pInA + numColsA; + pInB = pSrcBT + j; + pInB2 = pInB + numRowsB; + + + pSrcAVec = (q31_t const *) pInA; + pSrcA2Vec = (q31_t const *) pInA2; + pSrcBVec = (q31_t const *) pInB; + pSrcB2Vec = (q31_t const *) pInB2; + + acc0 = 0LL; + acc1 = 0LL; + acc2 = 0LL; + acc3 = 0LL; + + /* load scheduling */ + vecA = vld1q(pSrcAVec); + pSrcAVec += 4; + + blkCnt = (numColsA / 4); + while (blkCnt > 0U) { + vecB = vld1q(pSrcBVec); + pSrcBVec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + vecA2 = vld1q(pSrcA2Vec); + pSrcA2Vec += 4; + acc1 = vrmlaldavhaq(acc1, vecA2, vecB); + vecB2 = vld1q(pSrcB2Vec); + pSrcB2Vec += 4; + acc2 = vrmlaldavhaq(acc2, vecA, vecB2); + vecA = vld1q(pSrcAVec); + pSrcAVec += 4; + acc3 = vrmlaldavhaq(acc3, vecA2, vecB2); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (numColsA & 3); + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + vecB = vld1q(pSrcBVec); + acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0); + vecA2 = vld1q(pSrcA2Vec); + acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0); + vecB2 = vld1q(pSrcB2Vec); + acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0); + vecA = vld1q(pSrcAVec); + acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0); + } + + /* Convert to 1.31 */ + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + acc3 = asrl(acc3, 23); + + /* Store the results (2 x 2 block) in the destination buffer */ + *px++ = (q31_t) acc0; + *px++ = (q31_t) acc2; + *px2++ = (q31_t) acc1; + *px2++ = (q31_t) acc3; + + j += numRowsB * 2; + /* + * Decrement the column pair loop counter + */ + col--; + + } + + i = i + numColsA * 2; + px = px2 + (numColsB & 1u); + px2 = px + numColsB; + /* + * Decrement the row pair loop counter + */ + row--; + } + + /* + * Compute remaining row and/or column below + */ + if (numColsB & 1u) { + row = numRowsA & (~0x1); //avoid redundant computation + px = pDst->pData + numColsB - 1; + i = 0; + + /* + * row loop + */ + while (row > 0) { + q31_t const *pSrcAVec, *pSrcBVec; + q31x4_t vecA, vecB; + q63_t acc0; + + /* + * point to last column in matrix B + */ + pInB = pSrcBT + numRowsB * (numColsB - 1); + pInA = pSrcA->pData + i; + + pSrcAVec = (q31_t const *) pInA; + pSrcBVec = (q31_t const *) pInB; + + /* single dot-product */ + acc0 = 0LL; + blkCnt = (numColsA / 4); + while (blkCnt > 0U) { + vecA = vld1q(pSrcAVec); + pSrcAVec += 4; + vecB = vld1q(pSrcBVec); + pSrcBVec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (numColsA & 3); + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + vecA = vld1q(pSrcAVec); + vecB = vld1q(pSrcBVec); + acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0); + } + + acc0 = asrl(acc0, 23); + *px = (q31_t) acc0; + + px += numColsB; + + i += numColsA; + /* + * Decrement the row loop counter + */ + row--; + } + } + + if (numRowsA & 1u) { + col = numColsB; + i = 0u; + /* + * point to last row in output matrix + */ + px = pDst->pData + (numColsB) * (numRowsA - 1); + /* + * col loop + */ + while (col > 0) { + q31_t const *pSrcAVec, *pSrcBVec; + q31x4_t vecA, vecB; + q63_t acc0; + + /* + * point to last row in matrix A + */ + pInA = pSrcA->pData + (numRowsA - 1) * numColsA; + pInB = pSrcBT + i; + + /* + * Set the variable sum, that acts as accumulator, to zero + */ + pSrcAVec = (q31_t const *) pInA; + pSrcBVec = (q31_t const *) pInB; + acc0 = 0LL; + + blkCnt = (numColsA / 4); + while (blkCnt > 0U) { + vecA = vld1q(pSrcAVec); + pSrcAVec += 4; + vecB = vld1q(pSrcBVec); + pSrcBVec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + + blkCnt--; + } + /* + * tail + * (will be merged thru tail predication) + */ + blkCnt = (numColsA & 3); + if (blkCnt > 0U) { + mve_pred16_t p0 = vctp32q(blkCnt); + vecA = vld1q(pSrcAVec); + vecB = vld1q(pSrcBVec); + acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0); + } + + acc0 = asrl(acc0, 23); + *px++ = (q31_t) acc0; + + i += numColsA; + /* + * Decrement the col loop counter + */ + col--; + } + } + /* Set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + /* + * Return to application + */ + return (status); +} + +#else +arm_status arm_mat_mult_opt_q31( + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst, + q31_t *pState) +{ + q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */ + q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */ + q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */ + q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */ + q31_t *pOut = pDst->pData; /* Output data matrix pointer */ + q31_t *px; /* Temporary output data matrix pointer */ + q63_t sum; /* Accumulator */ + uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */ + uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */ + uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */ + uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */ + arm_status status; /* Status of matrix multiplication */ + (void)pState; +#ifdef ARM_MATH_MATRIX_CHECK + + /* Check for matrix mismatch condition */ + if ((pSrcA->numCols != pSrcB->numRows) || + (pSrcA->numRows != pDst->numRows) || + (pSrcB->numCols != pDst->numCols) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else + +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + + { + /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ + /* row loop */ + do + { + /* Output pointer is set to starting address of row being processed */ + px = pOut + i; + + /* For every row wise process, column loop counter is to be initiated */ + col = numColsB; + + /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */ + pIn2 = pSrcB->pData; + + /* column loop */ + do + { + /* Set the variable sum, that acts as accumulator, to zero */ + sum = 0; + + /* Initialize pointer pIn1 to point to starting address of column being processed */ + pIn1 = pInA; + +#if defined (ARM_MATH_LOOPUNROLL) + + /* Loop unrolling: Compute 4 MACs at a time. */ + colCnt = numColsA >> 2U; + + /* matrix multiplication */ + while (colCnt > 0U) + { + /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */ + + /* Perform the multiply-accumulates */ + sum += (q63_t) *pIn1++ * *pIn2; + pIn2 += numColsB; + + sum += (q63_t) *pIn1++ * *pIn2; + pIn2 += numColsB; + + sum += (q63_t) *pIn1++ * *pIn2; + pIn2 += numColsB; + + sum += (q63_t) *pIn1++ * *pIn2; + pIn2 += numColsB; + + /* Decrement loop counter */ + colCnt--; + } + + /* Loop unrolling: Compute remaining MACs */ + colCnt = numColsA % 0x4U; + +#else + + /* Initialize cntCnt with number of columns */ + colCnt = numColsA; + +#endif /* #if defined (ARM_MATH_LOOPUNROLL) */ + + while (colCnt > 0U) + { + /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */ + + /* Perform the multiply-accumulates */ + sum += (q63_t) *pIn1++ * *pIn2; + pIn2 += numColsB; + + /* Decrement loop counter */ + colCnt--; + } + + /* Convert result from 2.62 to 1.31 format and store in destination buffer */ + *px++ = (q31_t) (sum >> 31); + + /* Decrement column loop counter */ + col--; + + /* Update pointer pIn2 to point to starting address of next column */ + pIn2 = pInB + (numColsB - col); + + } while (col > 0U); + + /* Update pointer pInA to point to starting address of next row */ + i = i + numColsB; + pInA = pInA + numColsA; + + /* Decrement row loop counter */ + row--; + + } while (row > 0U); + + /* Set status as ARM_MATH_SUCCESS */ + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); +} +#endif /* defined(ARM_MATH_MVEI) */ + +/** + @} end of MatrixMult group + */ diff --git a/Source/MatrixFunctions/arm_mat_mult_q15.c b/Source/MatrixFunctions/arm_mat_mult_q15.c index 9219ed02..3e1172c5 100644 --- a/Source/MatrixFunctions/arm_mat_mult_q15.c +++ b/Source/MatrixFunctions/arm_mat_mult_q15.c @@ -42,7 +42,7 @@ @param[in] pSrcA points to the first input matrix structure @param[in] pSrcB points to the second input matrix structure @param[out] pDst points to output matrix structure - @param[in] pState points to the array for storing intermediate results (Unused) + @param[in] pState points to the array for storing intermediate results @return execution status - \ref ARM_MATH_SUCCESS : Operation successful - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed @@ -617,7 +617,7 @@ arm_status arm_mat_mult_q15( return (status); } -#else +#else arm_status arm_mat_mult_q15( const arm_matrix_instance_q15 * pSrcA, const arm_matrix_instance_q15 * pSrcB, @@ -639,8 +639,8 @@ arm_status arm_mat_mult_q15( uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */ arm_status status; /* Status of matrix multiplication */ - q31_t in; /* Temporary variable to hold the input value */ q31_t inA1, inB1, inA2, inB2; + arm_matrix_instance_q15 BT; #ifdef ARM_MATH_MATRIX_CHECK @@ -655,89 +655,13 @@ arm_status arm_mat_mult_q15( else #endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - { - /* Matrix transpose */ - do - { - /* The pointer px is set to starting address of column being processed */ - px = pSrcBT + i; - - /* Apply loop unrolling and exchange columns with row elements */ - col = numColsB >> 2U; - - /* First part of the processing with loop unrolling. Compute 4 outputs at a time. - ** a second loop below computes the remaining 1 to 3 samples. */ - while (col > 0U) - { - /* Read two elements from row */ - in = read_q15x2_ia ((q15_t **) &pInB); - - /* Unpack and store one element in destination */ -#ifndef ARM_MATH_BIG_ENDIAN - *px = (q15_t) in; -#else - *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); -#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - - /* Update pointer px to point to next row of transposed matrix */ - px += numRowsB; - - /* Unpack and store second element in destination */ -#ifndef ARM_MATH_BIG_ENDIAN - *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); -#else - *px = (q15_t) in; -#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - - /* Update pointer px to point to next row of transposed matrix */ - px += numRowsB; - - /* Read two elements from row */ - in = read_q15x2_ia ((q15_t **) &pInB); - - /* Unpack and store one element in destination */ -#ifndef ARM_MATH_BIG_ENDIAN - *px = (q15_t) in; -#else - *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); -#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - px += numRowsB; - -#ifndef ARM_MATH_BIG_ENDIAN - *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16); -#else - *px = (q15_t) in; -#endif /* #ifndef ARM_MATH_BIG_ENDIAN */ - px += numRowsB; - /* Decrement column loop counter */ - col--; - } - - /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here. - ** No loop unrolling is used. */ - col = numColsB % 0x4U; - - while (col > 0U) - { - /* Read and store input element in destination */ - *px = *pInB++; - - /* Update pointer px to point to next row of transposed matrix */ - px += numRowsB; - - /* Decrement column loop counter */ - col--; - } - - i++; - - /* Decrement row loop counter */ - row--; - - } while (row > 0U); + BT.numRows = numColsB; + BT.numCols = numRowsB; + BT.pData = pSrcBT; + arm_mat_trans_q15(pSrcB,&BT); /* Reset variables for usage in following multiplication process */ row = numRowsA; i = 0U; diff --git a/Source/MatrixFunctions/arm_mat_mult_q31.c b/Source/MatrixFunctions/arm_mat_mult_q31.c index 08001cc2..18738279 100644 --- a/Source/MatrixFunctions/arm_mat_mult_q31.c +++ b/Source/MatrixFunctions/arm_mat_mult_q31.c @@ -3,8 +3,8 @@ * Title: arm_mat_mult_q31.c * Description: Q31 matrix multiplication * - * $Date: 3 Nov 2021 - * $Revision: V1.10.0 + * $Date: 23 April 2021 + * $Revision: V1.9.0 * * Target Processor: Cortex-M and Cortex-A cores * -------------------------------------------------------------------- */ @@ -332,45 +332,44 @@ __STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve( return (ARM_MATH_SUCCESS); } - arm_status arm_mat_mult_q31( - const arm_matrix_instance_q31 * pSrcA, - const arm_matrix_instance_q31 * pSrcB, - arm_matrix_instance_q31 * pDst) + const arm_matrix_instance_q31 * pSrcA, + const arm_matrix_instance_q31 * pSrcB, + arm_matrix_instance_q31 * pDst) { - q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ - q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */ - q31_t *pInA2; - q31_t *pInB2; - q31_t *px; /* Temporary output data matrix pointer */ - q31_t *px2; /* Temporary output data matrix pointer */ - uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ - uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ - uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ - uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ - uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */ - q31_t State[numRowsB * numColsB * 1]; - q31_t *pSrcBT = State; /* input data matrix pointer for transpose */ - uint32_t blkCnt; /* loop counters */ - arm_status status; /* Status of matrix multiplication */ - arm_matrix_instance_q31 BT; -#ifdef ARM_MATH_MATRIX_CHECK + q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */ + q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */ + q31_t *pOut = pDst->pData; /* output data matrix pointer */ + q31_t *px; /* Temporary output data matrix pointer */ + uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ + uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ + uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ + uint16_t col, i = 0U, row = numRowsA; /* loop counters */ + arm_status status; /* status of matrix multiplication */ + uint32x4_t vecOffs, vecColBOffs; + uint32_t blkCnt, rowCnt; /* loop counters */ + + #ifdef ARM_MATH_MATRIX_CHECK - /* Check for matrix mismatch condition */ - if ((pSrcA->numCols != pSrcB->numRows) || - (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) { - /* Set status as ARM_MATH_SIZE_MISMATCH */ - status = ARM_MATH_SIZE_MISMATCH; - } else -#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ - { + /* Check for matrix mismatch condition */ + if ((pSrcA->numCols != pSrcB->numRows) || + (pSrcA->numRows != pDst->numRows) || + (pSrcB->numCols != pDst->numCols) ) + { + /* Set status as ARM_MATH_SIZE_MISMATCH */ + status = ARM_MATH_SIZE_MISMATCH; + } + else - /* small squared matrix specialized routines */ +#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ + + { + /* small squared matrix specialized routines */ if(numRowsA == numColsB && numColsB == numColsA) { if (numRowsA == 1) { q63_t sum = (q63_t) *pInA * *pInB; - pDst->pData[0] = (q31_t)(sum >> 31); + pOut[0] = (q31_t)(sum >> 31); return (ARM_MATH_SUCCESS); } else if(numRowsA == 2) @@ -381,263 +380,246 @@ arm_status arm_mat_mult_q31( return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst); } + vecColBOffs = vidupq_u32((uint32_t)0, 1); + vecColBOffs = vecColBOffs * (uint32_t) (numColsB); + /* + * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB + */ + + /* + * row loop + */ + rowCnt = row >> 2; + while (rowCnt > 0U) + { /* - * Matrix transpose + * Output pointer is set to starting address of the row being processed */ - BT.numRows = numColsB; - BT.numCols = numRowsB; - BT.pData = pSrcBT; - - arm_mat_trans_q31(pSrcB, &BT); - - + px = pOut + i; + i = i + 4 * numColsB; /* - * Reset the variables for the usage in the following multiplication process + * For every row wise process, the column loop counter is to be initiated */ - i = 0; - row = numRowsA >> 1; - px = pDst->pData; - px2 = px + numColsB; - + col = numColsB; + /* + * For every row wise process, the pInB pointer is set + * to the starting address of the pSrcB data + */ + pInB = (q31_t const *)pSrcB->pData; /* - * main loop - * compute 2 x 2 output blocks - * with dot products (Matrix A rows * Transposed MAtrix B rows) + * column loop */ - while (row > 0u) { + while (col > 0U) + { + /* + * generate 4 columns elements + */ /* - * For every row wise process, the column loop counter is to be initiated - * Compute 2 columns and 2 rows in parrallel + * Matrix A columns number of MAC operations are to be performed */ - col = numColsB >> 1; - j = 0; + + q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec; + q31_t const *pInA0 = pInA; + q31_t const *pInA1 = pInA0 + numColsA; + q31_t const *pInA2 = pInA1 + numColsA; + q31_t const *pInA3 = pInA2 + numColsA; + q63_t acc0, acc1, acc2, acc3; + + acc0 = 0LL; + acc1 = 0LL; + acc2 = 0LL; + acc3 = 0LL; + + pSrcA0Vec = (q31_t const *) pInA0; + pSrcA1Vec = (q31_t const *) pInA1; + pSrcA2Vec = (q31_t const *) pInA2; + pSrcA3Vec = (q31_t const *) pInA3; + + vecOffs = vecColBOffs; + + /* process 1 x 4 block output */ + blkCnt = numColsA >> 2; + while (blkCnt > 0U) + { + q31x4_t vecB, vecA; + + vecB = vldrwq_gather_shifted_offset(pInB, vecOffs); + /* move Matrix B read offsets, 4 rows down */ + vecOffs = vecOffs + (uint32_t) (numColsB * 4); + + vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4; + acc1 = vrmlaldavhaq(acc1, vecA, vecB); + vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4; + acc2 = vrmlaldavhaq(acc2, vecA, vecB); + vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4; + acc3 = vrmlaldavhaq(acc3, vecA, vecB); + blkCnt--; + } /* - * column pair loop + * tail + * (will be merged thru tail predication) */ - while (col > 0u) { - q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec; - q31x4_t vecA, vecA2, vecB, vecB2; - q63_t acc0, acc1, acc2, acc3; - - /* - * Initiate the pointers - * - 2 x consecutive Matrix A rows (i increment is 2 x numColsA) - * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB) - */ - pInA = pSrcA->pData + i; - pInA2 = pInA + numColsA; - pInB = pSrcBT + j; - pInB2 = pInB + numRowsB; - - - pSrcAVec = (q31_t const *) pInA; - pSrcA2Vec = (q31_t const *) pInA2; - pSrcBVec = (q31_t const *) pInB; - pSrcB2Vec = (q31_t const *) pInB2; - - acc0 = 0LL; - acc1 = 0LL; - acc2 = 0LL; - acc3 = 0LL; - - /* load scheduling */ - vecA = vld1q(pSrcAVec); - pSrcAVec += 4; - - blkCnt = (numColsA / 4); - while (blkCnt > 0U) { - vecB = vld1q(pSrcBVec); - pSrcBVec += 4; - acc0 = vrmlaldavhaq(acc0, vecA, vecB); - vecA2 = vld1q(pSrcA2Vec); - pSrcA2Vec += 4; - acc1 = vrmlaldavhaq(acc1, vecA2, vecB); - vecB2 = vld1q(pSrcB2Vec); - pSrcB2Vec += 4; - acc2 = vrmlaldavhaq(acc2, vecA, vecB2); - vecA = vld1q(pSrcAVec); - pSrcAVec += 4; - acc3 = vrmlaldavhaq(acc3, vecA2, vecB2); - - blkCnt--; - } - /* - * tail - * (will be merged thru tail predication) - */ - blkCnt = (numColsA & 3); - if (blkCnt > 0U) { - mve_pred16_t p0 = vctp32q(blkCnt); - vecB = vld1q(pSrcBVec); - acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0); - vecA2 = vld1q(pSrcA2Vec); - acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0); - vecB2 = vld1q(pSrcB2Vec); - acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0); - vecA = vld1q(pSrcAVec); - acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0); - } - - /* Convert to 1.31 */ - acc0 = asrl(acc0, 23); - acc1 = asrl(acc1, 23); - acc2 = asrl(acc2, 23); - acc3 = asrl(acc3, 23); - - /* Store the results (2 x 2 block) in the destination buffer */ - *px++ = (q31_t) acc0; - *px++ = (q31_t) acc2; - *px2++ = (q31_t) acc1; - *px2++ = (q31_t) acc3; - - j += numRowsB * 2; - /* - * Decrement the column pair loop counter - */ - col--; - + blkCnt = numColsA & 3; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp32q(blkCnt); + q31x4_t vecB, vecA; + + vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0); + //vecOffs = vecOffs + (uint32_t) (numColsB * 4); + + vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4; + acc1 = vrmlaldavhaq(acc1, vecA, vecB); + vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4; + acc2 = vrmlaldavhaq(acc2, vecA, vecB); + vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4; + acc3 = vrmlaldavhaq(acc3, vecA, vecB); } - i = i + numColsA * 2; - px = px2 + (numColsB & 1u); - px2 = px + numColsB; + acc0 = asrl(acc0, 23); + acc1 = asrl(acc1, 23); + acc2 = asrl(acc2, 23); + acc3 = asrl(acc3, 23); + + px[0] = (q31_t) acc0; + px[1 * numColsB] = (q31_t) acc1; + px[2 * numColsB] = (q31_t) acc2; + px[3 * numColsB] = (q31_t) acc3; + px++; /* - * Decrement the row pair loop counter + * Decrement the column loop counter */ - row--; + col--; + /* + * Update the pointer pInB to point to the starting address of the next column + */ + pInB = (q31_t const *)pSrcB->pData + (numColsB - col); } /* - * Compute remaining row and/or column below + * Update the pointer pInA to point to the starting address of the next row + */ + pInA += (numColsA * 4); + /* + * Decrement the row loop counter + */ + rowCnt --; + + } + rowCnt = row & 3; + while (rowCnt > 0U) + { + /* + * Output pointer is set to starting address of the row being processed + */ + px = pOut + i; + i = i + numColsB; + /* + * For every row wise process, the column loop counter is to be initiated + */ + col = numColsB; + /* + * For every row wise process, the pInB pointer is set + * to the starting address of the pSrcB data + */ + pInB = (q31_t const *)pSrcB->pData; + /* + * column loop */ - if (numColsB & 1u) { - row = numRowsA & (~0x1); //avoid redundant computation - px = pDst->pData + numColsB - 1; - i = 0; + while (col > 0U) + { + /* + * generate 4 columns elements + */ + /* + * Matrix A columns number of MAC operations are to be performed + */ + + q31_t const *pSrcA0Vec; + q31_t const *pInA0 = pInA; + q63_t acc0; + + acc0 = 0LL; + + + pSrcA0Vec = (q31_t const *) pInA0; + + vecOffs = vecColBOffs; + + /* process 1 x 4 block output */ + blkCnt = numColsA >> 2; + while (blkCnt > 0U) + { + q31x4_t vecB, vecA; + + vecB = vldrwq_gather_shifted_offset(pInB, vecOffs); + /* move Matrix B read offsets, 4 rows down */ + vecOffs = vecOffs + (uint32_t) (numColsB * 4); + + vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + + blkCnt--; + } /* - * row loop + * tail + * (will be merged thru tail predication) */ - while (row > 0) { - q31_t const *pSrcAVec, *pSrcBVec; - q31x4_t vecA, vecB; - q63_t acc0; - - /* - * point to last column in matrix B - */ - pInB = pSrcBT + numRowsB * (numColsB - 1); - pInA = pSrcA->pData + i; - - pSrcAVec = (q31_t const *) pInA; - pSrcBVec = (q31_t const *) pInB; - - /* single dot-product */ - acc0 = 0LL; - blkCnt = (numColsA / 4); - while (blkCnt > 0U) { - vecA = vld1q(pSrcAVec); - pSrcAVec += 4; - vecB = vld1q(pSrcBVec); - pSrcBVec += 4; - acc0 = vrmlaldavhaq(acc0, vecA, vecB); - - blkCnt--; - } - /* - * tail - * (will be merged thru tail predication) - */ - blkCnt = (numColsA & 3); - if (blkCnt > 0U) { - mve_pred16_t p0 = vctp32q(blkCnt); - vecA = vld1q(pSrcAVec); - vecB = vld1q(pSrcBVec); - acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0); - } - - acc0 = asrl(acc0, 23); - *px = (q31_t) acc0; - - px += numColsB; - - i += numColsA; - /* - * Decrement the row loop counter - */ - row--; + blkCnt = numColsA & 3; + if (blkCnt > 0U) + { + mve_pred16_t p0 = vctp32q(blkCnt); + q31x4_t vecB, vecA; + + vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0); + //vecOffs = vecOffs + (uint32_t) (numColsB * 4); + + vecA = vld1q(pSrcA0Vec); + pSrcA0Vec += 4; + acc0 = vrmlaldavhaq(acc0, vecA, vecB); + } - } - if (numRowsA & 1u) { - col = numColsB; - i = 0u; + acc0 = asrl(acc0, 23); + + + px[0] = (q31_t) acc0; + px++; /* - * point to last row in output matrix + * Decrement the column loop counter */ - px = pDst->pData + (numColsB) * (numRowsA - 1); + col--; /* - * col loop + * Update the pointer pInB to point to the starting address of the next column */ - while (col > 0) { - q31_t const *pSrcAVec, *pSrcBVec; - q31x4_t vecA, vecB; - q63_t acc0; - - /* - * point to last row in matrix A - */ - pInA = pSrcA->pData + (numRowsA - 1) * numColsA; - pInB = pSrcBT + i; - - /* - * Set the variable sum, that acts as accumulator, to zero - */ - pSrcAVec = (q31_t const *) pInA; - pSrcBVec = (q31_t const *) pInB; - acc0 = 0LL; - - blkCnt = (numColsA / 4); - while (blkCnt > 0U) { - vecA = vld1q(pSrcAVec); - pSrcAVec += 4; - vecB = vld1q(pSrcBVec); - pSrcBVec += 4; - acc0 = vrmlaldavhaq(acc0, vecA, vecB); - - blkCnt--; - } - /* - * tail - * (will be merged thru tail predication) - */ - blkCnt = (numColsA & 3); - if (blkCnt > 0U) { - mve_pred16_t p0 = vctp32q(blkCnt); - vecA = vld1q(pSrcAVec); - vecB = vld1q(pSrcBVec); - acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0); - } - - acc0 = asrl(acc0, 23); - *px++ = (q31_t) acc0; - - i += numColsA; - /* - * Decrement the col loop counter - */ - col--; - } + pInB = (q31_t const *)pSrcB->pData + (numColsB - col); } - /* Set status as ARM_MATH_SUCCESS */ - status = ARM_MATH_SUCCESS; + + /* + * Update the pointer pInA to point to the starting address of the next row + */ + pInA += numColsA; + /* + * Decrement the row loop counter + */ + rowCnt--; } + /* - * Return to application + * set status as ARM_MATH_SUCCESS */ - return (status); + status = ARM_MATH_SUCCESS; + } + + /* Return to application */ + return (status); } #else diff --git a/Testing/Include/Benchmarks/BinaryQ31.h b/Testing/Include/Benchmarks/BinaryQ31.h index 64502c2f..21f51508 100755 --- a/Testing/Include/Benchmarks/BinaryQ31.h +++ b/Testing/Include/Benchmarks/BinaryQ31.h @@ -14,6 +14,7 @@ class BinaryQ31:public Client::Suite Client::Pattern input1; Client::Pattern input2; Client::LocalPattern output; + Client::LocalPattern tmp; int nbr; int nbi; @@ -22,5 +23,6 @@ class BinaryQ31:public Client::Suite arm_matrix_instance_q31 in1; arm_matrix_instance_q31 in2; arm_matrix_instance_q31 out; + q31_t *tmpPtr; }; diff --git a/Testing/Include/Tests/BinaryTestsQ31.h b/Testing/Include/Tests/BinaryTestsQ31.h index 41459e40..b5d6d9a4 100755 --- a/Testing/Include/Tests/BinaryTestsQ31.h +++ b/Testing/Include/Tests/BinaryTestsQ31.h @@ -16,6 +16,8 @@ class BinaryTestsQ31:public Client::Suite Client::Pattern ref; Client::Pattern dims; Client::LocalPattern output; + Client::LocalPattern tmp; + /* Local copies of inputs since matrix instance in CMSIS-DSP are not using pointers to const. diff --git a/Testing/Source/Benchmarks/BinaryQ31.cpp b/Testing/Source/Benchmarks/BinaryQ31.cpp index f89abd90..7cbc836e 100755 --- a/Testing/Source/Benchmarks/BinaryQ31.cpp +++ b/Testing/Source/Benchmarks/BinaryQ31.cpp @@ -17,6 +17,11 @@ arm_mat_mult_fast_q31(&this->in1,&this->in2,&this->out); } + void BinaryQ31::test_mat_mult_opt_q31() + { + arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,this->tmpPtr); + } + void BinaryQ31::setUp(Testing::testID_t id,std::vector& params,Client::PatternMgr *mgr) { @@ -35,6 +40,14 @@ output.create(2*this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr); break; + case BinaryQ31::TEST_MAT_MULT_OPT_Q31_4: + input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi); + input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc); + output.create(this->nbr*this->nbc,BinaryQ31::OUT_Q31_ID,mgr); + tmp.create(this->nbi*this->nbc,BinaryQ31::TMP_Q31_ID,mgr); + this->tmpPtr=tmp.ptr(); + break; + default: input1.reload(BinaryQ31::INPUTA_Q31_ID,mgr,this->nbr*this->nbi); input2.reload(BinaryQ31::INPUTB_Q31_ID,mgr,this->nbi*this->nbc); diff --git a/Testing/Source/Tests/BinaryTestsF32.cpp b/Testing/Source/Tests/BinaryTestsF32.cpp index d552d87a..16e31263 100755 --- a/Testing/Source/Tests/BinaryTestsF32.cpp +++ b/Testing/Source/Tests/BinaryTestsF32.cpp @@ -16,6 +16,13 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 40 +static void checkInnerTail(float32_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); +} #define LOADDATA2() \ const float32_t *inp1=input1.ptr(); \ @@ -68,6 +75,7 @@ a double precision computation. ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } @@ -99,6 +107,7 @@ a double precision computation. ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (2*rows * columns); + checkInnerTail(outp); } diff --git a/Testing/Source/Tests/BinaryTestsQ15.cpp b/Testing/Source/Tests/BinaryTestsQ15.cpp index c6ff67a9..390a52a3 100755 --- a/Testing/Source/Tests/BinaryTestsQ15.cpp +++ b/Testing/Source/Tests/BinaryTestsQ15.cpp @@ -23,6 +23,19 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 40 +static void checkInnerTail(q15_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); + ASSERT_TRUE(b[4] == 0); + ASSERT_TRUE(b[5] == 0); + ASSERT_TRUE(b[6] == 0); + ASSERT_TRUE(b[7] == 0); +} + + #define LOADDATA2() \ const q15_t *inp1=input1.ptr(); \ @@ -39,7 +52,7 @@ a double precision computation. int i; -#define PREPAREDATA2() \ +#define PREPAREDATA2C() \ in1.numRows=rows; \ in1.numCols=internal; \ memcpy((void*)ap,(const void*)inp1,2*sizeof(q15_t)*rows*internal);\ @@ -54,29 +67,45 @@ a double precision computation. out.numCols=columns; \ out.pData = outp; - +#define PREPAREDATA2R() \ + in1.numRows=rows; \ + in1.numCols=internal; \ + memcpy((void*)ap,(const void*)inp1,sizeof(q15_t)*rows*internal);\ + in1.pData = ap; \ + \ + in2.numRows=internal; \ + in2.numCols=columns; \ + memcpy((void*)bp,(const void*)inp2,sizeof(q15_t)*internal*columns);\ + in2.pData = bp; \ + \ + out.numRows=rows; \ + out.numCols=columns; \ + out.pData = outp; void BinaryTestsQ15::test_mat_mult_q15() { LOADDATA2(); arm_status status; + for(i=0;i < nbMatrixes ; i ++) { rows = *dimsp++; internal = *dimsp++; columns = *dimsp++; - PREPAREDATA2(); + PREPAREDATA2R(); + memset(tmpPtr,0,sizeof(q15_t)*internal*columns + 16); status=arm_mat_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); + checkInnerTail(tmpPtr + internal * columns); } - ASSERT_EMPTY_TAIL(output); ASSERT_SNR(output,ref,(q15_t)SNR_LOW_THRESHOLD); @@ -99,17 +128,16 @@ a double precision computation. columns = *dimsp++; - PREPAREDATA2(); + PREPAREDATA2C(); status=arm_mat_cmplx_mult_q15(&this->in1,&this->in2,&this->out,tmpPtr); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (2*rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); - ASSERT_SNR(output,ref,(q15_t)MULT_SNR_THRESHOLD); ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15); diff --git a/Testing/Source/Tests/BinaryTestsQ31.cpp b/Testing/Source/Tests/BinaryTestsQ31.cpp index 7dee08ea..b894e530 100755 --- a/Testing/Source/Tests/BinaryTestsQ31.cpp +++ b/Testing/Source/Tests/BinaryTestsQ31.cpp @@ -18,6 +18,14 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 40 +static void checkInnerTail(q31_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); +} + #define LOADDATA2() \ const q31_t *inp1=input1.ptr(); \ @@ -68,11 +76,10 @@ a double precision computation. ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); - ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD); ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31); @@ -98,10 +105,38 @@ a double precision computation. ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (2*rows * columns); - + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); + ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD); + + ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q31); + + } + + void BinaryTestsQ31::test_mat_mult_opt_q31() + { + LOADDATA2(); + q31_t *tmpPtr=tmp.ptr(); + + arm_status status; + + for(i=0;i < nbMatrixes ; i ++) + { + rows = *dimsp++; + internal = *dimsp++; + columns = *dimsp++; + + PREPAREDATA2(); + memset(tmpPtr,0,sizeof(q31_t)*internal*columns + 16); + status=arm_mat_mult_opt_q31(&this->in1,&this->in2,&this->out,tmpPtr); + ASSERT_TRUE(status==ARM_MATH_SUCCESS); + + outp += (rows * columns); + checkInnerTail(outp); + checkInnerTail(tmpPtr + internal*columns); + + } ASSERT_SNR(output,ref,(q31_t)SNR_THRESHOLD); @@ -141,6 +176,21 @@ a double precision computation. b.create(2*MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr); break; + case TEST_MAT_MULT_OPT_Q31_3: + input1.reload(BinaryTestsQ31::INPUTS1_Q31_ID,mgr); + input2.reload(BinaryTestsQ31::INPUTS2_Q31_ID,mgr); + dims.reload(BinaryTestsQ31::DIMSBINARY1_S16_ID,mgr); + + ref.reload(BinaryTestsQ31::REFMUL1_Q31_ID,mgr); + + output.create(ref.nbSamples(),BinaryTestsQ31::OUT_Q31_ID,mgr); + a.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPA_Q31_ID,mgr); + b.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPB_Q31_ID,mgr); + + tmp.create(MAXMATRIXDIM*MAXMATRIXDIM,BinaryTestsQ31::TMPC_Q31_ID,mgr); + + break; + diff --git a/Testing/Source/Tests/BinaryTestsQ7.cpp b/Testing/Source/Tests/BinaryTestsQ7.cpp index accd3f4b..4fa0346d 100755 --- a/Testing/Source/Tests/BinaryTestsQ7.cpp +++ b/Testing/Source/Tests/BinaryTestsQ7.cpp @@ -19,6 +19,26 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 47 +static void checkInnerTail(q7_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); + ASSERT_TRUE(b[4] == 0); + ASSERT_TRUE(b[5] == 0); + ASSERT_TRUE(b[6] == 0); + ASSERT_TRUE(b[7] == 0); + ASSERT_TRUE(b[8] == 0); + ASSERT_TRUE(b[9] == 0); + ASSERT_TRUE(b[10] == 0); + ASSERT_TRUE(b[11] == 0); + ASSERT_TRUE(b[12] == 0); + ASSERT_TRUE(b[13] == 0); + ASSERT_TRUE(b[14] == 0); + ASSERT_TRUE(b[15] == 0); + +} #define LOADDATA2() \ const q7_t *inp1=input1.ptr(); \ @@ -65,12 +85,15 @@ a double precision computation. columns = *dimsp++; PREPAREDATA2(); + memset(tmpPtr,0,sizeof(q7_t)*internal*columns + 16); + checkInnerTail(tmpPtr + internal*columns); status=arm_mat_mult_q7(&this->in1,&this->in2,&this->out,tmpPtr); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); - + checkInnerTail(outp); + checkInnerTail(tmpPtr + internal*columns); } ASSERT_EMPTY_TAIL(output); diff --git a/Testing/Source/Tests/UnaryTestsF32.cpp b/Testing/Source/Tests/UnaryTestsF32.cpp index c91be8fc..8aee65c1 100755 --- a/Testing/Source/Tests/UnaryTestsF32.cpp +++ b/Testing/Source/Tests/UnaryTestsF32.cpp @@ -46,6 +46,14 @@ Comparison for Cholesky /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 40 +static void checkInnerTailOverflow(float32_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); +} + #define LOADDATA2() \ const float32_t *inp1=input1.ptr(); \ const float32_t *inp2=input2.ptr(); \ @@ -192,6 +200,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32() arm_mat_vec_mult_f32(&this->in1, bp, outp); outp += rows ; + checkInnerTailOverflow(outp); } @@ -219,6 +228,7 @@ void UnaryTestsF32::test_mat_vec_mult_f32() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTailOverflow(outp); } @@ -246,6 +256,7 @@ void UnaryTestsF32::test_mat_sub_f32() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTailOverflow(outp); } @@ -273,6 +284,7 @@ void UnaryTestsF32::test_mat_scale_f32() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTailOverflow(outp); } @@ -300,6 +312,7 @@ void UnaryTestsF32::test_mat_trans_f32() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTailOverflow(outp); } @@ -327,6 +340,7 @@ void UnaryTestsF32::test_mat_cmplx_trans_f32() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += 2*(rows * columns); + checkInnerTailOverflow(outp); } @@ -421,6 +435,7 @@ void UnaryTestsF32::test_mat_inverse_f32() outp += (rows * columns); inp1 += (rows * columns); + checkInnerTailOverflow(outp); } @@ -461,6 +476,7 @@ void UnaryTestsF32::test_mat_inverse_f32() outp += (rows * columns); inp1 += (rows * rows); inp2 += (rows * columns); + checkInnerTailOverflow(outp); } @@ -501,6 +517,7 @@ void UnaryTestsF32::test_mat_inverse_f32() outp += (rows * columns); inp1 += (rows * rows); inp2 += (rows * columns); + checkInnerTailOverflow(outp); } @@ -668,6 +685,9 @@ void UnaryTestsF32::test_mat_inverse_f32() inp1 += (rows * columns); + checkInnerTailOverflow(outllp); + checkInnerTailOverflow(outdp); + } diff --git a/Testing/Source/Tests/UnaryTestsQ15.cpp b/Testing/Source/Tests/UnaryTestsQ15.cpp index eaf43648..3de4bce8 100755 --- a/Testing/Source/Tests/UnaryTestsQ15.cpp +++ b/Testing/Source/Tests/UnaryTestsQ15.cpp @@ -18,6 +18,31 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 40 +static void refInnerTail(q15_t *b) +{ + b[0] = 1; + b[1] = -1; + b[2] = 2; + b[3] = -2; + b[4] = 3; + b[5] = -3; + b[6] = 4; + b[7] = -4; +} + +static void checkInnerTail(q15_t *b) +{ + ASSERT_TRUE(b[0] == 1); + ASSERT_TRUE(b[1] == -1); + ASSERT_TRUE(b[2] == 2); + ASSERT_TRUE(b[3] == -2); + ASSERT_TRUE(b[4] == 3); + ASSERT_TRUE(b[5] == -3); + ASSERT_TRUE(b[6] == 4); + ASSERT_TRUE(b[7] == -4); +} + + #define LOADDATA2() \ const q15_t *inp1=input1.ptr(); \ const q15_t *inp2=input2.ptr(); \ @@ -127,14 +152,14 @@ a double precision computation. internal = *dimsp++; PREPAREVECDATA2(); - + refInnerTail(outp + rows); arm_mat_vec_mult_q15(&this->in1, bp, outp); outp += rows ; + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD); @@ -153,15 +178,15 @@ a double precision computation. columns = *dimsp++; PREPAREDATA2(); - + refInnerTail(outp + rows * columns); status=arm_mat_add_q15(&this->in1,&this->in2,&this->out); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD); @@ -180,15 +205,15 @@ void UnaryTestsQ15::test_mat_sub_q15() columns = *dimsp++; PREPAREDATA2(); - + refInnerTail(outp + rows * columns); status=arm_mat_sub_q15(&this->in1,&this->in2,&this->out); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD); @@ -207,15 +232,15 @@ void UnaryTestsQ15::test_mat_scale_q15() columns = *dimsp++; PREPAREDATA1(false); - + refInnerTail(outp + rows * columns); status=arm_mat_scale_q15(&this->in1,ONEHALF,0,&this->out); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD); @@ -234,16 +259,15 @@ void UnaryTestsQ15::test_mat_trans_q15() columns = *dimsp++; PREPAREDATA1(true); - + refInnerTail(outp + rows * columns); status=arm_mat_trans_q15(&this->in1,&this->out); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); - ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD); ASSERT_NEAR_EQ(output,ref,ABS_ERROR_Q15); @@ -261,15 +285,15 @@ void UnaryTestsQ15::test_mat_cmplx_trans_q15() columns = *dimsp++; PREPAREDATA1C(true); - + refInnerTail(outp + 2*rows * columns); status=arm_mat_cmplx_trans_q15(&this->in1,&this->out); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += 2*(rows * columns); + checkInnerTail(outp); } - ASSERT_EMPTY_TAIL(output); ASSERT_SNR(output,ref,(q15_t)SNR_THRESHOLD); diff --git a/Testing/Source/Tests/UnaryTestsQ31.cpp b/Testing/Source/Tests/UnaryTestsQ31.cpp index f17d8d1a..9c5aac84 100755 --- a/Testing/Source/Tests/UnaryTestsQ31.cpp +++ b/Testing/Source/Tests/UnaryTestsQ31.cpp @@ -18,6 +18,15 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 40 +static void checkInnerTail(q31_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); +} + + #define LOADDATA2() \ const q31_t *inp1=input1.ptr(); \ const q31_t *inp2=input2.ptr(); \ @@ -129,6 +138,7 @@ a double precision computation. arm_mat_vec_mult_q31(&this->in1, bp, outp); outp += rows ; + checkInnerTail(outp); } @@ -156,6 +166,7 @@ a double precision computation. ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } @@ -183,6 +194,7 @@ void UnaryTestsQ31::test_mat_sub_q31() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } @@ -210,6 +222,7 @@ void UnaryTestsQ31::test_mat_scale_q31() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } @@ -237,6 +250,7 @@ void UnaryTestsQ31::test_mat_trans_q31() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } @@ -264,6 +278,7 @@ void UnaryTestsQ31::test_mat_cmplx_trans_q31() ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += 2*(rows * columns); + checkInnerTail(outp); } diff --git a/Testing/Source/Tests/UnaryTestsQ7.cpp b/Testing/Source/Tests/UnaryTestsQ7.cpp index 6e4200b9..bb8b2ab1 100755 --- a/Testing/Source/Tests/UnaryTestsQ7.cpp +++ b/Testing/Source/Tests/UnaryTestsQ7.cpp @@ -19,6 +19,27 @@ a double precision computation. /* Upper bound of maximum matrix dimension used by Python */ #define MAXMATRIXDIM 47 +static void checkInnerTail(q7_t *b) +{ + ASSERT_TRUE(b[0] == 0); + ASSERT_TRUE(b[1] == 0); + ASSERT_TRUE(b[2] == 0); + ASSERT_TRUE(b[3] == 0); + ASSERT_TRUE(b[4] == 0); + ASSERT_TRUE(b[5] == 0); + ASSERT_TRUE(b[6] == 0); + ASSERT_TRUE(b[7] == 0); + + ASSERT_TRUE(b[8] == 0); + ASSERT_TRUE(b[9] == 0); + ASSERT_TRUE(b[10] == 0); + ASSERT_TRUE(b[11] == 0); + ASSERT_TRUE(b[12] == 0); + ASSERT_TRUE(b[13] == 0); + ASSERT_TRUE(b[14] == 0); + ASSERT_TRUE(b[15] == 0); +} + #define LOADDATA2() \ const q7_t *inp1=input1.ptr(); \ const q7_t *inp2=input2.ptr(); \ @@ -112,6 +133,7 @@ a double precision computation. arm_mat_vec_mult_q7(&this->in1, bp, outp); outp += rows ; + checkInnerTail(outp); } @@ -132,13 +154,13 @@ void UnaryTestsQ7::test_mat_trans_q7() { rows = *dimsp++; columns = *dimsp++; - PREPAREDATA1(true); status=arm_mat_trans_q7(&this->in1,&this->out); ASSERT_TRUE(status==ARM_MATH_SUCCESS); outp += (rows * columns); + checkInnerTail(outp); } diff --git a/Testing/bench.txt b/Testing/bench.txt index ea5dcbf5..ec95e513 100755 --- a/Testing/bench.txt +++ b/Testing/bench.txt @@ -1583,6 +1583,7 @@ group Root { Pattern INPUTAC_Q31_ID : InputAC1_q31.txt Pattern INPUTBC_Q31_ID : InputBC1_q31.txt Output OUT_Q31_ID : Output + Output TMP_Q31_ID : Temp Params PARAM1_ID = { NBR = [5,10,40] @@ -1595,6 +1596,7 @@ group Root { Matrix Multiplication:test_mat_mult_q31 Complex Matrix Multiplication:test_mat_cmplx_mult_q31 Fast Matrix Multiplication:test_mat_mult_fast_q31 + Opt Matrix Multiplication:test_mat_mult_opt_q31 } -> PARAM1_ID } @@ -1614,6 +1616,7 @@ group Root { Pattern INPUTAC_Q15_ID : InputAC1_q15.txt Pattern INPUTBC_Q15_ID : InputBC1_q15.txt Output OUT_Q15_ID : Output + Output TMP_Q15_ID : Temp Params PARAM1_ID = { NBR = [5,10,40] diff --git a/Testing/desc.txt b/Testing/desc.txt index 1233d2d7..8728218c 100644 --- a/Testing/desc.txt +++ b/Testing/desc.txt @@ -3459,10 +3459,12 @@ group Root { Output OUT_Q31_ID : Output Output TMPA_Q31_ID : TmpA Output TMPB_Q31_ID : TmpB + Output TMPC_Q31_ID : TmpC Functions { test mult:test_mat_mult_q31 test complex mult:test_mat_cmplx_mult_q31 + test mult opt:test_mat_mult_opt_q31 } }