performance optimizations for arm_mat_mult_fast_q15 and arm_mat_mult_fast_q31

pull/19/head
David Palframan 9 years ago
parent 31eba363dd
commit 365d910888

@ -1031,6 +1031,17 @@ extern "C"
((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000) )); ((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000) ));
} }
/*
* @brief C custom defined SMMLA for M3 and M0 processors
*/
CMSIS_INLINE __STATIC_INLINE int32_t __SMMLA(
int32_t x,
int32_t y,
int32_t sum)
{
return (sum + (int32_t) (((int64_t) x * y) >> 32));
}
#endif /* defined (ARM_MATH_CM3) || defined (ARM_MATH_CM0_FAMILY) */ #endif /* defined (ARM_MATH_CM3) || defined (ARM_MATH_CM0_FAMILY) */

@ -97,13 +97,16 @@ arm_status arm_mat_mult_fast_q15(
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */ uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */ uint32_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */ arm_status status; /* status of matrix multiplication */
#ifndef UNALIGNED_SUPPORT_DISABLE #ifndef UNALIGNED_SUPPORT_DISABLE
q31_t in; /* Temporary variable to hold the input value */ q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inA2, inB1, inB2; q31_t inA1, inA2, inB1, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
#else #else
@ -269,9 +272,15 @@ arm_status arm_mat_mult_fast_q15(
i = 0u; i = 0u;
px = pDst->pData; px = pDst->pData;
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Process two rows from matrix A at a time and output two rows at a time */
row = row >> 1;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */ /* row loop */
do while(row > 0u)
{ {
/* For every row wise process, the column loop counter is to be initiated */ /* For every row wise process, the column loop counter is to be initiated */
col = numColsB; col = numColsB;
@ -280,18 +289,35 @@ arm_status arm_mat_mult_fast_q15(
** to the starting address of the transposed pSrcB data */ ** to the starting address of the transposed pSrcB data */
pInB = pSrcBT; pInB = pSrcBT;
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Process two (transposed) columns from matrix B at a time */
col = col >> 1;
j = 0;
#endif
/* column loop */ /* column loop */
do while (col > 0u)
{ {
/* Set the variable sum, that acts as accumulator, to zero */ /* Set the variable sum, that acts as accumulator, to zero */
sum = 0; sum = 0;
/* Apply loop unrolling and compute 2 MACs simultaneously. */ /* Initiate the pointer pInA to point to the starting address of the column being processed */
colCnt = numColsA >> 2;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pInA = pSrcA->pData + i; pInA = pSrcA->pData + i;
#ifndef UNALIGNED_SUPPORT_DISABLE
sum2 = 0;
sum3 = 0;
sum4 = 0;
pInB = pSrcBT + j;
pInA2 = pInA + numColsA;
pInB2 = pInB + numRowsB;
/* Read in two elements at once - alows dual MAC instruction */
colCnt = numColsA >> 1;
#else
colCnt = numColsA >> 2;
#endif
/* matrix multiplication */ /* matrix multiplication */
while(colCnt > 0u) while(colCnt > 0u)
{ {
@ -300,29 +326,35 @@ arm_status arm_mat_mult_fast_q15(
inA1 = *__SIMD32(pInA)++; inA1 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++; inB1 = *__SIMD32(pInB)++;
inA2 = *__SIMD32(pInA)++; inA2 = *__SIMD32(pInA2)++;
inB2 = *__SIMD32(pInB)++; inB2 = *__SIMD32(pInB2)++;
sum = __SMLAD(inA1, inB1, sum); sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum); sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
#else #else
inA1 = *pInA++; inA1 = *pInA;
inB1 = *pInB++; inB1 = *pInB;
inA2 = *pInA++;
sum += inA1 * inB1; sum += inA1 * inB1;
inB2 = *pInB++;
inA1 = *pInA++; inA2 = pInA[1];
inB1 = *pInB++; inB2 = pInB[1];
sum += inA2 * inB2; sum += inA2 * inB2;
inA2 = *pInA++;
inB2 = *pInB++;
inA1 = pInA[2];
inB1 = pInB[2];
sum += inA1 * inB1; sum += inA1 * inB1;
inA2 = pInA[3];
inB2 = pInB[3];
sum += inA2 * inB2; sum += inA2 * inB2;
pInA += 4;
pInB += 4;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */ #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */ /* Decrement the loop counter */
@ -330,6 +362,18 @@ arm_status arm_mat_mult_fast_q15(
} }
/* process odd column samples */ /* process odd column samples */
#ifndef UNALIGNED_SUPPORT_DISABLE
if (numColsA & 1u) {
inA1 = *pInA++;
inB1 = *pInB++;
inA2 = *pInA2++;
inB2 = *pInB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
}
#else
colCnt = numColsA % 0x4u; colCnt = numColsA % 0x4u;
while(colCnt > 0u) while(colCnt > 0u)
@ -339,22 +383,146 @@ arm_status arm_mat_mult_fast_q15(
colCnt--; colCnt--;
} }
#endif
/* Saturate and store the result in the destination buffer */ /* Saturate and store the result in the destination buffer */
*px = (q15_t) (sum >> 15); *px++ = (q15_t) (sum >> 15);
px++;
#ifndef UNALIGNED_SUPPORT_DISABLE
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
/* Decrement the column loop counter */ /* Decrement the column loop counter */
col--; col--;
} while(col > 0u); }
i = i + numColsA; i = i + numColsA;
#ifndef UNALIGNED_SUPPORT_DISABLE
i = i + numColsA;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
#endif
/* Decrement the row loop counter */ /* Decrement the row loop counter */
row--; row--;
} while(row > 0u); }
/* Compute any remaining odd row/column below */
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Compute remaining output column */
if (numColsB & 1u) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData+numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcBT + numRowsB*(numColsB-1);
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
/* matrix multiplication */
while(colCnt > 0u)
{
inA1 = *__SIMD32(pInA)++;
inA2 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inB2 = *__SIMD32(pInB)++;
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
colCnt--;
}
colCnt = numColsA & 3u;
while(colCnt > 0u) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store the result in the destination buffer */
*px = (q15_t) (sum >> 15);
px += numColsB;
/* Decrement the row loop counter */
row--;
}
}
/* Compute remaining output row */
if (numRowsA & 1u) {
/* point to last row in output matrix */
px = pDst->pData+(numColsB)*(numRowsA-1);
pInB = pSrcBT;
col = numColsB;
i = 0u;
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1)*numColsA;
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
/* matrix multiplication */
while(colCnt > 0u)
{
inA1 = *__SIMD32(pInA)++;
inA2 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inB2 = *__SIMD32(pInB)++;
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
colCnt--;
}
colCnt = numColsA & 3u;
while(colCnt > 0u) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Decrement the col loop counter */
col--;
}
}
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* set status as ARM_MATH_SUCCESS */ /* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS; status = ARM_MATH_SUCCESS;

@ -85,22 +85,27 @@ arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcB, const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst) arm_matrix_instance_q31 * pDst)
{ {
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
// q31_t *pSrcB = pSrcB->pData; /* input data matrix pointer B */ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */ q31_t *px; /* Temporary output data matrix pointer */
q31_t sum; /* Accumulator */ q31_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */ uint32_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */ arm_status status; /* status of matrix multiplication */
q31_t inA1, inA2, inA3, inA4, inB1, inB2, inB3, inB4; q31_t inA1, inB1;
#ifdef ARM_MATH_MATRIX_CHECK #ifndef ARM_MATH_CM0_FAMILY
q31_t sum2, sum3, sum4;
q31_t inA2, inB2;
q31_t *pInA2;
q31_t *px2;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */ /* Check for matrix mismatch condition */
if((pSrcA->numCols != pSrcB->numRows) || if((pSrcA->numCols != pSrcB->numRows) ||
@ -113,110 +118,275 @@ arm_status arm_mat_mult_fast_q31(
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */ #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{ {
px = pDst->pData;
#ifndef ARM_MATH_CM0_FAMILY
row = row >> 1;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */ /* row loop */
do while(row > 0u)
{ {
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */ /* For every row wise process, the column loop counter is to be initiated */
col = numColsB; col = numColsB;
/* For every row wise process, the pIn2 pointer is set /* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */ ** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData; pInB = pSrcB->pData;
j = 0u; j = 0u;
#ifndef ARM_MATH_CM0_FAMILY
col = col >> 1;
#endif
/* column loop */ /* column loop */
do while (col > 0u)
{ {
/* Set the variable sum, that acts as accumulator, to zero */ /* Set the variable sum, that acts as accumulator, to zero */
sum = 0; sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pInA */ /* Initiate data pointers */
pIn1 = pInA; pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
#ifndef ARM_MATH_CM0_FAMILY
sum2 = 0;
sum3 = 0;
sum4 = 0;
pInA2 = pInA + numColsA;
colCnt = numColsA;
#else
colCnt = numColsA >> 2; colCnt = numColsA >> 2;
#endif
/* matrix multiplication */ /* matrix multiplication */
while(colCnt > 0u) while(colCnt > 0u)
{ {
#ifndef ARM_MATH_CM0_FAMILY
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
inB1 = *pIn2; inB1 = *pInB;
pIn2 += numColsB; pInB += numColsB;
inA1 = pInA[0];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[1];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[2];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[3];
sum = __SMMLA(inA1, inB1, sum);
pInA += 4u;
#endif
/* Decrement the loop counter */
colCnt--;
}
#ifdef ARM_MATH_CM0_FAMILY
/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. */
colCnt = numColsA % 0x4u;
while(colCnt > 0u)
{
sum = __SMMLA(*pInA++, *pInB, sum);
pInB += numColsB;
colCnt--;
}
j++;
#endif
inA1 = pIn1[0]; /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
inA2 = pIn1[1]; *px++ = sum << 1;
inB2 = *pIn2; #ifndef ARM_MATH_CM0_FAMILY
pIn2 += numColsB; *px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
j += 2;
#endif
inB3 = *pIn2; /* Decrement the column loop counter */
pIn2 += numColsB; col--;
sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA1 * inB1)) >> 32); }
sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA2 * inB2)) >> 32);
inA3 = pIn1[2]; i = i + numColsA;
inA4 = pIn1[3];
inB4 = *pIn2; #ifndef ARM_MATH_CM0_FAMILY
pIn2 += numColsB; i = i + numColsA;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
#endif
sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA3 * inB3)) >> 32); /* Decrement the row loop counter */
sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA4 * inB4)) >> 32); row--;
pIn1 += 4u; }
/* Decrement the loop counter */ /* Compute any remaining odd row/column below */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. #ifndef ARM_MATH_CM0_FAMILY
** No loop unrolling is used. */
colCnt = numColsA % 0x4u; /* Compute remaining output column */
if (numColsB & 1u) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData+numColsB-1;
pInA = pSrcA->pData;
/* row loop */
while (row > 0)
{
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
/* matrix multiplication */
while(colCnt > 0u) while(colCnt > 0u)
{ {
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */ inA1 = *pInA++;
/* Perform the multiply-accumulates */ inA2 = *pInA++;
sum = (q31_t) ((((q63_t) sum << 32) + inB1 = *pInB;
((q63_t) * pIn1++ * (*pIn2))) >> 32); pInB += numColsB;
pIn2 += numColsB; inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
/* Decrement the loop counter */ /* Decrement the loop counter */
colCnt--; colCnt--;
} }
colCnt = numColsA & 3u;
while(colCnt > 0u) {
sum = __SMMLA(*pInA++, *pInB, sum);
pInB += numColsB;
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */ /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px++ = sum << 1; *px = sum << 1;
px += numColsB;
/* Update the pointer pIn2 to point to the starting address of the next column */ /* Decrement the row loop counter */
j++; row--;
pIn2 = pSrcB->pData + j; }
}
/* Decrement the column loop counter */ /* Compute remaining output row */
col--; if (numRowsA & 1u) {
} while(col > 0u); /* point to last row in output matrix */
px = pDst->pData+(numColsB)*(numRowsA-1);
/* Update the pointer pInA to point to the starting address of the next row */ col = numColsB;
i = i + numColsB; i = 0u;
pInA = pInA + numColsA;
/* Decrement the row loop counter */ /* col loop */
row--; while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1)*numColsA;
pInB = pSrcB->pData + i;
} while(row > 0u); /* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
/* matrix multiplication */
while(colCnt > 0u)
{
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
/* Decrement the loop counter */
colCnt--;
}
colCnt = numColsA & 3u;
while(colCnt > 0u) {
sum = __SMMLA(*pInA++, *pInB, sum);
pInB += numColsB;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum << 1;
i++;
/* Decrement the col loop counter */
col--;
}
}
#endif /* #ifndef ARM_MATH_CM0_FAMILY */
/* set status as ARM_MATH_SUCCESS */ /* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS; status = ARM_MATH_SUCCESS;
} }
/* Return to application */ /* Return to application */
return (status); return (status);
} }

Loading…
Cancel
Save