From 365d910888fdfe156aee312cce4d33a94d9de938 Mon Sep 17 00:00:00 2001
From: David Palframan <david.palframan@arm.com>
Date: Wed, 14 Sep 2016 17:20:10 -0500
Subject: [PATCH] performance optimizations for arm_mat_mult_fast_q15 and
 arm_mat_mult_fast_q31

---
 Include/arm_math.h                            |  11 +
 .../MatrixFunctions/arm_mat_mult_fast_q15.c   | 214 +++++++++++--
 .../MatrixFunctions/arm_mat_mult_fast_q31.c   | 286 ++++++++++++++----
 3 files changed, 430 insertions(+), 81 deletions(-)

diff --git a/Include/arm_math.h b/Include/arm_math.h
index 37791c3c..d4133723 100644
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@@ -1031,6 +1031,17 @@ extern "C"
                        ((((q31_t)x <<  8) >>  8) & (q31_t)0xFFFF0000)  ));
   }
 
+  /*
+   * @brief C custom defined SMMLA for M3 and M0 processors
+   */
+  CMSIS_INLINE __STATIC_INLINE int32_t __SMMLA(
+  int32_t x,
+  int32_t y,
+  int32_t sum)
+  {
+    return (sum + (int32_t) (((int64_t) x * y) >> 32));
+  }
+
 #endif /* defined (ARM_MATH_CM3) || defined (ARM_MATH_CM0_FAMILY) */
 
 
diff --git a/Source/MatrixFunctions/arm_mat_mult_fast_q15.c b/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
index 11139b8a..d8189a56 100644
--- a/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
+++ b/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
@@ -97,13 +97,16 @@ arm_status arm_mat_mult_fast_q15(
   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
   uint16_t numRowsB = pSrcB->numRows;            /* number of rows of input matrix A    */
-  uint16_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
+  uint32_t col, i = 0u, row = numRowsB, colCnt;  /* loop counters */
   arm_status status;                             /* status of matrix multiplication */
 
 #ifndef UNALIGNED_SUPPORT_DISABLE
 
   q31_t in;                                      /* Temporary variable to hold the input value */
   q31_t inA1, inA2, inB1, inB2;
+  q31_t sum2, sum3, sum4;
+  q15_t *pInA2, *pInB2, *px2;
+  uint32_t j = 0;
 
 #else
 
@@ -269,9 +272,15 @@ arm_status arm_mat_mult_fast_q15(
     i = 0u;
     px = pDst->pData;
 
+#ifndef UNALIGNED_SUPPORT_DISABLE
+    /* Process two rows from matrix A at a time and output two rows at a time */
+    row = row >> 1;
+    px2 = px + numColsB;
+#endif
+
     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
     /* row loop */
-    do
+    while(row > 0u)
     {
       /* For every row wise process, the column loop counter is to be initiated */
       col = numColsB;
@@ -280,18 +289,35 @@ arm_status arm_mat_mult_fast_q15(
        ** to the starting address of the transposed pSrcB data */
       pInB = pSrcBT;
 
+#ifndef UNALIGNED_SUPPORT_DISABLE
+      /* Process two (transposed) columns from matrix B at a time */
+      col = col >> 1;
+      j = 0;
+#endif
+
       /* column loop */
-      do
+      while (col > 0u)
       {
         /* Set the variable sum, that acts as accumulator, to zero */
         sum = 0;
 
-        /* Apply loop unrolling and compute 2 MACs simultaneously. */
-        colCnt = numColsA >> 2;
-
-        /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+        /* Initiate the pointer pInA to point to the starting address of the column being processed */
         pInA = pSrcA->pData + i;
 
+#ifndef UNALIGNED_SUPPORT_DISABLE
+        sum2 = 0;
+        sum3 = 0;
+        sum4 = 0;
+        pInB  = pSrcBT + j;
+        pInA2 = pInA + numColsA;
+        pInB2 = pInB + numRowsB;
+        
+        /* Read in two elements at once - alows dual MAC instruction */
+        colCnt = numColsA >> 1;
+#else
+        colCnt = numColsA >> 2;
+#endif
+
         /* matrix multiplication */
         while(colCnt > 0u)
         {
@@ -300,29 +326,35 @@ arm_status arm_mat_mult_fast_q15(
 
           inA1 = *__SIMD32(pInA)++;
           inB1 = *__SIMD32(pInB)++;
-          inA2 = *__SIMD32(pInA)++;
-          inB2 = *__SIMD32(pInB)++;
+          inA2 = *__SIMD32(pInA2)++;
+          inB2 = *__SIMD32(pInB2)++;
 
-          sum = __SMLAD(inA1, inB1, sum);
-          sum = __SMLAD(inA2, inB2, sum);
+          sum  = __SMLAD(inA1, inB1, sum);
+          sum2 = __SMLAD(inA1, inB2, sum2);
+          sum3 = __SMLAD(inA2, inB1, sum3);
+          sum4 = __SMLAD(inA2, inB2, sum4);
 
 #else
 
-          inA1 = *pInA++;
-          inB1 = *pInB++;
-          inA2 = *pInA++;
+          inA1 = *pInA;
+          inB1 = *pInB;
           sum += inA1 * inB1;
-          inB2 = *pInB++;
 
-          inA1 = *pInA++;
-          inB1 = *pInB++;
+          inA2 = pInA[1];
+          inB2 = pInB[1];
           sum += inA2 * inB2;
-          inA2 = *pInA++;
-          inB2 = *pInB++;
 
+          inA1 = pInA[2];
+          inB1 = pInB[2];
           sum += inA1 * inB1;
+
+          inA2 = pInA[3];
+          inB2 = pInB[3];
           sum += inA2 * inB2;
 
+          pInA += 4;
+          pInB += 4;
+
 #endif	/*	#ifndef UNALIGNED_SUPPORT_DISABLE	*/
 
           /* Decrement the loop counter */
@@ -330,6 +362,18 @@ arm_status arm_mat_mult_fast_q15(
         }
 
         /* process odd column samples */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+        if (numColsA & 1u) {
+          inA1 = *pInA++;
+          inB1 = *pInB++;
+          inA2 = *pInA2++;
+          inB2 = *pInB2++;
+          sum  += inA1 * inB1;
+          sum2 += inA1 * inB2;
+          sum3 += inA2 * inB1;
+          sum4 += inA2 * inB2;
+        }
+#else
         colCnt = numColsA % 0x4u;
 
         while(colCnt > 0u)
@@ -339,22 +383,146 @@ arm_status arm_mat_mult_fast_q15(
 
           colCnt--;
         }
+#endif
 
         /* Saturate and store the result in the destination buffer */
-        *px = (q15_t) (sum >> 15);
-        px++;
+        *px++  = (q15_t) (sum >> 15);
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+        *px++  = (q15_t) (sum2 >> 15);
+        *px2++ = (q15_t) (sum3 >> 15);
+        *px2++ = (q15_t) (sum4 >> 15);
+        j += numRowsB * 2;
+#endif
 
         /* Decrement the column loop counter */
         col--;
 
-      } while(col > 0u);
+      }
 
       i = i + numColsA;
 
+#ifndef UNALIGNED_SUPPORT_DISABLE
+      i = i + numColsA;
+      px = px2 + (numColsB & 1u);
+      px2 = px + numColsB;
+#endif
+
       /* Decrement the row loop counter */
       row--;
 
-    } while(row > 0u);
+    }
+
+    /* Compute any remaining odd row/column below */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+    /* Compute remaining output column */
+    if (numColsB & 1u) {
+
+      /* Avoid redundant computation of last element */
+      row = numRowsA & (~0x1);
+
+      /* Point to remaining unfilled column in output matrix */
+      px = pDst->pData+numColsB-1;
+      pInA = pSrcA->pData;
+
+      /* row loop */
+      while (row > 0)
+      {
+
+        /* point to last column in matrix B */
+        pInB  = pSrcBT + numRowsB*(numColsB-1);
+
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum  = 0;
+
+        /* Compute 4 columns at once */
+        colCnt = numColsA >> 2;
+
+        /* matrix multiplication */
+        while(colCnt > 0u)
+        {
+          inA1 = *__SIMD32(pInA)++;
+          inA2 = *__SIMD32(pInA)++;
+          inB1 = *__SIMD32(pInB)++;
+          inB2 = *__SIMD32(pInB)++;
+
+          sum  = __SMLAD(inA1, inB1, sum);
+          sum  = __SMLAD(inA2, inB2, sum);
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        colCnt = numColsA & 3u;
+        while(colCnt > 0u) {
+          sum += (q31_t) (*pInA++) * (*pInB++);
+          colCnt--;
+        }
+
+        /* Store the result in the destination buffer */
+        *px  = (q15_t) (sum  >> 15);
+        px += numColsB;
+
+        /* Decrement the row loop counter */
+        row--;
+      } 
+    }
+
+    /* Compute remaining output row */
+    if (numRowsA & 1u) {
+
+      /* point to last row in output matrix */
+      px = pDst->pData+(numColsB)*(numRowsA-1);
+
+      pInB  = pSrcBT;
+      col = numColsB;
+      i = 0u;
+
+      /* col loop */
+      while (col > 0)
+      {
+
+        /* point to last row in matrix A */
+        pInA = pSrcA->pData + (numRowsA-1)*numColsA;
+
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum  = 0;
+
+        /* Compute 4 columns at once */
+        colCnt = numColsA >> 2;
+
+        /* matrix multiplication */
+        while(colCnt > 0u)
+        {
+          inA1 = *__SIMD32(pInA)++;
+          inA2 = *__SIMD32(pInA)++;
+          inB1 = *__SIMD32(pInB)++;
+          inB2 = *__SIMD32(pInB)++;
+
+          sum  = __SMLAD(inA1, inB1, sum);
+          sum  = __SMLAD(inA2, inB2, sum);
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        colCnt = numColsA & 3u;
+        while(colCnt > 0u) {
+          sum += (q31_t) (*pInA++) * (*pInB++);
+          colCnt--;
+        }
+
+        /* Store the result in the destination buffer */
+        *px++  = (q15_t) (sum  >> 15);
+
+        /* Decrement the col loop counter */
+        col--;
+      }
+    }
+
+#endif	/*	#ifndef UNALIGNED_SUPPORT_DISABLE	*/
 
     /* set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
diff --git a/Source/MatrixFunctions/arm_mat_mult_fast_q31.c b/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
index dbc31410..2b4b753c 100644
--- a/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
+++ b/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
@@ -85,22 +85,27 @@ arm_status arm_mat_mult_fast_q31(
   const arm_matrix_instance_q31 * pSrcB,
   arm_matrix_instance_q31 * pDst)
 {
-  q31_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
-  q31_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
   q31_t *pInA = pSrcA->pData;                    /* input data matrix pointer A */
-//  q31_t *pSrcB = pSrcB->pData;                    /* input data matrix pointer B */    
-  q31_t *pOut = pDst->pData;                     /* output data matrix pointer */
+  q31_t *pInB = pSrcB->pData;                    /* input data matrix pointer B */
   q31_t *px;                                     /* Temporary output data matrix pointer */
   q31_t sum;                                     /* Accumulator */
   uint16_t numRowsA = pSrcA->numRows;            /* number of rows of input matrix A    */
   uint16_t numColsB = pSrcB->numCols;            /* number of columns of input matrix B */
   uint16_t numColsA = pSrcA->numCols;            /* number of columns of input matrix A */
-  uint16_t col, i = 0u, j, row = numRowsA, colCnt;      /* loop counters */
+  uint32_t col, i = 0u, j, row = numRowsA, colCnt;  /* loop counters */
   arm_status status;                             /* status of matrix multiplication */
-  q31_t inA1, inA2, inA3, inA4, inB1, inB2, inB3, inB4;
+  q31_t inA1, inB1;
 
-#ifdef ARM_MATH_MATRIX_CHECK
+#ifndef ARM_MATH_CM0_FAMILY
+
+  q31_t sum2, sum3, sum4;
+  q31_t inA2, inB2;
+  q31_t *pInA2;
+  q31_t *px2;
 
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
 
   /* Check for matrix mismatch condition */
   if((pSrcA->numCols != pSrcB->numRows) ||
@@ -113,110 +118,275 @@ arm_status arm_mat_mult_fast_q31(
 #endif /*      #ifdef ARM_MATH_MATRIX_CHECK    */
 
   {
+
+    px = pDst->pData;
+
+#ifndef ARM_MATH_CM0_FAMILY
+    row = row >> 1;
+    px2 = px + numColsB;
+#endif
+
     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
     /* row loop */
-    do
+    while(row > 0u)
     {
-      /* Output pointer is set to starting address of the row being processed */
-      px = pOut + i;
 
       /* For every row wise process, the column loop counter is to be initiated */
       col = numColsB;
 
       /* For every row wise process, the pIn2 pointer is set    
        ** to the starting address of the pSrcB data */
-      pIn2 = pSrcB->pData;
+      pInB = pSrcB->pData;
 
       j = 0u;
 
+#ifndef ARM_MATH_CM0_FAMILY
+      col = col >> 1;
+#endif
+
       /* column loop */
-      do
+      while (col > 0u)
       {
         /* Set the variable sum, that acts as accumulator, to zero */
         sum = 0;
 
-        /* Initiate the pointer pIn1 to point to the starting address of pInA */
-        pIn1 = pInA;
-
-        /* Apply loop unrolling and compute 4 MACs simultaneously. */
+        /* Initiate data pointers */
+        pInA = pSrcA->pData + i;
+        pInB  = pSrcB->pData + j;
+
+#ifndef ARM_MATH_CM0_FAMILY
+        sum2 = 0;
+        sum3 = 0;
+        sum4 = 0;
+        pInA2 = pInA + numColsA;
+        colCnt = numColsA;
+#else
         colCnt = numColsA >> 2;
-
+#endif
 
         /* matrix multiplication */
         while(colCnt > 0u)
         {
+
+#ifndef ARM_MATH_CM0_FAMILY
+          inA1 = *pInA++;
+          inB1 = pInB[0];
+          inA2 = *pInA2++;
+          inB2 = pInB[1];
+          pInB += numColsB;
+
+          sum  = __SMMLA(inA1, inB1, sum);
+          sum2 = __SMMLA(inA1, inB2, sum2);
+          sum3 = __SMMLA(inA2, inB1, sum3);
+          sum4 = __SMMLA(inA2, inB2, sum4);
+#else
           /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
           /* Perform the multiply-accumulates */
-          inB1 = *pIn2;
-          pIn2 += numColsB;
+          inB1 = *pInB;
+          pInB += numColsB;
+          inA1 = pInA[0];
+          sum = __SMMLA(inA1, inB1, sum);
+
+          inB1 = *pInB;
+          pInB += numColsB;
+          inA1 = pInA[1];
+          sum = __SMMLA(inA1, inB1, sum);
+
+          inB1 = *pInB;
+          pInB += numColsB;
+          inA1 = pInA[2];
+          sum = __SMMLA(inA1, inB1, sum);
+
+          inB1 = *pInB;
+          pInB += numColsB;
+          inA1 = pInA[3];
+          sum = __SMMLA(inA1, inB1, sum);
+
+          pInA += 4u;
+#endif
+          
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+#ifdef ARM_MATH_CM0_FAMILY
+        /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. */
+        colCnt = numColsA % 0x4u;
+        while(colCnt > 0u)
+        {
+          sum = __SMMLA(*pInA++, *pInB, sum);
+          pInB += numColsB;
+          colCnt--;
+        }
+        j++;
+#endif
 
-          inA1 = pIn1[0];
-          inA2 = pIn1[1];
+        /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
+        *px++  = sum << 1;
 
-          inB2 = *pIn2;
-          pIn2 += numColsB;
+#ifndef ARM_MATH_CM0_FAMILY        
+        *px++  = sum2 << 1; 
+        *px2++ = sum3 << 1;
+        *px2++ = sum4 << 1; 
+        j += 2;
+#endif
 
-          inB3 = *pIn2;
-          pIn2 += numColsB;
+        /* Decrement the column loop counter */
+        col--;
 
-          sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA1 * inB1)) >> 32);
-          sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA2 * inB2)) >> 32);
+      }
 
-          inA3 = pIn1[2];
-          inA4 = pIn1[3];
+      i = i + numColsA;
 
-          inB4 = *pIn2;
-          pIn2 += numColsB;
+#ifndef ARM_MATH_CM0_FAMILY  
+      i = i + numColsA;
+      px = px2 + (numColsB & 1u);
+      px2 = px + numColsB;
+#endif
 
-          sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA3 * inB3)) >> 32);
-          sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA4 * inB4)) >> 32);
+      /* Decrement the row loop counter */
+      row--;
 
-          pIn1 += 4u;
+    }
 
-          /* Decrement the loop counter */
-          colCnt--;
-        }
+    /* Compute any remaining odd row/column below */
 
-        /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.    
-         ** No loop unrolling is used. */
-        colCnt = numColsA % 0x4u;
+#ifndef ARM_MATH_CM0_FAMILY
+
+    /* Compute remaining output column */
+    if (numColsB & 1u) {
+
+      /* Avoid redundant computation of last element */
+      row = numRowsA & (~0x1);
+
+      /* Point to remaining unfilled column in output matrix */
+      px = pDst->pData+numColsB-1;
+      pInA = pSrcA->pData;
+
+      /* row loop */
+      while (row > 0)
+      {
+
+        /* point to last column in matrix B */
+        pInB  = pSrcB->pData + numColsB-1;
+
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum  = 0;
 
+        /* Compute 4 columns at once */
+        colCnt = numColsA >> 2;
+
+        /* matrix multiplication */
         while(colCnt > 0u)
         {
-          /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
-          /* Perform the multiply-accumulates */
-          sum = (q31_t) ((((q63_t) sum << 32) +
-                          ((q63_t) * pIn1++ * (*pIn2))) >> 32);
-          pIn2 += numColsB;
+          inA1 = *pInA++;
+          inA2 = *pInA++;
+          inB1 = *pInB;
+          pInB += numColsB;
+          inB2 = *pInB;
+          pInB += numColsB;
+          sum = __SMMLA(inA1, inB1, sum);
+          sum = __SMMLA(inA2, inB2, sum);
+
+          inA1 = *pInA++;
+          inA2 = *pInA++;
+          inB1 = *pInB;
+          pInB += numColsB;
+          inB2 = *pInB;
+          pInB += numColsB;
+          sum = __SMMLA(inA1, inB1, sum);
+          sum = __SMMLA(inA2, inB2, sum);
 
           /* Decrement the loop counter */
           colCnt--;
         }
 
+        colCnt = numColsA & 3u;
+        while(colCnt > 0u) {
+          sum = __SMMLA(*pInA++, *pInB, sum);
+          pInB += numColsB;
+          colCnt--;
+        }
+
         /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
-        *px++ = sum << 1;
+        *px = sum << 1;
+        px += numColsB;
 
-        /* Update the pointer pIn2 to point to the  starting address of the next column */
-        j++;
-        pIn2 = pSrcB->pData + j;
+        /* Decrement the row loop counter */
+        row--;
+      } 
+    }
 
-        /* Decrement the column loop counter */
-        col--;
+    /* Compute remaining output row */
+    if (numRowsA & 1u) {
 
-      } while(col > 0u);
+      /* point to last row in output matrix */
+      px = pDst->pData+(numColsB)*(numRowsA-1);
 
-      /* Update the pointer pInA to point to the  starting address of the next row */
-      i = i + numColsB;
-      pInA = pInA + numColsA;
+      col = numColsB;
+      i = 0u;
 
-      /* Decrement the row loop counter */
-      row--;
+      /* col loop */
+      while (col > 0)
+      {
+
+        /* point to last row in matrix A */
+        pInA = pSrcA->pData + (numRowsA-1)*numColsA;
+        pInB  = pSrcB->pData + i;
 
-    } while(row > 0u);
+        /* Set the variable sum, that acts as accumulator, to zero */
+        sum  = 0;
+
+        /* Compute 4 columns at once */
+        colCnt = numColsA >> 2;
+
+        /* matrix multiplication */
+        while(colCnt > 0u)
+        {
+          inA1 = *pInA++;
+          inA2 = *pInA++;
+          inB1 = *pInB;
+          pInB += numColsB;
+          inB2 = *pInB;
+          pInB += numColsB;
+          sum = __SMMLA(inA1, inB1, sum);
+          sum = __SMMLA(inA2, inB2, sum);
+
+          inA1 = *pInA++;
+          inA2 = *pInA++;
+          inB1 = *pInB;
+          pInB += numColsB;
+          inB2 = *pInB;
+          pInB += numColsB;
+          sum = __SMMLA(inA1, inB1, sum);
+          sum = __SMMLA(inA2, inB2, sum);
+
+          /* Decrement the loop counter */
+          colCnt--;
+        }
+
+        colCnt = numColsA & 3u;
+        while(colCnt > 0u) {
+          sum = __SMMLA(*pInA++, *pInB, sum);
+          pInB += numColsB;
+          colCnt--;
+        }
+
+        /* Saturate and store the result in the destination buffer */
+        *px++ = sum << 1;
+        i++;
+
+        /* Decrement the col loop counter */
+        col--;
+      }
+    }
+
+#endif	/*	#ifndef ARM_MATH_CM0_FAMILY	*/
 
     /* set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
   }
+
   /* Return to application */
   return (status);
 }