|
|
|
@ -234,12 +234,13 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
/* Working pointer of inputA */
|
|
|
|
/* Working pointer of inputA */
|
|
|
|
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
|
|
|
|
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
px = pIn1 + firstIndex - srcBLen + 1;
|
|
|
|
pSrc1 = pIn1 + firstIndex - srcBLen + 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
else
|
|
|
|
{
|
|
|
|
{
|
|
|
|
px = pIn1;
|
|
|
|
pSrc1 = pIn1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
px = pSrc1;
|
|
|
|
|
|
|
|
|
|
|
|
/* Working pointer of inputB */
|
|
|
|
/* Working pointer of inputB */
|
|
|
|
pSrc2 = pIn2 + (srcBLen - 1U);
|
|
|
|
pSrc2 = pIn2 + (srcBLen - 1U);
|
|
|
|
@ -269,9 +270,9 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
acc3 = 0;
|
|
|
|
acc3 = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* read x[0], x[1], x[2] samples */
|
|
|
|
/* read x[0], x[1], x[2] samples */
|
|
|
|
x0 = *(px++);
|
|
|
|
x0 = *px++;
|
|
|
|
x1 = *(px++);
|
|
|
|
x1 = *px++;
|
|
|
|
x2 = *(px++);
|
|
|
|
x2 = *px++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Apply loop unrolling and compute 4 MACs simultaneously. */
|
|
|
|
/* Apply loop unrolling and compute 4 MACs simultaneously. */
|
|
|
|
k = srcBLen >> 2U;
|
|
|
|
k = srcBLen >> 2U;
|
|
|
|
@ -281,10 +282,10 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
do
|
|
|
|
do
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/* Read y[srcBLen - 1] sample */
|
|
|
|
/* Read y[srcBLen - 1] sample */
|
|
|
|
c0 = *(py--);
|
|
|
|
c0 = *py--;
|
|
|
|
|
|
|
|
|
|
|
|
/* Read x[3] sample */
|
|
|
|
/* Read x[3] sample */
|
|
|
|
x3 = *(px++);
|
|
|
|
x3 = *px++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulate */
|
|
|
|
/* Perform the multiply-accumulate */
|
|
|
|
/* acc0 += x[0] * y[srcBLen - 1] */
|
|
|
|
/* acc0 += x[0] * y[srcBLen - 1] */
|
|
|
|
@ -300,10 +301,10 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
|
|
|
|
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
|
|
|
|
|
|
|
|
|
|
|
|
/* Read y[srcBLen - 2] sample */
|
|
|
|
/* Read y[srcBLen - 2] sample */
|
|
|
|
c0 = *(py--);
|
|
|
|
c0 = *py--;
|
|
|
|
|
|
|
|
|
|
|
|
/* Read x[4] sample */
|
|
|
|
/* Read x[4] sample */
|
|
|
|
x0 = *(px++);
|
|
|
|
x0 = *px++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulate */
|
|
|
|
/* Perform the multiply-accumulate */
|
|
|
|
/* acc0 += x[1] * y[srcBLen - 2] */
|
|
|
|
/* acc0 += x[1] * y[srcBLen - 2] */
|
|
|
|
@ -316,10 +317,10 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
|
|
|
|
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
|
|
|
|
|
|
|
|
|
|
|
|
/* Read y[srcBLen - 3] sample */
|
|
|
|
/* Read y[srcBLen - 3] sample */
|
|
|
|
c0 = *(py--);
|
|
|
|
c0 = *py--;
|
|
|
|
|
|
|
|
|
|
|
|
/* Read x[5] sample */
|
|
|
|
/* Read x[5] sample */
|
|
|
|
x1 = *(px++);
|
|
|
|
x1 = *px++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
/* acc0 += x[2] * y[srcBLen - 3] */
|
|
|
|
/* acc0 += x[2] * y[srcBLen - 3] */
|
|
|
|
@ -332,10 +333,10 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
|
|
|
|
acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
|
|
|
|
|
|
|
|
|
|
|
|
/* Read y[srcBLen - 4] sample */
|
|
|
|
/* Read y[srcBLen - 4] sample */
|
|
|
|
c0 = *(py--);
|
|
|
|
c0 = *py--;
|
|
|
|
|
|
|
|
|
|
|
|
/* Read x[6] sample */
|
|
|
|
/* Read x[6] sample */
|
|
|
|
x2 = *(px++);
|
|
|
|
x2 = *px++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
/* acc0 += x[3] * y[srcBLen - 4] */
|
|
|
|
/* acc0 += x[3] * y[srcBLen - 4] */
|
|
|
|
@ -357,10 +358,10 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
while (k > 0U)
|
|
|
|
while (k > 0U)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/* Read y[srcBLen - 5] sample */
|
|
|
|
/* Read y[srcBLen - 5] sample */
|
|
|
|
c0 = *(py--);
|
|
|
|
c0 = *py--;
|
|
|
|
|
|
|
|
|
|
|
|
/* Read x[7] sample */
|
|
|
|
/* Read x[7] sample */
|
|
|
|
x3 = *(px++);
|
|
|
|
x3 = *px++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
/* Perform the multiply-accumulates */
|
|
|
|
/* acc0 += x[4] * y[srcBLen - 5] */
|
|
|
|
/* acc0 += x[4] * y[srcBLen - 5] */
|
|
|
|
@ -391,14 +392,7 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
count += 4U;
|
|
|
|
count += 4U;
|
|
|
|
|
|
|
|
|
|
|
|
/* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
|
/* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
|
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
|
|
|
|
px = pSrc1 + count;
|
|
|
|
{
|
|
|
|
|
|
|
|
px = pIn1 + firstIndex - srcBLen + 1 + count;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
px = pIn1 + count;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
py = pSrc2;
|
|
|
|
py = pSrc2;
|
|
|
|
|
|
|
|
|
|
|
|
/* Decrement the loop counter */
|
|
|
|
/* Decrement the loop counter */
|
|
|
|
@ -456,14 +450,7 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
count++;
|
|
|
|
count++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
|
/* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
|
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
|
|
|
|
px = pSrc1 + count;
|
|
|
|
{
|
|
|
|
|
|
|
|
px = pIn1 + firstIndex - srcBLen + 1 + count;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
px = pIn1 + count;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
py = pSrc2;
|
|
|
|
py = pSrc2;
|
|
|
|
|
|
|
|
|
|
|
|
/* Decrement the loop counter */
|
|
|
|
/* Decrement the loop counter */
|
|
|
|
@ -501,14 +488,7 @@ arm_status arm_conv_partial_fast_q31(
|
|
|
|
count++;
|
|
|
|
count++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
|
/* Update the inputA and inputB pointers for next MAC calculation */
|
|
|
|
if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
|
|
|
|
px = pSrc1 + count;
|
|
|
|
{
|
|
|
|
|
|
|
|
px = pIn1 + firstIndex - srcBLen + 1 + count;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
px = pIn1 + count;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
py = pSrc2;
|
|
|
|
py = pSrc2;
|
|
|
|
|
|
|
|
|
|
|
|
/* Decrement the loop counter */
|
|
|
|
/* Decrement the loop counter */
|
|
|
|
|