Corrected arm_conv_partial_q15(), arm_conv_q15(), arm_correlate_q15() for Cortex-M7 based cores.

pull/19/head
Martin Günther 10 years ago
parent 31eba363dd
commit 3a1cfe0e0b

@ -1,24 +1,24 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
* Copyright (C) 2010-2014 ARM Limited. All rights reserved. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
* *
* $Date: 19. March 2015 * $Date: 26. September 2016
* $Revision: V.1.4.5 * $Revision: V.1.4.5 a
* *
* Project: CMSIS DSP Library * Project: CMSIS DSP Library
* Title: arm_conv_partial_q15.c * Title: arm_conv_partial_q15.c
* *
* Description: Partial convolution of Q15 sequences. * Description: Partial convolution of Q15 sequences.
* *
* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
* - Redistributions of source code must retain the above copyright * - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright * - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in * notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the * the documentation and/or other materials provided with the
* distribution. * distribution.
* - Neither the name of ARM LIMITED nor the names of its contributors * - Neither the name of ARM LIMITED nor the names of its contributors
* may be used to endorse or promote products derived from this * may be used to endorse or promote products derived from this
@ -27,7 +27,7 @@
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@ -35,39 +35,38 @@
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
* -------------------------------------------------------------------- */ * -------------------------------------------------------------------- */
#include "arm_math.h" #include "arm_math.h"
/** /**
* @ingroup groupFilters * @ingroup groupFilters
*/ */
/** /**
* @addtogroup PartialConv * @addtogroup PartialConv
* @{ * @{
*/ */
/** /**
* @brief Partial convolution of Q15 sequences. * @brief Partial convolution of Q15 sequences.
* @param[in] *pSrcA points to the first input sequence. * @param[in] *pSrcA points to the first input sequence.
* @param[in] srcALen length of the first input sequence. * @param[in] srcALen length of the first input sequence.
* @param[in] *pSrcB points to the second input sequence. * @param[in] *pSrcB points to the second input sequence.
* @param[in] srcBLen length of the second input sequence. * @param[in] srcBLen length of the second input sequence.
* @param[out] *pDst points to the location where the output result is written. * @param[out] *pDst points to the location where the output result is written.
* @param[in] firstIndex is the first output sample to start with. * @param[in] firstIndex is the first output sample to start with.
* @param[in] numPoints is the number of output points to be computed. * @param[in] numPoints is the number of output points to be computed.
* @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2]. * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
* *
* Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4. * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
* *
* \par * \par
* Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers. * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
* *
*/ */
arm_status arm_conv_partial_q15( arm_status arm_conv_partial_q15(
q15_t * pSrcA, q15_t * pSrcA,
uint32_t srcALen, uint32_t srcALen,
@ -78,7 +77,8 @@ arm_status arm_conv_partial_q15(
uint32_t numPoints) uint32_t numPoints)
{ {
#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
#if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
/* Run the below code for Cortex-M4 and Cortex-M3 */ /* Run the below code for Cortex-M4 and Cortex-M3 */
@ -128,7 +128,7 @@ arm_status arm_conv_partial_q15(
srcALen = j; srcALen = j;
} }
/* Conditions to check which loopCounter holds /* Conditions to check which loopCounter holds
* the first and last indices of the output samples to be calculated. */ * the first and last indices of the output samples to be calculated. */
check = firstIndex + numPoints; check = firstIndex + numPoints;
blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0; blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
@ -141,31 +141,31 @@ arm_status arm_conv_partial_q15(
blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
/* The function is internally /* The function is internally
* divided into three stages according to the number of multiplications that has to be * divided into three stages according to the number of multiplications that has to be
* taken place between inputA samples and inputB samples. In the first stage of the * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration. * algorithm, the multiplications increase by one for every iteration.
* In the second stage of the algorithm, srcBLen number of multiplications are done. * In the second stage of the algorithm, srcBLen number of multiplications are done.
* In the third stage of the algorithm, the multiplications decrease by one * In the third stage of the algorithm, the multiplications decrease by one
* for every iteration. */ * for every iteration. */
/* Set the output pointer to point to the firstIndex /* Set the output pointer to point to the firstIndex
* of the output sample to be calculated. */ * of the output sample to be calculated. */
pOut = pDst + firstIndex; pOut = pDst + firstIndex;
/* -------------------------- /* --------------------------
* Initializations of stage1 * Initializations of stage1
* -------------------------*/ * -------------------------*/
/* sum = x[0] * y[0] /* sum = x[0] * y[0]
* sum = x[0] * y[1] + x[1] * y[0] * sum = x[0] * y[1] + x[1] * y[0]
* .... * ....
* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
*/ */
/* In this stage the MAC operations are increased by 1 for every iteration. /* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed. The count variable holds the number of MAC operations performed.
Since the partial convolution starts from firstIndex Since the partial convolution starts from firstIndex
Number of Macs to be performed is firstIndex + 1 */ Number of Macs to be performed is firstIndex + 1 */
count = 1u + firstIndex; count = 1u + firstIndex;
@ -176,8 +176,8 @@ arm_status arm_conv_partial_q15(
pSrc2 = pIn2 + firstIndex; pSrc2 = pIn2 + firstIndex;
py = pSrc2; py = pSrc2;
/* ------------------------ /* ------------------------
* Stage1 process * Stage1 process
* ----------------------*/ * ----------------------*/
/* For loop unrolling by 4, this stage is divided into two. */ /* For loop unrolling by 4, this stage is divided into two. */
@ -190,7 +190,7 @@ arm_status arm_conv_partial_q15(
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0; sum = 0;
/* Loop over number of MAC operations between /* Loop over number of MAC operations between
* inputA samples and inputB samples */ * inputA samples and inputB samples */
k = count; k = count;
@ -219,7 +219,7 @@ arm_status arm_conv_partial_q15(
/* The second part of the stage starts here */ /* The second part of the stage starts here */
/* The internal loop, over count, is unrolled by 4 */ /* The internal loop, over count, is unrolled by 4 */
/* To, read the last two inputB samples using SIMD: /* To, read the last two inputB samples using SIMD:
* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
py = py - 1; py = py - 1;
@ -231,7 +231,7 @@ arm_status arm_conv_partial_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = count >> 2u; k = count >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -245,11 +245,11 @@ arm_status arm_conv_partial_q15(
k--; k--;
} }
/* For the next MAC operations, the pointer py is used without SIMD /* For the next MAC operations, the pointer py is used without SIMD
* So, py is incremented by 1 */ * So, py is incremented by 1 */
py = py + 1u; py = py + 1u;
/* If the count is not a multiple of 4, compute any remaining MACs here. /* If the count is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = count % 0x4u; k = count % 0x4u;
@ -276,14 +276,14 @@ arm_status arm_conv_partial_q15(
blockSize1--; blockSize1--;
} }
/* -------------------------- /* --------------------------
* Initializations of stage2 * Initializations of stage2
* ------------------------*/ * ------------------------*/
/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
* .... * ....
* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
*/ */
/* Working pointer of inputA */ /* Working pointer of inputA */
@ -300,16 +300,16 @@ arm_status arm_conv_partial_q15(
pSrc2 = pIn2 + (srcBLen - 1u); pSrc2 = pIn2 + (srcBLen - 1u);
py = pSrc2; py = pSrc2;
/* count is the index by which the pointer pIn1 to be incremented */ /* count is the index by which the pointer pIn1 to be incremented */
count = 0u; count = 0u;
/* -------------------- /* --------------------
* Stage2 process * Stage2 process
* -------------------*/ * -------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2, * So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */ * srcBLen should be greater than or equal to 4 */
if(srcBLen >= 4u) if(srcBLen >= 4u)
{ {
@ -331,17 +331,17 @@ arm_status arm_conv_partial_q15(
x0 = *__SIMD32(px); x0 = *__SIMD32(px);
/* read x[1], x[2] samples */ /* read x[1], x[2] samples */
x1 = _SIMD32_OFFSET(px+1); x1 = _SIMD32_OFFSET(px+1);
px+= 2u; px+= 2u;
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2u; k = srcBLen >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
do do
{ {
/* Read the last two inputB samples using SIMD: /* Read the last two inputB samples using SIMD:
* y[srcBLen - 1] and y[srcBLen - 2] */ * y[srcBLen - 1] and y[srcBLen - 2] */
c0 = *__SIMD32(py)--; c0 = *__SIMD32(py)--;
@ -377,7 +377,7 @@ arm_status arm_conv_partial_q15(
/* Read x[5], x[6] */ /* Read x[5], x[6] */
x1 = _SIMD32_OFFSET(px+3); x1 = _SIMD32_OFFSET(px+3);
px += 4u; px += 4u;
/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
acc2 = __SMLALDX(x0, c0, acc2); acc2 = __SMLALDX(x0, c0, acc2);
@ -387,10 +387,10 @@ arm_status arm_conv_partial_q15(
} while(--k); } while(--k);
/* For the next MAC operations, SIMD is not used /* For the next MAC operations, SIMD is not used
* So, the 16 bit pointer if inputB, py is updated */ * So, the 16 bit pointer if inputB, py is updated */
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = srcBLen % 0x4u; k = srcBLen % 0x4u;
@ -411,7 +411,7 @@ arm_status arm_conv_partial_q15(
/* Read x[7] */ /* Read x[7] */
x3 = *__SIMD32(px); x3 = *__SIMD32(px);
px++; px++;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALD(x0, c0, acc0); acc0 = __SMLALD(x0, c0, acc0);
@ -430,7 +430,7 @@ arm_status arm_conv_partial_q15(
/* Read x[9] */ /* Read x[9] */
x2 = _SIMD32_OFFSET(px+1); x2 = _SIMD32_OFFSET(px+1);
px += 2u; px += 2u;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALDX(x0, c0, acc0); acc0 = __SMLALDX(x0, c0, acc0);
@ -456,7 +456,7 @@ arm_status arm_conv_partial_q15(
acc2 = __SMLALDX(x3, c0, acc2); acc2 = __SMLALDX(x3, c0, acc2);
acc3 = __SMLALDX(x2, c0, acc3); acc3 = __SMLALDX(x2, c0, acc3);
c0 = *(py-1); c0 = *(py-1);
#ifdef ARM_MATH_BIG_ENDIAN #ifdef ARM_MATH_BIG_ENDIAN
@ -468,7 +468,7 @@ arm_status arm_conv_partial_q15(
/* Read x[10] */ /* Read x[10] */
x3 = _SIMD32_OFFSET(px+2); x3 = _SIMD32_OFFSET(px+2);
px += 3u; px += 3u;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALDX(x1, c0, acc0); acc0 = __SMLALDX(x1, c0, acc0);
@ -507,10 +507,10 @@ arm_status arm_conv_partial_q15(
blkCnt--; blkCnt--;
} }
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
blkCnt = (uint32_t) blockSize2 % 0x4u; blkCnt = (uint32_t) blockSize2 % 0x4u;
while(blkCnt > 0u) while(blkCnt > 0u)
{ {
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
@ -519,7 +519,7 @@ arm_status arm_conv_partial_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2u; k = srcBLen >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -533,7 +533,7 @@ arm_status arm_conv_partial_q15(
k--; k--;
} }
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = srcBLen % 0x4u; k = srcBLen % 0x4u;
@ -562,7 +562,7 @@ arm_status arm_conv_partial_q15(
} }
else else
{ {
/* If the srcBLen is not a multiple of 4, /* If the srcBLen is not a multiple of 4,
* the blockSize2 loop cannot be unrolled by 4 */ * the blockSize2 loop cannot be unrolled by 4 */
blkCnt = (uint32_t) blockSize2; blkCnt = (uint32_t) blockSize2;
@ -592,25 +592,25 @@ arm_status arm_conv_partial_q15(
/* Update the inputA and inputB pointers for next MAC calculation */ /* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count; px = pIn1 + count;
py = pSrc2; py = pSrc2;
/* Decrement the loop counter */ /* Decrement the loop counter */
blkCnt--; blkCnt--;
} }
} }
/* -------------------------- /* --------------------------
* Initializations of stage3 * Initializations of stage3
* -------------------------*/ * -------------------------*/
/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
* .... * ....
* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
* sum += x[srcALen-1] * y[srcBLen-1] * sum += x[srcALen-1] * y[srcBLen-1]
*/ */
/* In this stage the MAC operations are decreased by 1 for every iteration. /* In this stage the MAC operations are decreased by 1 for every iteration.
The count variable holds the number of MAC operations performed */ The count variable holds the number of MAC operations performed */
count = srcBLen - 1u; count = srcBLen - 1u;
@ -623,8 +623,8 @@ arm_status arm_conv_partial_q15(
pIn2 = pSrc2 - 1u; pIn2 = pSrc2 - 1u;
py = pIn2; py = pIn2;
/* ------------------- /* -------------------
* Stage3 process * Stage3 process
* ------------------*/ * ------------------*/
/* For loop unrolling by 4, this stage is divided into two. */ /* For loop unrolling by 4, this stage is divided into two. */
@ -642,14 +642,14 @@ arm_status arm_conv_partial_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = count >> 2u; k = count >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
* with y[srcBLen - 1], y[srcBLen - 2] respectively */ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
* with y[srcBLen - 3], y[srcBLen - 4] respectively */ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
@ -657,11 +657,11 @@ arm_status arm_conv_partial_q15(
k--; k--;
} }
/* For the next MAC operations, the pointer py is used without SIMD /* For the next MAC operations, the pointer py is used without SIMD
* So, py is incremented by 1 */ * So, py is incremented by 1 */
py = py + 1u; py = py + 1u;
/* If the count is not a multiple of 4, compute any remaining MACs here. /* If the count is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = count % 0x4u; k = count % 0x4u;
@ -691,7 +691,7 @@ arm_status arm_conv_partial_q15(
} }
/* The second part of the stage starts here */ /* The second part of the stage starts here */
/* SIMD is not used for the next MAC operations, /* SIMD is not used for the next MAC operations,
* so pointer py is updated to read only one sample at a time */ * so pointer py is updated to read only one sample at a time */
py = py + 1u; py = py + 1u;
@ -777,10 +777,10 @@ arm_status arm_conv_partial_q15(
} }
return (status); return (status);
#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
} }
/** /**
* @} end of PartialConv group * @} end of PartialConv group
*/ */

@ -1,24 +1,24 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
* Copyright (C) 2010-2014 ARM Limited. All rights reserved. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
* *
* $Date: 19. March 2015 * $Date: 26. September 2016
* $Revision: V.1.4.5 * $Revision: V.1.4.5 a
* *
* Project: CMSIS DSP Library * Project: CMSIS DSP Library
* Title: arm_conv_q15.c * Title: arm_conv_q15.c
* *
* Description: Convolution of Q15 sequences. * Description: Convolution of Q15 sequences.
* *
* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
* - Redistributions of source code must retain the above copyright * - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright * - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in * notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the * the documentation and/or other materials provided with the
* distribution. * distribution.
* - Neither the name of ARM LIMITED nor the names of its contributors * - Neither the name of ARM LIMITED nor the names of its contributors
* may be used to endorse or promote products derived from this * may be used to endorse or promote products derived from this
@ -27,7 +27,7 @@
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@ -35,45 +35,45 @@
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
* -------------------------------------------------------------------- */ * -------------------------------------------------------------------- */
#include "arm_math.h" #include "arm_math.h"
/** /**
* @ingroup groupFilters * @ingroup groupFilters
*/ */
/** /**
* @addtogroup Conv * @addtogroup Conv
* @{ * @{
*/ */
/** /**
* @brief Convolution of Q15 sequences. * @brief Convolution of Q15 sequences.
* @param[in] *pSrcA points to the first input sequence. * @param[in] *pSrcA points to the first input sequence.
* @param[in] srcALen length of the first input sequence. * @param[in] srcALen length of the first input sequence.
* @param[in] *pSrcB points to the second input sequence. * @param[in] *pSrcB points to the second input sequence.
* @param[in] srcBLen length of the second input sequence. * @param[in] srcBLen length of the second input sequence.
* @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1. * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
* @return none. * @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
* This approach provides 33 guard bits and there is no risk of overflow.
* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*
* \par
* Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
* *
* \par * @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
* This approach provides 33 guard bits and there is no risk of overflow.
* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*
* \par
* Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
*
* \par
* Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers. * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers.
* *
*/ */
void arm_conv_q15( void arm_conv_q15(
@ -84,7 +84,7 @@ void arm_conv_q15(
q15_t * pDst) q15_t * pDst)
{ {
#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
/* Run the below code for Cortex-M4 and Cortex-M3 */ /* Run the below code for Cortex-M4 and Cortex-M3 */
@ -124,30 +124,30 @@ void arm_conv_q15(
} }
/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
/* The function is internally /* The function is internally
* divided into three stages according to the number of multiplications that has to be * divided into three stages according to the number of multiplications that has to be
* taken place between inputA samples and inputB samples. In the first stage of the * taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration. * algorithm, the multiplications increase by one for every iteration.
* In the second stage of the algorithm, srcBLen number of multiplications are done. * In the second stage of the algorithm, srcBLen number of multiplications are done.
* In the third stage of the algorithm, the multiplications decrease by one * In the third stage of the algorithm, the multiplications decrease by one
* for every iteration. */ * for every iteration. */
/* The algorithm is implemented in three stages. /* The algorithm is implemented in three stages.
The loop counters of each stage is initiated here. */ The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1u; blockSize1 = srcBLen - 1u;
blockSize2 = srcALen - (srcBLen - 1u); blockSize2 = srcALen - (srcBLen - 1u);
/* -------------------------- /* --------------------------
* Initializations of stage1 * Initializations of stage1
* -------------------------*/ * -------------------------*/
/* sum = x[0] * y[0] /* sum = x[0] * y[0]
* sum = x[0] * y[1] + x[1] * y[0] * sum = x[0] * y[1] + x[1] * y[0]
* .... * ....
* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
*/ */
/* In this stage the MAC operations are increased by 1 for every iteration. /* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed */ The count variable holds the number of MAC operations performed */
count = 1u; count = 1u;
@ -158,8 +158,8 @@ void arm_conv_q15(
py = pIn2; py = pIn2;
/* ------------------------ /* ------------------------
* Stage1 process * Stage1 process
* ----------------------*/ * ----------------------*/
/* For loop unrolling by 4, this stage is divided into two. */ /* For loop unrolling by 4, this stage is divided into two. */
@ -172,7 +172,7 @@ void arm_conv_q15(
/* Accumulator is made zero for every iteration */ /* Accumulator is made zero for every iteration */
sum = 0; sum = 0;
/* Loop over number of MAC operations between /* Loop over number of MAC operations between
* inputA samples and inputB samples */ * inputA samples and inputB samples */
k = count; k = count;
@ -201,7 +201,7 @@ void arm_conv_q15(
/* The second part of the stage starts here */ /* The second part of the stage starts here */
/* The internal loop, over count, is unrolled by 4 */ /* The internal loop, over count, is unrolled by 4 */
/* To, read the last two inputB samples using SIMD: /* To, read the last two inputB samples using SIMD:
* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
py = py - 1; py = py - 1;
@ -213,7 +213,7 @@ void arm_conv_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = count >> 2u; k = count >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -227,11 +227,11 @@ void arm_conv_q15(
k--; k--;
} }
/* For the next MAC operations, the pointer py is used without SIMD /* For the next MAC operations, the pointer py is used without SIMD
* So, py is incremented by 1 */ * So, py is incremented by 1 */
py = py + 1u; py = py + 1u;
/* If the count is not a multiple of 4, compute any remaining MACs here. /* If the count is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = count % 0x4u; k = count % 0x4u;
@ -258,14 +258,14 @@ void arm_conv_q15(
blockSize1--; blockSize1--;
} }
/* -------------------------- /* --------------------------
* Initializations of stage2 * Initializations of stage2
* ------------------------*/ * ------------------------*/
/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
* .... * ....
* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
*/ */
/* Working pointer of inputA */ /* Working pointer of inputA */
@ -279,12 +279,12 @@ void arm_conv_q15(
count = 0u; count = 0u;
/* -------------------- /* --------------------
* Stage2 process * Stage2 process
* -------------------*/ * -------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2, * So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */ * srcBLen should be greater than or equal to 4 */
if(srcBLen >= 4u) if(srcBLen >= 4u)
{ {
@ -306,17 +306,17 @@ void arm_conv_q15(
x0 = *__SIMD32(px); x0 = *__SIMD32(px);
/* read x[1], x[2] samples */ /* read x[1], x[2] samples */
x1 = _SIMD32_OFFSET(px+1); x1 = _SIMD32_OFFSET(px+1);
px+= 2u; px+= 2u;
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2u; k = srcBLen >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
do do
{ {
/* Read the last two inputB samples using SIMD: /* Read the last two inputB samples using SIMD:
* y[srcBLen - 1] and y[srcBLen - 2] */ * y[srcBLen - 1] and y[srcBLen - 2] */
c0 = *__SIMD32(py)--; c0 = *__SIMD32(py)--;
@ -352,7 +352,7 @@ void arm_conv_q15(
/* Read x[5], x[6] */ /* Read x[5], x[6] */
x1 = _SIMD32_OFFSET(px+3); x1 = _SIMD32_OFFSET(px+3);
px += 4u; px += 4u;
/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
acc2 = __SMLALDX(x0, c0, acc2); acc2 = __SMLALDX(x0, c0, acc2);
@ -362,10 +362,10 @@ void arm_conv_q15(
} while(--k); } while(--k);
/* For the next MAC operations, SIMD is not used /* For the next MAC operations, SIMD is not used
* So, the 16 bit pointer if inputB, py is updated */ * So, the 16 bit pointer if inputB, py is updated */
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = srcBLen % 0x4u; k = srcBLen % 0x4u;
@ -385,7 +385,7 @@ void arm_conv_q15(
#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */ /* Read x[7] */
x3 = *__SIMD32(px); x3 = *__SIMD32(px);
px++; px++;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALD(x0, c0, acc0); acc0 = __SMLALD(x0, c0, acc0);
@ -404,7 +404,7 @@ void arm_conv_q15(
/* Read x[9] */ /* Read x[9] */
x2 = _SIMD32_OFFSET(px+1); x2 = _SIMD32_OFFSET(px+1);
px += 2u; px += 2u;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALDX(x0, c0, acc0); acc0 = __SMLALDX(x0, c0, acc0);
@ -430,7 +430,7 @@ void arm_conv_q15(
acc2 = __SMLALDX(x3, c0, acc2); acc2 = __SMLALDX(x3, c0, acc2);
acc3 = __SMLALDX(x2, c0, acc3); acc3 = __SMLALDX(x2, c0, acc3);
c0 = *(py-1); c0 = *(py-1);
#ifdef ARM_MATH_BIG_ENDIAN #ifdef ARM_MATH_BIG_ENDIAN
@ -441,7 +441,7 @@ void arm_conv_q15(
#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[10] */ /* Read x[10] */
x3 = _SIMD32_OFFSET(px+2); x3 = _SIMD32_OFFSET(px+2);
px += 3u; px += 3u;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALDX(x1, c0, acc0); acc0 = __SMLALDX(x1, c0, acc0);
@ -480,7 +480,7 @@ void arm_conv_q15(
blkCnt--; blkCnt--;
} }
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
blkCnt = blockSize2 % 0x4u; blkCnt = blockSize2 % 0x4u;
@ -492,7 +492,7 @@ void arm_conv_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2u; k = srcBLen >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -506,7 +506,7 @@ void arm_conv_q15(
k--; k--;
} }
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = srcBLen % 0x4u; k = srcBLen % 0x4u;
@ -535,7 +535,7 @@ void arm_conv_q15(
} }
else else
{ {
/* If the srcBLen is not a multiple of 4, /* If the srcBLen is not a multiple of 4,
* the blockSize2 loop cannot be unrolled by 4 */ * the blockSize2 loop cannot be unrolled by 4 */
blkCnt = blockSize2; blkCnt = blockSize2;
@ -572,18 +572,18 @@ void arm_conv_q15(
} }
/* -------------------------- /* --------------------------
* Initializations of stage3 * Initializations of stage3
* -------------------------*/ * -------------------------*/
/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
* .... * ....
* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
* sum += x[srcALen-1] * y[srcBLen-1] * sum += x[srcALen-1] * y[srcBLen-1]
*/ */
/* In this stage the MAC operations are decreased by 1 for every iteration. /* In this stage the MAC operations are decreased by 1 for every iteration.
The blockSize3 variable holds the number of MAC operations performed */ The blockSize3 variable holds the number of MAC operations performed */
blockSize3 = srcBLen - 1u; blockSize3 = srcBLen - 1u;
@ -597,8 +597,8 @@ void arm_conv_q15(
pIn2 = pSrc2 - 1u; pIn2 = pSrc2 - 1u;
py = pIn2; py = pIn2;
/* ------------------- /* -------------------
* Stage3 process * Stage3 process
* ------------------*/ * ------------------*/
/* For loop unrolling by 4, this stage is divided into two. */ /* For loop unrolling by 4, this stage is divided into two. */
@ -616,14 +616,14 @@ void arm_conv_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = blockSize3 >> 2u; k = blockSize3 >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
* with y[srcBLen - 1], y[srcBLen - 2] respectively */ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
* with y[srcBLen - 3], y[srcBLen - 4] respectively */ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum); sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
@ -631,11 +631,11 @@ void arm_conv_q15(
k--; k--;
} }
/* For the next MAC operations, the pointer py is used without SIMD /* For the next MAC operations, the pointer py is used without SIMD
* So, py is incremented by 1 */ * So, py is incremented by 1 */
py = py + 1u; py = py + 1u;
/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = blockSize3 % 0x4u; k = blockSize3 % 0x4u;
@ -662,7 +662,7 @@ void arm_conv_q15(
} }
/* The second part of the stage starts here */ /* The second part of the stage starts here */
/* SIMD is not used for the next MAC operations, /* SIMD is not used for the next MAC operations,
* so pointer py is updated to read only one sample at a time */ * so pointer py is updated to read only one sample at a time */
py = py + 1u; py = py + 1u;
@ -725,10 +725,10 @@ void arm_conv_q15(
pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u); pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
} }
#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/ #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
} }
/** /**
* @} end of Conv group * @} end of Conv group
*/ */

@ -1,24 +1,24 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
* Copyright (C) 2010-2014 ARM Limited. All rights reserved. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
* *
* $Date: 19. March 2015 * $Date: 26. September 2016
* $Revision: V.1.4.5 * $Revision: V.1.4.5 a
* *
* Project: CMSIS DSP Library * Project: CMSIS DSP Library
* Title: arm_correlate_q15.c * Title: arm_correlate_q15.c
* *
* Description: Correlation of Q15 sequences. * Description: Correlation of Q15 sequences.
* *
* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions * modification, are permitted provided that the following conditions
* are met: * are met:
* - Redistributions of source code must retain the above copyright * - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer. * notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright * - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in * notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the * the documentation and/or other materials provided with the
* distribution. * distribution.
* - Neither the name of ARM LIMITED nor the names of its contributors * - Neither the name of ARM LIMITED nor the names of its contributors
* may be used to endorse or promote products derived from this * may be used to endorse or promote products derived from this
@ -27,7 +27,7 @@
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
@ -35,45 +35,45 @@
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE. * POSSIBILITY OF SUCH DAMAGE.
* -------------------------------------------------------------------- */ * -------------------------------------------------------------------- */
#include "arm_math.h" #include "arm_math.h"
/** /**
* @ingroup groupFilters * @ingroup groupFilters
*/ */
/** /**
* @addtogroup Corr * @addtogroup Corr
* @{ * @{
*/ */
/** /**
* @brief Correlation of Q15 sequences. * @brief Correlation of Q15 sequences.
* @param[in] *pSrcA points to the first input sequence. * @param[in] *pSrcA points to the first input sequence.
* @param[in] srcALen length of the first input sequence. * @param[in] srcALen length of the first input sequence.
* @param[in] *pSrcB points to the second input sequence. * @param[in] *pSrcB points to the second input sequence.
* @param[in] srcBLen length of the second input sequence. * @param[in] srcBLen length of the second input sequence.
* @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
* @return none. * @return none.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
* This approach provides 33 guard bits and there is no risk of overflow.
* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*
* \par
* Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
* *
* \par * @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator.
* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
* This approach provides 33 guard bits and there is no risk of overflow.
* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*
* \par
* Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
*
* \par
* Refer the function <code>arm_correlate_opt_q15()</code> for a faster implementation of this function using scratch buffers. * Refer the function <code>arm_correlate_opt_q15()</code> for a faster implementation of this function using scratch buffers.
* *
*/ */
void arm_correlate_q15( void arm_correlate_q15(
@ -84,7 +84,7 @@ void arm_correlate_q15(
q15_t * pDst) q15_t * pDst)
{ {
#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
/* Run the below code for Cortex-M4 and Cortex-M3 */ /* Run the below code for Cortex-M4 and Cortex-M3 */
@ -107,11 +107,11 @@ void arm_correlate_q15(
/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
/* and the destination pointer modifier, inc is set to -1 */ /* and the destination pointer modifier, inc is set to -1 */
/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
/* But to improve the performance, /* But to improve the performance,
* we include zeroes in the output instead of zero padding either of the the inputs*/ * we include zeroes in the output instead of zero padding either of the the inputs*/
/* If srcALen > srcBLen, /* If srcALen > srcBLen,
* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
/* If srcALen < srcBLen, /* If srcALen < srcBLen,
* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
if(srcALen >= srcBLen) if(srcALen >= srcBLen)
{ {
@ -124,9 +124,9 @@ void arm_correlate_q15(
/* Number of output samples is calculated */ /* Number of output samples is calculated */
outBlockSize = (2u * srcALen) - 1u; outBlockSize = (2u * srcALen) - 1u;
/* When srcALen > srcBLen, zero padding is done to srcB /* When srcALen > srcBLen, zero padding is done to srcB
* to make their lengths equal. * to make their lengths equal.
* Instead, (outBlockSize - (srcALen + srcBLen - 1)) * Instead, (outBlockSize - (srcALen + srcBLen - 1))
* number of output samples are made zero */ * number of output samples are made zero */
j = outBlockSize - (srcALen + (srcBLen - 1u)); j = outBlockSize - (srcALen + (srcBLen - 1u));
@ -156,30 +156,30 @@ void arm_correlate_q15(
} }
/* The function is internally /* The function is internally
* divided into three parts according to the number of multiplications that has to be * divided into three parts according to the number of multiplications that has to be
* taken place between inputA samples and inputB samples. In the first part of the * taken place between inputA samples and inputB samples. In the first part of the
* algorithm, the multiplications increase by one for every iteration. * algorithm, the multiplications increase by one for every iteration.
* In the second part of the algorithm, srcBLen number of multiplications are done. * In the second part of the algorithm, srcBLen number of multiplications are done.
* In the third part of the algorithm, the multiplications decrease by one * In the third part of the algorithm, the multiplications decrease by one
* for every iteration.*/ * for every iteration.*/
/* The algorithm is implemented in three stages. /* The algorithm is implemented in three stages.
* The loop counters of each stage is initiated here. */ * The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1u; blockSize1 = srcBLen - 1u;
blockSize2 = srcALen - (srcBLen - 1u); blockSize2 = srcALen - (srcBLen - 1u);
blockSize3 = blockSize1; blockSize3 = blockSize1;
/* -------------------------- /* --------------------------
* Initializations of stage1 * Initializations of stage1
* -------------------------*/ * -------------------------*/
/* sum = x[0] * y[srcBlen - 1] /* sum = x[0] * y[srcBlen - 1]
* sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
* .... * ....
* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
*/ */
/* In this stage the MAC operations are increased by 1 for every iteration. /* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed */ The count variable holds the number of MAC operations performed */
count = 1u; count = 1u;
@ -190,8 +190,8 @@ void arm_correlate_q15(
pSrc1 = pIn2 + (srcBLen - 1u); pSrc1 = pIn2 + (srcBLen - 1u);
py = pSrc1; py = pSrc1;
/* ------------------------ /* ------------------------
* Stage1 process * Stage1 process
* ----------------------*/ * ----------------------*/
/* The first loop starts here */ /* The first loop starts here */
@ -203,7 +203,7 @@ void arm_correlate_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = count >> 2; k = count >> 2;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -216,7 +216,7 @@ void arm_correlate_q15(
k--; k--;
} }
/* If the count is not a multiple of 4, compute any remaining MACs here. /* If the count is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = count % 0x4u; k = count % 0x4u;
@ -246,14 +246,14 @@ void arm_correlate_q15(
blockSize1--; blockSize1--;
} }
/* -------------------------- /* --------------------------
* Initializations of stage2 * Initializations of stage2
* ------------------------*/ * ------------------------*/
/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
* .... * ....
* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
*/ */
/* Working pointer of inputA */ /* Working pointer of inputA */
@ -265,12 +265,12 @@ void arm_correlate_q15(
/* count is index by which the pointer pIn1 to be incremented */ /* count is index by which the pointer pIn1 to be incremented */
count = 0u; count = 0u;
/* ------------------- /* -------------------
* Stage2 process * Stage2 process
* ------------------*/ * ------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2, * So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
if(srcBLen >= 4u) if(srcBLen >= 4u)
{ {
@ -289,16 +289,16 @@ void arm_correlate_q15(
x0 = *__SIMD32(px); x0 = *__SIMD32(px);
/* read x[1], x[2] samples */ /* read x[1], x[2] samples */
x1 = _SIMD32_OFFSET(px + 1); x1 = _SIMD32_OFFSET(px + 1);
px += 2u; px += 2u;
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2u; k = srcBLen >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
do do
{ {
/* Read the first two inputB samples using SIMD: /* Read the first two inputB samples using SIMD:
* y[0] and y[1] */ * y[0] and y[1] */
c0 = *__SIMD32(py)++; c0 = *__SIMD32(py)++;
@ -335,7 +335,7 @@ void arm_correlate_q15(
/* Read x[5], x[6] */ /* Read x[5], x[6] */
x1 = _SIMD32_OFFSET(px + 3); x1 = _SIMD32_OFFSET(px + 3);
px += 4u; px += 4u;
/* acc2 += x[4] * y[2] + x[5] * y[3] */ /* acc2 += x[4] * y[2] + x[5] * y[3] */
acc2 = __SMLALD(x0, c0, acc2); acc2 = __SMLALD(x0, c0, acc2);
@ -345,7 +345,7 @@ void arm_correlate_q15(
} while(--k); } while(--k);
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = srcBLen % 0x4u; k = srcBLen % 0x4u;
@ -364,7 +364,7 @@ void arm_correlate_q15(
#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[7] */ /* Read x[7] */
x3 = *__SIMD32(px); x3 = *__SIMD32(px);
px++; px++;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALD(x0, c0, acc0); acc0 = __SMLALD(x0, c0, acc0);
@ -383,7 +383,7 @@ void arm_correlate_q15(
/* Read x[9] */ /* Read x[9] */
x2 = _SIMD32_OFFSET(px + 1); x2 = _SIMD32_OFFSET(px + 1);
px += 2u; px += 2u;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALD(x0, c0, acc0); acc0 = __SMLALD(x0, c0, acc0);
@ -421,7 +421,7 @@ void arm_correlate_q15(
#endif /* #ifdef ARM_MATH_BIG_ENDIAN */ #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
/* Read x[10] */ /* Read x[10] */
x3 = _SIMD32_OFFSET(px + 2); x3 = _SIMD32_OFFSET(px + 2);
px += 3u; px += 3u;
/* Perform the multiply-accumulates */ /* Perform the multiply-accumulates */
acc0 = __SMLALDX(x1, c0, acc0); acc0 = __SMLALDX(x1, c0, acc0);
@ -455,7 +455,7 @@ void arm_correlate_q15(
blkCnt--; blkCnt--;
} }
/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
blkCnt = blockSize2 % 0x4u; blkCnt = blockSize2 % 0x4u;
@ -467,7 +467,7 @@ void arm_correlate_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = srcBLen >> 2u; k = srcBLen >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -481,7 +481,7 @@ void arm_correlate_q15(
k--; k--;
} }
/* If the srcBLen is not a multiple of 4, compute any remaining MACs here. /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = srcBLen % 0x4u; k = srcBLen % 0x4u;
@ -512,7 +512,7 @@ void arm_correlate_q15(
} }
else else
{ {
/* If the srcBLen is not a multiple of 4, /* If the srcBLen is not a multiple of 4,
* the blockSize2 loop cannot be unrolled by 4 */ * the blockSize2 loop cannot be unrolled by 4 */
blkCnt = blockSize2; blkCnt = blockSize2;
@ -550,18 +550,18 @@ void arm_correlate_q15(
} }
} }
/* -------------------------- /* --------------------------
* Initializations of stage3 * Initializations of stage3
* -------------------------*/ * -------------------------*/
/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
* .... * ....
* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
* sum += x[srcALen-1] * y[0] * sum += x[srcALen-1] * y[0]
*/ */
/* In this stage the MAC operations are decreased by 1 for every iteration. /* In this stage the MAC operations are decreased by 1 for every iteration.
The count variable holds the number of MAC operations performed */ The count variable holds the number of MAC operations performed */
count = srcBLen - 1u; count = srcBLen - 1u;
@ -572,8 +572,8 @@ void arm_correlate_q15(
/* Working pointer of inputB */ /* Working pointer of inputB */
py = pIn2; py = pIn2;
/* ------------------- /* -------------------
* Stage3 process * Stage3 process
* ------------------*/ * ------------------*/
while(blockSize3 > 0u) while(blockSize3 > 0u)
@ -584,7 +584,7 @@ void arm_correlate_q15(
/* Apply loop unrolling and compute 4 MACs simultaneously. */ /* Apply loop unrolling and compute 4 MACs simultaneously. */
k = count >> 2u; k = count >> 2u;
/* First part of the processing with loop unrolling. Compute 4 MACs at a time. /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
** a second loop below computes MACs for the remaining 1 to 3 samples. */ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
while(k > 0u) while(k > 0u)
{ {
@ -598,7 +598,7 @@ void arm_correlate_q15(
k--; k--;
} }
/* If the count is not a multiple of 4, compute any remaining MACs here. /* If the count is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */ ** No loop unrolling is used. */
k = count % 0x4u; k = count % 0x4u;
@ -644,14 +644,14 @@ void arm_correlate_q15(
/* But CORR(x, y) is reverse of CORR(y, x) */ /* But CORR(x, y) is reverse of CORR(y, x) */
/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
/* and a varaible, inv is set to 1 */ /* and a varaible, inv is set to 1 */
/* If lengths are not equal then zero pad has to be done to make the two /* If lengths are not equal then zero pad has to be done to make the two
* inputs of same length. But to improve the performance, we include zeroes * inputs of same length. But to improve the performance, we include zeroes
* in the output instead of zero padding either of the the inputs*/ * in the output instead of zero padding either of the the inputs*/
/* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
* starting of the output buffer */ * starting of the output buffer */
/* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
* ending of the output buffer */ * ending of the output buffer */
/* Once the zero padding is done the remaining of the output is calcualted /* Once the zero padding is done the remaining of the output is calcualted
* using convolution but with the shorter signal time shifted. */ * using convolution but with the shorter signal time shifted. */
/* Calculate the length of the remaining sequence */ /* Calculate the length of the remaining sequence */
@ -710,10 +710,10 @@ void arm_correlate_q15(
*pDst++ = (q15_t) __SSAT((sum >> 15u), 16u); *pDst++ = (q15_t) __SSAT((sum >> 15u), 16u);
} }
#endif /*#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */ #endif /* #if (defined(ARM_MATH_CM7) || defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
} }
/** /**
* @} end of Corr group * @} end of Corr group
*/ */

Loading…
Cancel
Save