Updated the indentation and added the optimized var function

pull/39/head
Silfurion 4 years ago
parent c6b18e9e92
commit 433ecc8b22

@ -47,85 +47,85 @@
*/
#if defined(ARM_MATH_NEON)
void arm_dot_prod_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
float64x2_t sumV ; /* Neon buffer for sum variable */
/* Neon Buffer Initialisation */
sumV = vsetq_lane_f64(0.0f, sumV, 0);
sumV = vsetq_lane_f64(0.0f, sumV, 1);
/* Neon Buffer for sources */
float64x2_t pSrcAV;
float64x2_t pSrcBV;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Load source value in Neon Buffer */
pSrcAV = vld1q_f64(pSrcA);
pSrcBV = vld1q_f64(pSrcB);
/* Calculate dot product and store result in a temporary buffer. */
sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV);
pSrcA+=2;
pSrcB+=2;
/* Decrement loop counter */
blkCnt--;
}
/* Sum both 64 bits part in the float64x2 */
sum = vaddvq_f64(sumV);
/* Tail */
blkCnt = blockSize & 1 ;
while(blkCnt > 0U)
{
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
float64x2_t sumV ; /* Neon buffer for sum variable */
/* Neon Buffer Initialisation */
sumV = vsetq_lane_f64(0.0f, sumV, 0);
sumV = vsetq_lane_f64(0.0f, sumV, 1);
/* Neon Buffer for sources */
float64x2_t pSrcAV;
float64x2_t pSrcBV;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Load source value in Neon Buffer */
pSrcAV = vld1q_f64(pSrcA);
pSrcBV = vld1q_f64(pSrcB);
/* Calculate dot product and store result in a temporary buffer. */
sumV = vmlaq_f64(sumV, pSrcAV, pSrcBV);
pSrcA+=2;
pSrcB+=2;
/* Decrement loop counter */
blkCnt--;
}
/* Sum both 64 bits part in the float64x2 */
sum = vaddvq_f64(sumV);
/* Tail */
blkCnt = blockSize & 1 ;
while(blkCnt > 0U)
{
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
#else
void arm_dot_prod_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
#endif

@ -48,49 +48,49 @@
*/
float64_t arm_chebyshev_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
float64_t diff=0., maxVal,tmpA, tmpB;
uint32_t blkCnt;
maxVal = F64_MIN;
float64_t diff=0., maxVal,tmpA, tmpB;
uint32_t blkCnt;
maxVal = F64_MIN;
#if defined(ARM_MATH_NEON)
float64x2_t diffV , tmpAV , tmpBV , maxValV ;
maxValV = vdupq_n_f64(maxVal);
blkCnt = blockSize >> 1U ;
while(blkCnt > 0U)
{
tmpAV = vld1q_f64(pA);
tmpBV = vld1q_f64(pB);
diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV)));
maxValV = vmaxq_f64(maxValV, diffV);
pA+=2;
pB+=2;
blkCnt--;
}
maxVal =vgetq_lane_f64(maxValV, 0);
if(maxVal < vgetq_lane_f64(maxValV, 1))
{
maxVal = vgetq_lane_f64(maxValV, 1);
}
blkCnt = blockSize & 1;
float64x2_t diffV , tmpAV , tmpBV , maxValV ;
maxValV = vdupq_n_f64(maxVal);
blkCnt = blockSize >> 1U ;
while(blkCnt > 0U)
{
tmpAV = vld1q_f64(pA);
tmpBV = vld1q_f64(pB);
diffV = vabsq_f64((vsubq_f64(tmpAV, tmpBV)));
maxValV = vmaxq_f64(maxValV, diffV);
pA+=2;
pB+=2;
blkCnt--;
}
maxVal =vgetq_lane_f64(maxValV, 0);
if(maxVal < vgetq_lane_f64(maxValV, 1))
{
maxVal = vgetq_lane_f64(maxValV, 1);
}
blkCnt = blockSize & 1;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while(blkCnt > 0)
{
tmpA = *pA++;
tmpB = *pB++;
diff = fabs(tmpA - tmpB);
if (diff > maxVal)
{
maxVal = diff;
}
blkCnt --;
}
return(maxVal);
while(blkCnt > 0)
{
tmpA = *pA++;
tmpB = *pB++;
diff = fabs(tmpA - tmpB);
if (diff > maxVal)
{
maxVal = diff;
}
blkCnt --;
}
return(maxVal);
}
/**

@ -47,41 +47,41 @@
*/
float64_t arm_cityblock_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
float64_t accum,tmpA, tmpB;
uint32_t blkCnt;
accum = 0.;
float64_t accum,tmpA, tmpB;
uint32_t blkCnt;
accum = 0.;
#if defined(ARM_MATH_NEON)
float64x2_t tmpAV, tmpBV,accumV , subV;
accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U;
while(blkCnt > 0U)
{
tmpAV = vld1q_f64(pA);
tmpBV = vld1q_f64(pB);
subV = vabdq_f64(tmpAV, tmpBV);
accumV = vaddq_f64(accumV, subV);
pA+=2;
pB+=2;
blkCnt--;
}
accum = vaddvq_f64(accumV);
blkCnt = blockSize & 1 ;
float64x2_t tmpAV, tmpBV,accumV , subV;
accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U;
while(blkCnt > 0U)
{
tmpAV = vld1q_f64(pA);
tmpBV = vld1q_f64(pB);
subV = vabdq_f64(tmpAV, tmpBV);
accumV = vaddq_f64(accumV, subV);
pA+=2;
pB+=2;
blkCnt--;
}
accum = vaddvq_f64(accumV);
blkCnt = blockSize & 1 ;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while(blkCnt > 0)
{
tmpA = *pA++;
tmpB = *pB++;
accum += fabs(tmpA - tmpB);
blkCnt--;
}
return(accum);
while(blkCnt > 0)
{
tmpA = *pA++;
tmpB = *pB++;
accum += fabs(tmpA - tmpB);
blkCnt--;
}
return(accum);
}
/**

@ -49,35 +49,35 @@
*/
float64_t arm_euclidean_distance_f64(const float64_t *pA,const float64_t *pB, uint32_t blockSize)
{
float64_t accum=0.,tmp;
uint32_t blkCnt;
float64_t accum=0.,tmp;
uint32_t blkCnt;
#if defined(ARM_MATH_NEON)
float64x2_t accumV,tmpV , pAV ,pBV;
accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U;
while(blkCnt > 0U)
{
pAV = vld1q_f64(pA);
pBV = vld1q_f64(pB);
tmpV = vsubq_f64(pAV, pBV);
accumV = vmlaq_f64(accumV, tmpV, tmpV);
pA+=2;
pB+=2;
blkCnt--;
}
accum = vaddvq_f64(accumV);
blkCnt = blockSize & 1;
float64x2_t accumV,tmpV , pAV ,pBV;
accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U;
while(blkCnt > 0U)
{
pAV = vld1q_f64(pA);
pBV = vld1q_f64(pB);
tmpV = vsubq_f64(pAV, pBV);
accumV = vmlaq_f64(accumV, tmpV, tmpV);
pA+=2;
pB+=2;
blkCnt--;
}
accum = vaddvq_f64(accumV);
blkCnt = blockSize & 1;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while(blkCnt > 0)
{
tmp = *pA++ - *pB++;
accum += SQ(tmp);
blkCnt --;
}
tmp = sqrt(accum);
return(tmp);
while(blkCnt > 0)
{
tmp = *pA++ - *pB++;
accum += SQ(tmp);
blkCnt --;
}
tmp = sqrt(accum);
return(tmp);
}
/**

@ -33,44 +33,44 @@
#endif
void arm_vexp_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt;
uint32_t blkCnt;
#if defined(ARM_MATH_NEON)
float64x2_t src;
float64x2_t dst;
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
src = vld1q_f64(pSrc);
dst = vexpq_f64(src);
vst1q_f64(pDst, dst);
pSrc += 2;
pDst += 2;
blkCnt--;
}
blkCnt = blockSize & 1;
float64x2_t src;
float64x2_t dst;
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
src = vld1q_f64(pSrc);
dst = vexpq_f64(src);
vst1q_f64(pDst, dst);
pSrc += 2;
pDst += 2;
blkCnt--;
}
blkCnt = blockSize & 1;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
/* C = log(A) */
/* Calculate log and store result in destination buffer. */
*pDst++ = exp(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
while (blkCnt > 0U)
{
/* C = log(A) */
/* Calculate log and store result in destination buffer. */
*pDst++ = exp(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}

@ -34,43 +34,43 @@
#endif
void arm_vlog_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt;
uint32_t blkCnt;
#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
float64x2_t src;
float64x2_t dst;
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
src = vld1q_f64(pSrc);
dst = vlogq_f64(src);
vst1q_f64(pDst, dst);
pSrc += 2;
pDst += 2;
/* Decrement loop counter */
blkCnt--;
}
blkCnt = blockSize & 1;
float64x2_t src;
float64x2_t dst;
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
src = vld1q_f64(pSrc);
dst = vlogq_f64(src);
vst1q_f64(pDst, dst);
pSrc += 2;
pDst += 2;
/* Decrement loop counter */
blkCnt--;
}
blkCnt = blockSize & 1;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
/* C = log(A) */
/* Calculate log and store result in destination buffer. */
*pDst++ = log(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
while (blkCnt > 0U)
{
/* C = log(A) */
/* Calculate log and store result in destination buffer. */
*pDst++ = log(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}

File diff suppressed because it is too large Load Diff

@ -63,22 +63,22 @@
*/
void arm_biquad_cascade_df2T_init_f64(
arm_biquad_cascade_df2T_instance_f64 * S,
uint8_t numStages,
const float64_t * pCoeffs,
float64_t * pState)
arm_biquad_cascade_df2T_instance_f64 * S,
uint8_t numStages,
const float64_t * pCoeffs,
float64_t * pState)
{
/* Assign filter stages */
S->numStages = numStages;
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
/* Clear state buffer and size is always 2 * numStages */
memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t));
/* Assign state pointer */
S->pState = pState;
/* Assign filter stages */
S->numStages = numStages;
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
/* Clear state buffer and size is always 2 * numStages */
memset(pState, 0, (2U * (uint32_t) numStages) * sizeof(float64_t));
/* Assign state pointer */
S->pState = pState;
}
/**

@ -48,384 +48,384 @@
*/
void arm_correlate_f64(
const float64_t * pSrcA,
uint32_t srcALen,
const float64_t * pSrcB,
uint32_t srcBLen,
float64_t * pDst)
const float64_t * pSrcA,
uint32_t srcALen,
const float64_t * pSrcB,
uint32_t srcBLen,
float64_t * pDst)
{
const float64_t *pIn1; /* InputA pointer */
const float64_t *pIn2; /* InputB pointer */
float64_t *pOut = pDst; /* Output pointer */
const float64_t *px; /* Intermediate inputA pointer */
const float64_t *py; /* Intermediate inputB pointer */
const float64_t *pSrc1;
float64_t sum;
uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
uint32_t j, k, count, blkCnt; /* Loop counters */
uint32_t outBlockSize; /* Loop counter */
int32_t inc = 1; /* Destination address modifier */
const float64_t *pIn1; /* InputA pointer */
const float64_t *pIn2; /* InputB pointer */
float64_t *pOut = pDst; /* Output pointer */
const float64_t *px; /* Intermediate inputA pointer */
const float64_t *py; /* Intermediate inputB pointer */
const float64_t *pSrc1;
float64_t sum;
uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
uint32_t j, k, count, blkCnt; /* Loop counters */
uint32_t outBlockSize; /* Loop counter */
int32_t inc = 1; /* Destination address modifier */
#if defined(ARM_MATH_NEON)
float64x2_t sumV,pxV,pyV ;
float64x2_t sumV,pxV,pyV ;
#endif
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
/* But CORR(x, y) is reverse of CORR(y, x) */
/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
/* and the destination pointer modifier, inc is set to -1 */
/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
/* But to improve the performance,
* we assume zeroes in the output instead of zero padding either of the the inputs*/
/* If srcALen > srcBLen,
* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
/* If srcALen < srcBLen,
* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
pIn1 = pSrcA;
/* Initialization of inputB pointer */
pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
/* When srcALen > srcBLen, zero padding has to be done to srcB
* to make their lengths equal.
* Instead, (outBlockSize - (srcALen + srcBLen - 1))
* number of output samples are made zero */
j = outBlockSize - (srcALen + (srcBLen - 1U));
/* Updating the pointer position to non zero value */
pOut += j;
}
else
{
/* Initialization of inputA pointer */
pIn1 = pSrcB;
/* Initialization of inputB pointer */
pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
srcBLen = srcALen;
srcALen = j;
/* CORR(x, y) = Reverse order(CORR(y, x)) */
/* Hence set the destination pointer to point to the last output sample */
pOut = pDst + ((srcALen + srcBLen) - 2U);
/* Destination address modifier is set to -1 */
inc = -1;
}
/* The function is internally
* divided into three stages according to the number of multiplications that has to be
* taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
* In the second stage of the algorithm, srcBLen number of multiplications are done.
* In the third stage of the algorithm, the multiplications decrease by one
* for every iteration. */
/* The algorithm is implemented in three stages.
The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
/* --------------------------
* Initializations of stage1
* -------------------------*/
/* sum = x[0] * y[srcBlen - 1]
* sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
* ....
* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
*/
/* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed */
count = 1U;
/* Working pointer of inputA */
px = pIn1;
/* Working pointer of inputB */
pSrc1 = pIn2 + (srcBLen - 1U);
py = pSrc1;
/* ------------------------
* Stage1 process
* ----------------------*/
/* The first stage starts here */
while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
/* The algorithm implementation is based on the lengths of the inputs. */
/* srcB is always made to slide across srcA. */
/* So srcBLen is always considered as shorter or equal to srcALen */
/* But CORR(x, y) is reverse of CORR(y, x) */
/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
/* and the destination pointer modifier, inc is set to -1 */
/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
/* But to improve the performance,
* we assume zeroes in the output instead of zero padding either of the the inputs*/
/* If srcALen > srcBLen,
* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
/* If srcALen < srcBLen,
* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
if (srcALen >= srcBLen)
{
/* Initialization of inputA pointer */
pIn1 = pSrcA;
/* Initialization of inputB pointer */
pIn2 = pSrcB;
/* Number of output samples is calculated */
outBlockSize = (2U * srcALen) - 1U;
/* When srcALen > srcBLen, zero padding has to be done to srcB
* to make their lengths equal.
* Instead, (outBlockSize - (srcALen + srcBLen - 1))
* number of output samples are made zero */
j = outBlockSize - (srcALen + (srcBLen - 1U));
/* Updating the pointer position to non zero value */
pOut += j;
}
else
{
/* Initialization of inputA pointer */
pIn1 = pSrcB;
/* Initialization of inputB pointer */
pIn2 = pSrcA;
/* srcBLen is always considered as shorter or equal to srcALen */
j = srcBLen;
srcBLen = srcALen;
srcALen = j;
/* CORR(x, y) = Reverse order(CORR(y, x)) */
/* Hence set the destination pointer to point to the last output sample */
pOut = pDst + ((srcALen + srcBLen) - 2U);
/* Destination address modifier is set to -1 */
inc = -1;
}
/* The function is internally
* divided into three stages according to the number of multiplications that has to be
* taken place between inputA samples and inputB samples. In the first stage of the
* algorithm, the multiplications increase by one for every iteration.
* In the second stage of the algorithm, srcBLen number of multiplications are done.
* In the third stage of the algorithm, the multiplications decrease by one
* for every iteration. */
/* The algorithm is implemented in three stages.
The loop counters of each stage is initiated here. */
blockSize1 = srcBLen - 1U;
blockSize2 = srcALen - (srcBLen - 1U);
blockSize3 = blockSize1;
/* --------------------------
* Initializations of stage1
* -------------------------*/
/* sum = x[0] * y[srcBlen - 1]
* sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
* ....
* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
*/
/* In this stage the MAC operations are increased by 1 for every iteration.
The count variable holds the number of MAC operations performed */
count = 1U;
/* Working pointer of inputA */
px = pIn1;
/* Working pointer of inputB */
pSrc1 = pIn2 + (srcBLen - 1U);
py = pSrc1;
/* ------------------------
* Stage1 process
* ----------------------*/
/* The first stage starts here */
while (blockSize1 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
#if defined(ARM_MATH_NEON)
sumV = vdupq_n_f64(0.0f);
k = count >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = count & 1 ;
sumV = vdupq_n_f64(0.0f);
k = count >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = count & 1 ;
#else
/* Initialize k with number of samples */
k = count;
/* Initialize k with number of samples */
k = count;
#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
sum += *px++ * *py++;
/* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Update the inputA and inputB pointers for next MAC calculation */
py = pSrc1 - count;
px = pIn1;
/* Increment MAC count */
count++;
/* Decrement loop counter */
blockSize1--;
}
/* --------------------------
* Initializations of stage2
* ------------------------*/
/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
* ....
* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
*/
/* Working pointer of inputA */
px = pIn1;
/* Working pointer of inputB */
py = pIn2;
/* count is index by which the pointer pIn1 to be incremented */
count = 0U;
/* -------------------
* Stage2 process
* ------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize2;
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
while (k > 0U)
{
/* Perform the multiply-accumulate */
/* x[0] * y[srcBLen - 1] */
sum += *px++ * *py++;
/* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Update the inputA and inputB pointers for next MAC calculation */
py = pSrc1 - count;
px = pIn1;
/* Increment MAC count */
count++;
/* Decrement loop counter */
blockSize1--;
}
/* --------------------------
* Initializations of stage2
* ------------------------*/
/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
* ....
* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
*/
/* Working pointer of inputA */
px = pIn1;
/* Working pointer of inputB */
py = pIn2;
/* count is index by which the pointer pIn1 to be incremented */
count = 0U;
/* -------------------
* Stage2 process
* ------------------*/
/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
* So, to loop unroll over blockSize2,
* srcBLen should be greater than or equal to 4 */
if (srcBLen >= 4U)
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize2;
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
#if defined(ARM_MATH_NEON)
sumV = vdupq_n_f64(0.0f);
k = srcBLen >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = srcBLen & 1 ;
sumV = vdupq_n_f64(0.0f);
k = srcBLen >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = srcBLen & 1 ;
#else
/* Initialize blkCnt with number of samples */
k = srcBLen;
/* Initialize blkCnt with number of samples */
k = srcBLen;
#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py++;
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Increment the pointer pIn1 index, count by 1 */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* If the srcBLen is not a multiple of 4,
* the blockSize2 loop cannot be unrolled by 4 */
blkCnt = blockSize2;
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py++;
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Increment the pointer pIn1 index, count by 1 */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* If the srcBLen is not a multiple of 4,
* the blockSize2 loop cannot be unrolled by 4 */
blkCnt = blockSize2;
while (blkCnt > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
#if defined(ARM_MATH_NEON)
sumV = vdupq_n_f64(0.0f);
k = srcBLen >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = srcBLen & 1 ;
sumV = vdupq_n_f64(0.0f);
k = srcBLen >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = srcBLen & 1 ;
#else
/* Loop over srcBLen */
k = srcBLen;
/* Loop over srcBLen */
k = srcBLen;
#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py++;
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Increment the pointer pIn1 index, count by 1 */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
/* Decrement the loop counter */
blkCnt--;
}
}
/* --------------------------
* Initializations of stage3
* -------------------------*/
/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
* ....
* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
* sum += x[srcALen-1] * y[0]
*/
/* In this stage the MAC operations are decreased by 1 for every iteration.
The count variable holds the number of MAC operations performed */
count = srcBLen - 1U;
/* Working pointer of inputA */
pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
px = pSrc1;
/* Working pointer of inputB */
py = pIn2;
/* -------------------
* Stage3 process
* ------------------*/
while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py++;
/* Decrement the loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Increment the pointer pIn1 index, count by 1 */
count++;
/* Update the inputA and inputB pointers for next MAC calculation */
px = pIn1 + count;
py = pIn2;
/* Decrement the loop counter */
blkCnt--;
}
}
/* --------------------------
* Initializations of stage3
* -------------------------*/
/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
* ....
* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
* sum += x[srcALen-1] * y[0]
*/
/* In this stage the MAC operations are decreased by 1 for every iteration.
The count variable holds the number of MAC operations performed */
count = srcBLen - 1U;
/* Working pointer of inputA */
pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
px = pSrc1;
/* Working pointer of inputB */
py = pIn2;
/* -------------------
* Stage3 process
* ------------------*/
while (blockSize3 > 0U)
{
/* Accumulator is made zero for every iteration */
sum = 0.;
#if defined(ARM_MATH_NEON)
sumV = vdupq_n_f64(0.0f);
k = count >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = count & 1 ;
sumV = vdupq_n_f64(0.0f);
k = count >> 1U ;
while(k > 0U)
{
pxV = vld1q_f64(px);
pyV = vld1q_f64(py);
sumV = vmlaq_f64(sumV, pxV, pyV);
px+=2;
py+=2;
k--;
}
sum =vaddvq_f64(sumV);
k = count & 1 ;
#else
/* Initialize blkCnt with number of samples */
k = count;
/* Initialize blkCnt with number of samples */
k = count;
#endif
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py++;
/* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Update the inputA and inputB pointers for next MAC calculation */
px = ++pSrc1;
py = pIn2;
/* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
}
while (k > 0U)
{
/* Perform the multiply-accumulate */
sum += *px++ * *py++;
/* Decrement loop counter */
k--;
}
/* Store the result in the accumulator in the destination buffer. */
*pOut = sum;
/* Destination pointer is updated according to the address modifier, inc */
pOut += inc;
/* Update the inputA and inputB pointers for next MAC calculation */
px = ++pSrc1;
py = pIn2;
/* Decrement MAC count */
count--;
/* Decrement the loop counter */
blockSize3--;
}
}
/**

@ -47,177 +47,177 @@
*/
#if defined(ARM_MATH_NEON)
void arm_fir_f64(
const arm_fir_instance_f64 * S,
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
const arm_fir_instance_f64 * S,
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
float64_t *pState = S->pState; /* State pointer */
const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float64_t *pStateCurnt; /* Points to the current sample of the state */
float64_t *px; /* Temporary pointer for state buffer */
const float64_t *pb; /* Temporary pointer for coefficient buffer */
float64x2_t pxV;
float64x2_t pbV;
float64x2_t acc0V;
float64_t acc0; /* Accumulator */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, tapCnt, blkCnt; /* Loop counters */
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
/* Initialize blkCnt with number of taps */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0.;
acc0V = vdupq_n_f64(0.0f);
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps >> 1U;
/* Perform the multiply-accumulates */
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
pxV = vld1q_f64(px);
pbV = vld1q_f64(pb);
acc0V = vmlaq_f64(acc0V, pxV, pbV);
px+=2;
pb+=2;
i--;
}
acc0 = vaddvq_f64(acc0V);
i = numTaps%2 ;
while(i >0U)
{
acc0+= *px++ * *pb++ ;
i--;
}
/* Store result in destination buffer. */
*pDst++ = acc0;
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
Now copy the last numTaps - 1 samples to the start of the state buffer.
This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
/* Initialize tapCnt with number of taps */
tapCnt = (numTaps - 1U);
/* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
/* Decrement loop counter */
tapCnt--;
}
float64_t *pState = S->pState; /* State pointer */
const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float64_t *pStateCurnt; /* Points to the current sample of the state */
float64_t *px; /* Temporary pointer for state buffer */
const float64_t *pb; /* Temporary pointer for coefficient buffer */
float64x2_t pxV;
float64x2_t pbV;
float64x2_t acc0V;
float64_t acc0; /* Accumulator */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, tapCnt, blkCnt; /* Loop counters */
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
/* Initialize blkCnt with number of taps */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0.;
acc0V = vdupq_n_f64(0.0f);
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps >> 1U;
/* Perform the multiply-accumulates */
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
pxV = vld1q_f64(px);
pbV = vld1q_f64(pb);
acc0V = vmlaq_f64(acc0V, pxV, pbV);
px+=2;
pb+=2;
i--;
}
acc0 = vaddvq_f64(acc0V);
i = numTaps%2 ;
while(i >0U)
{
acc0+= *px++ * *pb++ ;
i--;
}
/* Store result in destination buffer. */
*pDst++ = acc0;
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
Now copy the last numTaps - 1 samples to the start of the state buffer.
This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
/* Initialize tapCnt with number of taps */
tapCnt = (numTaps - 1U);
/* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
/* Decrement loop counter */
tapCnt--;
}
}
#else
void arm_fir_f64(
const arm_fir_instance_f64 * S,
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
const arm_fir_instance_f64 * S,
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
float64_t *pState = S->pState; /* State pointer */
const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float64_t *pStateCurnt; /* Points to the current sample of the state */
float64_t *px; /* Temporary pointer for state buffer */
const float64_t *pb; /* Temporary pointer for coefficient buffer */
float64_t acc0; /* Accumulator */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, tapCnt, blkCnt; /* Loop counters */
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
/* Initialize blkCnt with number of taps */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0.;
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps;
/* Perform the multiply-accumulates */
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += *px++ * *pb++;
i--;
}
/* Store result in destination buffer. */
*pDst++ = acc0;
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
Now copy the last numTaps - 1 samples to the start of the state buffer.
This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
/* Initialize tapCnt with number of taps */
tapCnt = (numTaps - 1U);
/* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
/* Decrement loop counter */
tapCnt--;
}
float64_t *pState = S->pState; /* State pointer */
const float64_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
float64_t *pStateCurnt; /* Points to the current sample of the state */
float64_t *px; /* Temporary pointer for state buffer */
const float64_t *pb; /* Temporary pointer for coefficient buffer */
float64_t acc0; /* Accumulator */
uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
uint32_t i, tapCnt, blkCnt; /* Loop counters */
/* S->pState points to state array which contains previous frame (numTaps - 1) samples */
/* pStateCurnt points to the location where the new input data should be written */
pStateCurnt = &(S->pState[(numTaps - 1U)]);
/* Initialize blkCnt with number of taps */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* Copy one sample at a time into state buffer */
*pStateCurnt++ = *pSrc++;
/* Set the accumulator to zero */
acc0 = 0.;
/* Initialize state pointer */
px = pState;
/* Initialize Coefficient pointer */
pb = pCoeffs;
i = numTaps;
/* Perform the multiply-accumulates */
while (i > 0U)
{
/* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
acc0 += *px++ * *pb++;
i--;
}
/* Store result in destination buffer. */
*pDst++ = acc0;
/* Advance state pointer by 1 for the next sample */
pState = pState + 1U;
/* Decrement loop counter */
blkCnt--;
}
/* Processing is complete.
Now copy the last numTaps - 1 samples to the start of the state buffer.
This prepares the state buffer for the next function call. */
/* Points to the start of the state buffer */
pStateCurnt = S->pState;
/* Initialize tapCnt with number of taps */
tapCnt = (numTaps - 1U);
/* Copy remaining data */
while (tapCnt > 0U)
{
*pStateCurnt++ = *pState++;
/* Decrement loop counter */
tapCnt--;
}
}
#endif

@ -61,22 +61,22 @@
*/
void arm_fir_init_f64(
arm_fir_instance_f64 * S,
uint16_t numTaps,
const float64_t * pCoeffs,
float64_t * pState,
uint32_t blockSize)
arm_fir_instance_f64 * S,
uint16_t numTaps,
const float64_t * pCoeffs,
float64_t * pState,
uint32_t blockSize)
{
/* Assign filter taps */
S->numTaps = numTaps;
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float64_t));
/* Assign state pointer */
S->pState = pState;
/* Assign filter taps */
S->numTaps = numTaps;
/* Assign coefficient pointer */
S->pCoeffs = pCoeffs;
/* Clear state buffer. The size is always (blockSize + numTaps - 1) */
memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float64_t));
/* Assign state pointer */
S->pState = pState;
}
/**

@ -55,223 +55,223 @@
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_cholesky_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
arm_status status; /* status of matrix inverse */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float64_t invSqrtVj;
float64_t *pA,*pG;
int kCnt;
float64x2_t acc, acc0, acc1, acc2, acc3;
float64x2_t vecGi;
float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
float64x2_t tmp = vdupq_n_f64(0.0f);
float64_t sum=0.0f;
float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
acc0 = vdupq_n_f64(0.0f);
acc1 = vdupq_n_f64(0.0f);
acc2 = vdupq_n_f64(0.0f);
acc3 = vdupq_n_f64(0.0f);
kCnt = i >> 1U;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f64(&pG[i * n + k]);
vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
kCnt--;
k+=2;
}
sum0 = vaddvq_f64(acc0);
sum1 = vaddvq_f64(acc1);
sum2 = vaddvq_f64(acc2);
sum3 = vaddvq_f64(acc3);
kCnt = i & 1;
while(kCnt > 0)
{
sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
kCnt--;
k++;
}
pG[(j + 0) * n + i] -= sum0;
pG[(j + 1) * n + i] -= sum1;
pG[(j + 2) * n + i] -= sum2;
pG[(j + 3) * n + i] -= sum3;
}
for(; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
acc = vdupq_n_f64(0.0f);
kCnt = i >> 1U;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f64(&pG[i * n + k]);
vecGj=vld1q_f64(&pG[j * n + k]);
acc = vfmaq_f64(acc, vecGi, vecGj);
kCnt--;
k+=2;
}
sum = vaddvq_f64(acc);
kCnt = i & 1;
while(kCnt > 0)
{
sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
kCnt--;
k++;
}
pG[j * n + i] -= sum;
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
SCALE_COL_F64(pDst,i,invSqrtVj,i);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
int i,j,k;
int n = pSrc->numRows;
float64_t invSqrtVj;
float64_t *pA,*pG;
int kCnt;
float64x2_t acc, acc0, acc1, acc2, acc3;
float64x2_t vecGi;
float64x2_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
float64x2_t tmp = vdupq_n_f64(0.0f);
float64_t sum=0.0f;
float64_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
acc0 = vdupq_n_f64(0.0f);
acc1 = vdupq_n_f64(0.0f);
acc2 = vdupq_n_f64(0.0f);
acc3 = vdupq_n_f64(0.0f);
kCnt = i >> 1U;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f64(&pG[i * n + k]);
vecGj0=vld1q_f64(&pG[(j + 0) * n + k]);
vecGj1=vld1q_f64(&pG[(j + 1) * n + k]);
vecGj2=vld1q_f64(&pG[(j + 2) * n + k]);
vecGj3=vld1q_f64(&pG[(j + 3) * n + k]);
acc0 = vfmaq_f64(acc0, vecGi, vecGj0);
acc1 = vfmaq_f64(acc1, vecGi, vecGj1);
acc2 = vfmaq_f64(acc2, vecGi, vecGj2);
acc3 = vfmaq_f64(acc3, vecGi, vecGj3);
kCnt--;
k+=2;
}
sum0 = vaddvq_f64(acc0);
sum1 = vaddvq_f64(acc1);
sum2 = vaddvq_f64(acc2);
sum3 = vaddvq_f64(acc3);
kCnt = i & 1;
while(kCnt > 0)
{
sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
kCnt--;
k++;
}
pG[(j + 0) * n + i] -= sum0;
pG[(j + 1) * n + i] -= sum1;
pG[(j + 2) * n + i] -= sum2;
pG[(j + 3) * n + i] -= sum3;
}
for(; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
acc = vdupq_n_f64(0.0f);
kCnt = i >> 1U;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f64(&pG[i * n + k]);
vecGj=vld1q_f64(&pG[j * n + k]);
acc = vfmaq_f64(acc, vecGi, vecGj);
kCnt--;
k+=2;
}
sum = vaddvq_f64(acc);
kCnt = i & 1;
while(kCnt > 0)
{
sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
kCnt--;
k++;
}
pG[j * n + i] -= sum;
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
SCALE_COL_F64(pDst,i,invSqrtVj,i);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cholesky_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
arm_status status; /* status of matrix inverse */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float64_t invSqrtVj;
float64_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
}
}
if (pG[i * n + i] <= 0.0)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0/sqrt(pG[i * n + i]);
SCALE_COL_F64(pDst,i,invSqrtVj,i);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
int i,j,k;
int n = pSrc->numRows;
float64_t invSqrtVj;
float64_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
}
}
if (pG[i * n + i] <= 0.0)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0/sqrt(pG[i * n + i]);
SCALE_COL_F64(pDst,i,invSqrtVj,i);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif

@ -56,19 +56,19 @@
*/
void arm_mat_init_f64(
arm_matrix_instance_f64 * S,
uint16_t nRows,
uint16_t nColumns,
float64_t * pData)
arm_matrix_instance_f64 * S,
uint16_t nRows,
uint16_t nColumns,
float64_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**

@ -48,179 +48,179 @@
*/
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float64_t *pX = dst->pData;
float64_t *pLT = lt->pData;
float64_t *pA = a->pData;
float64_t *lt_row;
float64_t *a_col;
float64_t invLT;
float64x2_t vecA;
float64x2_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+1 < cols; j += 2)
{
vecA = vld1q_f64(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f64(&pX[cols*k+j]);
vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
}
if (pLT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f / pLT[n*i + i];
vecA = vmulq_f64(vecA,vdupq_n_f64(invLT));
vst1q_f64(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
float64_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float64_t *pX = dst->pData;
float64_t *pLT = lt->pData;
float64_t *pA = a->pData;
float64_t *lt_row;
float64_t *a_col;
float64_t invLT;
float64x2_t vecA;
float64x2_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+1 < cols; j += 2)
{
vecA = vld1q_f64(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f64(&pX[cols*k+j]);
vecA = vfmsq_f64(vecA,vdupq_n_f64(pLT[n*i + k]),vecX);
}
if (pLT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f / pLT[n*i + i];
vecA = vmulq_f64(vecA,vdupq_n_f64(invLT));
vst1q_f64(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
float64_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pLT = lt->pData;
float64_t *pA = a->pData;
float64_t *lt_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
float64_t tmp=a_col[i * cols];
lt_row = &pLT[n*i];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pLT = lt->pData;
float64_t *pA = a->pData;
float64_t *lt_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
float64_t tmp=a_col[i * cols];
lt_row = &pLT[n*i];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif
/**

@ -48,168 +48,168 @@
*/
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float64_t *pX = dst->pData;
float64_t *pUT = ut->pData;
float64_t *pA = a->pData;
float64_t *ut_row;
float64_t *a_col;
float64_t invUT;
float64x2_t vecA;
float64x2_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+1 < cols; j +=2)
{
vecA = vld1q_f64(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f64(&pX[cols*k+j]);
vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
}
if (pUT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f / pUT[n*i + i];
vecA = vmulq_f64(vecA,vdupq_n_f64(invUT));
vst1q_f64(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
float64_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float64_t *pX = dst->pData;
float64_t *pUT = ut->pData;
float64_t *pA = a->pData;
float64_t *ut_row;
float64_t *a_col;
float64_t invUT;
float64x2_t vecA;
float64x2_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+1 < cols; j +=2)
{
vecA = vld1q_f64(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f64(&pX[cols*k+j]);
vecA = vfmsq_f64(vecA,vdupq_n_f64(pUT[n*i + k]),vecX);
}
if (pUT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f / pUT[n*i + i];
vecA = vmulq_f64(vecA,vdupq_n_f64(invUT));
vst1q_f64(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
float64_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pUT = ut->pData;
float64_t *pA = a->pData;
float64_t *ut_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
float64_t tmp=a_col[i * cols];
ut_row = &pUT[n*i];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pUT = ut->pData;
float64_t *pA = a->pData;
float64_t *ut_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
float64_t tmp=a_col[i * cols];
ut_row = &pUT[n*i];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif

@ -57,216 +57,216 @@
#if defined(ARM_MATH_NEON)
arm_status arm_mat_trans_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* Row loop */
rowCnt = row >> 1;
while (rowCnt > 0U)
{
float64_t *row0,*row1;
float64x2x4_t raV;
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */
{
row0 = pIn;
row1 = pIn+nColumns;
pIn+=4;
raV = vld4q_lane_f64(row0, raV, 0);
raV = vld4q_lane_f64(row1, raV, 1);
vst1q_f64(px,raV.val[0]);
px += nRows;
vst1q_f64(px,raV.val[1]);
px += nRows;
vst1q_f64(px,raV.val[2]);
px += nRows;
vst1q_f64(px,raV.val[3]);
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
/* Perform matrix transpose for last 3 samples here. */
blkCnt = nColumns % 0x4U;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px++ = *pIn;
*px++ = *(pIn + 1 * nColumns);
px += (nRows - 2);
pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
i += 2;
pIn += 1 * nColumns;
/* Decrement the row loop counter */
rowCnt--;
} /* Row loop end */
rowCnt = row & 1;
while (rowCnt > 0U)
{
blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
i++;
rowCnt -- ;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
{
/* Matrix transpose by exchanging the rows with columns */
/* Row loop */
rowCnt = row >> 1;
while (rowCnt > 0U)
{
float64_t *row0,*row1;
float64x2x4_t raV;
blkCnt = nColumns >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U) /* Column loop */
{
row0 = pIn;
row1 = pIn+nColumns;
pIn+=4;
raV = vld4q_lane_f64(row0, raV, 0);
raV = vld4q_lane_f64(row1, raV, 1);
vst1q_f64(px,raV.val[0]);
px += nRows;
vst1q_f64(px,raV.val[1]);
px += nRows;
vst1q_f64(px,raV.val[2]);
px += nRows;
vst1q_f64(px,raV.val[3]);
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
/* Perform matrix transpose for last 3 samples here. */
blkCnt = nColumns % 0x4U;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px++ = *pIn;
*px++ = *(pIn + 1 * nColumns);
px += (nRows - 2);
pIn++;
/* Decrement the column loop counter */
blkCnt--;
}
i += 2;
pIn += 1 * nColumns;
/* Decrement the row loop counter */
rowCnt--;
} /* Row loop end */
rowCnt = row & 1;
while (rowCnt > 0U)
{
blkCnt = nColumns ;
/* The pointer px is set to starting address of the column being processed */
px = pOut + i;
while (blkCnt > 0U)
{
/* Read and store the input element in the destination */
*px = *pIn++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += nRows;
/* Decrement the column loop counter */
blkCnt--;
}
i++;
rowCnt -- ;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_trans_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint64_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nCols = pSrc->numCols; /* number of columns */
uint64_t col, row = nRows, i = 0U; /* Loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) ||
(pSrc->numCols != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* Pointer px is set to starting address of column being processed */
px = pOut + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
/* Loop unrolling: Compute 4 outputs at a time */
col = nCols >> 2U;
while (col > 0U) /* column loop */
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
*px = *pIn++;
px += nRows;
/* Decrement column loop counter */
col--;
}
/* Loop unrolling: Compute remaining outputs */
col = nCols % 0x4U;
#else
/* Initialize col with number of samples */
col = nCols;
/* Initialize col with number of samples */
col = nCols;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
while (col > 0U)
{
/* Read and store input element in destination */
*px = *pIn++;
/* Update pointer px to point to next row of transposed matrix */
px += nRows;
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif
/**

@ -49,9 +49,9 @@
#if defined(ARM_MATH_NEON)
void arm_absmax_no_idx_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
float64_t maxVal , in; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
@ -112,9 +112,9 @@ void arm_absmax_no_idx_f64(
}
#else
void arm_absmax_no_idx_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
float64_t maxVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */

@ -47,101 +47,101 @@
#if defined(ARM_MATH_NEON)
void arm_absmin_no_idx_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
float64_t minVal , in; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
float64x2_t minV;
float64x2_t pSrcV ;
pSrcV = vld1q_f64(pSrc);
pSrc += 2 ;
minV = vabsq_f64(pSrcV);
/* Load first input value that act as reference value for comparision */
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 2U) >> 1U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
pSrcV = vld1q_f64(pSrc);
minV = vminq_f64(minV, vabsq_f64(pSrcV));
pSrc += 2 ;
/* Decrement loop counter */
blkCnt--;
}
minVal =vgetq_lane_f64(minV, 0);
if(minVal > vgetq_lane_f64(minV, 1))
{
minVal = vgetq_lane_f64(minV, 1);
}
blkCnt = (blockSize - 2U) & 1;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
in = fabs(*pSrc++);
/* compare for the maximum value */
if (minVal > in)
{
/* Update the maximum value and it's index */
minVal = in;
}
/* Decrement loop counter */
blkCnt--;
}
*pResult = minVal;
/* Store the maximum value and it's index into destination pointers */
float64_t minVal , in; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
float64x2_t minV;
float64x2_t pSrcV ;
pSrcV = vld1q_f64(pSrc);
pSrc += 2 ;
minV = vabsq_f64(pSrcV);
/* Load first input value that act as reference value for comparision */
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 2U) >> 1U;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
pSrcV = vld1q_f64(pSrc);
minV = vminq_f64(minV, vabsq_f64(pSrcV));
pSrc += 2 ;
/* Decrement loop counter */
blkCnt--;
}
minVal =vgetq_lane_f64(minV, 0);
if(minVal > vgetq_lane_f64(minV, 1))
{
minVal = vgetq_lane_f64(minV, 1);
}
blkCnt = (blockSize - 2U) & 1;
while (blkCnt > 0U)
{
/* Initialize maxVal to the next consecutive values one by one */
in = fabs(*pSrc++);
/* compare for the maximum value */
if (minVal > in)
{
/* Update the maximum value and it's index */
minVal = in;
}
/* Decrement loop counter */
blkCnt--;
}
*pResult = minVal;
/* Store the maximum value and it's index into destination pointers */
}
#else
void arm_absmin_no_idx_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
float64_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
/* Load first input value that act as reference value for comparision */
out = fabs(*pSrc++);
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal = fabs(*pSrc++);
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
float64_t minVal, out; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
/* Load first input value that act as reference value for comparision */
out = fabs(*pSrc++);
/* Initialize blkCnt with number of samples */
blkCnt = (blockSize - 1U);
while (blkCnt > 0U)
{
/* Initialize minVal to the next consecutive values one by one */
minVal = fabs(*pSrc++);
/* compare for the minimum value */
if (out > minVal)
{
/* Update the minimum value and it's index */
out = minVal;
}
/* Decrement loop counter */
blkCnt--;
}
/* Store the minimum value and it's index into destination pointers */
*pResult = out;
}
#endif
/**

@ -49,43 +49,43 @@
float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
{
const float64_t *pIn;
uint32_t blkCnt;
float64_t accum, p;
pIn = pSrcA;
accum = 0.0;
const float64_t *pIn;
uint32_t blkCnt;
float64_t accum, p;
pIn = pSrcA;
accum = 0.0;
#if defined(ARM_MATH_NEON)
float64x2_t sumV ,pInV ;
sumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U ;
while(blkCnt > 0){
pInV = vld1q_f64(pIn);
sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) );
pIn += 2 ;
blkCnt--;
}
accum = vaddvq_f64(sumV);
blkCnt = blockSize & 1 ;
float64x2_t sumV ,pInV ;
sumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U ;
while(blkCnt > 0){
pInV = vld1q_f64(pIn);
sumV = vmlaq_f64(sumV, pInV,vlogq_f64(pInV) );
pIn += 2 ;
blkCnt--;
}
accum = vaddvq_f64(sumV);
blkCnt = blockSize & 1 ;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while(blkCnt > 0)
{
p = *pIn++;
accum += p * log(p);
blkCnt--;
}
return(-accum);
while(blkCnt > 0)
{
p = *pIn++;
accum += p * log(p);
blkCnt--;
}
return(-accum);
}
/**

@ -50,53 +50,53 @@
float64_t arm_kullback_leibler_f64(const float64_t * pSrcA,const float64_t * pSrcB,uint32_t blockSize)
{
const float64_t *pInA, *pInB;
uint32_t blkCnt;
float64_t accum, pA,pB;
float64x2_t accumV;
float64x2_t tmpVA, tmpVB,tmpV;
pInA = pSrcA;
pInB = pSrcB;
accum = 0.0f;
accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1;
while(blkCnt > 0)
{
tmpVA = vld1q_f64(pInA);
pInA += 2;
tmpVB = vld1q_f64(pInB);
pInB += 2;
tmpV = vinvq_f64(tmpVA);
tmpVB = vmulq_f64(tmpVB, tmpV);
tmpVB = vlogq_f64(tmpVB);
accumV = vmlaq_f64(accumV, tmpVA, tmpVB);
blkCnt--;
}
accum = vaddvq_f64(accumV);
blkCnt = blockSize & 1;
while(blkCnt > 0)
{
pA = *pInA++;
pB = *pInB++;
accum += pA * logf(pB/pA);
blkCnt--;
}
return(-accum);
const float64_t *pInA, *pInB;
uint32_t blkCnt;
float64_t accum, pA,pB;
float64x2_t accumV;
float64x2_t tmpVA, tmpVB,tmpV;
pInA = pSrcA;
pInB = pSrcB;
accum = 0.0f;
accumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1;
while(blkCnt > 0)
{
tmpVA = vld1q_f64(pInA);
pInA += 2;
tmpVB = vld1q_f64(pInB);
pInB += 2;
tmpV = vinvq_f64(tmpVA);
tmpVB = vmulq_f64(tmpVB, tmpV);
tmpVB = vlogq_f64(tmpVB);
accumV = vmlaq_f64(accumV, tmpVA, tmpVB);
blkCnt--;
}
accum = vaddvq_f64(accumV);
blkCnt = blockSize & 1;
while(blkCnt > 0)
{
pA = *pInA++;
pB = *pInB++;
accum += pA * logf(pB/pA);
blkCnt--;
}
return(-accum);
}
#else

@ -47,9 +47,9 @@
*/
#if defined(ARM_MATH_NEON)
void arm_max_no_idx_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
float64_t maxVal , in; /* Temporary variables to store the output value. */
uint32_t blkCnt; /* Loop counter */
@ -110,9 +110,9 @@ void arm_max_no_idx_f64(
}
#else
void arm_max_no_idx_f64(
const float64_t *pSrc,
uint32_t blockSize,
float64_t *pResult)
const float64_t *pSrc,
uint32_t blockSize,
float64_t *pResult)
{
float64_t maxValue = F64_MIN;
float64_t newVal;

@ -49,68 +49,68 @@
#if defined(ARM_MATH_NEON)
void arm_mean_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float64x2_t vSum = vdupq_n_f64(0.0f);
float64_t sum = 0.; /* Temporary result storage */
float64x2_t afterLoad ;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
afterLoad = vld1q_f64(pSrc);
vSum = vaddq_f64(vSum, afterLoad);
pSrc += 2;
/* Decrement loop counter */
blkCnt--;
}
sum = vaddvq_f64(vSum);
blkCnt = blockSize & 1;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
*pResult = (sum/blockSize);
uint32_t blkCnt; /* Loop counter */
float64x2_t vSum = vdupq_n_f64(0.0f);
float64_t sum = 0.; /* Temporary result storage */
float64x2_t afterLoad ;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
afterLoad = vld1q_f64(pSrc);
vSum = vaddq_f64(vSum, afterLoad);
pSrc += 2;
/* Decrement loop counter */
blkCnt--;
}
sum = vaddvq_f64(vSum);
blkCnt = blockSize & 1;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
*pResult = (sum/blockSize);
}
#else
void arm_mean_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (sum / blockSize);
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
/* Store result to destination */
*pResult = (sum / blockSize);
}
#endif
/**

@ -46,51 +46,51 @@
@return none
*/
void arm_min_no_idx_f64(
const float64_t *pSrc,
uint32_t blockSize,
float64_t *pResult)
const float64_t *pSrc,
uint32_t blockSize,
float64_t *pResult)
{
float64_t minValue = F64_MAX;
float64_t newVal;
uint32_t blkCnt ;
float64_t minValue = F64_MAX;
float64_t newVal;
uint32_t blkCnt ;
#if defined(ARM_MATH_NEON)
float64x2_t minValueV , newValV ;
minValueV = vdupq_n_f64(F64_MAX);
blkCnt = blockSize >> 1U;
while(blkCnt > 0)
{
newValV = vld1q_f64(pSrc);
minValueV = vminq_f64(minValueV, newValV);
pSrc += 2 ;
blkCnt--;
}
minValue =vgetq_lane_f64(minValueV, 0);
if(minValue > vgetq_lane_f64(minValueV, 1))
{
minValue = vgetq_lane_f64(minValueV, 1);
}
blkCnt = blockSize & 1 ;
float64x2_t minValueV , newValV ;
minValueV = vdupq_n_f64(F64_MAX);
blkCnt = blockSize >> 1U;
while(blkCnt > 0)
{
newValV = vld1q_f64(pSrc);
minValueV = vminq_f64(minValueV, newValV);
pSrc += 2 ;
blkCnt--;
}
minValue =vgetq_lane_f64(minValueV, 0);
if(minValue > vgetq_lane_f64(minValueV, 1))
{
minValue = vgetq_lane_f64(minValueV, 1);
}
blkCnt = blockSize & 1 ;
#else
blkCnt = blockSize;
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
newVal = *pSrc++;
/* compare for the minimum value */
if (minValue > newVal)
{
/* Update the minimum value and it's index */
minValue = newVal;
}
blkCnt --;
}
*pResult = minValue;
while (blkCnt > 0U)
{
newVal = *pSrc++;
/* compare for the minimum value */
if (minValue > newVal)
{
/* Update the minimum value and it's index */
minValue = newVal;
}
blkCnt --;
}
*pResult = minValue;
}
/**

@ -47,84 +47,84 @@
*/
void arm_mse_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t inA, inB;
float64_t sum = 0.0;
uint32_t blkCnt; /* Loop counter */
float64_t inA, inB;
float64_t sum = 0.0;
#if defined (ARM_MATH_NEON)
float64x2_t inAV , inBV , subV, sumV;
sumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U ;
while (blkCnt > 0U)
{
inAV = vld1q_f64(pSrcA);
pSrcA+=2;
inBV = vld1q_f64(pSrcB);
pSrcB+=2;
subV = vsubq_f64(inAV, inBV);
sumV = vmlaq_f64(sumV, subV, subV);
blkCnt--;
}
sum = vaddvq_f64(sumV);
blkCnt = (blockSize) & 1;
float64x2_t inAV , inBV , subV, sumV;
sumV = vdupq_n_f64(0.0f);
blkCnt = blockSize >> 1U ;
while (blkCnt > 0U)
{
inAV = vld1q_f64(pSrcA);
pSrcA+=2;
inBV = vld1q_f64(pSrcB);
pSrcB+=2;
subV = vsubq_f64(inAV, inBV);
sumV = vmlaq_f64(sumV, subV, subV);
blkCnt--;
}
sum = vaddvq_f64(sumV);
blkCnt = (blockSize) & 1;
#else
/* Temporary return variable */
/* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
blkCnt = (blockSize) >> 1;
blkCnt = (blockSize) >> 1;
#pragma clang loop vectorize(enable)
while (blkCnt > 0U)
{
inA = *pSrcA++;
inB = *pSrcB++;
inA = inA - inB;
sum += inA * inA;
inA = *pSrcA++;
inB = *pSrcB++;
inA = inA - inB;
sum += inA * inA;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize) & 1;
while (blkCnt > 0U)
{
inA = *pSrcA++;
inB = *pSrcB++;
inA = inA - inB;
sum += inA * inA;
inA = *pSrcA++;
inB = *pSrcB++;
inA = inA - inB;
sum += inA * inA;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = (blockSize) & 1;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
#endif
//#pragma clang loop vectorize(enable) unroll(disable)
while (blkCnt > 0U)
{
inA = *pSrcA++;
inB = *pSrcB++;
inA = inA - inB;
sum += inA * inA;
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum / blockSize;
//#pragma clang loop vectorize(enable) unroll(disable)
while (blkCnt > 0U)
{
inA = *pSrcA++;
inB = *pSrcB++;
inA = inA - inB;
sum += inA * inA;
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum / blockSize;
}

@ -46,81 +46,81 @@
*/
#if defined(ARM_MATH_NEON)
void arm_power_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
float64x2_t sumV ; /* Temporary variable to store input value */
sumV = vdupq_n_f64(0.0f);
float64x2_t pSrcV;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
pSrcV = vld1q_f64(pSrc);
sumV = vmlaq_f64(sumV, pSrcV, pSrcV);
pSrc+= 2 ;
/* Decrement loop counter */
blkCnt--;
}
sum = vaddvq_f64(sumV);
float64_t in;
blkCnt = blockSize & 1;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Store result to destination */
*pResult = sum;
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
float64x2_t sumV ; /* Temporary variable to store input value */
sumV = vdupq_n_f64(0.0f);
float64x2_t pSrcV;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize >> 1U;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
pSrcV = vld1q_f64(pSrc);
sumV = vmlaq_f64(sumV, pSrcV, pSrcV);
pSrc+= 2 ;
/* Decrement loop counter */
blkCnt--;
}
sum = vaddvq_f64(sumV);
float64_t in;
blkCnt = blockSize & 1;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Store result to destination */
*pResult = sum;
}
#else
void arm_power_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
float64_t in; /* Temporary variable to store input value */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Store result to destination */
*pResult = sum;
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
float64_t in; /* Temporary variable to store input value */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
/* Compute Power and store result in a temporary variable, sum. */
in = *pSrc++;
sum += in * in;
/* Decrement loop counter */
blkCnt--;
}
/* Store result to destination */
*pResult = sum;
}
#endif
/**

@ -45,56 +45,58 @@
@return none
*/
void arm_var_f64(
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
const float64_t * pSrc,
uint32_t blockSize,
float64_t * pResult)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
float64_t fSum = 0.;
float64_t fMean, fValue;
const float64_t * pInput = pSrc;
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
sum += *pInput++;
/* Decrement loop counter */
blkCnt--;
}
/* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
fMean = sum / (float64_t) blockSize;
pInput = pSrc;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement loop counter */
blkCnt--;
}
/* Variance */
*pResult = fSum / (float64_t)(blockSize - 1.);
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary result storage */
float64_t fSum = 0.;
float64_t fMean, fValue;
const float64_t * pInput = pSrc;
if (blockSize <= 1U)
{
*pResult = 0;
return;
}
arm_mean_f64(pInput, blockSize, &fMean);
#if defined(ARM_MATH_NEON)
float64x2_t fValueV ,fsumV , pInputV , fMeanV;
fsumV = vdupq_n_f64(0.0f);
fMeanV = vdupq_n_f64(fMean);
blkCnt = blockSize >> 1U;
while(blkCnt > 0U)
{
pInputV = vld1q_f64(pInput);
fValueV = vsubq_f64(pInputV, fMeanV);
fsumV = vmlaq_f64(fsumV, fValueV, fValueV);
pInput += 2 ;
blkCnt--;
}
fSum = vaddvq_f64(fsumV);
blkCnt = blockSize & 1 ;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
fValue = *pInput++ - fMean;
fSum += fValue * fValue;
/* Decrement loop counter */
blkCnt--;
}
/* Variance */
*pResult = fSum / (float64_t)(blockSize - 1.);
}
/**
@} end of variance group
@} end of variance group
*/

Loading…
Cancel
Save