Optimizations for QR decomposition.

pull/42/head
Christophe Favergeon 4 years ago
parent 6e95cfef9d
commit 2bd5292468

@ -36,6 +36,7 @@
#include <math.h> #include <math.h>
/** /**
@ingroup groupMatrix @ingroup groupMatrix
*/ */
@ -54,8 +55,12 @@
@param[out] pOut points to the output vector. @param[out] pOut points to the output vector.
@return beta return the scaling factor beta @return beta return the scaling factor beta
*/ */
#if defined(ARM_FLOAT16_SUPPORTED) #if defined(ARM_FLOAT16_SUPPORTED)
float16_t arm_householder_f16( float16_t arm_householder_f16(
const float16_t * pSrc, const float16_t * pSrc,
const float16_t threshold, const float16_t threshold,

@ -35,6 +35,7 @@
#include <math.h> #include <math.h>
/** /**
@ingroup groupMatrix @ingroup groupMatrix
*/ */
@ -129,6 +130,9 @@
@return beta return the scaling factor beta @return beta return the scaling factor beta
*/ */
float32_t arm_householder_f32( float32_t arm_householder_f32(
const float32_t * pSrc, const float32_t * pSrc,
const float32_t threshold, const float32_t threshold,

@ -35,6 +35,7 @@
#include <math.h> #include <math.h>
/** /**
@ingroup groupMatrix @ingroup groupMatrix
*/ */
@ -54,6 +55,9 @@
@return beta return the scaling factor beta @return beta return the scaling factor beta
*/ */
float64_t arm_householder_f64( float64_t arm_householder_f64(
const float64_t * pSrc, const float64_t * pSrc,
const float64_t threshold, const float64_t threshold,

@ -30,6 +30,12 @@
#include "dsp/matrix_utils.h" #include "dsp/matrix_utils.h"
#if !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVE_FLOAT16)
#include "arm_helium_utils.h"
#endif
#endif
/** /**
@ingroup groupMatrix @ingroup groupMatrix
*/ */
@ -66,7 +72,9 @@
refer to the \ref MatrixHouseholder documentation refer to the \ref MatrixHouseholder documentation
*/ */
#if defined(ARM_FLOAT16_SUPPORTED)
#if !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVE_FLOAT16)
arm_status arm_mat_qr_f16( arm_status arm_mat_qr_f16(
const arm_matrix_instance_f16 * pSrc, const arm_matrix_instance_f16 * pSrc,
@ -79,16 +87,477 @@ arm_status arm_mat_qr_f16(
) )
{ {
int32_t col=0;
int32_t nb,pos;
float16_t *pa,*pc;
float16_t beta;
float16_t *pv;
float16_t *pdst;
float16_t *p;
if (pSrc->numRows < pSrc->numCols)
{
return(ARM_MATH_SIZE_MISMATCH);
}
memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float16_t));
pOutR->numCols = pSrc->numCols;
pOutR->numRows = pSrc->numRows;
p = pOutR->pData;
pc = pOutTau;
for(col=0 ; col < pSrc->numCols; col++)
{
int32_t j,k,blkCnt,blkCnt2;
float16_t *pa0,*pa1,*pa2,*pa3,*ptemp;
float16_t temp;
float16x8_t v1,v2,vtemp;
COPY_COL_F16(pOutR,col,col,pTmpA);
beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA);
*pc++ = beta;
pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
temp = *pv;
blkCnt = (pSrc->numCols-col) >> 3;
while (blkCnt > 0)
{
v1 = vld1q_f16(pa);
v2 = vmulq_n_f16(v1,temp);
vst1q_f16(pdst,v2);
pa += 8;
pdst += 8;
blkCnt--;
}
blkCnt = (pSrc->numCols-col) & 7;
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt);
v1 = vld1q_f16(pa);
v2 = vmulq_n_f16(v1,temp);
vst1q_p_f16(pdst,v2,p0);
pa += blkCnt;
}
pa += col;
pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pSrc->numCols;
pa2 = pa1 + pSrc->numCols;
pa3 = pa2 + pSrc->numCols;
/* Unrolled loop */
blkCnt = (pSrc->numRows-col - 1) >> 2;
k=1;
while(blkCnt > 0)
{
vtemp=vld1q_f16(pv);
blkCnt2 = (pSrc->numCols-col) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pdst);
v2 = vld1q_f16(pa0);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0));
v2 = vld1q_f16(pa1);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1));
v2 = vld1q_f16(pa2);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2));
v2 = vld1q_f16(pa3);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3));
vst1q_f16(pdst,v1);
pdst += 8;
pa0 += 8;
pa1 += 8;
pa2 += 8;
pa3 += 8;
blkCnt2--;
}
blkCnt2 = (pSrc->numCols-col) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pdst);
v2 = vld1q_f16(pa0);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0));
v2 = vld1q_f16(pa1);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1));
v2 = vld1q_f16(pa2);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2));
v2 = vld1q_f16(pa3);
v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3));
vst1q_p_f16(pdst,v1,p0);
pa0 += blkCnt2;
pa1 += blkCnt2;
pa2 += blkCnt2;
pa3 += blkCnt2;
}
pa0 += col + 3*pSrc->numCols;
pa1 += col + 3*pSrc->numCols;
pa2 += col + 3*pSrc->numCols;
pa3 += col + 3*pSrc->numCols;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pSrc->numRows-col; k++)
{
temp = *pv;
blkCnt2 = (pSrc->numCols-col) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pa);
v2 = vld1q_f16(pdst);
v2 = vfmaq_n_f16(v2,v1,temp);
vst1q_f16(pdst,v2);
pa += 8;
pdst += 8;
blkCnt2--;
}
blkCnt2 = (pSrc->numCols-col) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pa);
v2 = vld1q_f16(pdst);
v2 = vfmaq_n_f16(v2,v1,temp);
vst1q_p_f16(pdst,v2,p0);
pa += blkCnt2;
}
pa += col;
pv++;
pdst = pTmpB;
}
/* A(col:,col:) - beta v tmpb */
pa = p;
for(j=0;j<pSrc->numRows-col; j++)
{
float16_t f = -(_Float16)beta * (_Float16)pTmpA[j];
ptemp = pTmpB;
blkCnt2 = (pSrc->numCols-col) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pa);
v2 = vld1q_f16(ptemp);
v1 = vfmaq_n_f16(v1,v2,f);
vst1q_f16(pa,v1);
pa += 8;
ptemp += 8;
blkCnt2--;
}
blkCnt2 = (pSrc->numCols-col) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pa);
v2 = vld1q_f16(ptemp);
v1 = vfmaq_n_f16(v1,v2,f);
vst1q_p_f16(pa,v1,p0);
pa += blkCnt2;
}
pa += col;
}
/* Copy Householder reflectors into R matrix */
pa = p + pOutR->numCols;
for(k=0;k<pSrc->numRows-col-1; k++)
{
*pa = pTmpA[k+1];
pa += pOutR->numCols;
}
p += 1 + pOutR->numCols;
}
/* Generate Q if requested by user matrix */
if (pOutQ != NULL)
{
/* Initialize Q matrix to identity */
memset(pOutQ->pData,0,sizeof(float16_t)*pOutQ->numRows*pOutQ->numRows);
pa = pOutQ->pData;
for(col=0 ; col < pOutQ->numCols; col++)
{
*pa = 1.0f16;
pa += pOutQ->numCols+1;
}
nb = pOutQ->numRows - pOutQ->numCols + 1;
pc = pOutTau + pOutQ->numCols - 1;
for(col=0 ; col < pOutQ->numCols; col++)
{
int32_t j,k, blkCnt, blkCnt2;
float16_t *pa0,*pa1,*pa2,*pa3,*ptemp;
float16_t temp;
float16x8_t v1,v2,vtemp;
pos = pSrc->numRows - nb;
p = pOutQ->pData + pos + pOutQ->numCols*pos ;
COPY_COL_F16(pOutR,pos,pos,pTmpA);
pTmpA[0] = 1.0f16;
pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
temp = *pv;
blkCnt2 = (pOutQ->numRows-pos) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pa);
v1 = vmulq_n_f16(v1, temp);
vst1q_f16(pdst,v1);
pa += 8;
pdst += 8;
blkCnt2--;
}
blkCnt2 = (pOutQ->numRows-pos) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pa);
v1 = vmulq_n_f16(v1, temp);
vst1q_p_f16(pdst,v1,p0);
pa += blkCnt2;
}
pa += pos;
pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pOutQ->numRows;
pa2 = pa1 + pOutQ->numRows;
pa3 = pa2 + pOutQ->numRows;
/* Unrolled loop */
blkCnt = (pOutQ->numRows-pos - 1) >> 2;
k=1;
while(blkCnt > 0)
{
vtemp = vld1q_f16(pv);
blkCnt2 = (pOutQ->numRows-pos) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pdst);
v2 = vld1q_f16(pa0);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0));
v2 = vld1q_f16(pa1);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1));
v2 = vld1q_f16(pa2);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2));
v2 = vld1q_f16(pa3);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3));
vst1q_f16(pdst,v1);
pa0 += 8;
pa1 += 8;
pa2 += 8;
pa3 += 8;
pdst += 8;
blkCnt2--;
}
blkCnt2 = (pOutQ->numRows-pos) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pdst);
v2 = vld1q_f16(pa0);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0));
v2 = vld1q_f16(pa1);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1));
v2 = vld1q_f16(pa2);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2));
v2 = vld1q_f16(pa3);
v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3));
vst1q_p_f16(pdst,v1,p0);
pa0 += blkCnt2;
pa1 += blkCnt2;
pa2 += blkCnt2;
pa3 += blkCnt2;
}
pa0 += pos + 3*pOutQ->numRows;
pa1 += pos + 3*pOutQ->numRows;
pa2 += pos + 3*pOutQ->numRows;
pa3 += pos + 3*pOutQ->numRows;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pOutQ->numRows-pos; k++)
{
temp = *pv;
blkCnt2 = (pOutQ->numRows-pos) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pdst);
v2 = vld1q_f16(pa);
v1 = vfmaq_n_f16(v1, v2, temp);
vst1q_f16(pdst,v1);
pdst += 8;
pa += 8;
blkCnt2--;
}
blkCnt2 = (pOutQ->numRows-pos) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pdst);
v2 = vld1q_f16(pa);
v1 = vfmaq_n_f16(v1, v2, temp);
vst1q_p_f16(pdst,v1,p0);
pa += blkCnt2;
}
pa += pos;
pv++;
pdst = pTmpB;
}
pa = p;
beta = *pc--;
for(j=0;j<pOutQ->numRows-pos; j++)
{
float16_t f = -(_Float16)beta * (_Float16)pTmpA[j];
ptemp = pTmpB;
blkCnt2 = (pOutQ->numCols-pos) >> 3;
while (blkCnt2 > 0)
{
v1 = vld1q_f16(pa);
v2 = vld1q_f16(ptemp);
v1 = vfmaq_n_f16(v1,v2,f);
vst1q_f16(pa,v1);
pa += 8;
ptemp += 8;
blkCnt2--;
}
blkCnt2 = (pOutQ->numCols-pos) & 7;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt2);
v1 = vld1q_f16(pa);
v2 = vld1q_f16(ptemp);
v1 = vfmaq_n_f16(v1,v2,f);
vst1q_p_f16(pa,v1,p0);
uint32_t col=0; pa += blkCnt2;
}
pa += pos;
}
nb++;
}
}
arm_status status = ARM_MATH_SUCCESS;
/* Return to application */
return (status);
}
#endif /*#if !defined(ARM_MATH_MVEF)*/
#endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
#if defined(ARM_FLOAT16_SUPPORTED)
#if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_qr_f16(
const arm_matrix_instance_f16 * pSrc,
const float16_t threshold,
arm_matrix_instance_f16 * pOutR,
arm_matrix_instance_f16 * pOutQ,
float16_t * pOutTau,
float16_t *pTmpA,
float16_t *pTmpB
)
{
int32_t col=0;
int32_t nb,pos; int32_t nb,pos;
float16_t *pa,*pc; float16_t *pa,*pc;
float16_t beta; float16_t beta;
float16_t *pv; float16_t *pv;
float16_t *pdst; float16_t *pdst;
float16_t *p; float16_t *p;
float16_t sum;
if (pSrc->numRows < pSrc->numCols) if (pSrc->numRows < pSrc->numCols)
{ {
@ -104,7 +573,8 @@ arm_status arm_mat_qr_f16(
pc = pOutTau; pc = pOutTau;
for(col=0 ; col < pSrc->numCols; col++) for(col=0 ; col < pSrc->numCols; col++)
{ {
uint32_t i,j,k; int32_t i,j,k,blkCnt;
float16_t *pa0,*pa1,*pa2,*pa3;
COPY_COL_F16(pOutR,col,col,pTmpA); COPY_COL_F16(pOutR,col,col,pTmpA);
beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA); beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA);
@ -113,26 +583,70 @@ arm_status arm_mat_qr_f16(
pdst = pTmpB; pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */ /* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
for(j=0;j<pSrc->numCols-col; j++) for(j=0;j<pSrc->numCols-col; j++)
{ {
pa = p+j; *pdst++ = (_Float16)*pv * (_Float16)*pa++;
pv = pTmpA; }
sum = 0.0f16; pa += col;
for(k=0;k<pSrc->numRows-col; k++) pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pSrc->numCols;
pa2 = pa1 + pSrc->numCols;
pa3 = pa2 + pSrc->numCols;
/* Unrolled loop */
blkCnt = (pSrc->numRows-col - 1) >> 2;
k=1;
while(blkCnt > 0)
{
float16_t sum;
for(j=0;j<pSrc->numCols-col; j++)
{
sum = *pdst;
sum += (_Float16)pv[0] * (_Float16)*pa0++;
sum += (_Float16)pv[1] * (_Float16)*pa1++;
sum += (_Float16)pv[2] * (_Float16)*pa2++;
sum += (_Float16)pv[3] * (_Float16)*pa3++;
*pdst++ = sum;
}
pa0 += col + 3*pSrc->numCols;
pa1 += col + 3*pSrc->numCols;
pa2 += col + 3*pSrc->numCols;
pa3 += col + 3*pSrc->numCols;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pSrc->numRows-col; k++)
{
for(j=0;j<pSrc->numCols-col; j++)
{ {
sum += (_Float16)*pv++ * (_Float16)*pa; *pdst++ += (_Float16)*pv * (_Float16)*pa++;
pa += pOutR->numCols;
} }
*pdst++ = sum; pa += col;
pv++;
pdst = pTmpB;
} }
/* A(col:,col:) - beta v tmpb */ /* A(col:,col:) - beta v tmpb */
pa = p; pa = p;
for(j=0;j<pSrc->numRows-col; j++) for(j=0;j<pSrc->numRows-col; j++)
{ {
float16_t f = (_Float16)beta * (_Float16)pTmpA[j];
for(i=0;i<pSrc->numCols-col; i++) for(i=0;i<pSrc->numCols-col; i++)
{ {
*pa = (_Float16)*pa - (_Float16)beta * (_Float16)pTmpA[j] * (_Float16)pTmpB[i] ; *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ;
pa++; pa++;
} }
pa += col; pa += col;
@ -168,7 +682,8 @@ arm_status arm_mat_qr_f16(
pc = pOutTau + pOutQ->numCols - 1; pc = pOutTau + pOutQ->numCols - 1;
for(col=0 ; col < pOutQ->numCols; col++) for(col=0 ; col < pOutQ->numCols; col++)
{ {
int32_t i,j,k; int32_t i,j,k, blkCnt;
float16_t *pa0,*pa1,*pa2,*pa3;
pos = pSrc->numRows - nb; pos = pSrc->numRows - nb;
p = pOutQ->pData + pos + pOutQ->numCols*pos ; p = pOutQ->pData + pos + pOutQ->numCols*pos ;
@ -178,26 +693,70 @@ arm_status arm_mat_qr_f16(
pdst = pTmpB; pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */ /* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
for(j=0;j<pOutQ->numRows-pos; j++) for(j=0;j<pOutQ->numRows-pos; j++)
{ {
pa = p+j; *pdst++ = (_Float16)*pv * (_Float16)*pa++;
pv = pTmpA; }
sum = 0.0f16; pa += pos;
for(k=0;k<pOutQ->numRows-pos; k++) pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pOutQ->numRows;
pa2 = pa1 + pOutQ->numRows;
pa3 = pa2 + pOutQ->numRows;
/* Unrolled loop */
blkCnt = (pOutQ->numRows-pos - 1) >> 2;
k=1;
while(blkCnt > 0)
{
float16_t sum;
for(j=0;j<pOutQ->numRows-pos; j++)
{ {
sum += (_Float16)*pv++ * (_Float16)*pa; sum = *pdst;
pa += pOutQ->numCols;
sum += (_Float16)pv[0] * (_Float16)*pa0++;
sum += (_Float16)pv[1] * (_Float16)*pa1++;
sum += (_Float16)pv[2] * (_Float16)*pa2++;
sum += (_Float16)pv[3] * (_Float16)*pa3++;
*pdst++ = sum;
} }
*pdst++ = sum; pa0 += pos + 3*pOutQ->numRows;
pa1 += pos + 3*pOutQ->numRows;
pa2 += pos + 3*pOutQ->numRows;
pa3 += pos + 3*pOutQ->numRows;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pOutQ->numRows-pos; k++)
{
for(j=0;j<pOutQ->numRows-pos; j++)
{
*pdst++ += (_Float16)*pv * (_Float16)*pa++;
}
pa += pos;
pv++;
pdst = pTmpB;
} }
pa = p; pa = p;
beta = *pc--; beta = *pc--;
for(j=0;j<pOutQ->numRows-pos; j++) for(j=0;j<pOutQ->numRows-pos; j++)
{ {
float16_t f = (_Float16)beta * (_Float16)pTmpA[j];
for(i=0;i<pOutQ->numCols-pos; i++) for(i=0;i<pOutQ->numCols-pos; i++)
{ {
*pa = (_Float16)*pa - (_Float16)beta * (_Float16)pTmpA[j] * (_Float16)pTmpB[i] ; *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ;
pa++; pa++;
} }
pa += pos; pa += pos;
@ -213,6 +772,7 @@ arm_status arm_mat_qr_f16(
return (status); return (status);
} }
#endif /* end of test for Helium or Neon availability */
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
/** /**

@ -30,6 +30,12 @@
#include "dsp/matrix_utils.h" #include "dsp/matrix_utils.h"
#if !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVEF)
#include "arm_helium_utils.h"
#endif
#endif
/** /**
@ingroup groupMatrix @ingroup groupMatrix
*/ */
@ -140,6 +146,9 @@
*/ */
#if !defined(ARM_MATH_AUTOVECTORIZE)
#if defined(ARM_MATH_MVEF)
arm_status arm_mat_qr_f32( arm_status arm_mat_qr_f32(
const arm_matrix_instance_f32 * pSrc, const arm_matrix_instance_f32 * pSrc,
const float32_t threshold, const float32_t threshold,
@ -151,16 +160,475 @@ arm_status arm_mat_qr_f32(
) )
{ {
int32_t col=0;
int32_t nb,pos;
float32_t *pa,*pc;
float32_t beta;
float32_t *pv;
float32_t *pdst;
float32_t *p;
if (pSrc->numRows < pSrc->numCols)
{
return(ARM_MATH_SIZE_MISMATCH);
}
memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
pOutR->numCols = pSrc->numCols;
pOutR->numRows = pSrc->numRows;
p = pOutR->pData;
pc = pOutTau;
for(col=0 ; col < pSrc->numCols; col++)
{
int32_t j,k,blkCnt,blkCnt2;
float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
float32_t temp;
float32x4_t v1,v2,vtemp;
COPY_COL_F32(pOutR,col,col,pTmpA);
beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
*pc++ = beta;
pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
temp = *pv;
blkCnt = (pSrc->numCols-col) >> 2;
while (blkCnt > 0)
{
v1 = vld1q_f32(pa);
v2 = vmulq_n_f32(v1,temp);
vst1q_f32(pdst,v2);
pa += 4;
pdst += 4;
blkCnt--;
}
blkCnt = (pSrc->numCols-col) & 3;
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt);
v1 = vld1q_f32(pa);
v2 = vmulq_n_f32(v1,temp);
vst1q_p_f32(pdst,v2,p0);
pa += blkCnt;
}
pa += col;
pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pSrc->numCols;
pa2 = pa1 + pSrc->numCols;
pa3 = pa2 + pSrc->numCols;
/* Unrolled loop */
blkCnt = (pSrc->numRows-col - 1) >> 2;
k=1;
while(blkCnt > 0)
{
vtemp=vld1q_f32(pv);
blkCnt2 = (pSrc->numCols-col) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pdst);
v2 = vld1q_f32(pa0);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
v2 = vld1q_f32(pa1);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
v2 = vld1q_f32(pa2);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
v2 = vld1q_f32(pa3);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
vst1q_f32(pdst,v1);
pdst += 4;
pa0 += 4;
pa1 += 4;
pa2 += 4;
pa3 += 4;
blkCnt2--;
}
blkCnt2 = (pSrc->numCols-col) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
v1 = vld1q_f32(pdst);
v2 = vld1q_f32(pa0);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
v2 = vld1q_f32(pa1);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
v2 = vld1q_f32(pa2);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
v2 = vld1q_f32(pa3);
v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
vst1q_p_f32(pdst,v1,p0);
pa0 += blkCnt2;
pa1 += blkCnt2;
pa2 += blkCnt2;
pa3 += blkCnt2;
}
pa0 += col + 3*pSrc->numCols;
pa1 += col + 3*pSrc->numCols;
pa2 += col + 3*pSrc->numCols;
pa3 += col + 3*pSrc->numCols;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pSrc->numRows-col; k++)
{
temp = *pv;
blkCnt2 = (pSrc->numCols-col) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pa);
v2 = vld1q_f32(pdst);
v2 = vfmaq_n_f32(v2,v1,temp);
vst1q_f32(pdst,v2);
pa += 4;
pdst += 4;
blkCnt2--;
}
blkCnt2 = (pSrc->numCols-col) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
v1 = vld1q_f32(pa);
v2 = vld1q_f32(pdst);
v2 = vfmaq_n_f32(v2,v1,temp);
vst1q_p_f32(pdst,v2,p0);
pa += blkCnt2;
}
pa += col;
pv++;
pdst = pTmpB;
}
/* A(col:,col:) - beta v tmpb */
pa = p;
for(j=0;j<pSrc->numRows-col; j++)
{
float32_t f = -beta * pTmpA[j];
ptemp = pTmpB;
blkCnt2 = (pSrc->numCols-col) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pa);
v2 = vld1q_f32(ptemp);
v1 = vfmaq_n_f32(v1,v2,f);
vst1q_f32(pa,v1);
pa += 4;
ptemp += 4;
blkCnt2--;
}
blkCnt2 = (pSrc->numCols-col) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
v1 = vld1q_f32(pa);
v2 = vld1q_f32(ptemp);
v1 = vfmaq_n_f32(v1,v2,f);
vst1q_p_f32(pa,v1,p0);
pa += blkCnt2;
}
pa += col;
}
/* Copy Householder reflectors into R matrix */
pa = p + pOutR->numCols;
for(k=0;k<pSrc->numRows-col-1; k++)
{
*pa = pTmpA[k+1];
pa += pOutR->numCols;
}
p += 1 + pOutR->numCols;
}
/* Generate Q if requested by user matrix */
if (pOutQ != NULL)
{
/* Initialize Q matrix to identity */
memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
pa = pOutQ->pData;
for(col=0 ; col < pOutQ->numCols; col++)
{
*pa = 1.0f;
pa += pOutQ->numCols+1;
}
nb = pOutQ->numRows - pOutQ->numCols + 1;
pc = pOutTau + pOutQ->numCols - 1;
for(col=0 ; col < pOutQ->numCols; col++)
{
int32_t j,k, blkCnt, blkCnt2;
float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
float32_t temp;
float32x4_t v1,v2,vtemp;
pos = pSrc->numRows - nb;
p = pOutQ->pData + pos + pOutQ->numCols*pos ;
COPY_COL_F32(pOutR,pos,pos,pTmpA);
pTmpA[0] = 1.0f;
pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
temp = *pv;
blkCnt2 = (pOutQ->numRows-pos) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pa);
v1 = vmulq_n_f32(v1, temp);
vst1q_f32(pdst,v1);
pa += 4;
pdst += 4;
blkCnt2--;
}
blkCnt2 = (pOutQ->numRows-pos) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
v1 = vld1q_f32(pa);
v1 = vmulq_n_f32(v1, temp);
vst1q_p_f32(pdst,v1,p0);
pa += blkCnt2;
}
pa += pos;
pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pOutQ->numRows;
pa2 = pa1 + pOutQ->numRows;
pa3 = pa2 + pOutQ->numRows;
/* Unrolled loop */
blkCnt = (pOutQ->numRows-pos - 1) >> 2;
k=1;
while(blkCnt > 0)
{
vtemp = vld1q_f32(pv);
blkCnt2 = (pOutQ->numRows-pos) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pdst);
v2 = vld1q_f32(pa0);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
v2 = vld1q_f32(pa1);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
v2 = vld1q_f32(pa2);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
v2 = vld1q_f32(pa3);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
vst1q_f32(pdst,v1);
pa0 += 4;
pa1 += 4;
pa2 += 4;
pa3 += 4;
pdst += 4;
blkCnt2--;
}
blkCnt2 = (pOutQ->numRows-pos) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
uint32_t col=0; v1 = vld1q_f32(pdst);
v2 = vld1q_f32(pa0);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
v2 = vld1q_f32(pa1);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
v2 = vld1q_f32(pa2);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
v2 = vld1q_f32(pa3);
v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
vst1q_p_f32(pdst,v1,p0);
pa0 += blkCnt2;
pa1 += blkCnt2;
pa2 += blkCnt2;
pa3 += blkCnt2;
}
pa0 += pos + 3*pOutQ->numRows;
pa1 += pos + 3*pOutQ->numRows;
pa2 += pos + 3*pOutQ->numRows;
pa3 += pos + 3*pOutQ->numRows;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pOutQ->numRows-pos; k++)
{
temp = *pv;
blkCnt2 = (pOutQ->numRows-pos) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pdst);
v2 = vld1q_f32(pa);
v1 = vfmaq_n_f32(v1, v2, temp);
vst1q_f32(pdst,v1);
pdst += 4;
pa += 4;
blkCnt2--;
}
blkCnt2 = (pOutQ->numRows-pos) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
v1 = vld1q_f32(pdst);
v2 = vld1q_f32(pa);
v1 = vfmaq_n_f32(v1, v2, temp);
vst1q_p_f32(pdst,v1,p0);
pa += blkCnt2;
}
pa += pos;
pv++;
pdst = pTmpB;
}
pa = p;
beta = *pc--;
for(j=0;j<pOutQ->numRows-pos; j++)
{
float32_t f = -beta * pTmpA[j];
ptemp = pTmpB;
blkCnt2 = (pOutQ->numCols-pos) >> 2;
while (blkCnt2 > 0)
{
v1 = vld1q_f32(pa);
v2 = vld1q_f32(ptemp);
v1 = vfmaq_n_f32(v1,v2,f);
vst1q_f32(pa,v1);
pa += 4;
ptemp += 4;
blkCnt2--;
}
blkCnt2 = (pOutQ->numCols-pos) & 3;
if (blkCnt2 > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt2);
v1 = vld1q_f32(pa);
v2 = vld1q_f32(ptemp);
v1 = vfmaq_n_f32(v1,v2,f);
vst1q_p_f32(pa,v1,p0);
pa += blkCnt2;
}
pa += pos;
}
nb++;
}
}
arm_status status = ARM_MATH_SUCCESS;
/* Return to application */
return (status);
}
#endif /*#if !defined(ARM_MATH_MVEF)*/
#endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
#if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_qr_f32(
const arm_matrix_instance_f32 * pSrc,
const float32_t threshold,
arm_matrix_instance_f32 * pOutR,
arm_matrix_instance_f32 * pOutQ,
float32_t * pOutTau,
float32_t *pTmpA,
float32_t *pTmpB
)
{
int32_t col=0;
int32_t nb,pos; int32_t nb,pos;
float32_t *pa,*pc; float32_t *pa,*pc;
float32_t beta; float32_t beta;
float32_t *pv; float32_t *pv;
float32_t *pdst; float32_t *pdst;
float32_t *p; float32_t *p;
float32_t sum;
if (pSrc->numRows < pSrc->numCols) if (pSrc->numRows < pSrc->numCols)
{ {
@ -176,7 +644,8 @@ arm_status arm_mat_qr_f32(
pc = pOutTau; pc = pOutTau;
for(col=0 ; col < pSrc->numCols; col++) for(col=0 ; col < pSrc->numCols; col++)
{ {
uint32_t i,j,k; int32_t i,j,k,blkCnt;
float32_t *pa0,*pa1,*pa2,*pa3;
COPY_COL_F32(pOutR,col,col,pTmpA); COPY_COL_F32(pOutR,col,col,pTmpA);
beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA); beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
@ -185,26 +654,70 @@ arm_status arm_mat_qr_f32(
pdst = pTmpB; pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */ /* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
for(j=0;j<pSrc->numCols-col; j++) for(j=0;j<pSrc->numCols-col; j++)
{ {
pa = p+j; *pdst++ = *pv * *pa++;
pv = pTmpA; }
sum = 0.0f; pa += col;
for(k=0;k<pSrc->numRows-col; k++) pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pSrc->numCols;
pa2 = pa1 + pSrc->numCols;
pa3 = pa2 + pSrc->numCols;
/* Unrolled loop */
blkCnt = (pSrc->numRows-col - 1) >> 2;
k=1;
while(blkCnt > 0)
{
float32_t sum;
for(j=0;j<pSrc->numCols-col; j++)
{
sum = *pdst;
sum += pv[0] * *pa0++;
sum += pv[1] * *pa1++;
sum += pv[2] * *pa2++;
sum += pv[3] * *pa3++;
*pdst++ = sum;
}
pa0 += col + 3*pSrc->numCols;
pa1 += col + 3*pSrc->numCols;
pa2 += col + 3*pSrc->numCols;
pa3 += col + 3*pSrc->numCols;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pSrc->numRows-col; k++)
{
for(j=0;j<pSrc->numCols-col; j++)
{ {
sum += *pv++ * *pa; *pdst++ += *pv * *pa++;
pa += pOutR->numCols;
} }
*pdst++ = sum; pa += col;
pv++;
pdst = pTmpB;
} }
/* A(col:,col:) - beta v tmpb */ /* A(col:,col:) - beta v tmpb */
pa = p; pa = p;
for(j=0;j<pSrc->numRows-col; j++) for(j=0;j<pSrc->numRows-col; j++)
{ {
float32_t f = beta * pTmpA[j];
for(i=0;i<pSrc->numCols-col; i++) for(i=0;i<pSrc->numCols-col; i++)
{ {
*pa = *pa - beta * pTmpA[j] * pTmpB[i] ; *pa = *pa - f * pTmpB[i] ;
pa++; pa++;
} }
pa += col; pa += col;
@ -240,7 +753,8 @@ arm_status arm_mat_qr_f32(
pc = pOutTau + pOutQ->numCols - 1; pc = pOutTau + pOutQ->numCols - 1;
for(col=0 ; col < pOutQ->numCols; col++) for(col=0 ; col < pOutQ->numCols; col++)
{ {
int32_t i,j,k; int32_t i,j,k, blkCnt;
float32_t *pa0,*pa1,*pa2,*pa3;
pos = pSrc->numRows - nb; pos = pSrc->numRows - nb;
p = pOutQ->pData + pos + pOutQ->numCols*pos ; p = pOutQ->pData + pos + pOutQ->numCols*pos ;
@ -250,26 +764,70 @@ arm_status arm_mat_qr_f32(
pdst = pTmpB; pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */ /* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
for(j=0;j<pOutQ->numRows-pos; j++) for(j=0;j<pOutQ->numRows-pos; j++)
{ {
pa = p+j; *pdst++ = *pv * *pa++;
pv = pTmpA; }
sum = 0.0f; pa += pos;
for(k=0;k<pOutQ->numRows-pos; k++) pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pOutQ->numRows;
pa2 = pa1 + pOutQ->numRows;
pa3 = pa2 + pOutQ->numRows;
/* Unrolled loop */
blkCnt = (pOutQ->numRows-pos - 1) >> 2;
k=1;
while(blkCnt > 0)
{
float32_t sum;
for(j=0;j<pOutQ->numRows-pos; j++)
{ {
sum += *pv++ * *pa; sum = *pdst;
pa += pOutQ->numCols;
sum += pv[0] * *pa0++;
sum += pv[1] * *pa1++;
sum += pv[2] * *pa2++;
sum += pv[3] * *pa3++;
*pdst++ = sum;
} }
*pdst++ = sum; pa0 += pos + 3*pOutQ->numRows;
pa1 += pos + 3*pOutQ->numRows;
pa2 += pos + 3*pOutQ->numRows;
pa3 += pos + 3*pOutQ->numRows;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pOutQ->numRows-pos; k++)
{
for(j=0;j<pOutQ->numRows-pos; j++)
{
*pdst++ += *pv * *pa++;
}
pa += pos;
pv++;
pdst = pTmpB;
} }
pa = p; pa = p;
beta = *pc--; beta = *pc--;
for(j=0;j<pOutQ->numRows-pos; j++) for(j=0;j<pOutQ->numRows-pos; j++)
{ {
float32_t f = beta * pTmpA[j];
for(i=0;i<pOutQ->numCols-pos; i++) for(i=0;i<pOutQ->numCols-pos; i++)
{ {
*pa = *pa - beta * pTmpA[j] * pTmpB[i] ; *pa = *pa - f * pTmpB[i] ;
pa++; pa++;
} }
pa += pos; pa += pos;
@ -285,6 +843,7 @@ arm_status arm_mat_qr_f32(
return (status); return (status);
} }
#endif /* end of test for Helium or Neon availability */
/** /**
@} end of MatrixQR group @} end of MatrixQR group

@ -30,6 +30,7 @@
#include "dsp/matrix_utils.h" #include "dsp/matrix_utils.h"
/** /**
@ingroup groupMatrix @ingroup groupMatrix
*/ */
@ -65,6 +66,9 @@
*/ */
arm_status arm_mat_qr_f64( arm_status arm_mat_qr_f64(
const arm_matrix_instance_f64 * pSrc, const arm_matrix_instance_f64 * pSrc,
const float64_t threshold, const float64_t threshold,
@ -76,16 +80,13 @@ arm_status arm_mat_qr_f64(
) )
{ {
int32_t col=0;
uint32_t col=0;
int32_t nb,pos; int32_t nb,pos;
float64_t *pa,*pc; float64_t *pa,*pc;
float64_t beta; float64_t beta;
float64_t *pv; float64_t *pv;
float64_t *pdst; float64_t *pdst;
float64_t *p; float64_t *p;
float64_t sum;
if (pSrc->numRows < pSrc->numCols) if (pSrc->numRows < pSrc->numCols)
{ {
@ -101,7 +102,8 @@ arm_status arm_mat_qr_f64(
pc = pOutTau; pc = pOutTau;
for(col=0 ; col < pSrc->numCols; col++) for(col=0 ; col < pSrc->numCols; col++)
{ {
uint32_t i,j,k; int32_t i,j,k,blkCnt;
float64_t *pa0,*pa1,*pa2,*pa3;
COPY_COL_F64(pOutR,col,col,pTmpA); COPY_COL_F64(pOutR,col,col,pTmpA);
beta = arm_householder_f64(pTmpA,threshold,pSrc->numRows - col,pTmpA); beta = arm_householder_f64(pTmpA,threshold,pSrc->numRows - col,pTmpA);
@ -110,26 +112,70 @@ arm_status arm_mat_qr_f64(
pdst = pTmpB; pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */ /* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
for(j=0;j<pSrc->numCols-col; j++) for(j=0;j<pSrc->numCols-col; j++)
{ {
pa = p+j; *pdst++ = *pv * *pa++;
pv = pTmpA; }
sum = 0.0; pa += col;
for(k=0;k<pSrc->numRows-col; k++) pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pSrc->numCols;
pa2 = pa1 + pSrc->numCols;
pa3 = pa2 + pSrc->numCols;
/* Unrolled loop */
blkCnt = (pSrc->numRows-col - 1) >> 2;
k=1;
while(blkCnt > 0)
{
float64_t sum;
for(j=0;j<pSrc->numCols-col; j++)
{ {
sum += *pv++ * *pa; sum = *pdst;
pa += pOutR->numCols;
sum += pv[0] * *pa0++;
sum += pv[1] * *pa1++;
sum += pv[2] * *pa2++;
sum += pv[3] * *pa3++;
*pdst++ = sum;
} }
*pdst++ = sum; pa0 += col + 3*pSrc->numCols;
pa1 += col + 3*pSrc->numCols;
pa2 += col + 3*pSrc->numCols;
pa3 += col + 3*pSrc->numCols;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pSrc->numRows-col; k++)
{
for(j=0;j<pSrc->numCols-col; j++)
{
*pdst++ += *pv * *pa++;
}
pa += col;
pv++;
pdst = pTmpB;
} }
/* A(col:,col:) - beta v tmpb */ /* A(col:,col:) - beta v tmpb */
pa = p; pa = p;
for(j=0;j<pSrc->numRows-col; j++) for(j=0;j<pSrc->numRows-col; j++)
{ {
float64_t f = beta * pTmpA[j];
for(i=0;i<pSrc->numCols-col; i++) for(i=0;i<pSrc->numCols-col; i++)
{ {
*pa = *pa - beta * pTmpA[j] * pTmpB[i] ; *pa = *pa - f * pTmpB[i] ;
pa++; pa++;
} }
pa += col; pa += col;
@ -165,7 +211,8 @@ arm_status arm_mat_qr_f64(
pc = pOutTau + pOutQ->numCols - 1; pc = pOutTau + pOutQ->numCols - 1;
for(col=0 ; col < pOutQ->numCols; col++) for(col=0 ; col < pOutQ->numCols; col++)
{ {
int32_t i,j,k; int32_t i,j,k, blkCnt;
float64_t *pa0,*pa1,*pa2,*pa3;
pos = pSrc->numRows - nb; pos = pSrc->numRows - nb;
p = pOutQ->pData + pos + pOutQ->numCols*pos ; p = pOutQ->pData + pos + pOutQ->numCols*pos ;
@ -175,26 +222,70 @@ arm_status arm_mat_qr_f64(
pdst = pTmpB; pdst = pTmpB;
/* v.T A(col:,col:) -> tmpb */ /* v.T A(col:,col:) -> tmpb */
pv = pTmpA;
pa = p;
for(j=0;j<pOutQ->numRows-pos; j++) for(j=0;j<pOutQ->numRows-pos; j++)
{ {
pa = p+j; *pdst++ = *pv * *pa++;
pv = pTmpA; }
sum = 0.0; pa += pos;
for(k=0;k<pOutQ->numRows-pos; k++) pv++;
pdst = pTmpB;
pa0 = pa;
pa1 = pa0 + pOutQ->numRows;
pa2 = pa1 + pOutQ->numRows;
pa3 = pa2 + pOutQ->numRows;
/* Unrolled loop */
blkCnt = (pOutQ->numRows-pos - 1) >> 2;
k=1;
while(blkCnt > 0)
{
float64_t sum;
for(j=0;j<pOutQ->numRows-pos; j++)
{ {
sum += *pv++ * *pa; sum = *pdst;
pa += pOutQ->numCols;
sum += pv[0] * *pa0++;
sum += pv[1] * *pa1++;
sum += pv[2] * *pa2++;
sum += pv[3] * *pa3++;
*pdst++ = sum;
}
pa0 += pos + 3*pOutQ->numRows;
pa1 += pos + 3*pOutQ->numRows;
pa2 += pos + 3*pOutQ->numRows;
pa3 += pos + 3*pOutQ->numRows;
pv += 4;
pdst = pTmpB;
k += 4;
blkCnt--;
}
pa = pa0;
for(;k<pOutQ->numRows-pos; k++)
{
for(j=0;j<pOutQ->numRows-pos; j++)
{
*pdst++ += *pv * *pa++;
} }
*pdst++ = sum; pa += pos;
pv++;
pdst = pTmpB;
} }
pa = p; pa = p;
beta = *pc--; beta = *pc--;
for(j=0;j<pOutQ->numRows-pos; j++) for(j=0;j<pOutQ->numRows-pos; j++)
{ {
float64_t f = beta * pTmpA[j];
for(i=0;i<pOutQ->numCols-pos; i++) for(i=0;i<pOutQ->numCols-pos; i++)
{ {
*pa = *pa - beta * pTmpA[j] * pTmpB[i] ; *pa = *pa - f * pTmpB[i] ;
pa++; pa++;
} }
pa += pos; pa += pos;

@ -40,7 +40,7 @@ Comparisons for inverse
/* Not very accurate for big matrix. /* Not very accurate for big matrix.
But big matrix needed for checking the vectorized code */ But big matrix needed for checking the vectorized code */
#define SNR_THRESHOLD_INV 100 #define SNR_THRESHOLD_INV 99
#define REL_ERROR_INV (3.0e-5) #define REL_ERROR_INV (3.0e-5)
#define ABS_ERROR_INV (1.0e-5) #define ABS_ERROR_INV (1.0e-5)

@ -832,7 +832,7 @@ class CodeGen:
self._currentPaths = oldPath.copy() self._currentPaths = oldPath.copy()
self._currentParamPaths = oldParamPath.copy() self._currentParamPaths = oldParamPath.copy()
def genCodeForTree(self,root,benchMode): def genCodeForTree(self,genFolder,root,benchMode):
""" Generate all files from the trees of tests """ Generate all files from the trees of tests
Args: Args:
@ -845,14 +845,16 @@ class CodeGen:
# Get a list of all suites contained in the tree # Get a list of all suites contained in the tree
suites = self.getSuites(root,[]) suites = self.getSuites(root,[])
src = "GeneratedSource" src = os.path.join(genFolder,"GeneratedSource")
header = "GeneratedInclude" header = os.path.join(genFolder,"GeneratedInclude")
if benchMode: if benchMode:
src += "Bench" src += "Bench"
header += "Bench" header += "Bench"
# Generate .cpp and .h files neded to run the tests # Generate .cpp and .h files neded to run the tests
with open("%s/TestDesc.cpp" % src,"w") as sourceFile: testDescCPath = os.path.join(src,"TestDesc.cpp")
with open("%s/TestDesc.h" % header,"w") as headerFile: testDescHeaderPath = os.path.join(header,"TestDesc.h")
with open(testDescCPath,"w") as sourceFile:
with open(testDescHeaderPath,"w") as headerFile:
headerFile.write("#include \"Test.h\"\n") headerFile.write("#include \"Test.h\"\n")
headerFile.write("#include \"Pattern.h\"\n") headerFile.write("#include \"Pattern.h\"\n")
@ -871,18 +873,20 @@ class CodeGen:
# the pattern files. # the pattern files.
# Driver file is similar in this case but different from semihosting # Driver file is similar in this case but different from semihosting
# one. # one.
testDriveHeaderPath = os.path.join(header,"TestDrive.h")
patternHeaderPath = os.path.join(header,"Patterns.h")
if not self._fpga: if not self._fpga:
with open("%s/TestDrive.h" % header,"w") as driverFile: with open(testDriveHeaderPath,"w") as driverFile:
driverFile.write("// Empty driver include in semihosting mode") driverFile.write("// Empty driver include in semihosting mode")
with open("%s/Patterns.h" % header,"w") as includeFile: with open(patternHeaderPath,"w") as includeFile:
includeFile.write("// Empty pattern include in semihosting mode") includeFile.write("// Empty pattern include in semihosting mode")
else: else:
with open("%s/TestDrive.h" % header,"w") as driverFile: with open(testDriveHeaderPath,"w") as driverFile:
driverFile.write("#ifndef _DRIVER_H_\n") driverFile.write("#ifndef _DRIVER_H_\n")
driverFile.write("#define _DRIVER_H_\n") driverFile.write("#define _DRIVER_H_\n")
driverFile.write("__ALIGNED(8) const char testDesc[]={\n") driverFile.write("__ALIGNED(8) const char testDesc[]={\n")
self._offset=0 self._offset=0
with open("%s/Patterns.h" % header,"w") as includeFile: with open(patternHeaderPath,"w") as includeFile:
includeFile.write("#ifndef _PATTERNS_H_\n") includeFile.write("#ifndef _PATTERNS_H_\n")
includeFile.write("#define _PATTERNS_H_\n") includeFile.write("#define _PATTERNS_H_\n")
includeFile.write("__ALIGNED(8) const char patterns[]={\n") includeFile.write("__ALIGNED(8) const char patterns[]={\n")

@ -1,462 +0,0 @@
import subprocess
import colorama
from colorama import init,Fore, Back, Style
import argparse
import os
import os.path
from contextlib import contextmanager
import shutil
import glob
from pathlib import Path
import sys
DEBUGMODE = False
KEEPBUILDFOLDER = True
DEBUGLIST=[
"-DBASICMATH=ON",
"-DCOMPLEXMATH=OFF",
"-DCONTROLLER=OFF",
"-DFASTMATH=OFF",
"-DFILTERING=OFF",
"-DMATRIX=OFF",
"-DSTATISTICS=OFF",
"-DSUPPORT=OFF",
"-DTRANSFORM=OFF",
"-DSVM=OFF",
"-DBAYES=OFF",
"-DDISTANCE=OFF"]
NOTESTFAILED = 0
MAKEFAILED = 1
TESTFAILED = 2
FLOWFAILURE = 3
CALLFAILURE = 4
def setDebugMode():
global DEBUGMODE
DEBUGMODE=True
def isDebugMode():
global DEBUGMODE
return(DEBUGMODE)
def setNokeepBuildFolder():
global KEEPBUILDFOLDER
KEEPBUILDFOLDER=False
def isKeepMode():
global KEEPBUILDFOLDER
return(KEEPBUILDFOLDER)
def joinit(iterable, delimiter):
it = iter(iterable)
yield next(it)
for x in it:
yield delimiter
yield x
def addToDb(db,desc,runid):
msg("Add %s to summary database\n" % desc)
completed = subprocess.run([sys.executable, "addToDB.py","-o",db,"-r",str(runid),desc], timeout=3600)
check(completed)
def addToRegDb(db,desc,runid):
msg("Add %s to regression database\n" % desc)
completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,"-r",str(runid),desc], timeout=3600)
check(completed)
class TestFlowFailure(Exception):
def __init__(self,completed):
self._errorcode = completed.returncode
def errorCode(self):
return(self._errorcode)
class CallFailure(Exception):
pass
def check(n):
#print(n)
if n is not None:
if n.returncode != 0:
raise TestFlowFailure(n)
else:
raise CallFailure()
def msg(t):
print(Fore.CYAN + t + Style.RESET_ALL)
def errorMsg(t):
print(Fore.RED + t + Style.RESET_ALL)
def fullTestFolder(rootFolder):
return(os.path.join(rootFolder,"CMSIS","DSP","Testing","fulltests"))
class BuildConfig:
def __init__(self,toUnset,toSet,rootFolder,buildFolder,compiler,toolchain,core,cmake):
self._toUnset = toUnset
self._toSet = toSet
self._buildFolder = buildFolder
self._rootFolder = os.path.abspath(rootFolder)
self._dspFolder = os.path.join(self._rootFolder,"CMSIS","DSP")
self._testingFolder = os.path.join(self._dspFolder,"Testing")
self._fullTests = os.path.join(self._testingFolder,"fulltests")
self._compiler = compiler
self._toolchain = toolchain
self._core = core
self._cmake = cmake
self._savedEnv = {}
def compiler(self):
return(self._compiler)
def toolChainFile(self):
return(self._toolchain)
def core(self):
return(self._core)
def path(self):
return(os.path.join(self._fullTests,self._buildFolder))
def archivePath(self):
return(os.path.join(self._fullTests,"archive",self._buildFolder))
def archiveResultPath(self):
return(os.path.join(self._fullTests,"archive",self._buildFolder,"results"))
def archiveLogPath(self):
return(os.path.join(self._fullTests,"archive",self._buildFolder,"logs"))
def archiveErrorPath(self):
return(os.path.join(self._fullTests,"archive",self._buildFolder,"errors"))
def toolChainPath(self):
return(self._dspFolder)
def cmakeFilePath(self):
return(self._testingFolder)
def buildFolderName(self):
return(self._buildFolder)
def saveEnv(self):
if self._toUnset is not None:
for v in self._toUnset:
if v in os.environ:
self._savedEnv[v] = os.environ[v]
else:
self._savedEnv[v] = None
del os.environ[v]
if self._toSet is not None:
for v in self._toSet:
for varName in v:
if varName in os.environ:
self._savedEnv[varName] = os.environ[varName]
else:
self._savedEnv[varName] = None
os.environ[varName] = v[varName]
def restoreEnv(self):
for v in self._savedEnv:
if self._savedEnv[v] is not None:
os.environ[v] = self._savedEnv[v]
else:
if v in os.environ:
del os.environ[v]
self._savedEnv = {}
# Build for a folder
# We need to be able to detect failed build
def build(self,test):
completed=None
# Save and unset some environment variables
self.saveEnv()
with self.buildFolder() as b:
msg(" Build %s\n" % self.buildFolderName())
with open(os.path.join(self.archiveLogPath(),"makelog_%s.txt" % test),"w") as makelog:
with open(os.path.join(self.archiveErrorPath(),"makeerror_%s.txt" % test),"w") as makeerr:
if DEBUGMODE:
completed=subprocess.run(["make","-j4","VERBOSE=1"],timeout=3600)
else:
completed=subprocess.run(["make","-j4","VERBOSE=1"],stdout=makelog,stderr=makeerr,timeout=3600)
# Restore environment variables
self.restoreEnv()
check(completed)
def getTest(self,test):
return(Test(self,test))
# Launch cmake command.
def createCMake(self,configid,flags,benchMode,platform):
with self.buildFolder() as b:
self.saveEnv()
if benchMode:
msg("Benchmark mode\n")
msg("Create cmake for %s\n" % self.buildFolderName())
toolchainCmake = os.path.join(self.toolChainPath(),self.toolChainFile())
cmd = [self._cmake]
cmd += ["-DCMAKE_PREFIX_PATH=%s" % self.compiler(),
"-DCMAKE_TOOLCHAIN_FILE=%s" % toolchainCmake,
"-DARM_CPU=%s" % self.core(),
"-DPLATFORM=%s" % platform,
"-DCONFIGID=%s" % configid
]
cmd += flags
if DEBUGMODE:
cmd += DEBUGLIST
if benchMode:
cmd += ["-DBENCHMARK=ON"]
cmd += ["-DWRAPPER=ON"]
else:
cmd += ["-DBENCHMARK=OFF"]
cmd += ["-DWRAPPER=OFF"]
cmd += ["-DROOT=%s" % self._rootFolder,
"-DCMAKE_BUILD_TYPE=Release",
"-G", "Unix Makefiles" ,"%s" % self.cmakeFilePath()]
if DEBUGMODE:
print(cmd)
with open(os.path.join(self.archiveLogPath(),"cmakecmd.txt"),"w") as cmakecmd:
cmakecmd.write("".join(joinit(cmd," ")))
with open(os.path.join(self.archiveLogPath(),"cmakelog.txt"),"w") as cmakelog:
with open(os.path.join(self.archiveErrorPath(),"cmakeerror.txt"),"w") as cmakeerr:
completed=subprocess.run(cmd, stdout=cmakelog,stderr=cmakeerr, timeout=3600)
self.restoreEnv()
check(completed)
# Create the build folder if missing
def createFolder(self):
os.makedirs(self.path(),exist_ok=True)
def createArchive(self, flags):
os.makedirs(self.archivePath(),exist_ok=True)
os.makedirs(self.archiveResultPath(),exist_ok=True)
os.makedirs(self.archiveErrorPath(),exist_ok=True)
os.makedirs(self.archiveLogPath(),exist_ok=True)
with open(os.path.join(self.archivePath(),"flags.txt"),"w") as f:
for flag in flags:
f.write(flag)
f.write("\n")
# Delete the build folder
def cleanFolder(self):
print("Delete %s\n" % self.path())
#DEBUG
if not isDebugMode() and not isKeepMode():
shutil.rmtree(self.path())
# Archive results and currentConfig.csv to another folder
def archiveResults(self):
results=glob.glob(os.path.join(self.path(),"results_*"))
for result in results:
dst=os.path.join(self.archiveResultPath(),os.path.basename(result))
shutil.copy(result,dst)
src = os.path.join(self.path(),"currentConfig.csv")
dst = os.path.join(self.archiveResultPath(),os.path.basename(src))
shutil.copy(src,dst)
@contextmanager
def buildFolder(self):
current=os.getcwd()
try:
os.chdir(self.path() )
yield self.path()
finally:
os.chdir(current)
@contextmanager
def archiveFolder(self):
current=os.getcwd()
try:
os.chdir(self.archivePath() )
yield self.archivePath()
finally:
os.chdir(current)
@contextmanager
def resultFolder(self):
current=os.getcwd()
try:
os.chdir(self.archiveResultPath())
yield self.archiveResultPath()
finally:
os.chdir(current)
@contextmanager
def logFolder(self):
current=os.getcwd()
try:
os.chdir(self.archiveLogPath())
yield self.archiveLogPath()
finally:
os.chdir(current)
@contextmanager
def errorFolder(self):
current=os.getcwd()
try:
os.chdir(self.archiveErrorPath())
yield self.archiveErrorPath()
finally:
os.chdir(current)
class Test:
def __init__(self,build,test):
self._test = test
self._buildConfig = build
def buildConfig(self):
return(self._buildConfig)
def testName(self):
return(self._test)
# Process a test from the test description file
def processTest(self,patternConfig):
if isDebugMode():
if patternConfig:
completed=subprocess.run([sys.executable,"processTests.py","-p",patternConfig["patterns"],"-d",patternConfig["parameters"],"-e",self.testName(),"1"],timeout=3600)
else:
completed=subprocess.run([sys.executable,"processTests.py","-e",self.testName(),"1"],timeout=3600)
check(completed)
else:
if patternConfig:
completed=subprocess.run([sys.executable,"processTests.py","-p",patternConfig["patterns"],"-d",patternConfig["parameters"],"-e",self.testName()],timeout=3600)
else:
completed=subprocess.run([sys.executable,"processTests.py","-e",self.testName()],timeout=3600)
check(completed)
def getResultPath(self):
return(os.path.join(self.buildConfig().path() ,self.resultName()))
def resultName(self):
return("results_%s.txt" % self.testName())
# Run a specific test in the current folder
# A specific results.txt file is created in
# the build folder for this test
#
# We need a timeout and detect failed run
def run(self,fvp,benchmode):
timeoutVal=3600
if benchmode:
timeoutVal = 3600 * 4
completed = None
with self.buildConfig().buildFolder() as b:
msg(" Run %s\n" % self.testName() )
with open(self.resultName(),"w") as results:
if isDebugMode():
print(os.getcwd())
print(fvp.split())
completed=subprocess.run(fvp.split(),timeout=timeoutVal)
else:
completed=subprocess.run(fvp.split(),stdout=results,timeout=timeoutVal)
check(completed)
# Process results of the given tests
# in given build folder
# We need to detect failed tests
def processResult(self):
msg(" Parse result for %s\n" % self.testName())
with open(os.path.join(self.buildConfig().archiveResultPath(),"processedResult_%s.txt" % self.testName()),"w") as presult:
completed=subprocess.run([sys.executable,"processResult.py","-e","-r",self.getResultPath()],stdout=presult,timeout=3600)
# When a test fail, the regression is continuing but we
# track that a test has failed
if completed.returncode==0:
return(NOTESTFAILED)
else:
return(TESTFAILED)
# Compute the regression data
def computeSummaryStat(self):
msg(" Compute regressions for %s\n" % self.testName())
completed=subprocess.run([sys.executable,"summaryBench.py","-r",self.getResultPath(),self.testName()],timeout=3600)
# When a test fail, the regression is continuing but we
# track that a test has failed
if completed.returncode==0:
return(NOTESTFAILED)
else:
return(TESTFAILED)
def runAndProcess(self,patternConfig,compiler,fvp,sim,benchmode,db,regdb,benchid,regid):
# If we can't parse test description we fail all tests
self.processTest(patternConfig)
# Otherwise if only building or those tests are failing, we continue
# with other tests
try:
self.buildConfig().build(self.testName())
except:
return(MAKEFAILED)
# We run tests only for AC6
# For other compilers only build is tests
# Since full build is no more possible because of huge pattersn,
# build is done per test suite.
if sim:
if fvp is not None:
if isDebugMode():
print(fvp)
self.run(fvp,benchmode)
error=self.processResult()
if benchmode and (error == NOTESTFAILED):
error = self.computeSummaryStat()
if db is not None:
addToDb(db,self.testName(),benchid)
if regdb is not None:
addToRegDb(regdb,self.testName(),regid)
return(error)
else:
msg("No FVP available")
return(NOTESTFAILED)
else:
return(NOTESTFAILED)
# Preprocess the test description
def preprocess(desc):
msg("Process test description file %s\n" % desc)
completed = subprocess.run([sys.executable, "preprocess.py","-f",desc],timeout=3600)
check(completed)
# Generate all missing C code by using all classes in the
# test description file
def generateAllCCode(patternConfig):
msg("Generate all missing C files\n")
if patternConfig:
completed = subprocess.run([sys.executable,"processTests.py",
"-p",patternConfig["patterns"],"-d",patternConfig["parameters"],"-e"],timeout=3600)
else:
completed = subprocess.run([sys.executable,"processTests.py", "-e"],timeout=3600)
check(completed)
# Create db
def createDb(sqlite,desc):
msg("Create database %s\n" % desc)
with open("createDb.sql") as db:
completed = subprocess.run([sqlite, desc],stdin=db, timeout=3600)
check(completed)

@ -9,6 +9,7 @@ parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="P
parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path") parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path")
parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path") parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path")
parser.add_argument('-gen', nargs='?',type = str, default=".", help="Folder for generated C sources")
# -e true when no semihosting # -e true when no semihosting
# Input is include files # Input is include files
@ -34,6 +35,6 @@ if args.f is not None:
d.deprecate(root,args.others) d.deprecate(root,args.others)
#print(root) #print(root)
# Generate code with the tree of tests # Generate code with the tree of tests
c.genCodeForTree(root,args.b) c.genCodeForTree(args.gen,root,args.b)
else: else:
parser.print_help() parser.print_help()
Loading…
Cancel
Save