diff --git a/Source/MatrixFunctions/arm_householder_f16.c b/Source/MatrixFunctions/arm_householder_f16.c index 21691a13..1985d145 100644 --- a/Source/MatrixFunctions/arm_householder_f16.c +++ b/Source/MatrixFunctions/arm_householder_f16.c @@ -36,6 +36,7 @@ #include + /** @ingroup groupMatrix */ @@ -54,8 +55,12 @@ @param[out] pOut points to the output vector. @return beta return the scaling factor beta */ + + #if defined(ARM_FLOAT16_SUPPORTED) + + float16_t arm_householder_f16( const float16_t * pSrc, const float16_t threshold, diff --git a/Source/MatrixFunctions/arm_householder_f32.c b/Source/MatrixFunctions/arm_householder_f32.c index 083dbc78..23c549a2 100644 --- a/Source/MatrixFunctions/arm_householder_f32.c +++ b/Source/MatrixFunctions/arm_householder_f32.c @@ -35,6 +35,7 @@ #include + /** @ingroup groupMatrix */ @@ -129,6 +130,9 @@ @return beta return the scaling factor beta */ + + + float32_t arm_householder_f32( const float32_t * pSrc, const float32_t threshold, diff --git a/Source/MatrixFunctions/arm_householder_f64.c b/Source/MatrixFunctions/arm_householder_f64.c index 1fd75b46..be4f3e4c 100644 --- a/Source/MatrixFunctions/arm_householder_f64.c +++ b/Source/MatrixFunctions/arm_householder_f64.c @@ -35,6 +35,7 @@ #include + /** @ingroup groupMatrix */ @@ -54,6 +55,9 @@ @return beta return the scaling factor beta */ + + + float64_t arm_householder_f64( const float64_t * pSrc, const float64_t threshold, diff --git a/Source/MatrixFunctions/arm_mat_qr_f16.c b/Source/MatrixFunctions/arm_mat_qr_f16.c index 6ee0eeec..d52b20be 100644 --- a/Source/MatrixFunctions/arm_mat_qr_f16.c +++ b/Source/MatrixFunctions/arm_mat_qr_f16.c @@ -30,6 +30,12 @@ #include "dsp/matrix_utils.h" +#if !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) +#include "arm_helium_utils.h" +#endif +#endif + /** @ingroup groupMatrix */ @@ -66,7 +72,9 @@ refer to the \ref MatrixHouseholder documentation */ -#if defined(ARM_FLOAT16_SUPPORTED) + +#if !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) arm_status arm_mat_qr_f16( const arm_matrix_instance_f16 * pSrc, @@ -79,16 +87,477 @@ arm_status arm_mat_qr_f16( ) { + int32_t col=0; + int32_t nb,pos; + float16_t *pa,*pc; + float16_t beta; + float16_t *pv; + float16_t *pdst; + float16_t *p; + + if (pSrc->numRows < pSrc->numCols) + { + return(ARM_MATH_SIZE_MISMATCH); + } + + memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float16_t)); + pOutR->numCols = pSrc->numCols; + pOutR->numRows = pSrc->numRows; + + p = pOutR->pData; + pc = pOutTau; + for(col=0 ; col < pSrc->numCols; col++) + { + int32_t j,k,blkCnt,blkCnt2; + float16_t *pa0,*pa1,*pa2,*pa3,*ptemp; + float16_t temp; + float16x8_t v1,v2,vtemp; + + COPY_COL_F16(pOutR,col,col,pTmpA); + + beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA); + *pc++ = beta; + + pdst = pTmpB; + + /* v.T A(col:,col:) -> tmpb */ + pv = pTmpA; + pa = p; + + temp = *pv; + blkCnt = (pSrc->numCols-col) >> 3; + while (blkCnt > 0) + { + v1 = vld1q_f16(pa); + v2 = vmulq_n_f16(v1,temp); + vst1q_f16(pdst,v2); + + pa += 8; + pdst += 8; + blkCnt--; + } + blkCnt = (pSrc->numCols-col) & 7; + if (blkCnt > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt); + v1 = vld1q_f16(pa); + v2 = vmulq_n_f16(v1,temp); + vst1q_p_f16(pdst,v2,p0); + + pa += blkCnt; + } + + pa += col; + pv++; + pdst = pTmpB; + + pa0 = pa; + pa1 = pa0 + pSrc->numCols; + pa2 = pa1 + pSrc->numCols; + pa3 = pa2 + pSrc->numCols; + + /* Unrolled loop */ + blkCnt = (pSrc->numRows-col - 1) >> 2; + k=1; + while(blkCnt > 0) + { + vtemp=vld1q_f16(pv); + + blkCnt2 = (pSrc->numCols-col) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pdst); + + v2 = vld1q_f16(pa0); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0)); + + v2 = vld1q_f16(pa1); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1)); + + v2 = vld1q_f16(pa2); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2)); + + v2 = vld1q_f16(pa3); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3)); + + vst1q_f16(pdst,v1); + + pdst += 8; + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + blkCnt2--; + } + blkCnt2 = (pSrc->numCols-col) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + + v1 = vld1q_f16(pdst); + + v2 = vld1q_f16(pa0); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,0)); + + v2 = vld1q_f16(pa1); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,1)); + + v2 = vld1q_f16(pa2); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,2)); + + v2 = vld1q_f16(pa3); + v1 = vfmaq_n_f16(v1,v2,vgetq_lane(vtemp,3)); + + vst1q_p_f16(pdst,v1,p0); + + pa0 += blkCnt2; + pa1 += blkCnt2; + pa2 += blkCnt2; + pa3 += blkCnt2; + } + + pa0 += col + 3*pSrc->numCols; + pa1 += col + 3*pSrc->numCols; + pa2 += col + 3*pSrc->numCols; + pa3 += col + 3*pSrc->numCols; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-col; k++) + { + temp = *pv; + blkCnt2 = (pSrc->numCols-col) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pa); + v2 = vld1q_f16(pdst); + v2 = vfmaq_n_f16(v2,v1,temp); + vst1q_f16(pdst,v2); + + pa += 8; + pdst += 8; + blkCnt2--; + } + blkCnt2 = (pSrc->numCols-col) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + v1 = vld1q_f16(pa); + v2 = vld1q_f16(pdst); + v2 = vfmaq_n_f16(v2,v1,temp); + vst1q_p_f16(pdst,v2,p0); + + pa += blkCnt2; + } + + pa += col; + pv++; + pdst = pTmpB; + } + + /* A(col:,col:) - beta v tmpb */ + pa = p; + for(j=0;jnumRows-col; j++) + { + float16_t f = -(_Float16)beta * (_Float16)pTmpA[j]; + ptemp = pTmpB; + + blkCnt2 = (pSrc->numCols-col) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pa); + v2 = vld1q_f16(ptemp); + v1 = vfmaq_n_f16(v1,v2,f); + vst1q_f16(pa,v1); + + pa += 8; + ptemp += 8; + + blkCnt2--; + } + blkCnt2 = (pSrc->numCols-col) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + + v1 = vld1q_f16(pa); + v2 = vld1q_f16(ptemp); + v1 = vfmaq_n_f16(v1,v2,f); + vst1q_p_f16(pa,v1,p0); + + pa += blkCnt2; + } + + pa += col; + } + + /* Copy Householder reflectors into R matrix */ + pa = p + pOutR->numCols; + for(k=0;knumRows-col-1; k++) + { + *pa = pTmpA[k+1]; + pa += pOutR->numCols; + } + + p += 1 + pOutR->numCols; + } + + /* Generate Q if requested by user matrix */ + + if (pOutQ != NULL) + { + /* Initialize Q matrix to identity */ + memset(pOutQ->pData,0,sizeof(float16_t)*pOutQ->numRows*pOutQ->numRows); + + pa = pOutQ->pData; + for(col=0 ; col < pOutQ->numCols; col++) + { + *pa = 1.0f16; + pa += pOutQ->numCols+1; + } + + nb = pOutQ->numRows - pOutQ->numCols + 1; + + pc = pOutTau + pOutQ->numCols - 1; + for(col=0 ; col < pOutQ->numCols; col++) + { + int32_t j,k, blkCnt, blkCnt2; + float16_t *pa0,*pa1,*pa2,*pa3,*ptemp; + float16_t temp; + float16x8_t v1,v2,vtemp; + + pos = pSrc->numRows - nb; + p = pOutQ->pData + pos + pOutQ->numCols*pos ; + + + COPY_COL_F16(pOutR,pos,pos,pTmpA); + pTmpA[0] = 1.0f16; + pdst = pTmpB; + + /* v.T A(col:,col:) -> tmpb */ + + pv = pTmpA; + pa = p; + + temp = *pv; + blkCnt2 = (pOutQ->numRows-pos) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pa); + v1 = vmulq_n_f16(v1, temp); + vst1q_f16(pdst,v1); + + pa += 8; + pdst += 8; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numRows-pos) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + + v1 = vld1q_f16(pa); + v1 = vmulq_n_f16(v1, temp); + vst1q_p_f16(pdst,v1,p0); + + pa += blkCnt2; + } + + pa += pos; + pv++; + pdst = pTmpB; + pa0 = pa; + pa1 = pa0 + pOutQ->numRows; + pa2 = pa1 + pOutQ->numRows; + pa3 = pa2 + pOutQ->numRows; + + /* Unrolled loop */ + blkCnt = (pOutQ->numRows-pos - 1) >> 2; + k=1; + while(blkCnt > 0) + { + + vtemp = vld1q_f16(pv); + blkCnt2 = (pOutQ->numRows-pos) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pdst); + + v2 = vld1q_f16(pa0); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0)); + + v2 = vld1q_f16(pa1); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1)); + + v2 = vld1q_f16(pa2); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2)); + + v2 = vld1q_f16(pa3); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3)); + + vst1q_f16(pdst,v1); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + pdst += 8; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numRows-pos) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + + v1 = vld1q_f16(pdst); + + v2 = vld1q_f16(pa0); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,0)); + + v2 = vld1q_f16(pa1); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,1)); + + v2 = vld1q_f16(pa2); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,2)); + + v2 = vld1q_f16(pa3); + v1 = vfmaq_n_f16(v1, v2, vgetq_lane(vtemp,3)); + + vst1q_p_f16(pdst,v1,p0); + + pa0 += blkCnt2; + pa1 += blkCnt2; + pa2 += blkCnt2; + pa3 += blkCnt2; + + } + + pa0 += pos + 3*pOutQ->numRows; + pa1 += pos + 3*pOutQ->numRows; + pa2 += pos + 3*pOutQ->numRows; + pa3 += pos + 3*pOutQ->numRows; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-pos; k++) + { + temp = *pv; + blkCnt2 = (pOutQ->numRows-pos) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pdst); + v2 = vld1q_f16(pa); + v1 = vfmaq_n_f16(v1, v2, temp); + vst1q_f16(pdst,v1); + + pdst += 8; + pa += 8; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numRows-pos) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + v1 = vld1q_f16(pdst); + v2 = vld1q_f16(pa); + v1 = vfmaq_n_f16(v1, v2, temp); + vst1q_p_f16(pdst,v1,p0); + + pa += blkCnt2; + } + + pa += pos; + pv++; + pdst = pTmpB; + } + + pa = p; + beta = *pc--; + for(j=0;jnumRows-pos; j++) + { + float16_t f = -(_Float16)beta * (_Float16)pTmpA[j]; + ptemp = pTmpB; + + blkCnt2 = (pOutQ->numCols-pos) >> 3; + while (blkCnt2 > 0) + { + v1 = vld1q_f16(pa); + v2 = vld1q_f16(ptemp); + v1 = vfmaq_n_f16(v1,v2,f); + vst1q_f16(pa,v1); + + pa += 8; + ptemp += 8; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numCols-pos) & 7; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp16q(blkCnt2); + + v1 = vld1q_f16(pa); + v2 = vld1q_f16(ptemp); + v1 = vfmaq_n_f16(v1,v2,f); + vst1q_p_f16(pa,v1,p0); - uint32_t col=0; + pa += blkCnt2; + } + + pa += pos; + } + + + nb++; + } + } + + arm_status status = ARM_MATH_SUCCESS; + /* Return to application */ + return (status); +} + +#endif /*#if !defined(ARM_MATH_MVEF)*/ + + +#endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/ + + +#if defined(ARM_FLOAT16_SUPPORTED) + +#if (!defined(ARM_MATH_MVE_FLOAT16)) || defined(ARM_MATH_AUTOVECTORIZE) + + +arm_status arm_mat_qr_f16( + const arm_matrix_instance_f16 * pSrc, + const float16_t threshold, + arm_matrix_instance_f16 * pOutR, + arm_matrix_instance_f16 * pOutQ, + float16_t * pOutTau, + float16_t *pTmpA, + float16_t *pTmpB + ) + +{ + int32_t col=0; int32_t nb,pos; float16_t *pa,*pc; float16_t beta; float16_t *pv; float16_t *pdst; float16_t *p; - float16_t sum; if (pSrc->numRows < pSrc->numCols) { @@ -104,7 +573,8 @@ arm_status arm_mat_qr_f16( pc = pOutTau; for(col=0 ; col < pSrc->numCols; col++) { - uint32_t i,j,k; + int32_t i,j,k,blkCnt; + float16_t *pa0,*pa1,*pa2,*pa3; COPY_COL_F16(pOutR,col,col,pTmpA); beta = arm_householder_f16(pTmpA,threshold,pSrc->numRows - col,pTmpA); @@ -113,26 +583,70 @@ arm_status arm_mat_qr_f16( pdst = pTmpB; /* v.T A(col:,col:) -> tmpb */ + pv = pTmpA; + pa = p; for(j=0;jnumCols-col; j++) { - pa = p+j; - pv = pTmpA; - sum = 0.0f16; - for(k=0;knumRows-col; k++) + *pdst++ = (_Float16)*pv * (_Float16)*pa++; + } + pa += col; + pv++; + pdst = pTmpB; + + pa0 = pa; + pa1 = pa0 + pSrc->numCols; + pa2 = pa1 + pSrc->numCols; + pa3 = pa2 + pSrc->numCols; + + /* Unrolled loop */ + blkCnt = (pSrc->numRows-col - 1) >> 2; + k=1; + while(blkCnt > 0) + { + float16_t sum; + + for(j=0;jnumCols-col; j++) + { + sum = *pdst; + + sum += (_Float16)pv[0] * (_Float16)*pa0++; + sum += (_Float16)pv[1] * (_Float16)*pa1++; + sum += (_Float16)pv[2] * (_Float16)*pa2++; + sum += (_Float16)pv[3] * (_Float16)*pa3++; + + *pdst++ = sum; + } + pa0 += col + 3*pSrc->numCols; + pa1 += col + 3*pSrc->numCols; + pa2 += col + 3*pSrc->numCols; + pa3 += col + 3*pSrc->numCols; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-col; k++) + { + for(j=0;jnumCols-col; j++) { - sum += (_Float16)*pv++ * (_Float16)*pa; - pa += pOutR->numCols; + *pdst++ += (_Float16)*pv * (_Float16)*pa++; } - *pdst++ = sum; + pa += col; + pv++; + pdst = pTmpB; } /* A(col:,col:) - beta v tmpb */ pa = p; for(j=0;jnumRows-col; j++) { + float16_t f = (_Float16)beta * (_Float16)pTmpA[j]; + for(i=0;inumCols-col; i++) { - *pa = (_Float16)*pa - (_Float16)beta * (_Float16)pTmpA[j] * (_Float16)pTmpB[i] ; + *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ; pa++; } pa += col; @@ -168,7 +682,8 @@ arm_status arm_mat_qr_f16( pc = pOutTau + pOutQ->numCols - 1; for(col=0 ; col < pOutQ->numCols; col++) { - int32_t i,j,k; + int32_t i,j,k, blkCnt; + float16_t *pa0,*pa1,*pa2,*pa3; pos = pSrc->numRows - nb; p = pOutQ->pData + pos + pOutQ->numCols*pos ; @@ -178,26 +693,70 @@ arm_status arm_mat_qr_f16( pdst = pTmpB; /* v.T A(col:,col:) -> tmpb */ + + pv = pTmpA; + pa = p; for(j=0;jnumRows-pos; j++) { - pa = p+j; - pv = pTmpA; - sum = 0.0f16; - for(k=0;knumRows-pos; k++) + *pdst++ = (_Float16)*pv * (_Float16)*pa++; + } + pa += pos; + pv++; + pdst = pTmpB; + pa0 = pa; + pa1 = pa0 + pOutQ->numRows; + pa2 = pa1 + pOutQ->numRows; + pa3 = pa2 + pOutQ->numRows; + + /* Unrolled loop */ + blkCnt = (pOutQ->numRows-pos - 1) >> 2; + k=1; + while(blkCnt > 0) + { + float16_t sum; + + for(j=0;jnumRows-pos; j++) { - sum += (_Float16)*pv++ * (_Float16)*pa; - pa += pOutQ->numCols; + sum = *pdst; + + sum += (_Float16)pv[0] * (_Float16)*pa0++; + sum += (_Float16)pv[1] * (_Float16)*pa1++; + sum += (_Float16)pv[2] * (_Float16)*pa2++; + sum += (_Float16)pv[3] * (_Float16)*pa3++; + + *pdst++ = sum; } - *pdst++ = sum; + pa0 += pos + 3*pOutQ->numRows; + pa1 += pos + 3*pOutQ->numRows; + pa2 += pos + 3*pOutQ->numRows; + pa3 += pos + 3*pOutQ->numRows; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-pos; k++) + { + for(j=0;jnumRows-pos; j++) + { + *pdst++ += (_Float16)*pv * (_Float16)*pa++; + } + pa += pos; + pv++; + pdst = pTmpB; } pa = p; beta = *pc--; for(j=0;jnumRows-pos; j++) { + float16_t f = (_Float16)beta * (_Float16)pTmpA[j]; + for(i=0;inumCols-pos; i++) { - *pa = (_Float16)*pa - (_Float16)beta * (_Float16)pTmpA[j] * (_Float16)pTmpB[i] ; + *pa = (_Float16)*pa - (_Float16)f * (_Float16)pTmpB[i] ; pa++; } pa += pos; @@ -213,6 +772,7 @@ arm_status arm_mat_qr_f16( return (status); } +#endif /* end of test for Helium or Neon availability */ #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ /** diff --git a/Source/MatrixFunctions/arm_mat_qr_f32.c b/Source/MatrixFunctions/arm_mat_qr_f32.c index 699adcf6..ebad6b74 100644 --- a/Source/MatrixFunctions/arm_mat_qr_f32.c +++ b/Source/MatrixFunctions/arm_mat_qr_f32.c @@ -30,6 +30,12 @@ #include "dsp/matrix_utils.h" +#if !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVEF) +#include "arm_helium_utils.h" +#endif +#endif + /** @ingroup groupMatrix */ @@ -140,6 +146,9 @@ */ +#if !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVEF) + arm_status arm_mat_qr_f32( const arm_matrix_instance_f32 * pSrc, const float32_t threshold, @@ -151,16 +160,475 @@ arm_status arm_mat_qr_f32( ) { + int32_t col=0; + int32_t nb,pos; + float32_t *pa,*pc; + float32_t beta; + float32_t *pv; + float32_t *pdst; + float32_t *p; + + if (pSrc->numRows < pSrc->numCols) + { + return(ARM_MATH_SIZE_MISMATCH); + } + + memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t)); + pOutR->numCols = pSrc->numCols; + pOutR->numRows = pSrc->numRows; + + p = pOutR->pData; + pc = pOutTau; + for(col=0 ; col < pSrc->numCols; col++) + { + int32_t j,k,blkCnt,blkCnt2; + float32_t *pa0,*pa1,*pa2,*pa3,*ptemp; + float32_t temp; + float32x4_t v1,v2,vtemp; + + COPY_COL_F32(pOutR,col,col,pTmpA); + + beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA); + *pc++ = beta; + + pdst = pTmpB; + + /* v.T A(col:,col:) -> tmpb */ + pv = pTmpA; + pa = p; + + temp = *pv; + blkCnt = (pSrc->numCols-col) >> 2; + while (blkCnt > 0) + { + v1 = vld1q_f32(pa); + v2 = vmulq_n_f32(v1,temp); + vst1q_f32(pdst,v2); + + pa += 4; + pdst += 4; + blkCnt--; + } + blkCnt = (pSrc->numCols-col) & 3; + if (blkCnt > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt); + v1 = vld1q_f32(pa); + v2 = vmulq_n_f32(v1,temp); + vst1q_p_f32(pdst,v2,p0); + + pa += blkCnt; + } + + pa += col; + pv++; + pdst = pTmpB; + + pa0 = pa; + pa1 = pa0 + pSrc->numCols; + pa2 = pa1 + pSrc->numCols; + pa3 = pa2 + pSrc->numCols; + + /* Unrolled loop */ + blkCnt = (pSrc->numRows-col - 1) >> 2; + k=1; + while(blkCnt > 0) + { + vtemp=vld1q_f32(pv); + + blkCnt2 = (pSrc->numCols-col) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pdst); + + v2 = vld1q_f32(pa0); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0)); + + v2 = vld1q_f32(pa1); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1)); + + v2 = vld1q_f32(pa2); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2)); + + v2 = vld1q_f32(pa3); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3)); + + vst1q_f32(pdst,v1); + + pdst += 4; + pa0 += 4; + pa1 += 4; + pa2 += 4; + pa3 += 4; + blkCnt2--; + } + blkCnt2 = (pSrc->numCols-col) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); + + v1 = vld1q_f32(pdst); + + v2 = vld1q_f32(pa0); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0)); + + v2 = vld1q_f32(pa1); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1)); + + v2 = vld1q_f32(pa2); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2)); + + v2 = vld1q_f32(pa3); + v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3)); + + vst1q_p_f32(pdst,v1,p0); + + pa0 += blkCnt2; + pa1 += blkCnt2; + pa2 += blkCnt2; + pa3 += blkCnt2; + } + + pa0 += col + 3*pSrc->numCols; + pa1 += col + 3*pSrc->numCols; + pa2 += col + 3*pSrc->numCols; + pa3 += col + 3*pSrc->numCols; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-col; k++) + { + temp = *pv; + blkCnt2 = (pSrc->numCols-col) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pa); + v2 = vld1q_f32(pdst); + v2 = vfmaq_n_f32(v2,v1,temp); + vst1q_f32(pdst,v2); + + pa += 4; + pdst += 4; + blkCnt2--; + } + blkCnt2 = (pSrc->numCols-col) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); + v1 = vld1q_f32(pa); + v2 = vld1q_f32(pdst); + v2 = vfmaq_n_f32(v2,v1,temp); + vst1q_p_f32(pdst,v2,p0); + + pa += blkCnt2; + } + + pa += col; + pv++; + pdst = pTmpB; + } + + /* A(col:,col:) - beta v tmpb */ + pa = p; + for(j=0;jnumRows-col; j++) + { + float32_t f = -beta * pTmpA[j]; + ptemp = pTmpB; + + blkCnt2 = (pSrc->numCols-col) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pa); + v2 = vld1q_f32(ptemp); + v1 = vfmaq_n_f32(v1,v2,f); + vst1q_f32(pa,v1); + + pa += 4; + ptemp += 4; + + blkCnt2--; + } + blkCnt2 = (pSrc->numCols-col) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); + + v1 = vld1q_f32(pa); + v2 = vld1q_f32(ptemp); + v1 = vfmaq_n_f32(v1,v2,f); + vst1q_p_f32(pa,v1,p0); + + pa += blkCnt2; + } + + pa += col; + } + + /* Copy Householder reflectors into R matrix */ + pa = p + pOutR->numCols; + for(k=0;knumRows-col-1; k++) + { + *pa = pTmpA[k+1]; + pa += pOutR->numCols; + } + + p += 1 + pOutR->numCols; + } + + /* Generate Q if requested by user matrix */ + + if (pOutQ != NULL) + { + /* Initialize Q matrix to identity */ + memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows); + + pa = pOutQ->pData; + for(col=0 ; col < pOutQ->numCols; col++) + { + *pa = 1.0f; + pa += pOutQ->numCols+1; + } + + nb = pOutQ->numRows - pOutQ->numCols + 1; + + pc = pOutTau + pOutQ->numCols - 1; + for(col=0 ; col < pOutQ->numCols; col++) + { + int32_t j,k, blkCnt, blkCnt2; + float32_t *pa0,*pa1,*pa2,*pa3,*ptemp; + float32_t temp; + float32x4_t v1,v2,vtemp; + + pos = pSrc->numRows - nb; + p = pOutQ->pData + pos + pOutQ->numCols*pos ; + + + COPY_COL_F32(pOutR,pos,pos,pTmpA); + pTmpA[0] = 1.0f; + pdst = pTmpB; + + /* v.T A(col:,col:) -> tmpb */ + + pv = pTmpA; + pa = p; + + temp = *pv; + blkCnt2 = (pOutQ->numRows-pos) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pa); + v1 = vmulq_n_f32(v1, temp); + vst1q_f32(pdst,v1); + + pa += 4; + pdst += 4; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numRows-pos) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); + + v1 = vld1q_f32(pa); + v1 = vmulq_n_f32(v1, temp); + vst1q_p_f32(pdst,v1,p0); + + pa += blkCnt2; + } + + pa += pos; + pv++; + pdst = pTmpB; + pa0 = pa; + pa1 = pa0 + pOutQ->numRows; + pa2 = pa1 + pOutQ->numRows; + pa3 = pa2 + pOutQ->numRows; + + /* Unrolled loop */ + blkCnt = (pOutQ->numRows-pos - 1) >> 2; + k=1; + while(blkCnt > 0) + { + + vtemp = vld1q_f32(pv); + blkCnt2 = (pOutQ->numRows-pos) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pdst); + + v2 = vld1q_f32(pa0); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0)); + + v2 = vld1q_f32(pa1); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1)); + + v2 = vld1q_f32(pa2); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2)); + + v2 = vld1q_f32(pa3); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3)); + + vst1q_f32(pdst,v1); + + pa0 += 4; + pa1 += 4; + pa2 += 4; + pa3 += 4; + pdst += 4; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numRows-pos) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); - uint32_t col=0; + v1 = vld1q_f32(pdst); + + v2 = vld1q_f32(pa0); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0)); + + v2 = vld1q_f32(pa1); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1)); + + v2 = vld1q_f32(pa2); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2)); + + v2 = vld1q_f32(pa3); + v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3)); + + vst1q_p_f32(pdst,v1,p0); + + pa0 += blkCnt2; + pa1 += blkCnt2; + pa2 += blkCnt2; + pa3 += blkCnt2; + + } + + pa0 += pos + 3*pOutQ->numRows; + pa1 += pos + 3*pOutQ->numRows; + pa2 += pos + 3*pOutQ->numRows; + pa3 += pos + 3*pOutQ->numRows; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-pos; k++) + { + temp = *pv; + blkCnt2 = (pOutQ->numRows-pos) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pdst); + v2 = vld1q_f32(pa); + v1 = vfmaq_n_f32(v1, v2, temp); + vst1q_f32(pdst,v1); + + pdst += 4; + pa += 4; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numRows-pos) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); + v1 = vld1q_f32(pdst); + v2 = vld1q_f32(pa); + v1 = vfmaq_n_f32(v1, v2, temp); + vst1q_p_f32(pdst,v1,p0); + + pa += blkCnt2; + } + + pa += pos; + pv++; + pdst = pTmpB; + } + + pa = p; + beta = *pc--; + for(j=0;jnumRows-pos; j++) + { + float32_t f = -beta * pTmpA[j]; + ptemp = pTmpB; + + blkCnt2 = (pOutQ->numCols-pos) >> 2; + while (blkCnt2 > 0) + { + v1 = vld1q_f32(pa); + v2 = vld1q_f32(ptemp); + v1 = vfmaq_n_f32(v1,v2,f); + vst1q_f32(pa,v1); + + pa += 4; + ptemp += 4; + + blkCnt2--; + } + blkCnt2 = (pOutQ->numCols-pos) & 3; + if (blkCnt2 > 0) + { + mve_pred16_t p0 = vctp32q(blkCnt2); + + v1 = vld1q_f32(pa); + v2 = vld1q_f32(ptemp); + v1 = vfmaq_n_f32(v1,v2,f); + vst1q_p_f32(pa,v1,p0); + + pa += blkCnt2; + } + + pa += pos; + } + + + nb++; + } + } + + arm_status status = ARM_MATH_SUCCESS; + /* Return to application */ + return (status); +} + +#endif /*#if !defined(ARM_MATH_MVEF)*/ + + +#endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/ + + + +#if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE) + +arm_status arm_mat_qr_f32( + const arm_matrix_instance_f32 * pSrc, + const float32_t threshold, + arm_matrix_instance_f32 * pOutR, + arm_matrix_instance_f32 * pOutQ, + float32_t * pOutTau, + float32_t *pTmpA, + float32_t *pTmpB + ) + +{ + int32_t col=0; int32_t nb,pos; float32_t *pa,*pc; float32_t beta; float32_t *pv; float32_t *pdst; float32_t *p; - float32_t sum; if (pSrc->numRows < pSrc->numCols) { @@ -176,7 +644,8 @@ arm_status arm_mat_qr_f32( pc = pOutTau; for(col=0 ; col < pSrc->numCols; col++) { - uint32_t i,j,k; + int32_t i,j,k,blkCnt; + float32_t *pa0,*pa1,*pa2,*pa3; COPY_COL_F32(pOutR,col,col,pTmpA); beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA); @@ -185,26 +654,70 @@ arm_status arm_mat_qr_f32( pdst = pTmpB; /* v.T A(col:,col:) -> tmpb */ + pv = pTmpA; + pa = p; for(j=0;jnumCols-col; j++) { - pa = p+j; - pv = pTmpA; - sum = 0.0f; - for(k=0;knumRows-col; k++) + *pdst++ = *pv * *pa++; + } + pa += col; + pv++; + pdst = pTmpB; + + pa0 = pa; + pa1 = pa0 + pSrc->numCols; + pa2 = pa1 + pSrc->numCols; + pa3 = pa2 + pSrc->numCols; + + /* Unrolled loop */ + blkCnt = (pSrc->numRows-col - 1) >> 2; + k=1; + while(blkCnt > 0) + { + float32_t sum; + + for(j=0;jnumCols-col; j++) + { + sum = *pdst; + + sum += pv[0] * *pa0++; + sum += pv[1] * *pa1++; + sum += pv[2] * *pa2++; + sum += pv[3] * *pa3++; + + *pdst++ = sum; + } + pa0 += col + 3*pSrc->numCols; + pa1 += col + 3*pSrc->numCols; + pa2 += col + 3*pSrc->numCols; + pa3 += col + 3*pSrc->numCols; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-col; k++) + { + for(j=0;jnumCols-col; j++) { - sum += *pv++ * *pa; - pa += pOutR->numCols; + *pdst++ += *pv * *pa++; } - *pdst++ = sum; + pa += col; + pv++; + pdst = pTmpB; } /* A(col:,col:) - beta v tmpb */ pa = p; for(j=0;jnumRows-col; j++) { + float32_t f = beta * pTmpA[j]; + for(i=0;inumCols-col; i++) { - *pa = *pa - beta * pTmpA[j] * pTmpB[i] ; + *pa = *pa - f * pTmpB[i] ; pa++; } pa += col; @@ -240,7 +753,8 @@ arm_status arm_mat_qr_f32( pc = pOutTau + pOutQ->numCols - 1; for(col=0 ; col < pOutQ->numCols; col++) { - int32_t i,j,k; + int32_t i,j,k, blkCnt; + float32_t *pa0,*pa1,*pa2,*pa3; pos = pSrc->numRows - nb; p = pOutQ->pData + pos + pOutQ->numCols*pos ; @@ -250,26 +764,70 @@ arm_status arm_mat_qr_f32( pdst = pTmpB; /* v.T A(col:,col:) -> tmpb */ + + pv = pTmpA; + pa = p; for(j=0;jnumRows-pos; j++) { - pa = p+j; - pv = pTmpA; - sum = 0.0f; - for(k=0;knumRows-pos; k++) + *pdst++ = *pv * *pa++; + } + pa += pos; + pv++; + pdst = pTmpB; + pa0 = pa; + pa1 = pa0 + pOutQ->numRows; + pa2 = pa1 + pOutQ->numRows; + pa3 = pa2 + pOutQ->numRows; + + /* Unrolled loop */ + blkCnt = (pOutQ->numRows-pos - 1) >> 2; + k=1; + while(blkCnt > 0) + { + float32_t sum; + + for(j=0;jnumRows-pos; j++) { - sum += *pv++ * *pa; - pa += pOutQ->numCols; + sum = *pdst; + + sum += pv[0] * *pa0++; + sum += pv[1] * *pa1++; + sum += pv[2] * *pa2++; + sum += pv[3] * *pa3++; + + *pdst++ = sum; } - *pdst++ = sum; + pa0 += pos + 3*pOutQ->numRows; + pa1 += pos + 3*pOutQ->numRows; + pa2 += pos + 3*pOutQ->numRows; + pa3 += pos + 3*pOutQ->numRows; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-pos; k++) + { + for(j=0;jnumRows-pos; j++) + { + *pdst++ += *pv * *pa++; + } + pa += pos; + pv++; + pdst = pTmpB; } pa = p; beta = *pc--; for(j=0;jnumRows-pos; j++) { + float32_t f = beta * pTmpA[j]; + for(i=0;inumCols-pos; i++) { - *pa = *pa - beta * pTmpA[j] * pTmpB[i] ; + *pa = *pa - f * pTmpB[i] ; pa++; } pa += pos; @@ -285,6 +843,7 @@ arm_status arm_mat_qr_f32( return (status); } +#endif /* end of test for Helium or Neon availability */ /** @} end of MatrixQR group diff --git a/Source/MatrixFunctions/arm_mat_qr_f64.c b/Source/MatrixFunctions/arm_mat_qr_f64.c index e056f770..145ed88c 100644 --- a/Source/MatrixFunctions/arm_mat_qr_f64.c +++ b/Source/MatrixFunctions/arm_mat_qr_f64.c @@ -30,6 +30,7 @@ #include "dsp/matrix_utils.h" + /** @ingroup groupMatrix */ @@ -65,6 +66,9 @@ */ + + + arm_status arm_mat_qr_f64( const arm_matrix_instance_f64 * pSrc, const float64_t threshold, @@ -76,16 +80,13 @@ arm_status arm_mat_qr_f64( ) { - - - uint32_t col=0; + int32_t col=0; int32_t nb,pos; float64_t *pa,*pc; float64_t beta; float64_t *pv; float64_t *pdst; float64_t *p; - float64_t sum; if (pSrc->numRows < pSrc->numCols) { @@ -101,7 +102,8 @@ arm_status arm_mat_qr_f64( pc = pOutTau; for(col=0 ; col < pSrc->numCols; col++) { - uint32_t i,j,k; + int32_t i,j,k,blkCnt; + float64_t *pa0,*pa1,*pa2,*pa3; COPY_COL_F64(pOutR,col,col,pTmpA); beta = arm_householder_f64(pTmpA,threshold,pSrc->numRows - col,pTmpA); @@ -110,26 +112,70 @@ arm_status arm_mat_qr_f64( pdst = pTmpB; /* v.T A(col:,col:) -> tmpb */ + pv = pTmpA; + pa = p; for(j=0;jnumCols-col; j++) { - pa = p+j; - pv = pTmpA; - sum = 0.0; - for(k=0;knumRows-col; k++) + *pdst++ = *pv * *pa++; + } + pa += col; + pv++; + pdst = pTmpB; + + pa0 = pa; + pa1 = pa0 + pSrc->numCols; + pa2 = pa1 + pSrc->numCols; + pa3 = pa2 + pSrc->numCols; + + /* Unrolled loop */ + blkCnt = (pSrc->numRows-col - 1) >> 2; + k=1; + while(blkCnt > 0) + { + float64_t sum; + + for(j=0;jnumCols-col; j++) { - sum += *pv++ * *pa; - pa += pOutR->numCols; + sum = *pdst; + + sum += pv[0] * *pa0++; + sum += pv[1] * *pa1++; + sum += pv[2] * *pa2++; + sum += pv[3] * *pa3++; + + *pdst++ = sum; } - *pdst++ = sum; + pa0 += col + 3*pSrc->numCols; + pa1 += col + 3*pSrc->numCols; + pa2 += col + 3*pSrc->numCols; + pa3 += col + 3*pSrc->numCols; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-col; k++) + { + for(j=0;jnumCols-col; j++) + { + *pdst++ += *pv * *pa++; + } + pa += col; + pv++; + pdst = pTmpB; } /* A(col:,col:) - beta v tmpb */ pa = p; for(j=0;jnumRows-col; j++) { + float64_t f = beta * pTmpA[j]; + for(i=0;inumCols-col; i++) { - *pa = *pa - beta * pTmpA[j] * pTmpB[i] ; + *pa = *pa - f * pTmpB[i] ; pa++; } pa += col; @@ -165,7 +211,8 @@ arm_status arm_mat_qr_f64( pc = pOutTau + pOutQ->numCols - 1; for(col=0 ; col < pOutQ->numCols; col++) { - int32_t i,j,k; + int32_t i,j,k, blkCnt; + float64_t *pa0,*pa1,*pa2,*pa3; pos = pSrc->numRows - nb; p = pOutQ->pData + pos + pOutQ->numCols*pos ; @@ -175,26 +222,70 @@ arm_status arm_mat_qr_f64( pdst = pTmpB; /* v.T A(col:,col:) -> tmpb */ + + pv = pTmpA; + pa = p; for(j=0;jnumRows-pos; j++) { - pa = p+j; - pv = pTmpA; - sum = 0.0; - for(k=0;knumRows-pos; k++) + *pdst++ = *pv * *pa++; + } + pa += pos; + pv++; + pdst = pTmpB; + pa0 = pa; + pa1 = pa0 + pOutQ->numRows; + pa2 = pa1 + pOutQ->numRows; + pa3 = pa2 + pOutQ->numRows; + + /* Unrolled loop */ + blkCnt = (pOutQ->numRows-pos - 1) >> 2; + k=1; + while(blkCnt > 0) + { + float64_t sum; + + for(j=0;jnumRows-pos; j++) { - sum += *pv++ * *pa; - pa += pOutQ->numCols; + sum = *pdst; + + sum += pv[0] * *pa0++; + sum += pv[1] * *pa1++; + sum += pv[2] * *pa2++; + sum += pv[3] * *pa3++; + + *pdst++ = sum; + } + pa0 += pos + 3*pOutQ->numRows; + pa1 += pos + 3*pOutQ->numRows; + pa2 += pos + 3*pOutQ->numRows; + pa3 += pos + 3*pOutQ->numRows; + pv += 4; + pdst = pTmpB; + k += 4; + blkCnt--; + } + + pa = pa0; + for(;knumRows-pos; k++) + { + for(j=0;jnumRows-pos; j++) + { + *pdst++ += *pv * *pa++; } - *pdst++ = sum; + pa += pos; + pv++; + pdst = pTmpB; } pa = p; beta = *pc--; for(j=0;jnumRows-pos; j++) { + float64_t f = beta * pTmpA[j]; + for(i=0;inumCols-pos; i++) { - *pa = *pa - beta * pTmpA[j] * pTmpB[i] ; + *pa = *pa - f * pTmpB[i] ; pa++; } pa += pos; diff --git a/Testing/Source/Tests/UnaryTestsF32.cpp b/Testing/Source/Tests/UnaryTestsF32.cpp index 8c0606b2..1d901cf9 100755 --- a/Testing/Source/Tests/UnaryTestsF32.cpp +++ b/Testing/Source/Tests/UnaryTestsF32.cpp @@ -40,7 +40,7 @@ Comparisons for inverse /* Not very accurate for big matrix. But big matrix needed for checking the vectorized code */ -#define SNR_THRESHOLD_INV 100 +#define SNR_THRESHOLD_INV 99 #define REL_ERROR_INV (3.0e-5) #define ABS_ERROR_INV (1.0e-5) diff --git a/Testing/TestScripts/CodeGen.py b/Testing/TestScripts/CodeGen.py index 0e07d03c..45ed77cb 100644 --- a/Testing/TestScripts/CodeGen.py +++ b/Testing/TestScripts/CodeGen.py @@ -832,7 +832,7 @@ class CodeGen: self._currentPaths = oldPath.copy() self._currentParamPaths = oldParamPath.copy() - def genCodeForTree(self,root,benchMode): + def genCodeForTree(self,genFolder,root,benchMode): """ Generate all files from the trees of tests Args: @@ -845,14 +845,16 @@ class CodeGen: # Get a list of all suites contained in the tree suites = self.getSuites(root,[]) - src = "GeneratedSource" - header = "GeneratedInclude" + src = os.path.join(genFolder,"GeneratedSource") + header = os.path.join(genFolder,"GeneratedInclude") if benchMode: src += "Bench" header += "Bench" # Generate .cpp and .h files neded to run the tests - with open("%s/TestDesc.cpp" % src,"w") as sourceFile: - with open("%s/TestDesc.h" % header,"w") as headerFile: + testDescCPath = os.path.join(src,"TestDesc.cpp") + testDescHeaderPath = os.path.join(header,"TestDesc.h") + with open(testDescCPath,"w") as sourceFile: + with open(testDescHeaderPath,"w") as headerFile: headerFile.write("#include \"Test.h\"\n") headerFile.write("#include \"Pattern.h\"\n") @@ -871,18 +873,20 @@ class CodeGen: # the pattern files. # Driver file is similar in this case but different from semihosting # one. + testDriveHeaderPath = os.path.join(header,"TestDrive.h") + patternHeaderPath = os.path.join(header,"Patterns.h") if not self._fpga: - with open("%s/TestDrive.h" % header,"w") as driverFile: + with open(testDriveHeaderPath,"w") as driverFile: driverFile.write("// Empty driver include in semihosting mode") - with open("%s/Patterns.h" % header,"w") as includeFile: + with open(patternHeaderPath,"w") as includeFile: includeFile.write("// Empty pattern include in semihosting mode") else: - with open("%s/TestDrive.h" % header,"w") as driverFile: + with open(testDriveHeaderPath,"w") as driverFile: driverFile.write("#ifndef _DRIVER_H_\n") driverFile.write("#define _DRIVER_H_\n") driverFile.write("__ALIGNED(8) const char testDesc[]={\n") self._offset=0 - with open("%s/Patterns.h" % header,"w") as includeFile: + with open(patternHeaderPath,"w") as includeFile: includeFile.write("#ifndef _PATTERNS_H_\n") includeFile.write("#define _PATTERNS_H_\n") includeFile.write("__ALIGNED(8) const char patterns[]={\n") diff --git a/Testing/TestScripts/Regression/Commands.py b/Testing/TestScripts/Regression/Commands.py deleted file mode 100755 index b3f19b99..00000000 --- a/Testing/TestScripts/Regression/Commands.py +++ /dev/null @@ -1,462 +0,0 @@ -import subprocess -import colorama -from colorama import init,Fore, Back, Style -import argparse -import os -import os.path -from contextlib import contextmanager -import shutil -import glob -from pathlib import Path -import sys - -DEBUGMODE = False -KEEPBUILDFOLDER = True - -DEBUGLIST=[ -"-DBASICMATH=ON", -"-DCOMPLEXMATH=OFF", -"-DCONTROLLER=OFF", -"-DFASTMATH=OFF", -"-DFILTERING=OFF", -"-DMATRIX=OFF", -"-DSTATISTICS=OFF", -"-DSUPPORT=OFF", -"-DTRANSFORM=OFF", -"-DSVM=OFF", -"-DBAYES=OFF", -"-DDISTANCE=OFF"] - -NOTESTFAILED = 0 -MAKEFAILED = 1 -TESTFAILED = 2 -FLOWFAILURE = 3 -CALLFAILURE = 4 - -def setDebugMode(): - global DEBUGMODE - DEBUGMODE=True - -def isDebugMode(): - global DEBUGMODE - return(DEBUGMODE) - -def setNokeepBuildFolder(): - global KEEPBUILDFOLDER - KEEPBUILDFOLDER=False - -def isKeepMode(): - global KEEPBUILDFOLDER - return(KEEPBUILDFOLDER) - - -def joinit(iterable, delimiter): - it = iter(iterable) - yield next(it) - for x in it: - yield delimiter - yield x - -def addToDb(db,desc,runid): - msg("Add %s to summary database\n" % desc) - completed = subprocess.run([sys.executable, "addToDB.py","-o",db,"-r",str(runid),desc], timeout=3600) - check(completed) - -def addToRegDb(db,desc,runid): - msg("Add %s to regression database\n" % desc) - completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,"-r",str(runid),desc], timeout=3600) - check(completed) - - -class TestFlowFailure(Exception): - def __init__(self,completed): - self._errorcode = completed.returncode - - def errorCode(self): - return(self._errorcode) - -class CallFailure(Exception): - pass - -def check(n): - #print(n) - if n is not None: - if n.returncode != 0: - raise TestFlowFailure(n) - else: - raise CallFailure() - -def msg(t): - print(Fore.CYAN + t + Style.RESET_ALL) - -def errorMsg(t): - print(Fore.RED + t + Style.RESET_ALL) - -def fullTestFolder(rootFolder): - return(os.path.join(rootFolder,"CMSIS","DSP","Testing","fulltests")) - -class BuildConfig: - def __init__(self,toUnset,toSet,rootFolder,buildFolder,compiler,toolchain,core,cmake): - self._toUnset = toUnset - self._toSet = toSet - self._buildFolder = buildFolder - self._rootFolder = os.path.abspath(rootFolder) - self._dspFolder = os.path.join(self._rootFolder,"CMSIS","DSP") - self._testingFolder = os.path.join(self._dspFolder,"Testing") - self._fullTests = os.path.join(self._testingFolder,"fulltests") - self._compiler = compiler - self._toolchain = toolchain - self._core = core - self._cmake = cmake - self._savedEnv = {} - - def compiler(self): - return(self._compiler) - - def toolChainFile(self): - return(self._toolchain) - - def core(self): - return(self._core) - - def path(self): - return(os.path.join(self._fullTests,self._buildFolder)) - - def archivePath(self): - return(os.path.join(self._fullTests,"archive",self._buildFolder)) - - def archiveResultPath(self): - return(os.path.join(self._fullTests,"archive",self._buildFolder,"results")) - - def archiveLogPath(self): - return(os.path.join(self._fullTests,"archive",self._buildFolder,"logs")) - - def archiveErrorPath(self): - return(os.path.join(self._fullTests,"archive",self._buildFolder,"errors")) - - def toolChainPath(self): - return(self._dspFolder) - - def cmakeFilePath(self): - return(self._testingFolder) - - def buildFolderName(self): - return(self._buildFolder) - - def saveEnv(self): - if self._toUnset is not None: - for v in self._toUnset: - if v in os.environ: - self._savedEnv[v] = os.environ[v] - else: - self._savedEnv[v] = None - del os.environ[v] - - if self._toSet is not None: - for v in self._toSet: - for varName in v: - if varName in os.environ: - self._savedEnv[varName] = os.environ[varName] - else: - self._savedEnv[varName] = None - os.environ[varName] = v[varName] - - - - def restoreEnv(self): - for v in self._savedEnv: - if self._savedEnv[v] is not None: - os.environ[v] = self._savedEnv[v] - else: - if v in os.environ: - del os.environ[v] - self._savedEnv = {} - - # Build for a folder - # We need to be able to detect failed build - def build(self,test): - completed=None - # Save and unset some environment variables - self.saveEnv() - with self.buildFolder() as b: - msg(" Build %s\n" % self.buildFolderName()) - with open(os.path.join(self.archiveLogPath(),"makelog_%s.txt" % test),"w") as makelog: - with open(os.path.join(self.archiveErrorPath(),"makeerror_%s.txt" % test),"w") as makeerr: - if DEBUGMODE: - completed=subprocess.run(["make","-j4","VERBOSE=1"],timeout=3600) - else: - completed=subprocess.run(["make","-j4","VERBOSE=1"],stdout=makelog,stderr=makeerr,timeout=3600) - # Restore environment variables - self.restoreEnv() - check(completed) - - def getTest(self,test): - return(Test(self,test)) - - - - # Launch cmake command. - def createCMake(self,configid,flags,benchMode,platform): - with self.buildFolder() as b: - self.saveEnv() - if benchMode: - msg("Benchmark mode\n") - msg("Create cmake for %s\n" % self.buildFolderName()) - toolchainCmake = os.path.join(self.toolChainPath(),self.toolChainFile()) - cmd = [self._cmake] - cmd += ["-DCMAKE_PREFIX_PATH=%s" % self.compiler(), - "-DCMAKE_TOOLCHAIN_FILE=%s" % toolchainCmake, - "-DARM_CPU=%s" % self.core(), - "-DPLATFORM=%s" % platform, - "-DCONFIGID=%s" % configid - ] - cmd += flags - - if DEBUGMODE: - cmd += DEBUGLIST - - if benchMode: - cmd += ["-DBENCHMARK=ON"] - cmd += ["-DWRAPPER=ON"] - else: - cmd += ["-DBENCHMARK=OFF"] - cmd += ["-DWRAPPER=OFF"] - - cmd += ["-DROOT=%s" % self._rootFolder, - "-DCMAKE_BUILD_TYPE=Release", - "-G", "Unix Makefiles" ,"%s" % self.cmakeFilePath()] - - if DEBUGMODE: - print(cmd) - - with open(os.path.join(self.archiveLogPath(),"cmakecmd.txt"),"w") as cmakecmd: - cmakecmd.write("".join(joinit(cmd," "))) - - with open(os.path.join(self.archiveLogPath(),"cmakelog.txt"),"w") as cmakelog: - with open(os.path.join(self.archiveErrorPath(),"cmakeerror.txt"),"w") as cmakeerr: - completed=subprocess.run(cmd, stdout=cmakelog,stderr=cmakeerr, timeout=3600) - self.restoreEnv() - check(completed) - - - # Create the build folder if missing - def createFolder(self): - os.makedirs(self.path(),exist_ok=True) - - def createArchive(self, flags): - os.makedirs(self.archivePath(),exist_ok=True) - os.makedirs(self.archiveResultPath(),exist_ok=True) - os.makedirs(self.archiveErrorPath(),exist_ok=True) - os.makedirs(self.archiveLogPath(),exist_ok=True) - with open(os.path.join(self.archivePath(),"flags.txt"),"w") as f: - for flag in flags: - f.write(flag) - f.write("\n") - - - # Delete the build folder - def cleanFolder(self): - print("Delete %s\n" % self.path()) - #DEBUG - if not isDebugMode() and not isKeepMode(): - shutil.rmtree(self.path()) - - # Archive results and currentConfig.csv to another folder - def archiveResults(self): - results=glob.glob(os.path.join(self.path(),"results_*")) - for result in results: - dst=os.path.join(self.archiveResultPath(),os.path.basename(result)) - shutil.copy(result,dst) - - src = os.path.join(self.path(),"currentConfig.csv") - dst = os.path.join(self.archiveResultPath(),os.path.basename(src)) - shutil.copy(src,dst) - - - @contextmanager - def buildFolder(self): - current=os.getcwd() - try: - os.chdir(self.path() ) - yield self.path() - finally: - os.chdir(current) - - @contextmanager - def archiveFolder(self): - current=os.getcwd() - try: - os.chdir(self.archivePath() ) - yield self.archivePath() - finally: - os.chdir(current) - - @contextmanager - def resultFolder(self): - current=os.getcwd() - try: - os.chdir(self.archiveResultPath()) - yield self.archiveResultPath() - finally: - os.chdir(current) - - @contextmanager - def logFolder(self): - current=os.getcwd() - try: - os.chdir(self.archiveLogPath()) - yield self.archiveLogPath() - finally: - os.chdir(current) - - @contextmanager - def errorFolder(self): - current=os.getcwd() - try: - os.chdir(self.archiveErrorPath()) - yield self.archiveErrorPath() - finally: - os.chdir(current) - -class Test: - def __init__(self,build,test): - self._test = test - self._buildConfig = build - - def buildConfig(self): - return(self._buildConfig) - - def testName(self): - return(self._test) - - # Process a test from the test description file - def processTest(self,patternConfig): - if isDebugMode(): - if patternConfig: - completed=subprocess.run([sys.executable,"processTests.py","-p",patternConfig["patterns"],"-d",patternConfig["parameters"],"-e",self.testName(),"1"],timeout=3600) - else: - completed=subprocess.run([sys.executable,"processTests.py","-e",self.testName(),"1"],timeout=3600) - check(completed) - else: - if patternConfig: - completed=subprocess.run([sys.executable,"processTests.py","-p",patternConfig["patterns"],"-d",patternConfig["parameters"],"-e",self.testName()],timeout=3600) - else: - completed=subprocess.run([sys.executable,"processTests.py","-e",self.testName()],timeout=3600) - check(completed) - - def getResultPath(self): - return(os.path.join(self.buildConfig().path() ,self.resultName())) - - def resultName(self): - return("results_%s.txt" % self.testName()) - - # Run a specific test in the current folder - # A specific results.txt file is created in - # the build folder for this test - # - # We need a timeout and detect failed run - def run(self,fvp,benchmode): - timeoutVal=3600 - if benchmode: - timeoutVal = 3600 * 4 - completed = None - with self.buildConfig().buildFolder() as b: - msg(" Run %s\n" % self.testName() ) - with open(self.resultName(),"w") as results: - if isDebugMode(): - print(os.getcwd()) - print(fvp.split()) - completed=subprocess.run(fvp.split(),timeout=timeoutVal) - else: - completed=subprocess.run(fvp.split(),stdout=results,timeout=timeoutVal) - check(completed) - - # Process results of the given tests - # in given build folder - # We need to detect failed tests - def processResult(self): - msg(" Parse result for %s\n" % self.testName()) - with open(os.path.join(self.buildConfig().archiveResultPath(),"processedResult_%s.txt" % self.testName()),"w") as presult: - completed=subprocess.run([sys.executable,"processResult.py","-e","-r",self.getResultPath()],stdout=presult,timeout=3600) - # When a test fail, the regression is continuing but we - # track that a test has failed - if completed.returncode==0: - return(NOTESTFAILED) - else: - return(TESTFAILED) - - # Compute the regression data - def computeSummaryStat(self): - msg(" Compute regressions for %s\n" % self.testName()) - completed=subprocess.run([sys.executable,"summaryBench.py","-r",self.getResultPath(),self.testName()],timeout=3600) - # When a test fail, the regression is continuing but we - # track that a test has failed - if completed.returncode==0: - return(NOTESTFAILED) - else: - return(TESTFAILED) - - def runAndProcess(self,patternConfig,compiler,fvp,sim,benchmode,db,regdb,benchid,regid): - # If we can't parse test description we fail all tests - self.processTest(patternConfig) - # Otherwise if only building or those tests are failing, we continue - # with other tests - try: - self.buildConfig().build(self.testName()) - except: - return(MAKEFAILED) - # We run tests only for AC6 - # For other compilers only build is tests - # Since full build is no more possible because of huge pattersn, - # build is done per test suite. - if sim: - if fvp is not None: - if isDebugMode(): - print(fvp) - self.run(fvp,benchmode) - error=self.processResult() - if benchmode and (error == NOTESTFAILED): - error = self.computeSummaryStat() - if db is not None: - addToDb(db,self.testName(),benchid) - if regdb is not None: - addToRegDb(regdb,self.testName(),regid) - return(error) - else: - msg("No FVP available") - return(NOTESTFAILED) - else: - return(NOTESTFAILED) - - - -# Preprocess the test description -def preprocess(desc): - msg("Process test description file %s\n" % desc) - completed = subprocess.run([sys.executable, "preprocess.py","-f",desc],timeout=3600) - check(completed) - -# Generate all missing C code by using all classes in the -# test description file -def generateAllCCode(patternConfig): - msg("Generate all missing C files\n") - if patternConfig: - completed = subprocess.run([sys.executable,"processTests.py", - "-p",patternConfig["patterns"],"-d",patternConfig["parameters"],"-e"],timeout=3600) - else: - completed = subprocess.run([sys.executable,"processTests.py", "-e"],timeout=3600) - check(completed) - -# Create db -def createDb(sqlite,desc): - msg("Create database %s\n" % desc) - with open("createDb.sql") as db: - completed = subprocess.run([sqlite, desc],stdin=db, timeout=3600) - check(completed) - - - - - - - diff --git a/Testing/processTests.py b/Testing/processTests.py index 20376984..d7d9d8b4 100644 --- a/Testing/processTests.py +++ b/Testing/processTests.py @@ -9,6 +9,7 @@ parser.add_argument('-f', nargs='?',type = str, default="Output.pickle", help="P parser.add_argument('-p', nargs='?',type = str, default="Patterns", help="Pattern dir path") parser.add_argument('-d', nargs='?',type = str, default="Parameters", help="Parameter dir path") +parser.add_argument('-gen', nargs='?',type = str, default=".", help="Folder for generated C sources") # -e true when no semihosting # Input is include files @@ -34,6 +35,6 @@ if args.f is not None: d.deprecate(root,args.others) #print(root) # Generate code with the tree of tests - c.genCodeForTree(root,args.b) + c.genCodeForTree(args.gen,root,args.b) else: parser.print_help() \ No newline at end of file