DSP: Cleanup of type mismatches (#1327)

The IAR compiler appears to provide more warnings about type
mismatches than the other compilers. This cleans a lot of them up.

Signed-off-by: TTornblom <thomas.tornblom@iar.com>
pull/19/head
Thomas Törnblom 4 years ago committed by GitHub
parent b4822217d0
commit 2a7714ced7

@ -150,20 +150,20 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
bitRevTabOff = vldrhq_u16(pBitRevTab); bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8; pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
blkCnt = (bitRevLen / 16); blkCnt = (bitRevLen / 16);
while (blkCnt > 0) { while (blkCnt > 0) {
bitRevTabOff = vldrhq_u16(pBitRevTab); bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8; pBitRevTab += 8;
bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one); bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one); bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3); bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3); bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
@ -175,10 +175,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
bitRevTabOff = vldrhq_u16(pBitRevTab); bitRevTabOff = vldrhq_u16(pBitRevTab);
pBitRevTab += 8; pBitRevTab += 8;
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low); inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High); inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
@ -209,10 +209,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p); inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p); inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
@ -251,13 +251,13 @@ __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32
while (blkCnt > 0) { while (blkCnt > 0) {
uint64x2_t vecIn; uint64x2_t vecIn;
vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0); vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
idxOffs0 = idxOffs0 + 16; idxOffs0 = idxOffs0 + 16;
vst1q(pDst32, (uint32x4_t) vecIn); vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4; pDst32 += 4;
bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1); vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
idxOffs1 = idxOffs1 + 16; idxOffs1 = idxOffs1 + 16;
vst1q(pDst32, (uint32x4_t) vecIn); vst1q(pDst32, (uint32x4_t) vecIn);
pDst32 += 4; pDst32 += 4;

@ -32,9 +32,9 @@
#if defined(ARM_FLOAT16_SUPPORTED) #if defined(ARM_FLOAT16_SUPPORTED)
/* Degree of the polynomial approximation */ /* Degree of the polynomial approximation */
#define NB_DEG_LOGF16 3 #define NB_DEG_LOGF16 3
/* /*
Related to the Log2 of the number of approximations. Related to the Log2 of the number of approximations.
For instance, with 3 there are 1 + 2^3 polynomials For instance, with 3 there are 1 + 2^3 polynomials
*/ */
@ -55,7 +55,7 @@ nb = 3;
deg = 3; deg = 3;
lut = Table[ lut = Table[
MiniMaxApproximation[ MiniMaxApproximation[
Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0},
MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}]; MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}];
coefs = Chop@Flatten[CoefficientList[lut, x]]; coefs = Chop@Flatten[CoefficientList[lut, x]];
@ -75,8 +75,8 @@ static float16_t lut_logf16[NB_LUT_LOGF16]={
float16_t logf16_scalar(float16_t x) float16_t logf16_scalar(float16_t x)
{ {
int16_t i = arm_typecast_s16_f16(x); int16_t i = arm_typecast_s16_f16(x);
int32_t vecExpUnBiased = (i >> 10) - 15; int32_t vecExpUnBiased = (i >> 10) - 15;
i = i - (vecExpUnBiased << 10); i = i - (vecExpUnBiased << 10);
float16_t vecTmpFlt1 = arm_typecast_f16_s16(i); float16_t vecTmpFlt1 = arm_typecast_f16_s16(i);
@ -85,7 +85,7 @@ float16_t logf16_scalar(float16_t x)
float16_t tmp,v; float16_t tmp,v;
tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16); tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16);
n = floor((double)tmp); n = (int)floor((double)tmp);
v = (_Float16)tmp - (_Float16)n; v = (_Float16)tmp - (_Float16)n;
lut = lut_logf16 + n * (1+NB_DEG_LOGF16); lut = lut_logf16 + n * (1+NB_DEG_LOGF16);
@ -110,9 +110,9 @@ float16_t logf16_scalar(float16_t x)
float16x8_t vlogq_lut_f16(float16x8_t vecIn) float16x8_t vlogq_lut_f16(float16x8_t vecIn)
{ {
int16x8_t i = vreinterpretq_s16_f16(vecIn); int16x8_t i = vreinterpretq_s16_f16(vecIn);
int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15);
i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10)); i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10));
float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i); float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i);
@ -132,12 +132,12 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16)); offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16));
offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1); offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1);
res = vldrhq_gather_shifted_offset_f16(lut_logf16,offset); res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
offset = vsubq_n_s16(offset,1); offset = vsubq_n_s16(offset,1);
for(int j=NB_DEG_LOGF16-2; j >=0 ; j--) for(int j=NB_DEG_LOGF16-2; j >=0 ; j--)
{ {
lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,offset); lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
res = vfmaq_f16(lutV,v,res); res = vfmaq_f16(lutV,v,res);
offset = vsubq_n_s16(offset,1); offset = vsubq_n_s16(offset,1);
@ -150,7 +150,7 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
} }
#endif #endif
/** /**
@ingroup groupFastMath @ingroup groupFastMath
@ -175,9 +175,9 @@ void arm_vlog_f16(
float16_t * pDst, float16_t * pDst,
uint32_t blockSize) uint32_t blockSize)
{ {
uint32_t blkCnt; uint32_t blkCnt;
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
f16x8_t src; f16x8_t src;
f16x8_t dst; f16x8_t dst;
@ -203,10 +203,10 @@ void arm_vlog_f16(
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = log(A) */ /* C = log(A) */
/* Calculate log and store result in destination buffer. */ /* Calculate log and store result in destination buffer. */
*pDst++ = logf16_scalar(*pSrc++); *pDst++ = logf16_scalar(*pSrc++);
/* Decrement loop counter */ /* Decrement loop counter */
blkCnt--; blkCnt--;
} }
@ -219,4 +219,4 @@ void arm_vlog_f16(
*/ */
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

@ -32,9 +32,9 @@
#define LOG_Q15_ACCURACY 15 #define LOG_Q15_ACCURACY 15
/* Bit to represent the normalization factor /* Bit to represent the normalization factor
It is Ceiling[Log2[LOG_Q15_ACCURACY]] of the previous value. It is Ceiling[Log2[LOG_Q15_ACCURACY]] of the previous value.
The Log2 algorithm is assuming that the value x is The Log2 algorithm is assuming that the value x is
1 <= x < 2. 1 <= x < 2.
But input value could be as small a 2^-LOG_Q15_ACCURACY But input value could be as small a 2^-LOG_Q15_ACCURACY
@ -90,7 +90,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
/* Compute the Log2. Result is in q11 instead of q16 /* Compute the Log2. Result is in q11 instead of q16
because we know 0 <= y < 1.0 but because we know 0 <= y < 1.0 but
we want a result allowing to do a we want a result allowing to do a
product on int16 rather than having to go product on int16 rather than having to go
through int32 through int32
@ -98,7 +98,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
for(i = 0; i < LOG_Q15_ACCURACY ; i++) for(i = 0; i < LOG_Q15_ACCURACY ; i++)
{ {
x = (((int32_t)x*x)) >> (LOG_Q15_ACCURACY - 1); x = (((int32_t)x*x)) >> (LOG_Q15_ACCURACY - 1);
if (x >= LOQ_Q15_THRESHOLD) if (x >= LOQ_Q15_THRESHOLD)
{ {
y += inc ; y += inc ;
@ -108,10 +108,10 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
} }
/* /*
Convert the Log2 to Log and apply normalization. Convert the Log2 to Log and apply normalization.
We compute (y - normalisation) * (1 / Log2[e]). We compute (y - normalisation) * (1 / Log2[e]).
*/ */
/* q11 */ /* q11 */
@ -120,7 +120,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
/* q4.11 */ /* q4.11 */
y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15; y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
return(y); return(y);
} }
@ -146,11 +146,11 @@ q15x8_t vlogq_q15(q15x8_t src)
/* q11 */ /* q11 */
uint16x8_t y = vdupq_n_u16(0); uint16x8_t y = vdupq_n_u16(0);
/* q11 */ /* q11 */
int16x8_t vtmp; int16x8_t vtmp;
mve_pred16_t p; mve_pred16_t p;
@ -158,11 +158,11 @@ q15x8_t vlogq_q15(q15x8_t src)
vtmp = vsubq_n_s16(c,1); vtmp = vsubq_n_s16(c,1);
x = vshlq_u16(src,vtmp); x = vshlq_u16((uint16x8_t)src,vtmp);
/* Compute the Log2. Result is in q11 instead of q16 /* Compute the Log2. Result is in q11 instead of q16
because we know 0 <= y < 1.0 but because we know 0 <= y < 1.0 but
we want a result allowing to do a we want a result allowing to do a
product on int16 rather than having to go product on int16 rather than having to go
through int32 through int32
@ -172,7 +172,7 @@ q15x8_t vlogq_q15(q15x8_t src)
x = vmulhq_u16(x,x); x = vmulhq_u16(x,x);
x = vshlq_n_u16(x,2); x = vshlq_n_u16(x,2);
p = vcmphiq_u16(x,vdupq_n_u16(LOQ_Q15_THRESHOLD)); p = vcmphiq_u16(x,vdupq_n_u16(LOQ_Q15_THRESHOLD));
y = vaddq_m_n_u16(y, y,inc,p); y = vaddq_m_n_u16(y, y,inc,p);
x = vshrq_m_n_u16(x,x,1,p); x = vshrq_m_n_u16(x,x,1,p);
@ -181,26 +181,26 @@ q15x8_t vlogq_q15(q15x8_t src)
} }
/* /*
Convert the Log2 to Log and apply normalization. Convert the Log2 to Log and apply normalization.
We compute (y - normalisation) * (1 / Log2[e]). We compute (y - normalisation) * (1 / Log2[e]).
*/ */
/* q11 */ /* q11 */
// tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART)); // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
vtmp = vshlq_n_s16(normalization,LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART); vtmp = vshlq_n_s16(normalization,LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART);
vtmp = vsubq_s16(y,vtmp); vtmp = vsubq_s16((int16x8_t)y,vtmp);
/* q4.11 */ /* q4.11 */
// y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15; // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
vtmp = vqdmulhq_n_s16(vtmp,LOG_Q15_INVLOG2EXP); vtmp = vqdmulhq_n_s16(vtmp,LOG_Q15_INVLOG2EXP);
return(vtmp); return(vtmp);
} }
#endif #endif
/** /**
@ingroup groupFastMath @ingroup groupFastMath
@ -248,7 +248,7 @@ void arm_vlog_q15(
blkCnt = blockSize & 7; blkCnt = blockSize & 7;
#else #else
blkCnt = blockSize; blkCnt = blockSize;
#endif #endif
while (blkCnt > 0U) while (blkCnt > 0U)
{ {

@ -30,9 +30,9 @@
#define LOG_Q31_ACCURACY 31 #define LOG_Q31_ACCURACY 31
/* Bit to represent the normalization factor /* Bit to represent the normalization factor
It is Ceiling[Log2[LOG_Q31_ACCURACY]] of the previous value. It is Ceiling[Log2[LOG_Q31_ACCURACY]] of the previous value.
The Log2 algorithm is assuming that the value x is The Log2 algorithm is assuming that the value x is
1 <= x < 2. 1 <= x < 2.
But input value could be as small a 2^-LOG_Q31_ACCURACY But input value could be as small a 2^-LOG_Q31_ACCURACY
@ -55,7 +55,7 @@
static uint32_t arm_scalar_log_q31(uint32_t src) static uint32_t arm_scalar_log_q31(uint32_t src)
{ {
int32_t i; int32_t i;
int32_t c = __CLZ(src); int32_t c = __CLZ(src);
int32_t normalization=0; int32_t normalization=0;
@ -84,7 +84,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
} }
normalization = c; normalization = c;
/* Compute the Log2. Result is in q26 /* Compute the Log2. Result is in q26
because we know 0 <= y < 1.0 but because we know 0 <= y < 1.0 but
do not want to use q32 to allow do not want to use q32 to allow
following computation with less instructions. following computation with less instructions.
@ -101,10 +101,10 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
inc = inc >> 1; inc = inc >> 1;
} }
/* /*
Convert the Log2 to Log and apply normalization. Convert the Log2 to Log and apply normalization.
We compute (y - normalisation) * (1 / Log2[e]). We compute (y - normalisation) * (1 / Log2[e]).
*/ */
/* q26 */ /* q26 */
@ -114,7 +114,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
/* q5.26 */ /* q5.26 */
y = ((int64_t)tmp * LOG_Q31_INVLOG2EXP) >> 31; y = ((int64_t)tmp * LOG_Q31_INVLOG2EXP) >> 31;
return(y); return(y);
@ -125,7 +125,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
q31x4_t vlogq_q31(q31x4_t src) q31x4_t vlogq_q31(q31x4_t src)
{ {
int32_t i; int32_t i;
int32x4_t c = vclzq_s32(src); int32x4_t c = vclzq_s32(src);
@ -141,11 +141,11 @@ q31x4_t vlogq_q31(q31x4_t src)
/* q11 */ /* q11 */
uint32x4_t y = vdupq_n_u32(0); uint32x4_t y = vdupq_n_u32(0);
/* q11 */ /* q11 */
int32x4_t vtmp; int32x4_t vtmp;
mve_pred16_t p; mve_pred16_t p;
@ -153,10 +153,10 @@ q31x4_t vlogq_q31(q31x4_t src)
vtmp = vsubq_n_s32(c,1); vtmp = vsubq_n_s32(c,1);
x = vshlq_u32(src,vtmp); x = vshlq_u32((uint32x4_t)src,vtmp);
/* Compute the Log2. Result is in Q26 /* Compute the Log2. Result is in Q26
because we know 0 <= y < 1.0 but because we know 0 <= y < 1.0 but
do not want to use Q32 to allow do not want to use Q32 to allow
following computation with less instructions. following computation with less instructions.
@ -166,7 +166,7 @@ q31x4_t vlogq_q31(q31x4_t src)
x = vmulhq_u32(x,x); x = vmulhq_u32(x,x);
x = vshlq_n_u32(x,2); x = vshlq_n_u32(x,2);
p = vcmphiq_u32(x,vdupq_n_u32(LOQ_Q31_THRESHOLD)); p = vcmphiq_u32(x,vdupq_n_u32(LOQ_Q31_THRESHOLD));
y = vaddq_m_n_u32(y, y,inc,p); y = vaddq_m_n_u32(y, y,inc,p);
x = vshrq_m_n_u32(x,x,1,p); x = vshrq_m_n_u32(x,x,1,p);
@ -175,19 +175,19 @@ q31x4_t vlogq_q31(q31x4_t src)
} }
/* /*
Convert the Log2 to Log and apply normalization. Convert the Log2 to Log and apply normalization.
We compute (y - normalisation) * (1 / Log2[e]). We compute (y - normalisation) * (1 / Log2[e]).
*/ */
/* q11 */ /* q11 */
// tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART)); // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
vtmp = vshlq_n_s32(normalization,LOG_Q31_ACCURACY - LOG_Q31_INTEGER_PART); vtmp = vshlq_n_s32(normalization,LOG_Q31_ACCURACY - LOG_Q31_INTEGER_PART);
vtmp = vsubq_s32(y,vtmp); vtmp = vsubq_s32((int32x4_t)y,vtmp);
/* q4.11 */ /* q4.11 */
// y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15; // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
vtmp = vqdmulhq_n_s32(vtmp,LOG_Q31_INVLOG2EXP); vtmp = vqdmulhq_n_s32(vtmp,LOG_Q31_INVLOG2EXP);
@ -242,7 +242,7 @@ void arm_vlog_q31(
blkCnt = blockSize & 3; blkCnt = blockSize & 3;
#else #else
blkCnt = blockSize; blkCnt = blockSize;
#endif #endif
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
@ -250,7 +250,7 @@ void arm_vlog_q31(
blkCnt--; blkCnt--;
} }
} }
/** /**

@ -96,7 +96,7 @@ arm_status arm_mat_ldlt_f32(
{ {
arm_status status; /* status of matrix inverse */ arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK #ifdef ARM_MATH_MATRIX_CHECK
@ -142,7 +142,7 @@ arm_status arm_mat_ldlt_f32(
{ {
/* Find pivot */ /* Find pivot */
float32_t m=F32_MIN,a; float32_t m=F32_MIN,a;
int j=k; int j=k;
for(int r=k;r<n;r++) for(int r=k;r<n;r++)
@ -204,25 +204,25 @@ arm_status arm_mat_ldlt_f32(
//pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA); //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0); vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0); vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0); vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
vecA = vfmsq_m(vecA, vecW0, vecX, p0); vecA = vfmsq_m(vecA, vecW0, vecX, p0);
vstrwq_p(&pA[(w + 0)*n+x], vecA, p0); vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0); vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
vecA = vfmsq_m(vecA, vecW1, vecX, p0); vecA = vfmsq_m(vecA, vecW1, vecX, p0);
vstrwq_p(&pA[(w + 1)*n+x], vecA, p0); vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0); vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
vecA = vfmsq_m(vecA, vecW2, vecX, p0); vecA = vfmsq_m(vecA, vecW2, vecX, p0);
vstrwq_p(&pA[(w + 2)*n+x], vecA, p0); vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0); vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
vecA = vfmsq_m(vecA, vecW3, vecX, p0); vecA = vfmsq_m(vecA, vecW3, vecX, p0);
vstrwq_p(&pA[(w + 3)*n+x], vecA, p0); vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);
cnt -= 4; cnt -= 4;
} }
@ -246,13 +246,13 @@ arm_status arm_mat_ldlt_f32(
//pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA); //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
vecA = vldrwq_z_f32(&pA[w*n+x],p0); vecA = vldrwq_z_f32(&pA[w*n+x],p0);
vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0); vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0); vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
vecA = vfmsq_m(vecA, vecW, vecX, p0); vecA = vfmsq_m(vecA, vecW, vecX, p0);
vstrwq_p(&pA[w*n+x], vecA, p0); vstrwq_p(&pA[w*n+x], vecA, p0);
cnt -= 4; cnt -= 4;
} }
@ -263,7 +263,7 @@ arm_status arm_mat_ldlt_f32(
pA[w*n+k] = pA[w*n+k] * invA; pA[w*n+k] = pA[w*n+k] * invA;
} }
} }
@ -275,15 +275,15 @@ arm_status arm_mat_ldlt_f32(
diag--; diag--;
for(int row=0; row < n;row++) for(int row=0; row < n;row++)
{ {
mve_pred16_t p0; mve_pred16_t p0;
int cnt= n-k; int cnt= n-k;
f32x4_t zero=vdupq_n_f32(0.0f); f32x4_t zero=vdupq_n_f32(0.0f);
for(int col=k; col < n;col += 4) for(int col=k; col < n;col += 4)
{ {
p0 = vctp32q(cnt); p0 = vctp32q(cnt);
vstrwq_p(&pl->pData[row*n+col], zero, p0); vstrwq_p(&pl->pData[row*n+col], zero, p0);
cnt -= 4; cnt -= 4;
} }
@ -292,15 +292,15 @@ arm_status arm_mat_ldlt_f32(
for(int row=0; row < n;row++) for(int row=0; row < n;row++)
{ {
mve_pred16_t p0; mve_pred16_t p0;
int cnt= n-row-1; int cnt= n-row-1;
f32x4_t zero=vdupq_n_f32(0.0f); f32x4_t zero=vdupq_n_f32(0.0f);
for(int col=row+1; col < n;col+=4) for(int col=row+1; col < n;col+=4)
{ {
p0 = vctp32q(cnt); p0 = vctp32q(cnt);
vstrwq_p(&pl->pData[row*n+col], zero, p0); vstrwq_p(&pl->pData[row*n+col], zero, p0);
cnt -= 4; cnt -= 4;
} }
@ -311,12 +311,12 @@ arm_status arm_mat_ldlt_f32(
pd->pData[d*n+d] = pl->pData[d*n+d]; pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0; pl->pData[d*n+d] = 1.0;
} }
status = ARM_MATH_SUCCESS; status = ARM_MATH_SUCCESS;
} }
/* Return to application */ /* Return to application */
return (status); return (status);
} }
@ -350,7 +350,7 @@ arm_status arm_mat_ldlt_f32(
@addtogroup MatrixChol @addtogroup MatrixChol
@{ @{
*/ */
/** /**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix. * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure. * @param[in] pSrc points to the instance of the input floating-point matrix structure.
@ -373,7 +373,7 @@ arm_status arm_mat_ldlt_f32(
{ {
arm_status status; /* status of matrix inverse */ arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK #ifdef ARM_MATH_MATRIX_CHECK
@ -410,7 +410,7 @@ arm_status arm_mat_ldlt_f32(
{ {
/* Find pivot */ /* Find pivot */
float32_t m=F32_MIN,a; float32_t m=F32_MIN,a;
int j=k; int j=k;
int r; int r;
@ -457,7 +457,7 @@ arm_status arm_mat_ldlt_f32(
pA[w*n+k] = pA[w*n+k] / a; pA[w*n+k] = pA[w*n+k] / a;
} }
} }
@ -491,12 +491,12 @@ arm_status arm_mat_ldlt_f32(
pd->pData[d*n+d] = pl->pData[d*n+d]; pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0; pl->pData[d*n+d] = 1.0;
} }
status = ARM_MATH_SUCCESS; status = ARM_MATH_SUCCESS;
} }
/* Return to application */ /* Return to application */
return (status); return (status);
} }

@ -65,16 +65,16 @@ void arm_q15_to_float(
q15x8_t vecDst; q15x8_t vecDst;
q15_t const *pSrcVec; q15_t const *pSrcVec;
pSrcVec = (q15_t const *) pSrc; pSrcVec = (q15_t const *) pSrc;
blkCnt = blockSize >> 2; blkCnt = blockSize >> 2;
while (blkCnt > 0U) while (blkCnt > 0U)
{ {
/* C = (float32_t) A / 32768 */ /* C = (float32_t) A / 32768 */
/* convert from q15 to float and then store the results in the destination buffer */ /* convert from q15 to float and then store the results in the destination buffer */
vecDst = vldrhq_s32(pSrcVec); vecDst = vldrhq_s32(pSrcVec);
pSrcVec += 4; pSrcVec += 4;
vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15)); vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));
pDst += 4; pDst += 4;
/* /*
* Decrement the blockSize loop counter * Decrement the blockSize loop counter
@ -129,7 +129,7 @@ void arm_q15_to_float(
outV = vcvtq_n_f32_s32(inV1,15); outV = vcvtq_n_f32_s32(inV1,15);
vst1q_f32(pDst, outV); vst1q_f32(pDst, outV);
pDst += 4; pDst += 4;
/* Decrement the loop counter */ /* Decrement the loop counter */
blkCnt--; blkCnt--;
} }

@ -72,7 +72,7 @@ void arm_q7_to_float(
/* convert from q7 to float and then store the results in the destination buffer */ /* convert from q7 to float and then store the results in the destination buffer */
vecDst = vldrbq_s32(pSrcVec); vecDst = vldrbq_s32(pSrcVec);
pSrcVec += 4; pSrcVec += 4;
vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7)); vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));
pDst += 4; pDst += 4;
/* /*
* Decrement the blockSize loop counter * Decrement the blockSize loop counter

@ -184,16 +184,16 @@ static void _arm_radix4_butterfly_q15_mve(
vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
vecTmp0 = vhaddq(vecSum0, vecSum1); vecTmp0 = vhaddq(vecSum0, vecSum1);
vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
vecTmp0 = vhsubq(vecSum0, vecSum1); vecTmp0 = vhsubq(vecSum0, vecSum1);
vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
blkCnt--; blkCnt--;
} }
@ -419,16 +419,16 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
vecTmp0 = vhaddq(vecSum0, vecSum1); vecTmp0 = vhaddq(vecSum0, vecSum1);
vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
vecTmp0 = vhsubq(vecSum0, vecSum1); vecTmp0 = vhsubq(vecSum0, vecSum1);
vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0); vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
blkCnt--; blkCnt--;
} }

Loading…
Cancel
Save