diff --git a/PrivateInclude/arm_vec_fft.h b/PrivateInclude/arm_vec_fft.h index 17e62aa8..fdf6498c 100755 --- a/PrivateInclude/arm_vec_fft.h +++ b/PrivateInclude/arm_vec_fft.h @@ -150,20 +150,20 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve( bitRevTabOff = vldrhq_u16(pBitRevTab); pBitRevTab += 8; - bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); - bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); - bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); - bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3); blkCnt = (bitRevLen / 16); while (blkCnt > 0) { bitRevTabOff = vldrhq_u16(pBitRevTab); pBitRevTab += 8; - bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one); - bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one); - bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3); - bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3); + bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3); + bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3); inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low); inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High); @@ -175,10 +175,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve( bitRevTabOff = vldrhq_u16(pBitRevTab); pBitRevTab += 8; - bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); - bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); - bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); - bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3); inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low); inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High); @@ -209,10 +209,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve( vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh); vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow); - bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one); - bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one); - bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3); - bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3); + bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one); + bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3); + bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3); inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p); inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p); @@ -251,13 +251,13 @@ __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32 while (blkCnt > 0) { uint64x2_t vecIn; - vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0); + vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0); idxOffs0 = idxOffs0 + 16; vst1q(pDst32, (uint32x4_t) vecIn); pDst32 += 4; bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos); - vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1); + vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1); idxOffs1 = idxOffs1 + 16; vst1q(pDst32, (uint32x4_t) vecIn); pDst32 += 4; diff --git a/Source/FastMathFunctions/arm_vlog_f16.c b/Source/FastMathFunctions/arm_vlog_f16.c index 0493c23a..0821d5ab 100755 --- a/Source/FastMathFunctions/arm_vlog_f16.c +++ b/Source/FastMathFunctions/arm_vlog_f16.c @@ -32,9 +32,9 @@ #if defined(ARM_FLOAT16_SUPPORTED) /* Degree of the polynomial approximation */ -#define NB_DEG_LOGF16 3 +#define NB_DEG_LOGF16 3 -/* +/* Related to the Log2 of the number of approximations. For instance, with 3 there are 1 + 2^3 polynomials */ @@ -55,7 +55,7 @@ nb = 3; deg = 3; lut = Table[ MiniMaxApproximation[ - Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, + Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}]; coefs = Chop@Flatten[CoefficientList[lut, x]]; @@ -75,8 +75,8 @@ static float16_t lut_logf16[NB_LUT_LOGF16]={ float16_t logf16_scalar(float16_t x) { int16_t i = arm_typecast_s16_f16(x); - - int32_t vecExpUnBiased = (i >> 10) - 15; + + int32_t vecExpUnBiased = (i >> 10) - 15; i = i - (vecExpUnBiased << 10); float16_t vecTmpFlt1 = arm_typecast_f16_s16(i); @@ -85,7 +85,7 @@ float16_t logf16_scalar(float16_t x) float16_t tmp,v; tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16); - n = floor((double)tmp); + n = (int)floor((double)tmp); v = (_Float16)tmp - (_Float16)n; lut = lut_logf16 + n * (1+NB_DEG_LOGF16); @@ -110,9 +110,9 @@ float16_t logf16_scalar(float16_t x) float16x8_t vlogq_lut_f16(float16x8_t vecIn) { - int16x8_t i = vreinterpretq_s16_f16(vecIn); - - int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); + int16x8_t i = vreinterpretq_s16_f16(vecIn); + + int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10)); float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i); @@ -132,12 +132,12 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn) offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16)); offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1); - res = vldrhq_gather_shifted_offset_f16(lut_logf16,offset); + res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset); offset = vsubq_n_s16(offset,1); for(int j=NB_DEG_LOGF16-2; j >=0 ; j--) { - lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,offset); + lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset); res = vfmaq_f16(lutV,v,res); offset = vsubq_n_s16(offset,1); @@ -150,7 +150,7 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn) } -#endif +#endif /** @ingroup groupFastMath @@ -175,9 +175,9 @@ void arm_vlog_f16( float16_t * pDst, uint32_t blockSize) { - uint32_t blkCnt; + uint32_t blkCnt; -#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) +#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) f16x8_t src; f16x8_t dst; @@ -203,10 +203,10 @@ void arm_vlog_f16( while (blkCnt > 0U) { /* C = log(A) */ - + /* Calculate log and store result in destination buffer. */ *pDst++ = logf16_scalar(*pSrc++); - + /* Decrement loop counter */ blkCnt--; } @@ -219,4 +219,4 @@ void arm_vlog_f16( */ -#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ +#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ diff --git a/Source/FastMathFunctions/arm_vlog_q15.c b/Source/FastMathFunctions/arm_vlog_q15.c index 131133cc..896988dc 100755 --- a/Source/FastMathFunctions/arm_vlog_q15.c +++ b/Source/FastMathFunctions/arm_vlog_q15.c @@ -32,9 +32,9 @@ #define LOG_Q15_ACCURACY 15 -/* Bit to represent the normalization factor +/* Bit to represent the normalization factor It is Ceiling[Log2[LOG_Q15_ACCURACY]] of the previous value. - The Log2 algorithm is assuming that the value x is + The Log2 algorithm is assuming that the value x is 1 <= x < 2. But input value could be as small a 2^-LOG_Q15_ACCURACY @@ -90,7 +90,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src) /* Compute the Log2. Result is in q11 instead of q16 - because we know 0 <= y < 1.0 but + because we know 0 <= y < 1.0 but we want a result allowing to do a product on int16 rather than having to go through int32 @@ -98,7 +98,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src) for(i = 0; i < LOG_Q15_ACCURACY ; i++) { x = (((int32_t)x*x)) >> (LOG_Q15_ACCURACY - 1); - + if (x >= LOQ_Q15_THRESHOLD) { y += inc ; @@ -108,10 +108,10 @@ static uint16_t arm_scalar_log_q15(uint16_t src) } - /* + /* Convert the Log2 to Log and apply normalization. We compute (y - normalisation) * (1 / Log2[e]). - + */ /* q11 */ @@ -120,7 +120,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src) /* q4.11 */ y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15; - + return(y); } @@ -146,11 +146,11 @@ q15x8_t vlogq_q15(q15x8_t src) /* q11 */ uint16x8_t y = vdupq_n_u16(0); - + /* q11 */ int16x8_t vtmp; - + mve_pred16_t p; @@ -158,11 +158,11 @@ q15x8_t vlogq_q15(q15x8_t src) vtmp = vsubq_n_s16(c,1); - x = vshlq_u16(src,vtmp); + x = vshlq_u16((uint16x8_t)src,vtmp); /* Compute the Log2. Result is in q11 instead of q16 - because we know 0 <= y < 1.0 but + because we know 0 <= y < 1.0 but we want a result allowing to do a product on int16 rather than having to go through int32 @@ -172,7 +172,7 @@ q15x8_t vlogq_q15(q15x8_t src) x = vmulhq_u16(x,x); x = vshlq_n_u16(x,2); - + p = vcmphiq_u16(x,vdupq_n_u16(LOQ_Q15_THRESHOLD)); y = vaddq_m_n_u16(y, y,inc,p); x = vshrq_m_n_u16(x,x,1,p); @@ -181,26 +181,26 @@ q15x8_t vlogq_q15(q15x8_t src) } - /* + /* Convert the Log2 to Log and apply normalization. We compute (y - normalisation) * (1 / Log2[e]). - + */ /* q11 */ // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART)); vtmp = vshlq_n_s16(normalization,LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART); - vtmp = vsubq_s16(y,vtmp); + vtmp = vsubq_s16((int16x8_t)y,vtmp); + + - - /* q4.11 */ // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15; vtmp = vqdmulhq_n_s16(vtmp,LOG_Q15_INVLOG2EXP); return(vtmp); } -#endif +#endif /** @ingroup groupFastMath @@ -248,7 +248,7 @@ void arm_vlog_q15( blkCnt = blockSize & 7; #else blkCnt = blockSize; - #endif + #endif while (blkCnt > 0U) { diff --git a/Source/FastMathFunctions/arm_vlog_q31.c b/Source/FastMathFunctions/arm_vlog_q31.c index ad31ddef..00538594 100755 --- a/Source/FastMathFunctions/arm_vlog_q31.c +++ b/Source/FastMathFunctions/arm_vlog_q31.c @@ -30,9 +30,9 @@ #define LOG_Q31_ACCURACY 31 -/* Bit to represent the normalization factor +/* Bit to represent the normalization factor It is Ceiling[Log2[LOG_Q31_ACCURACY]] of the previous value. - The Log2 algorithm is assuming that the value x is + The Log2 algorithm is assuming that the value x is 1 <= x < 2. But input value could be as small a 2^-LOG_Q31_ACCURACY @@ -55,7 +55,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src) { int32_t i; - + int32_t c = __CLZ(src); int32_t normalization=0; @@ -84,7 +84,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src) } normalization = c; - /* Compute the Log2. Result is in q26 + /* Compute the Log2. Result is in q26 because we know 0 <= y < 1.0 but do not want to use q32 to allow following computation with less instructions. @@ -101,10 +101,10 @@ static uint32_t arm_scalar_log_q31(uint32_t src) inc = inc >> 1; } - /* + /* Convert the Log2 to Log and apply normalization. We compute (y - normalisation) * (1 / Log2[e]). - + */ /* q26 */ @@ -114,7 +114,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src) /* q5.26 */ y = ((int64_t)tmp * LOG_Q31_INVLOG2EXP) >> 31; - + return(y); @@ -125,7 +125,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src) q31x4_t vlogq_q31(q31x4_t src) { - + int32_t i; int32x4_t c = vclzq_s32(src); @@ -141,11 +141,11 @@ q31x4_t vlogq_q31(q31x4_t src) /* q11 */ uint32x4_t y = vdupq_n_u32(0); - + /* q11 */ int32x4_t vtmp; - + mve_pred16_t p; @@ -153,10 +153,10 @@ q31x4_t vlogq_q31(q31x4_t src) vtmp = vsubq_n_s32(c,1); - x = vshlq_u32(src,vtmp); + x = vshlq_u32((uint32x4_t)src,vtmp); - /* Compute the Log2. Result is in Q26 + /* Compute the Log2. Result is in Q26 because we know 0 <= y < 1.0 but do not want to use Q32 to allow following computation with less instructions. @@ -166,7 +166,7 @@ q31x4_t vlogq_q31(q31x4_t src) x = vmulhq_u32(x,x); x = vshlq_n_u32(x,2); - + p = vcmphiq_u32(x,vdupq_n_u32(LOQ_Q31_THRESHOLD)); y = vaddq_m_n_u32(y, y,inc,p); x = vshrq_m_n_u32(x,x,1,p); @@ -175,19 +175,19 @@ q31x4_t vlogq_q31(q31x4_t src) } - /* + /* Convert the Log2 to Log and apply normalization. We compute (y - normalisation) * (1 / Log2[e]). - + */ /* q11 */ // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART)); vtmp = vshlq_n_s32(normalization,LOG_Q31_ACCURACY - LOG_Q31_INTEGER_PART); - vtmp = vsubq_s32(y,vtmp); + vtmp = vsubq_s32((int32x4_t)y,vtmp); + + - - /* q4.11 */ // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15; vtmp = vqdmulhq_n_s32(vtmp,LOG_Q31_INVLOG2EXP); @@ -242,7 +242,7 @@ void arm_vlog_q31( blkCnt = blockSize & 3; #else blkCnt = blockSize; - #endif + #endif while (blkCnt > 0U) { @@ -250,7 +250,7 @@ void arm_vlog_q31( blkCnt--; } - + } /** diff --git a/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/Source/MatrixFunctions/arm_mat_ldlt_f32.c index f8d10cb4..7e5ecbdd 100755 --- a/Source/MatrixFunctions/arm_mat_ldlt_f32.c +++ b/Source/MatrixFunctions/arm_mat_ldlt_f32.c @@ -96,7 +96,7 @@ arm_status arm_mat_ldlt_f32( { arm_status status; /* status of matrix inverse */ - + #ifdef ARM_MATH_MATRIX_CHECK @@ -142,7 +142,7 @@ arm_status arm_mat_ldlt_f32( { /* Find pivot */ float32_t m=F32_MIN,a; - int j=k; + int j=k; for(int r=k;rpData[row*n+col], zero, p0); + + vstrwq_p(&pl->pData[row*n+col], zero, p0); cnt -= 4; } @@ -292,15 +292,15 @@ arm_status arm_mat_ldlt_f32( for(int row=0; row < n;row++) { - mve_pred16_t p0; + mve_pred16_t p0; int cnt= n-row-1; f32x4_t zero=vdupq_n_f32(0.0f); - + for(int col=row+1; col < n;col+=4) { p0 = vctp32q(cnt); - - vstrwq_p(&pl->pData[row*n+col], zero, p0); + + vstrwq_p(&pl->pData[row*n+col], zero, p0); cnt -= 4; } @@ -311,12 +311,12 @@ arm_status arm_mat_ldlt_f32( pd->pData[d*n+d] = pl->pData[d*n+d]; pl->pData[d*n+d] = 1.0; } - + status = ARM_MATH_SUCCESS; } - + /* Return to application */ return (status); } @@ -350,7 +350,7 @@ arm_status arm_mat_ldlt_f32( @addtogroup MatrixChol @{ */ - + /** * @brief Floating-point LDL^t decomposition of positive semi-definite matrix. * @param[in] pSrc points to the instance of the input floating-point matrix structure. @@ -373,7 +373,7 @@ arm_status arm_mat_ldlt_f32( { arm_status status; /* status of matrix inverse */ - + #ifdef ARM_MATH_MATRIX_CHECK @@ -410,7 +410,7 @@ arm_status arm_mat_ldlt_f32( { /* Find pivot */ float32_t m=F32_MIN,a; - int j=k; + int j=k; int r; @@ -457,7 +457,7 @@ arm_status arm_mat_ldlt_f32( pA[w*n+k] = pA[w*n+k] / a; } - + } @@ -491,12 +491,12 @@ arm_status arm_mat_ldlt_f32( pd->pData[d*n+d] = pl->pData[d*n+d]; pl->pData[d*n+d] = 1.0; } - + status = ARM_MATH_SUCCESS; } - + /* Return to application */ return (status); } diff --git a/Source/SupportFunctions/arm_q15_to_float.c b/Source/SupportFunctions/arm_q15_to_float.c index 902d302e..1a20defe 100644 --- a/Source/SupportFunctions/arm_q15_to_float.c +++ b/Source/SupportFunctions/arm_q15_to_float.c @@ -65,16 +65,16 @@ void arm_q15_to_float( q15x8_t vecDst; q15_t const *pSrcVec; - + pSrcVec = (q15_t const *) pSrc; blkCnt = blockSize >> 2; while (blkCnt > 0U) { /* C = (float32_t) A / 32768 */ /* convert from q15 to float and then store the results in the destination buffer */ - vecDst = vldrhq_s32(pSrcVec); + vecDst = vldrhq_s32(pSrcVec); pSrcVec += 4; - vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15)); + vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15)); pDst += 4; /* * Decrement the blockSize loop counter @@ -129,7 +129,7 @@ void arm_q15_to_float( outV = vcvtq_n_f32_s32(inV1,15); vst1q_f32(pDst, outV); pDst += 4; - + /* Decrement the loop counter */ blkCnt--; } diff --git a/Source/SupportFunctions/arm_q7_to_float.c b/Source/SupportFunctions/arm_q7_to_float.c index 776a5383..9cbf8ad0 100644 --- a/Source/SupportFunctions/arm_q7_to_float.c +++ b/Source/SupportFunctions/arm_q7_to_float.c @@ -72,7 +72,7 @@ void arm_q7_to_float( /* convert from q7 to float and then store the results in the destination buffer */ vecDst = vldrbq_s32(pSrcVec); pSrcVec += 4; - vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7)); + vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7)); pDst += 4; /* * Decrement the blockSize loop counter diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c index 69211c43..1e997adb 100644 --- a/Source/TransformFunctions/arm_cfft_q15.c +++ b/Source/TransformFunctions/arm_cfft_q15.c @@ -184,16 +184,16 @@ static void _arm_radix4_butterfly_q15_mve( vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); vecTmp0 = vhaddq(vecSum0, vecSum1); - vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0); vecTmp0 = vhsubq(vecSum0, vecSum1); - vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0); vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); - vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0); vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); - vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0); blkCnt--; } @@ -419,16 +419,16 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8); vecTmp0 = vhaddq(vecSum0, vecSum1); - vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0); vecTmp0 = vhsubq(vecSum0, vecSum1); - vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0); vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1); - vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0); vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1); - vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0); + vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0); blkCnt--; }