DSP: Cleanup of type mismatches (#1327)

The IAR compiler appears to provide more warnings about type mismatches than the other compilers. This cleans a lot of them up. Signed-off-by: TTornblom <thomas.tornblom@iar.com>
4 years ago · 2a7714ced7
parent b4822217d0
commit 2a7714ced7
8 changed files with 114 additions and 114 deletions
--- a/PrivateInclude/arm_vec_fft.h
+++ b/PrivateInclude/arm_vec_fft.h
@ -150,20 +150,20 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
    bitRevTabOff = vldrhq_u16(pBitRevTab);
    pBitRevTab += 8;
-    bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+    bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
-    bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+    bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
-    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+    bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
-    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+    bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
    blkCnt = (bitRevLen / 16);
    while (blkCnt > 0) {
        bitRevTabOff = vldrhq_u16(pBitRevTab);
        pBitRevTab += 8;
-        bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
-        bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
-        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
+        bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
-        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
+        bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
@ -175,10 +175,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
        bitRevTabOff = vldrhq_u16(pBitRevTab);
        pBitRevTab += 8;
-        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
-        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+        bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+        bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
        inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
        inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
@ -209,10 +209,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
        vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
-        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
+        bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
-        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
+        bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+        bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
        inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
        inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
@ -251,13 +251,13 @@ __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32
    while (blkCnt > 0) {
        uint64x2_t      vecIn;
-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
        idxOffs0 = idxOffs0 + 16;
        vst1q(pDst32, (uint32x4_t) vecIn);
        pDst32 += 4;
        bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
        idxOffs1 = idxOffs1 + 16;
        vst1q(pDst32, (uint32x4_t) vecIn);
        pDst32 += 4;
--- a/Source/FastMathFunctions/arm_vlog_f16.c
+++ b/Source/FastMathFunctions/arm_vlog_f16.c
@ -32,9 +32,9 @@
 #if defined(ARM_FLOAT16_SUPPORTED)
 /* Degree of the polynomial approximation */
-#define NB_DEG_LOGF16 3 
+#define NB_DEG_LOGF16 3
-/* 
+/*
 Related to the Log2 of the number of approximations.
 For instance, with 3 there are 1 + 2^3 polynomials
 */
@ -55,7 +55,7 @@ nb = 3;
 deg = 3;
 lut = Table[
   MiniMaxApproximation[
-     Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, 
+     Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0},
     MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}];
 coefs = Chop@Flatten[CoefficientList[lut, x]];
@ -75,8 +75,8 @@ static float16_t lut_logf16[NB_LUT_LOGF16]={
 float16_t logf16_scalar(float16_t x)
 {
    int16_t i =  arm_typecast_s16_f16(x);
-     
+
-    int32_t vecExpUnBiased = (i >> 10) - 15;  
+    int32_t vecExpUnBiased = (i >> 10) - 15;
    i = i - (vecExpUnBiased << 10);
    float16_t vecTmpFlt1 = arm_typecast_f16_s16(i);
@ -85,7 +85,7 @@ float16_t logf16_scalar(float16_t x)
    float16_t tmp,v;
    tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16);
-    n = floor((double)tmp);
+    n = (int)floor((double)tmp);
    v = (_Float16)tmp - (_Float16)n;
    lut = lut_logf16 + n * (1+NB_DEG_LOGF16);
@ -110,9 +110,9 @@ float16_t logf16_scalar(float16_t x)
 float16x8_t vlogq_lut_f16(float16x8_t vecIn)
 {
-    int16x8_t i =  vreinterpretq_s16_f16(vecIn);      
+    int16x8_t i =  vreinterpretq_s16_f16(vecIn);
-     
+
-    int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); 
+    int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15);
    i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10));
    float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i);
@ -132,12 +132,12 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
    offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16));
    offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1);
-    res = vldrhq_gather_shifted_offset_f16(lut_logf16,offset);
+    res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
    offset = vsubq_n_s16(offset,1);
    for(int j=NB_DEG_LOGF16-2; j >=0 ; j--)
    {
-       lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,offset);
+       lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
       res = vfmaq_f16(lutV,v,res);
       offset = vsubq_n_s16(offset,1);
@ -150,7 +150,7 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
 }
-#endif 
+#endif
 /**
  @ingroup groupFastMath
@ -175,9 +175,9 @@ void arm_vlog_f16(
        float16_t * pDst,
        uint32_t blockSize)
 {
-   uint32_t blkCnt; 
+   uint32_t blkCnt;
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)   
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
   f16x8_t src;
   f16x8_t dst;
@ -203,10 +203,10 @@ void arm_vlog_f16(
   while (blkCnt > 0U)
   {
      /* C = log(A) */
-  
+
      /* Calculate log and store result in destination buffer. */
      *pDst++ = logf16_scalar(*pSrc++);
-  
+
      /* Decrement loop counter */
      blkCnt--;
   }
@ -219,4 +219,4 @@ void arm_vlog_f16(
 */
-#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
--- a/Source/FastMathFunctions/arm_vlog_q15.c
+++ b/Source/FastMathFunctions/arm_vlog_q15.c
@ -32,9 +32,9 @@
 #define LOG_Q15_ACCURACY 15
-/* Bit to represent the normalization factor 
+/* Bit to represent the normalization factor
   It is Ceiling[Log2[LOG_Q15_ACCURACY]] of the previous value.
-   The Log2 algorithm is assuming that the value x is 
+   The Log2 algorithm is assuming that the value x is
   1 <= x < 2.
   But input value could be as small a 2^-LOG_Q15_ACCURACY
@ -90,7 +90,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
   /* Compute the Log2. Result is in q11 instead of q16
-      because we know 0 <= y < 1.0 but 
+      because we know 0 <= y < 1.0 but
      we want a result allowing to do a
      product on int16 rather than having to go
      through int32
@ -98,7 +98,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
   for(i = 0; i < LOG_Q15_ACCURACY ; i++)
   {
      x = (((int32_t)x*x)) >> (LOG_Q15_ACCURACY - 1);
-    
+
      if (x >= LOQ_Q15_THRESHOLD)
      {
         y += inc ;
@ -108,10 +108,10 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
   }
-   /* 
+   /*
      Convert the Log2 to Log and apply normalization.
      We compute (y - normalisation) * (1 / Log2[e]).
-      
+
   */
   /* q11 */
@ -120,7 +120,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
   /* q4.11 */
   y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
-  
+
   return(y);
 }
@ -146,11 +146,11 @@ q15x8_t vlogq_q15(q15x8_t src)
   /* q11 */
   uint16x8_t y = vdupq_n_u16(0);
-   
+
   /* q11 */
   int16x8_t vtmp;
-  
+
   mve_pred16_t p;
@ -158,11 +158,11 @@ q15x8_t vlogq_q15(q15x8_t src)
   vtmp = vsubq_n_s16(c,1);
-   x = vshlq_u16(src,vtmp);
+   x = vshlq_u16((uint16x8_t)src,vtmp);
   /* Compute the Log2. Result is in q11 instead of q16
-      because we know 0 <= y < 1.0 but 
+      because we know 0 <= y < 1.0 but
      we want a result allowing to do a
      product on int16 rather than having to go
      through int32
@ -172,7 +172,7 @@ q15x8_t vlogq_q15(q15x8_t src)
      x = vmulhq_u16(x,x);
      x = vshlq_n_u16(x,2);
-      
+
      p = vcmphiq_u16(x,vdupq_n_u16(LOQ_Q15_THRESHOLD));
      y = vaddq_m_n_u16(y, y,inc,p);
      x = vshrq_m_n_u16(x,x,1,p);
@ -181,26 +181,26 @@ q15x8_t vlogq_q15(q15x8_t src)
   }
-   /* 
+   /*
      Convert the Log2 to Log and apply normalization.
      We compute (y - normalisation) * (1 / Log2[e]).
-      
+
   */
   /* q11 */
   // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
   vtmp = vshlq_n_s16(normalization,LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART);
-   vtmp = vsubq_s16(y,vtmp);
+   vtmp = vsubq_s16((int16x8_t)y,vtmp);
   /* q4.11 */
   // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
   vtmp = vqdmulhq_n_s16(vtmp,LOG_Q15_INVLOG2EXP);
   return(vtmp);
 }
-#endif 
+#endif
 /**
  @ingroup groupFastMath
@ -248,7 +248,7 @@ void arm_vlog_q15(
  blkCnt = blockSize & 7;
  #else
  blkCnt = blockSize;
-  #endif 
+  #endif
  while (blkCnt > 0U)
  {
--- a/Source/FastMathFunctions/arm_vlog_q31.c
+++ b/Source/FastMathFunctions/arm_vlog_q31.c
@ -30,9 +30,9 @@
 #define LOG_Q31_ACCURACY 31
-/* Bit to represent the normalization factor 
+/* Bit to represent the normalization factor
   It is Ceiling[Log2[LOG_Q31_ACCURACY]] of the previous value.
-   The Log2 algorithm is assuming that the value x is 
+   The Log2 algorithm is assuming that the value x is
   1 <= x < 2.
   But input value could be as small a 2^-LOG_Q31_ACCURACY
@ -55,7 +55,7 @@
 static uint32_t arm_scalar_log_q31(uint32_t src)
 {
   int32_t i;
-   
+
   int32_t c = __CLZ(src);
   int32_t normalization=0;
@ -84,7 +84,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
   }
   normalization = c;
-   /* Compute the Log2. Result is in q26 
+   /* Compute the Log2. Result is in q26
      because we know 0 <= y < 1.0 but
      do not want to use q32 to allow
      following computation with less instructions.
@ -101,10 +101,10 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
      inc = inc >> 1;
   }
-   /* 
+   /*
      Convert the Log2 to Log and apply normalization.
      We compute (y - normalisation) * (1 / Log2[e]).
-      
+
   */
   /* q26 */
@ -114,7 +114,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
   /* q5.26 */
   y = ((int64_t)tmp * LOG_Q31_INVLOG2EXP) >> 31;
-  
+
   return(y);
@ -125,7 +125,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
 q31x4_t vlogq_q31(q31x4_t src)
 {
-    
+
   int32_t i;
   int32x4_t c = vclzq_s32(src);
@ -141,11 +141,11 @@ q31x4_t vlogq_q31(q31x4_t src)
   /* q11 */
   uint32x4_t y = vdupq_n_u32(0);
-   
+
   /* q11 */
   int32x4_t vtmp;
-  
+
   mve_pred16_t p;
@ -153,10 +153,10 @@ q31x4_t vlogq_q31(q31x4_t src)
   vtmp = vsubq_n_s32(c,1);
-   x = vshlq_u32(src,vtmp);
+   x = vshlq_u32((uint32x4_t)src,vtmp);
-    /* Compute the Log2. Result is in Q26 
+    /* Compute the Log2. Result is in Q26
      because we know 0 <= y < 1.0 but
      do not want to use Q32 to allow
      following computation with less instructions.
@ -166,7 +166,7 @@ q31x4_t vlogq_q31(q31x4_t src)
      x = vmulhq_u32(x,x);
      x = vshlq_n_u32(x,2);
-      
+
      p = vcmphiq_u32(x,vdupq_n_u32(LOQ_Q31_THRESHOLD));
      y = vaddq_m_n_u32(y, y,inc,p);
      x = vshrq_m_n_u32(x,x,1,p);
@ -175,19 +175,19 @@ q31x4_t vlogq_q31(q31x4_t src)
   }
-   /* 
+   /*
      Convert the Log2 to Log and apply normalization.
      We compute (y - normalisation) * (1 / Log2[e]).
-      
+
   */
   /* q11 */
   // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
   vtmp = vshlq_n_s32(normalization,LOG_Q31_ACCURACY - LOG_Q31_INTEGER_PART);
-   vtmp = vsubq_s32(y,vtmp);
+   vtmp = vsubq_s32((int32x4_t)y,vtmp);
   /* q4.11 */
   // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
   vtmp = vqdmulhq_n_s32(vtmp,LOG_Q31_INVLOG2EXP);
@ -242,7 +242,7 @@ void arm_vlog_q31(
  blkCnt = blockSize & 3;
  #else
  blkCnt = blockSize;
-  #endif 
+  #endif
  while (blkCnt > 0U)
  {
@ -250,7 +250,7 @@ void arm_vlog_q31(
     blkCnt--;
  }
- 
+
 }
 /**
--- a/Source/MatrixFunctions/arm_mat_ldlt_f32.c
+++ b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@ -96,7 +96,7 @@ arm_status arm_mat_ldlt_f32(
 {
  arm_status status;                             /* status of matrix inverse */
- 
+
 #ifdef ARM_MATH_MATRIX_CHECK
@ -142,7 +142,7 @@ arm_status arm_mat_ldlt_f32(
    {
        /* Find pivot */
        float32_t m=F32_MIN,a;
-        int j=k; 
+        int j=k;
        for(int r=k;r<n;r++)
@ -204,25 +204,25 @@ arm_status arm_mat_ldlt_f32(
             //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
             vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
-             
+
             vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
             vecA = vfmsq_m(vecA, vecW0, vecX, p0);
-             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);
             vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
             vecA = vfmsq_m(vecA, vecW1, vecX, p0);
-             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);
             vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
             vecA = vfmsq_m(vecA, vecW2, vecX, p0);
-             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);
             vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
             vecA = vfmsq_m(vecA, vecW3, vecX, p0);
-             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);
             cnt -= 4;
          }
@ -246,13 +246,13 @@ arm_status arm_mat_ldlt_f32(
             //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
             vecA = vldrwq_z_f32(&pA[w*n+x],p0);
-             
+
-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
             vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
             vecA = vfmsq_m(vecA, vecW, vecX, p0);
-             vstrwq_p(&pA[w*n+x], vecA, p0);  
+             vstrwq_p(&pA[w*n+x], vecA, p0);
             cnt -= 4;
          }
@ -263,7 +263,7 @@ arm_status arm_mat_ldlt_f32(
               pA[w*n+k] = pA[w*n+k] * invA;
        }
-        
+
    }
@ -275,15 +275,15 @@ arm_status arm_mat_ldlt_f32(
      diag--;
      for(int row=0; row < n;row++)
      {
-        mve_pred16_t p0; 
+        mve_pred16_t p0;
        int cnt= n-k;
        f32x4_t zero=vdupq_n_f32(0.0f);
        for(int col=k; col < n;col += 4)
        {
           p0 = vctp32q(cnt);
-         
+
-           vstrwq_p(&pl->pData[row*n+col], zero, p0);  
+           vstrwq_p(&pl->pData[row*n+col], zero, p0);
           cnt -= 4;
        }
@ -292,15 +292,15 @@ arm_status arm_mat_ldlt_f32(
    for(int row=0; row < n;row++)
    {
-       mve_pred16_t p0; 
+       mve_pred16_t p0;
       int cnt= n-row-1;
       f32x4_t zero=vdupq_n_f32(0.0f);
-       
+
       for(int col=row+1; col < n;col+=4)
       {
         p0 = vctp32q(cnt);
-         
+
-         vstrwq_p(&pl->pData[row*n+col], zero, p0);  
+         vstrwq_p(&pl->pData[row*n+col], zero, p0);
         cnt -= 4;
       }
@ -311,12 +311,12 @@ arm_status arm_mat_ldlt_f32(
      pd->pData[d*n+d] = pl->pData[d*n+d];
      pl->pData[d*n+d] = 1.0;
    }
-  
+
    status = ARM_MATH_SUCCESS;
  }
-  
+
  /* Return to application */
  return (status);
 }
@ -350,7 +350,7 @@ arm_status arm_mat_ldlt_f32(
  @addtogroup MatrixChol
  @{
 */
-  
+
 /**
   * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
   * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
@ -373,7 +373,7 @@ arm_status arm_mat_ldlt_f32(
 {
  arm_status status;                             /* status of matrix inverse */
- 
+
 #ifdef ARM_MATH_MATRIX_CHECK
@ -410,7 +410,7 @@ arm_status arm_mat_ldlt_f32(
    {
        /* Find pivot */
        float32_t m=F32_MIN,a;
-        int j=k; 
+        int j=k;
        int r;
@ -457,7 +457,7 @@ arm_status arm_mat_ldlt_f32(
               pA[w*n+k] = pA[w*n+k] / a;
        }
-        
+
    }
@ -491,12 +491,12 @@ arm_status arm_mat_ldlt_f32(
      pd->pData[d*n+d] = pl->pData[d*n+d];
      pl->pData[d*n+d] = 1.0;
    }
-  
+
    status = ARM_MATH_SUCCESS;
  }
-  
+
  /* Return to application */
  return (status);
 }
--- a/Source/SupportFunctions/arm_q15_to_float.c
+++ b/Source/SupportFunctions/arm_q15_to_float.c
@ -65,16 +65,16 @@ void arm_q15_to_float(
  q15x8_t vecDst;
  q15_t const *pSrcVec;
-  
+
  pSrcVec = (q15_t const *) pSrc;
  blkCnt = blockSize >> 2;
  while (blkCnt > 0U)
  {
      /* C = (float32_t) A / 32768 */
      /* convert from q15 to float and then store the results in the destination buffer */
-      vecDst = vldrhq_s32(pSrcVec); 
+      vecDst = vldrhq_s32(pSrcVec);
      pSrcVec += 4;
-      vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15));  
+      vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));
      pDst += 4;
      /*
       * Decrement the blockSize loop counter
@ -129,7 +129,7 @@ void arm_q15_to_float(
    outV = vcvtq_n_f32_s32(inV1,15);
    vst1q_f32(pDst, outV);
    pDst += 4;
-  
+
    /* Decrement the loop counter */
    blkCnt--;
  }
--- a/Source/SupportFunctions/arm_q7_to_float.c
+++ b/Source/SupportFunctions/arm_q7_to_float.c
@ -72,7 +72,7 @@ void arm_q7_to_float(
        /* convert from q7 to float and then store the results in the destination buffer */
        vecDst = vldrbq_s32(pSrcVec);    
        pSrcVec += 4;
-        vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7));   
+        vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));   
        pDst += 4;
        /*
         * Decrement the blockSize loop counter
--- a/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Source/TransformFunctions/arm_cfft_q15.c
@ -184,16 +184,16 @@ static void _arm_radix4_butterfly_q15_mve(
        vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
        vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
        vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
        vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
        vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
        blkCnt--;
    }
@ -419,16 +419,16 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
        vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
        vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
        vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
        vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
        vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
        blkCnt--;
    }