diff --git a/PrivateInclude/arm_vec_fft.h b/PrivateInclude/arm_vec_fft.h
index 17e62aa8..fdf6498c 100755
--- a/PrivateInclude/arm_vec_fft.h
+++ b/PrivateInclude/arm_vec_fft.h
@@ -150,20 +150,20 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
     bitRevTabOff = vldrhq_u16(pBitRevTab);
     pBitRevTab += 8;
 
-    bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
-    bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
-    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
-    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+    bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
+    bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
+    bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
+    bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
 
     blkCnt = (bitRevLen / 16);
     while (blkCnt > 0) {
         bitRevTabOff = vldrhq_u16(pBitRevTab);
         pBitRevTab += 8;
 
-        bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
-        bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
-        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
-        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
+        bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
+        bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
+        bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
+        bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
 
         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
@@ -175,10 +175,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
         bitRevTabOff = vldrhq_u16(pBitRevTab);
         pBitRevTab += 8;
 
-        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
-        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+        bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
+        bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
+        bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
 
         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
@@ -209,10 +209,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
 
-        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
-        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
+        bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
+        bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
+        bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
+        bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
 
         inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
         inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
@@ -251,13 +251,13 @@ __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32
     while (blkCnt > 0) {
         uint64x2_t      vecIn;
 
-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
         idxOffs0 = idxOffs0 + 16;
         vst1q(pDst32, (uint32x4_t) vecIn);
         pDst32 += 4;
         bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
 
-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
         idxOffs1 = idxOffs1 + 16;
         vst1q(pDst32, (uint32x4_t) vecIn);
         pDst32 += 4;
diff --git a/Source/FastMathFunctions/arm_vlog_f16.c b/Source/FastMathFunctions/arm_vlog_f16.c
index 0493c23a..0821d5ab 100755
--- a/Source/FastMathFunctions/arm_vlog_f16.c
+++ b/Source/FastMathFunctions/arm_vlog_f16.c
@@ -32,9 +32,9 @@
 #if defined(ARM_FLOAT16_SUPPORTED)
 
 /* Degree of the polynomial approximation */
-#define NB_DEG_LOGF16 3 
+#define NB_DEG_LOGF16 3
 
-/* 
+/*
 Related to the Log2 of the number of approximations.
 For instance, with 3 there are 1 + 2^3 polynomials
 */
@@ -55,7 +55,7 @@ nb = 3;
 deg = 3;
 lut = Table[
    MiniMaxApproximation[
-     Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, 
+     Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0},
      MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}];
 coefs = Chop@Flatten[CoefficientList[lut, x]];
 
@@ -75,8 +75,8 @@ static float16_t lut_logf16[NB_LUT_LOGF16]={
 float16_t logf16_scalar(float16_t x)
 {
     int16_t i =  arm_typecast_s16_f16(x);
-     
-    int32_t vecExpUnBiased = (i >> 10) - 15;  
+
+    int32_t vecExpUnBiased = (i >> 10) - 15;
     i = i - (vecExpUnBiased << 10);
     float16_t vecTmpFlt1 = arm_typecast_f16_s16(i);
 
@@ -85,7 +85,7 @@ float16_t logf16_scalar(float16_t x)
     float16_t tmp,v;
 
     tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16);
-    n = floor((double)tmp);
+    n = (int)floor((double)tmp);
     v = (_Float16)tmp - (_Float16)n;
 
     lut = lut_logf16 + n * (1+NB_DEG_LOGF16);
@@ -110,9 +110,9 @@ float16_t logf16_scalar(float16_t x)
 
 float16x8_t vlogq_lut_f16(float16x8_t vecIn)
 {
-    int16x8_t i =  vreinterpretq_s16_f16(vecIn);      
-     
-    int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); 
+    int16x8_t i =  vreinterpretq_s16_f16(vecIn);
+
+    int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15);
     i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10));
     float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i);
 
@@ -132,12 +132,12 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
     offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16));
     offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1);
 
-    res = vldrhq_gather_shifted_offset_f16(lut_logf16,offset);
+    res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
     offset = vsubq_n_s16(offset,1);
 
     for(int j=NB_DEG_LOGF16-2; j >=0 ; j--)
     {
-       lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,offset);
+       lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
        res = vfmaq_f16(lutV,v,res);
        offset = vsubq_n_s16(offset,1);
 
@@ -150,7 +150,7 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
 
 }
 
-#endif 
+#endif
 
 /**
   @ingroup groupFastMath
@@ -175,9 +175,9 @@ void arm_vlog_f16(
         float16_t * pDst,
         uint32_t blockSize)
 {
-   uint32_t blkCnt; 
+   uint32_t blkCnt;
 
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)   
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
    f16x8_t src;
    f16x8_t dst;
 
@@ -203,10 +203,10 @@ void arm_vlog_f16(
    while (blkCnt > 0U)
    {
       /* C = log(A) */
-  
+
       /* Calculate log and store result in destination buffer. */
       *pDst++ = logf16_scalar(*pSrc++);
-  
+
       /* Decrement loop counter */
       blkCnt--;
    }
@@ -219,4 +219,4 @@ void arm_vlog_f16(
  */
 
 
-#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/Source/FastMathFunctions/arm_vlog_q15.c b/Source/FastMathFunctions/arm_vlog_q15.c
index 131133cc..896988dc 100755
--- a/Source/FastMathFunctions/arm_vlog_q15.c
+++ b/Source/FastMathFunctions/arm_vlog_q15.c
@@ -32,9 +32,9 @@
 
 #define LOG_Q15_ACCURACY 15
 
-/* Bit to represent the normalization factor 
+/* Bit to represent the normalization factor
    It is Ceiling[Log2[LOG_Q15_ACCURACY]] of the previous value.
-   The Log2 algorithm is assuming that the value x is 
+   The Log2 algorithm is assuming that the value x is
    1 <= x < 2.
 
    But input value could be as small a 2^-LOG_Q15_ACCURACY
@@ -90,7 +90,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
 
 
    /* Compute the Log2. Result is in q11 instead of q16
-      because we know 0 <= y < 1.0 but 
+      because we know 0 <= y < 1.0 but
       we want a result allowing to do a
       product on int16 rather than having to go
       through int32
@@ -98,7 +98,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
    for(i = 0; i < LOG_Q15_ACCURACY ; i++)
    {
       x = (((int32_t)x*x)) >> (LOG_Q15_ACCURACY - 1);
-    
+
       if (x >= LOQ_Q15_THRESHOLD)
       {
          y += inc ;
@@ -108,10 +108,10 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
    }
 
 
-   /* 
+   /*
       Convert the Log2 to Log and apply normalization.
       We compute (y - normalisation) * (1 / Log2[e]).
-      
+
    */
 
    /* q11 */
@@ -120,7 +120,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
 
    /* q4.11 */
    y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
-  
+
    return(y);
 
 }
@@ -146,11 +146,11 @@ q15x8_t vlogq_q15(q15x8_t src)
 
    /* q11 */
    uint16x8_t y = vdupq_n_u16(0);
-   
+
 
    /* q11 */
    int16x8_t vtmp;
-  
+
 
    mve_pred16_t p;
 
@@ -158,11 +158,11 @@ q15x8_t vlogq_q15(q15x8_t src)
 
 
    vtmp = vsubq_n_s16(c,1);
-   x = vshlq_u16(src,vtmp);
+   x = vshlq_u16((uint16x8_t)src,vtmp);
 
 
    /* Compute the Log2. Result is in q11 instead of q16
-      because we know 0 <= y < 1.0 but 
+      because we know 0 <= y < 1.0 but
       we want a result allowing to do a
       product on int16 rather than having to go
       through int32
@@ -172,7 +172,7 @@ q15x8_t vlogq_q15(q15x8_t src)
       x = vmulhq_u16(x,x);
       x = vshlq_n_u16(x,2);
 
-      
+
       p = vcmphiq_u16(x,vdupq_n_u16(LOQ_Q15_THRESHOLD));
       y = vaddq_m_n_u16(y, y,inc,p);
       x = vshrq_m_n_u16(x,x,1,p);
@@ -181,26 +181,26 @@ q15x8_t vlogq_q15(q15x8_t src)
    }
 
 
-   /* 
+   /*
       Convert the Log2 to Log and apply normalization.
       We compute (y - normalisation) * (1 / Log2[e]).
-      
+
    */
 
    /* q11 */
    // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
    vtmp = vshlq_n_s16(normalization,LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART);
-   vtmp = vsubq_s16(y,vtmp);
+   vtmp = vsubq_s16((int16x8_t)y,vtmp);
+
+
 
-   
-  
    /* q4.11 */
    // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
    vtmp = vqdmulhq_n_s16(vtmp,LOG_Q15_INVLOG2EXP);
 
    return(vtmp);
 }
-#endif 
+#endif
 
 /**
   @ingroup groupFastMath
@@ -248,7 +248,7 @@ void arm_vlog_q15(
   blkCnt = blockSize & 7;
   #else
   blkCnt = blockSize;
-  #endif 
+  #endif
 
   while (blkCnt > 0U)
   {
diff --git a/Source/FastMathFunctions/arm_vlog_q31.c b/Source/FastMathFunctions/arm_vlog_q31.c
index ad31ddef..00538594 100755
--- a/Source/FastMathFunctions/arm_vlog_q31.c
+++ b/Source/FastMathFunctions/arm_vlog_q31.c
@@ -30,9 +30,9 @@
 
 #define LOG_Q31_ACCURACY 31
 
-/* Bit to represent the normalization factor 
+/* Bit to represent the normalization factor
    It is Ceiling[Log2[LOG_Q31_ACCURACY]] of the previous value.
-   The Log2 algorithm is assuming that the value x is 
+   The Log2 algorithm is assuming that the value x is
    1 <= x < 2.
 
    But input value could be as small a 2^-LOG_Q31_ACCURACY
@@ -55,7 +55,7 @@
 static uint32_t arm_scalar_log_q31(uint32_t src)
 {
    int32_t i;
-   
+
    int32_t c = __CLZ(src);
    int32_t normalization=0;
 
@@ -84,7 +84,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
    }
    normalization = c;
 
-   /* Compute the Log2. Result is in q26 
+   /* Compute the Log2. Result is in q26
       because we know 0 <= y < 1.0 but
       do not want to use q32 to allow
       following computation with less instructions.
@@ -101,10 +101,10 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
       inc = inc >> 1;
    }
 
-   /* 
+   /*
       Convert the Log2 to Log and apply normalization.
       We compute (y - normalisation) * (1 / Log2[e]).
-      
+
    */
 
    /* q26 */
@@ -114,7 +114,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
    /* q5.26 */
    y = ((int64_t)tmp * LOG_Q31_INVLOG2EXP) >> 31;
 
-  
+
 
    return(y);
 
@@ -125,7 +125,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
 
 q31x4_t vlogq_q31(q31x4_t src)
 {
-    
+
    int32_t i;
 
    int32x4_t c = vclzq_s32(src);
@@ -141,11 +141,11 @@ q31x4_t vlogq_q31(q31x4_t src)
 
    /* q11 */
    uint32x4_t y = vdupq_n_u32(0);
-   
+
 
    /* q11 */
    int32x4_t vtmp;
-  
+
 
    mve_pred16_t p;
 
@@ -153,10 +153,10 @@ q31x4_t vlogq_q31(q31x4_t src)
 
 
    vtmp = vsubq_n_s32(c,1);
-   x = vshlq_u32(src,vtmp);
+   x = vshlq_u32((uint32x4_t)src,vtmp);
 
 
-    /* Compute the Log2. Result is in Q26 
+    /* Compute the Log2. Result is in Q26
       because we know 0 <= y < 1.0 but
       do not want to use Q32 to allow
       following computation with less instructions.
@@ -166,7 +166,7 @@ q31x4_t vlogq_q31(q31x4_t src)
       x = vmulhq_u32(x,x);
       x = vshlq_n_u32(x,2);
 
-      
+
       p = vcmphiq_u32(x,vdupq_n_u32(LOQ_Q31_THRESHOLD));
       y = vaddq_m_n_u32(y, y,inc,p);
       x = vshrq_m_n_u32(x,x,1,p);
@@ -175,19 +175,19 @@ q31x4_t vlogq_q31(q31x4_t src)
    }
 
 
-   /* 
+   /*
       Convert the Log2 to Log and apply normalization.
       We compute (y - normalisation) * (1 / Log2[e]).
-      
+
    */
 
    /* q11 */
    // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
    vtmp = vshlq_n_s32(normalization,LOG_Q31_ACCURACY - LOG_Q31_INTEGER_PART);
-   vtmp = vsubq_s32(y,vtmp);
+   vtmp = vsubq_s32((int32x4_t)y,vtmp);
+
+
 
-   
-  
    /* q4.11 */
    // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
    vtmp = vqdmulhq_n_s32(vtmp,LOG_Q31_INVLOG2EXP);
@@ -242,7 +242,7 @@ void arm_vlog_q31(
   blkCnt = blockSize & 3;
   #else
   blkCnt = blockSize;
-  #endif 
+  #endif
 
   while (blkCnt > 0U)
   {
@@ -250,7 +250,7 @@ void arm_vlog_q31(
 
      blkCnt--;
   }
- 
+
 }
 
 /**
diff --git a/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
index f8d10cb4..7e5ecbdd 100755
--- a/Source/MatrixFunctions/arm_mat_ldlt_f32.c
+++ b/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -96,7 +96,7 @@ arm_status arm_mat_ldlt_f32(
 {
 
   arm_status status;                             /* status of matrix inverse */
- 
+
 
 #ifdef ARM_MATH_MATRIX_CHECK
 
@@ -142,7 +142,7 @@ arm_status arm_mat_ldlt_f32(
     {
         /* Find pivot */
         float32_t m=F32_MIN,a;
-        int j=k; 
+        int j=k;
 
 
         for(int r=k;r<n;r++)
@@ -204,25 +204,25 @@ arm_status arm_mat_ldlt_f32(
              //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
 
 
-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
              vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
 
-             
+
              vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
              vecA = vfmsq_m(vecA, vecW0, vecX, p0);
-             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);
 
              vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
              vecA = vfmsq_m(vecA, vecW1, vecX, p0);
-             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);
 
              vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
              vecA = vfmsq_m(vecA, vecW2, vecX, p0);
-             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);
 
              vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
              vecA = vfmsq_m(vecA, vecW3, vecX, p0);
-             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);  
+             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);
 
              cnt -= 4;
           }
@@ -246,13 +246,13 @@ arm_status arm_mat_ldlt_f32(
              //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
 
              vecA = vldrwq_z_f32(&pA[w*n+x],p0);
-             
-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
+
+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
              vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
 
              vecA = vfmsq_m(vecA, vecW, vecX, p0);
 
-             vstrwq_p(&pA[w*n+x], vecA, p0);  
+             vstrwq_p(&pA[w*n+x], vecA, p0);
 
              cnt -= 4;
           }
@@ -263,7 +263,7 @@ arm_status arm_mat_ldlt_f32(
                pA[w*n+k] = pA[w*n+k] * invA;
         }
 
-        
+
 
     }
 
@@ -275,15 +275,15 @@ arm_status arm_mat_ldlt_f32(
       diag--;
       for(int row=0; row < n;row++)
       {
-        mve_pred16_t p0; 
+        mve_pred16_t p0;
         int cnt= n-k;
         f32x4_t zero=vdupq_n_f32(0.0f);
 
         for(int col=k; col < n;col += 4)
         {
            p0 = vctp32q(cnt);
-         
-           vstrwq_p(&pl->pData[row*n+col], zero, p0);  
+
+           vstrwq_p(&pl->pData[row*n+col], zero, p0);
 
            cnt -= 4;
         }
@@ -292,15 +292,15 @@ arm_status arm_mat_ldlt_f32(
 
     for(int row=0; row < n;row++)
     {
-       mve_pred16_t p0; 
+       mve_pred16_t p0;
        int cnt= n-row-1;
        f32x4_t zero=vdupq_n_f32(0.0f);
-       
+
        for(int col=row+1; col < n;col+=4)
        {
          p0 = vctp32q(cnt);
-         
-         vstrwq_p(&pl->pData[row*n+col], zero, p0);  
+
+         vstrwq_p(&pl->pData[row*n+col], zero, p0);
 
          cnt -= 4;
        }
@@ -311,12 +311,12 @@ arm_status arm_mat_ldlt_f32(
       pd->pData[d*n+d] = pl->pData[d*n+d];
       pl->pData[d*n+d] = 1.0;
     }
-  
+
     status = ARM_MATH_SUCCESS;
 
   }
 
-  
+
   /* Return to application */
   return (status);
 }
@@ -350,7 +350,7 @@ arm_status arm_mat_ldlt_f32(
   @addtogroup MatrixChol
   @{
  */
-  
+
 /**
    * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
    * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
@@ -373,7 +373,7 @@ arm_status arm_mat_ldlt_f32(
 {
 
   arm_status status;                             /* status of matrix inverse */
- 
+
 
 #ifdef ARM_MATH_MATRIX_CHECK
 
@@ -410,7 +410,7 @@ arm_status arm_mat_ldlt_f32(
     {
         /* Find pivot */
         float32_t m=F32_MIN,a;
-        int j=k; 
+        int j=k;
 
 
         int r;
@@ -457,7 +457,7 @@ arm_status arm_mat_ldlt_f32(
                pA[w*n+k] = pA[w*n+k] / a;
         }
 
-        
+
 
     }
 
@@ -491,12 +491,12 @@ arm_status arm_mat_ldlt_f32(
       pd->pData[d*n+d] = pl->pData[d*n+d];
       pl->pData[d*n+d] = 1.0;
     }
-  
+
     status = ARM_MATH_SUCCESS;
 
   }
 
-  
+
   /* Return to application */
   return (status);
 }
diff --git a/Source/SupportFunctions/arm_q15_to_float.c b/Source/SupportFunctions/arm_q15_to_float.c
index 902d302e..1a20defe 100644
--- a/Source/SupportFunctions/arm_q15_to_float.c
+++ b/Source/SupportFunctions/arm_q15_to_float.c
@@ -65,16 +65,16 @@ void arm_q15_to_float(
 
   q15x8_t vecDst;
   q15_t const *pSrcVec;
-  
+
   pSrcVec = (q15_t const *) pSrc;
   blkCnt = blockSize >> 2;
   while (blkCnt > 0U)
   {
       /* C = (float32_t) A / 32768 */
       /* convert from q15 to float and then store the results in the destination buffer */
-      vecDst = vldrhq_s32(pSrcVec); 
+      vecDst = vldrhq_s32(pSrcVec);
       pSrcVec += 4;
-      vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15));  
+      vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));
       pDst += 4;
       /*
        * Decrement the blockSize loop counter
@@ -129,7 +129,7 @@ void arm_q15_to_float(
     outV = vcvtq_n_f32_s32(inV1,15);
     vst1q_f32(pDst, outV);
     pDst += 4;
-  
+
     /* Decrement the loop counter */
     blkCnt--;
   }
diff --git a/Source/SupportFunctions/arm_q7_to_float.c b/Source/SupportFunctions/arm_q7_to_float.c
index 776a5383..9cbf8ad0 100644
--- a/Source/SupportFunctions/arm_q7_to_float.c
+++ b/Source/SupportFunctions/arm_q7_to_float.c
@@ -72,7 +72,7 @@ void arm_q7_to_float(
         /* convert from q7 to float and then store the results in the destination buffer */
         vecDst = vldrbq_s32(pSrcVec);    
         pSrcVec += 4;
-        vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7));   
+        vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));   
         pDst += 4;
         /*
          * Decrement the blockSize loop counter
diff --git a/Source/TransformFunctions/arm_cfft_q15.c b/Source/TransformFunctions/arm_cfft_q15.c
index 69211c43..1e997adb 100644
--- a/Source/TransformFunctions/arm_cfft_q15.c
+++ b/Source/TransformFunctions/arm_cfft_q15.c
@@ -184,16 +184,16 @@ static void _arm_radix4_butterfly_q15_mve(
         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
 
         vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
 
         vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
 
         blkCnt--;
     }
@@ -419,16 +419,16 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
 
         vecTmp0 = vhaddq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
 
         vecTmp0 = vhsubq(vecSum0, vecSum1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
 
         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
 
         blkCnt--;
     }