From efd47b9da66f2b95872d7e71805f7feacc8fcd0a Mon Sep 17 00:00:00 2001
From: Christophe Favergeon <Christophe.Favergeon@arm.com>
Date: Tue, 28 Jan 2020 13:52:42 +0100
Subject: [PATCH] CMSIS-DSP: Improvement for issue 809

---
 Source/StatisticsFunctions/arm_std_f32.c | 123 ++---------------------
 Testing/Source/Tests/StatsTestsF32.cpp   |  46 +++++++++
 Testing/desc.txt                         |   2 +
 3 files changed, 57 insertions(+), 114 deletions(-)
diff --git a/Source/StatisticsFunctions/arm_std_f32.c b/Source/StatisticsFunctions/arm_std_f32.c
index ea160d42..b584dcdf 100644
--- a/Source/StatisticsFunctions/arm_std_f32.c
+++ b/Source/StatisticsFunctions/arm_std_f32.c
@@ -36,7 +36,15 @@
   @defgroup STD Standard deviation
 
   Calculates the standard deviation of the elements in the input vector.
-  The underlying algorithm is used:
+
+  The float implementation is relying on arm_var_f32 which is using a two-pass algorithm
+  to avoid problem of numerical instabilities and cancellation errors.
+
+  Fixed point versions are using the standard textbook algorithm since the fixed point
+  numerical behavior is different from the float one.
+
+  Algorithm for fixed point versions is summarized below:
+
 
   <pre>
       Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
@@ -60,7 +68,6 @@
   @param[out]    pResult    standard deviation value returned here
   @return        none
  */
-#if (defined(ARM_MATH_NEON_EXPERIMENTAL) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
 void arm_std_f32(
   const float32_t * pSrc,
         uint32_t blockSize,
@@ -70,118 +77,6 @@ void arm_std_f32(
   arm_var_f32(pSrc,blockSize,&var);
   arm_sqrt_f32(var, pResult);
 }
-#else
-void arm_std_f32(
-  const float32_t * pSrc,
-        uint32_t blockSize,
-        float32_t * pResult)
-{
-        uint32_t blkCnt;                               /* Loop counter */
-        float32_t sum = 0.0f;                          /* Temporary result storage */
-        float32_t sumOfSquares = 0.0f;                 /* Sum of squares */
-        float32_t in;                                  /* Temporary variable to store input value */
-
-#ifndef ARM_MATH_CM0_FAMILY
-        float32_t meanOfSquares, mean, squareOfMean;   /* Temporary variables */
-#else
-        float32_t squareOfSum;                         /* Square of Sum */
-        float32_t var;                                 /* Temporary varaince storage */
-#endif
-
-  if (blockSize <= 1U)
-  {
-    *pResult = 0;
-    return;
-  }
-
-#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
-
-  /* Loop unrolling: Compute 4 outputs at a time */
-  blkCnt = blockSize >> 2U;
-
-  while (blkCnt > 0U)
-  {
-    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-    /* C = A[0] + A[1] + ... + A[blockSize-1] */
-
-    in = *pSrc++;
-    /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
-    sumOfSquares += in * in;
-    /* Compute sum and store result in a temporary variable, sum. */
-    sum += in;
-
-    in = *pSrc++;
-    sumOfSquares += in * in;
-    sum += in;
-
-    in = *pSrc++;
-    sumOfSquares += in * in;
-    sum += in;
-
-    in = *pSrc++;
-    sumOfSquares += in * in;
-    sum += in;
-
-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-  /* Loop unrolling: Compute remaining outputs */
-  blkCnt = blockSize % 0x4U;
-
-#else
-
-  /* Initialize blkCnt with number of samples */
-  blkCnt = blockSize;
-
-#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
-
-  while (blkCnt > 0U)
-  {
-    /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
-    /* C = A[0] + A[1] + ... + A[blockSize-1] */
-
-    in = *pSrc++;
-    /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
-    sumOfSquares += ( in * in);
-    /* Compute sum and store result in a temporary variable, sum. */
-    sum += in;
-
-    /* Decrement loop counter */
-    blkCnt--;
-  }
-
-#ifndef ARM_MATH_CM0_FAMILY
-
-  /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
-  meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
-
-  /* Compute mean of all input values */
-  mean = sum / (float32_t) blockSize;
-
-  /* Compute square of mean */
-  squareOfMean = (mean * mean) * (((float32_t) blockSize) /
-                                  ((float32_t) blockSize - 1.0f));
-
-  /* Compute standard deviation and store result to destination */
-  arm_sqrt_f32((meanOfSquares - squareOfMean), pResult);
-
-#else
-  /* Run the below code for Cortex-M0 */
-
-  /* Compute square of sum */
-  squareOfSum = ((sum * sum) / (float32_t) blockSize);
-
-  /* Compute variance */
-  var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
-
-  /* Compute standard deviation and store result in destination */
-  arm_sqrt_f32(var, pResult);
-
-#endif /* #ifndef ARM_MATH_CM0_FAMILY */
-
-}
-#endif /* #if defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)*/
 
 /**
   @} end of STD group
diff --git a/Testing/Source/Tests/StatsTestsF32.cpp b/Testing/Source/Tests/StatsTestsF32.cpp
index ec4d72c1..a2d1a4e9 100755
--- a/Testing/Source/Tests/StatsTestsF32.cpp
+++ b/Testing/Source/Tests/StatsTestsF32.cpp
@@ -196,6 +196,52 @@ a double precision computation.
 
     }
 
+
+    void StatsTestsF32::test_std_stability_f32()
+    {
+      /*
+
+      With the textbook algorithm, those values will produce a negative
+      value for the variance.
+
+      The CMSIS-DSP variance algorithm is the two pass one so will work
+      with those values.
+
+      So, it should be possible to compute the square root for the standard
+      deviation.
+
+      */
+      float32_t in[4]={4.0f, 7.0f, 13.0f, 16.0f};
+      float32_t result;
+      int i;
+      arm_status status;
+
+      /*
+
+      Add bigger offset so that average is much bigger than standard deviation.
+
+      */
+      for(i=0 ; i < 4; i++)
+      {
+        in[i] += 3e4;
+      }
+
+      arm_std_f32(in,4,&result);
+
+      /*
+
+      If variance is giving a negative value, the square root
+      should return zero.
+
+      We check it is not happening here.
+
+
+      */
+
+      ASSERT_TRUE(fabs(5.47723f - result) < 1.0e-4);
+
+    }
+
     void StatsTestsF32::test_entropy_f32()
     {
       const float32_t *inp  = inputA.ptr();
diff --git a/Testing/desc.txt b/Testing/desc.txt
index 51d1dd0c..35af5b64 100644
--- a/Testing/desc.txt
+++ b/Testing/desc.txt
@@ -89,6 +89,8 @@ group Root {
                 Test long  arm_rms_f32:test_rms_f32
                 Test long  arm_std_f32:test_std_f32
                 Test long  arm_var_f32:test_var_f32
+
+                Test stability  arm_std_f32:test_std_stability_f32
               }
 
            }