From 169877344c3d93785c45f48c2f8b9389ca4228a5 Mon Sep 17 00:00:00 2001
From: ClaudioMartino <claudio.martino@arm.com>
Date: Tue, 7 Jan 2020 11:38:36 +0100
Subject: [PATCH] Corrected spline interpolation and merge sort

---
 Include/arm_math.h                            | 43 ++++++++--
 PrivateInclude/arm_sorting.h                  | 12 ---
 Source/SupportFunctions/arm_merge_sort_f32.c  | 27 +++++--
 .../arm_merge_sort_init_f32.c                 | 35 +++++++++
 Source/SupportFunctions/arm_sort_f32.c        |  4 -
 .../SupportFunctions/arm_spline_interp_f32.c  | 78 ++++++++-----------
 .../arm_spline_interp_init_f32.c              | 13 ++--
 Testing/Include/Tests/SupportTestsF32.h       |  7 +-
 Testing/Source/Tests/SupportTestsF32.cpp      | 33 ++++----
 9 files changed, 154 insertions(+), 98 deletions(-)
 create mode 100644 Source/SupportFunctions/arm_merge_sort_init_f32.c

diff --git a/Include/arm_math.h b/Include/arm_math.h
index 0cd2ee7d..2cd6375a 100644
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@@ -2264,11 +2264,9 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
              /**< Heap sort      */
     ARM_SORT_INSERTION = 3,
              /**< Insertion sort */
-    ARM_SORT_MERGE     = 4,
-             /**< Merge sort     */
-    ARM_SORT_QUICK     = 5,
+    ARM_SORT_QUICK     = 4,
              /**< Quick sort     */
-    ARM_SORT_SELECTION = 6
+    ARM_SORT_SELECTION = 5
              /**< Selection sort */
   } arm_sort_alg;
 
@@ -2314,6 +2312,37 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
     arm_sort_alg alg, 
     arm_sort_dir dir); 
 
+  /**
+   * @brief Instance structure for the sorting algorithms.
+   */
+  typedef struct            
+  {
+    arm_sort_dir dir;        /**< Sorting order (direction)  */
+    float32_t * buffer;      /**< Working buffer */
+  } arm_merge_sort_instance_f32;  
+
+  /**
+   * @param[in]      S          points to an instance of the sorting structure.
+   * @param[in,out]  pSrc       points to the block of input data.
+   * @param[out]     pDst       points to the block of output data
+   * @param[in]      blockSize  number of samples to process.
+   */
+  void arm_merge_sort_f32(
+    const arm_merge_sort_instance_f32 * S,
+          float32_t *pSrc,
+          float32_t *pDst,
+          uint32_t blockSize);
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      dir          Sorting order.
+   * @param[in]      buffer       Working buffer.
+   */
+  void arm_merge_sort_init_f32(
+    arm_merge_sort_instance_f32 * S,
+    arm_sort_dir dir,
+    float32_t * buffer);
+
   /**
    * @brief Struct for specifying cubic spline type
    */
@@ -2330,6 +2359,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
   {
     uint32_t n_x;              /**< Number of known data points */
     arm_spline_type type;      /**< Type (boundary conditions) */
+    float32_t * buffer;
   } arm_spline_instance_f32;
 
   /**
@@ -2347,7 +2377,7 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
     const float32_t * y,
     const float32_t * xq,
           float32_t * pDst,
-	  uint32_t blockSize);
+          uint32_t blockSize);
 
   /**
    * @brief Initialization function for the floating-point cubic spline interpolation.
@@ -2358,7 +2388,8 @@ __STATIC_INLINE q31_t arm_div_q63_to_q31(q63_t num, q31_t den)
   void arm_spline_init_f32(
     arm_spline_instance_f32 * S,
     uint32_t n,
-    arm_spline_type type);
+    arm_spline_type type,
+    float32_t * buffer);
 
   /**
    * @brief Instance structure for the floating-point matrix structure.
diff --git a/PrivateInclude/arm_sorting.h b/PrivateInclude/arm_sorting.h
index 4a405402..ec002b2f 100644
--- a/PrivateInclude/arm_sorting.h
+++ b/PrivateInclude/arm_sorting.h
@@ -68,18 +68,6 @@ extern "C"
           float32_t* pDst, 
     uint32_t blockSize);
 
-  /**
-   * @param[in]  S          points to an instance of the sorting structure.
-   * @param[in]  pSrc       points to the block of input data.
-   * @param[out] pDst       points to the block of output data
-   * @param[in]  blockSize  number of samples to process.
-   */
-  void arm_merge_sort_f32(
-    const arm_sort_instance_f32 * S, 
-          float32_t *pSrc, 
-          float32_t *pDst, 
-    uint32_t blockSize);
-
   /**
    * @param[in]  S          points to an instance of the sorting structure.
    * @param[in]  pSrc       points to the block of input data.
diff --git a/Source/SupportFunctions/arm_merge_sort_f32.c b/Source/SupportFunctions/arm_merge_sort_f32.c
index 7e3180b8..2e026a39 100644
--- a/Source/SupportFunctions/arm_merge_sort_f32.c
+++ b/Source/SupportFunctions/arm_merge_sort_f32.c
@@ -92,24 +92,35 @@ static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end
    *               divide the input array in sublists and merge them to produce
    *               longer sorted sublists until there is only one list remaining.
    *
-   * @par          A work array is always needed, hence pSrc and pDst cannot be
-   *               equal and the results will be stored in pDst.
+   * @par          A work array is always needed. It must be allocated by the user 
+   *               linked to the instance at initialization time.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed
    */
 
 
 void arm_merge_sort_f32(
-  const arm_sort_instance_f32 * S, 
+  const arm_merge_sort_instance_f32 * S, 
         float32_t *pSrc, 
         float32_t *pDst, 
         uint32_t blockSize)
 {
-    // A work array is needed: pDst
-    if(pSrc != pDst) // out-of-place only
-    {
-	memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+    float32_t * pA;
 
-	arm_merge_sort_core_f32(pSrc, 0, blockSize, pDst, S->dir);
+    /* Out-of-place */ 
+    if(pSrc != pDst)
+    {
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+        pA = pDst;
     }
+    else
+        pA = pSrc;
+
+    /* A working buffer is needed */
+    memcpy(S->buffer, pSrc, blockSize*sizeof(float32_t));
+
+    arm_merge_sort_core_f32(S->buffer, 0, blockSize, pA, S->dir);
 }
 /**
   @} end of Sorting group
diff --git a/Source/SupportFunctions/arm_merge_sort_init_f32.c b/Source/SupportFunctions/arm_merge_sort_init_f32.c
new file mode 100644
index 00000000..3a2ff07c
--- /dev/null
+++ b/Source/SupportFunctions/arm_merge_sort_init_f32.c
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_merge_sort_init_f32.c
+ * Description:  Floating point merge sort initialization function
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+void arm_merge_sort_init_f32(arm_merge_sort_instance_f32 * S, arm_sort_dir dir, float32_t * buffer)
+{
+    S->dir    = dir;
+    S->buffer = buffer;
+}
diff --git a/Source/SupportFunctions/arm_sort_f32.c b/Source/SupportFunctions/arm_sort_f32.c
index fea7aad4..bb1cf7e4 100644
--- a/Source/SupportFunctions/arm_sort_f32.c
+++ b/Source/SupportFunctions/arm_sort_f32.c
@@ -66,10 +66,6 @@ void arm_sort_f32(
         arm_insertion_sort_f32(S, pSrc, pDst, blockSize);
         break;
 
-        case ARM_SORT_MERGE:
-        arm_merge_sort_f32(S, pSrc, pDst, blockSize);
-        break;
-
         case ARM_SORT_QUICK:
         arm_quick_sort_f32(S, pSrc, pDst, blockSize);
         break;
diff --git a/Source/SupportFunctions/arm_spline_interp_f32.c b/Source/SupportFunctions/arm_spline_interp_f32.c
index e3a8a375..2954b570 100644
--- a/Source/SupportFunctions/arm_spline_interp_f32.c
+++ b/Source/SupportFunctions/arm_spline_interp_f32.c
@@ -163,28 +163,16 @@ void arm_spline_f32(
   const float32_t * y,
   const float32_t * xq,
         float32_t * pDst,
-	uint32_t blockSize)
+        uint32_t blockSize)
 {
-
-    /* 
-
-    As explained in arm_spline_interp_init_f32.c, this must be > 1
-    
-    */
     int32_t n = S->n_x;
     arm_spline_type type = S->type;
 
     float32_t hi, hm1;
-
-    /* 
-
-    Temporary variables for system AX=B.
-    This will be replaced by new arguments for the function.
-
-     */
-    float32_t u[MAX_DATA_POINTS], z[MAX_DATA_POINTS], c[MAX_DATA_POINTS];
+    float32_t * u = (S->buffer);     /* (n-1)-long buffer for u elements */
+    float32_t * z = (S->buffer)+(n-1);   /* n-long buffer for z elements */
+    float32_t * c = (S->buffer)+(n+n-1); /* n-long buffer for c elements */
     float32_t Bi, li;
-
     float32_t bi, di;
     float32_t x_sc;
 
@@ -231,7 +219,7 @@ void arm_spline_f32(
         hi = x[i+1]-x[i];
         Bi = 3*(y[i+1]-y[i])/hi - 3*(y[i]-y[i-1])/hm1;
         /* l(i) = a(ii)-a(i,i-1)*u(i-1) = 2[h(i-1)+h(i)]-h(i-1)*u(i-1) */
-	li = 2*(hi+hm1) - hm1*u[i-1];
+        li = 2*(hi+hm1) - hm1*u[i-1];
         /* u(i) = a(i,i+1)/l(i) = h(i)/l(i) */
         u[i] = hi/li;
         /* z(i) = [B(i)-h(i-1)*z(i-1)]/l(i) */
@@ -264,12 +252,12 @@ void arm_spline_f32(
     /* === Compute b(i) and d(i) from c(i) and create output for x(i)<x<x(i+1) === */
     for (i=0; i<n-1; i++)
     {
-	hi = x[i+1]-x[i];
+        hi = x[i+1]-x[i];
         bi = (y[i+1]-y[i])/hi-hi*(c[i+1]+2*c[i])/3;
         di = (c[i+1]-c[i])/(3*hi);
 
 #ifdef ARM_MATH_NEON
-	xiv = vdupq_n_f32(x[i]);
+        xiv = vdupq_n_f32(x[i]);
 
         aiv = vdupq_n_f32(y[i]);
         biv = vdupq_n_f32(bi);
@@ -277,42 +265,42 @@ void arm_spline_f32(
         div = vdupq_n_f32(di);
 
         while( *(pXq+4) <= x[i+1] && blkCnt > 4 )
-	{
-	    /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
+        {
+            /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
             xqv = vld1q_f32(pXq);
-	    pXq+=4;
-
-	    /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+            pXq+=4;
+        
+            /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
             diff = vsubq_f32(xqv, xiv);
-	    temp = diff;
-
-	    /* y(i) = a(i) + ... */
+            temp = diff;
+        
+            /* y(i) = a(i) + ... */
             yv = aiv;
-	    /* ... + b(i)*(x-x(i)) + ... */
-	    yv = vmlaq_f32(yv, biv, temp);
-	    /* ... + c(i)*(x-x(i))^2 + ... */
-	    temp = vmulq_f32(temp, diff);
-	    yv = vmlaq_f32(yv, civ, temp);
+            /* ... + b(i)*(x-x(i)) + ... */
+            yv = vmlaq_f32(yv, biv, temp);
+            /* ... + c(i)*(x-x(i))^2 + ... */
+            temp = vmulq_f32(temp, diff);
+            yv = vmlaq_f32(yv, civ, temp);
             /* ... + d(i)*(x-x(i))^3 */
-	    temp = vmulq_f32(temp, diff);
-	    yv = vmlaq_f32(yv, div, temp);
-
+            temp = vmulq_f32(temp, diff);
+            yv = vmlaq_f32(yv, div, temp);
+        
             /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
-	    vst1q_f32(pDst, yv);
-	    pDst+=4;
-
-	    blkCnt-=4;
-	}
+            vst1q_f32(pDst, yv);
+            pDst+=4;
+        
+            blkCnt-=4;
+        }
 #endif
-        while( *pXq <= x[i+1] && blkCnt > 0 )	
-	{
-	    x_sc = *pXq++;
+        while( *pXq <= x[i+1] && blkCnt > 0 )
+        {
+            x_sc = *pXq++;
 
             *pDst = y[i]+bi*(x_sc-x[i])+c[i]*(x_sc-x[i])*(x_sc-x[i])+di*(x_sc-x[i])*(x_sc-x[i])*(x_sc-x[i]);
 
             pDst++;
-	    blkCnt--;
-	}
+            blkCnt--;
+        }
     }
 
     /* == Create output for remaining samples (x>=x(n)) == */
diff --git a/Source/SupportFunctions/arm_spline_interp_init_f32.c b/Source/SupportFunctions/arm_spline_interp_init_f32.c
index e014f201..d5065d92 100644
--- a/Source/SupportFunctions/arm_spline_interp_init_f32.c
+++ b/Source/SupportFunctions/arm_spline_interp_init_f32.c
@@ -47,15 +47,16 @@
 void arm_spline_init_f32(
     arm_spline_instance_f32 * S, 
     uint32_t n, 
-    arm_spline_type type)
+    arm_spline_type type,
+    float32_t * buffer)
 {
-    S->n_x    = n;
-    S->type   = type;
-
+    S->n_x  = n;
+    S->type = type;
     /* Type (boundary conditions):
-     *   0: Natural spline          ( S1''(x1)=0 ; Sn''(xn)=0 )
-     *   1: Parabolic runout spline ( S1''(x1)=S2''(x2) ; Sn-1''(xn-1)=Sn''(xn) )
+     *  - Natural spline          ( S1''(x1)=0 ; Sn''(xn)=0 )
+     *  - Parabolic runout spline ( S1''(x1)=S2''(x2) ; Sn-1''(xn-1)=Sn''(xn) )
      */
+    S->buffer = buffer;
 }
 
 /**
diff --git a/Testing/Include/Tests/SupportTestsF32.h b/Testing/Include/Tests/SupportTestsF32.h
index 2830f02f..0fe693cd 100755
--- a/Testing/Include/Tests/SupportTestsF32.h
+++ b/Testing/Include/Tests/SupportTestsF32.h
@@ -12,9 +12,10 @@ class SupportTestsF32:public Client::Suite
             Client::Pattern<float32_t> input;
             Client::Pattern<float32_t> coefs;
 
-	    Client::Pattern<float32_t> inputX;
-	    Client::Pattern<float32_t> inputY;
-	    Client::Pattern<float32_t> outputX;
+            Client::Pattern<float32_t> inputX;
+            Client::Pattern<float32_t> inputY;
+            Client::Pattern<float32_t> outputX;
+            Client::Pattern<float32_t> buffer;
 
             Client::LocalPattern<float32_t> output;
             Client::LocalPattern<q15_t> outputQ15;
diff --git a/Testing/Source/Tests/SupportTestsF32.cpp b/Testing/Source/Tests/SupportTestsF32.cpp
index 58468372..8f9ea251 100755
--- a/Testing/Source/Tests/SupportTestsF32.cpp
+++ b/Testing/Source/Tests/SupportTestsF32.cpp
@@ -297,11 +297,12 @@
     {
        float32_t *inp = input.ptr();
        float32_t *outp = output.ptr();
-       arm_sort_instance_f32 S;
-
-       arm_sort_init_f32(&S, ARM_SORT_MERGE, ARM_SORT_ASCENDING);
+       float32_t *buf = buffer.ptr();
+       buf = (float32_t *)malloc((this->nbSamples)*sizeof(float32_t) );
+       arm_merge_sort_instance_f32 S;
 
-       arm_sort_f32(&S,inp,outp,this->nbSamples);
+       arm_merge_sort_init_f32(&S, ARM_SORT_ASCENDING, buf);
+       arm_merge_sort_f32(&S,inp,outp,this->nbSamples);
         
        ASSERT_EMPTY_TAIL(output);
 
@@ -313,11 +314,12 @@
     {
        float32_t *inp = input.ptr();
        float32_t *outp = output.ptr();
-       arm_sort_instance_f32 S;
-
-       arm_sort_init_f32(&S, ARM_SORT_MERGE, ARM_SORT_ASCENDING);
+       float32_t *buf = buffer.ptr();
+       buf = (float32_t *)malloc((this->nbSamples)*sizeof(float32_t) );
+       arm_merge_sort_instance_f32 S;
 
-       arm_sort_f32(&S,inp,outp,this->nbSamples);
+       arm_merge_sort_init_f32(&S, ARM_SORT_ASCENDING, buf);
+       arm_merge_sort_f32(&S,inp,outp,this->nbSamples);
         
        ASSERT_EMPTY_TAIL(output);
 
@@ -424,10 +426,11 @@
        const float32_t *inpY = inputY.ptr();
        const float32_t *outX = outputX.ptr();
        float32_t *outp = output.ptr();
-
+       float32_t *buf = buffer.ptr();
+       buf=(float32_t*)malloc((3*4-1)*sizeof(float32_t));
        arm_spline_instance_f32 S;
 
-       arm_spline_init_f32(&S, 4, ARM_SPLINE_PARABOLIC_RUNOUT);
+       arm_spline_init_f32(&S, 4, ARM_SPLINE_PARABOLIC_RUNOUT, buf);
        arm_spline_f32(&S, inpX, inpY, outX, outp, 20);
 
        ASSERT_EMPTY_TAIL(output);
@@ -440,10 +443,11 @@
        const float32_t *inpY = inputY.ptr();
        const float32_t *outX = outputX.ptr();
        float32_t *outp = output.ptr();
-
+       float32_t *buf = buffer.ptr();
+       buf=(float32_t*)malloc((3*9-1)*sizeof(float32_t));
        arm_spline_instance_f32 S;
 
-       arm_spline_init_f32(&S, 9, ARM_SPLINE_NATURAL);
+       arm_spline_init_f32(&S, 9, ARM_SPLINE_NATURAL, buf);
        arm_spline_f32(&S, inpX, inpY, outX, outp, 33);
 
        ASSERT_EMPTY_TAIL(output);
@@ -456,10 +460,11 @@
        const float32_t *inpY = inputY.ptr();
        const float32_t *outX = outputX.ptr();
        float32_t *outp = output.ptr();
-
+       float32_t *buf = buffer.ptr();
+       buf=(float32_t*)malloc((3*3-1)*sizeof(float32_t));
        arm_spline_instance_f32 S;
 
-       arm_spline_init_f32(&S, 3, ARM_SPLINE_PARABOLIC_RUNOUT);
+       arm_spline_init_f32(&S, 3, ARM_SPLINE_PARABOLIC_RUNOUT, buf);
        arm_spline_f32(&S, inpX, inpY, outX, outp, 30);
 
        ASSERT_EMPTY_TAIL(output);