Updated FAQ of compute graph documentation.

Corrected issue in GenericNodes.h
3 years ago · 175545244d
parent 46a732c158
commit 175545244d
2 changed files with 119 additions and 53 deletions
--- a/ComputeGraph/FAQ.md
+++ b/ComputeGraph/FAQ.md
@ -257,67 +257,133 @@ public:

 The `input` and `output` arrays, used in the sink / source, are defined as extern. The source is reading from `input` and the sink is writing to `output`.

-If we look at the asm code generated with `-Ofast` with armclang `AC6` and for one iteration of the schedule, we get:
+The generated scheduler is:

-```txt
-PUSH     {r4-r6,lr}
-MOVW     r5,#0x220
-MOVW     r1,#0x620
-MOVT     r5,#0x3000
-MOV      r4,r0
-MOVT     r1,#0x3000
-MOV      r0,r5
-MOV      r2,#0x200
-BL       __aeabi_memcpy4 ; 0x10000a94
-MOVW     r6,#0x420
-MOV      r0,r5
-MOVT     r6,#0x3000
-MOVS     r2,#0x80
-VMOV.F32 s0,#0.5
-MOV      r1,r6
-BL       arm_offset_f32 ; 0x10002cd0
-MOV      r0,#0x942c
-MOV      r1,r6
-MOVT     r0,#0x3000
-MOV      r2,#0x200
-BL       __aeabi_memcpy4 ; 0x10000a94
-MOVS     r1,#0
-MOVS     r0,#1
-STR      r1,[r4,#0]
-POP      {r4-r6,pc}
-```
-
-It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP function. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.
+```C++
+uint32_t scheduler(int *error)
+{
+    int cgStaticError=0;
+    uint32_t nbSchedule=0;
+    int32_t debugCounter=1;
+
+    CG_BEFORE_FIFO_INIT;
+    /*
+    Create FIFOs objects
+    */
+    FIFO<float32_t,FIFOSIZE0,1,0> fifo0(buf0);
+    FIFO<float32_t,FIFOSIZE1,1,0> fifo1(buf1);
+
+    CG_BEFORE_NODE_INIT;
+    /* 
+    Create node objects
+    */
+    ProcessingNode<float32_t,128,float32_t,128> proc(fifo0,fifo1);
+    Sink<float32_t,128> sink(fifo1);
+    Source<float32_t,128> source(fifo0);
+
+    /* Run several schedule iterations */
+    CG_BEFORE_SCHEDULE;
+    while((cgStaticError==0) && (debugCounter > 0))
+    {
+        /* Run a schedule iteration */
+        CG_BEFORE_ITERATION;
+        for(unsigned long id=0 ; id < 3; id++)
+        {
+            CG_BEFORE_NODE_EXECUTION;

-The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.
+            switch(schedule[id])
+            {
+                case 0:
+                {
+                   cgStaticError = proc.run();
+                }
+                break;

-The generated code is as efficient as something manually coded.
+                case 1:
+                {
+                   cgStaticError = sink.run();
+                }
+                break;

-The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.
+                case 2:
+                {
+                   cgStaticError = source.run();
+                }
+                break;

-The input buffer `input` is at address `0x30000620`.
+                default:
+                break;
+            }
+            CG_AFTER_NODE_EXECUTION;
+            CHECKERROR;
+        }
+       debugCounter--;
+       CG_AFTER_ITERATION;
+       nbSchedule++;
+    }

-The `output` buffer is at address `0x3000942c`.
+errorHandling:
+    CG_AFTER_SCHEDULE;
+    *error=cgStaticError;
+    return(nbSchedule);
+}
+```

-We can see in the code:
+If we look at the asm of the scheduler generated for a Cortex-M7 with `-Ofast` with armclang `AC6.19` and for **one** iteration of the schedule, we get (disassembly is from uVision IDE):

 ```txt
-MOVW     r1,#0x620
-...
-MOVT     r1,#0x3000
+0x000004B0 B570      PUSH          {r4-r6,lr}
+    97:             b[i] = input[i]; 
+0x000004B2 F2402518  MOVW          r5,#0x218
+0x000004B6 F2406118  MOVW          r1,#0x618
+0x000004BA F2C20500  MOVT          r5,#0x2000
+0x000004BE 4604      MOV           r4,r0
+0x000004C0 F2C20100  MOVT          r1,#0x2000
+0x000004C4 F44F7200  MOV           r2,#0x200
+0x000004C8 4628      MOV           r0,r5
+0x000004CA F00BF8E6  BL.W          0x0000B69A __aeabi_memcpy4
+0x000004CE EEB60A00  VMOV.F32      s0,#0.5
+   131:         arm_offset_f32(a,0.5,b,inputSize); 
+0x000004D2 F2404618  MOVW          r6,#0x418
+0x000004D6 F2C20600  MOVT          r6,#0x2000
+0x000004DA 2280      MOVS          r2,#0x80
+0x000004DC 4628      MOV           r0,r5
+0x000004DE 4631      MOV           r1,r6
+0x000004E0 F002FC5E  BL.W          0x00002DA0 arm_offset_f32
+    63:             output[i] = b[i]; 
+0x000004E4 F648705C  MOVW          r0,#0x8F5C
+0x000004E8 F44F7200  MOV           r2,#0x200
+0x000004EC F2C20000  MOVT          r0,#0x2000
+0x000004F0 4631      MOV           r1,r6
+0x000004F2 F00BF8D2  BL.W          0x0000B69A __aeabi_memcpy4
+   163:        CG_AFTER_ITERATION; 
+   164:        nbSchedule++; 
+   165:     } 
+   166:  
+   167: errorHandling: 
+   168:     CG_AFTER_SCHEDULE; 
+   169:     *error=cgStaticError; 
+   170:     return(nbSchedule); 
+0x000004F6 F2402014  MOVW          r0,#0x214
+0x000004FA F2C20000  MOVT          r0,#0x2000
+0x000004FE 6801      LDR           r1,[r0,#0x00]
+0x00000500 3101      ADDS          r1,r1,#0x01
+0x00000502 6001      STR           r1,[r0,#0x00]
+   171: } 
+0x00000504 2001      MOVS          r0,#0x01
+0x00000506 2100      MOVS          r1,#0x00
+   169:     *error=cgStaticError; 
+0x00000508 6021      STR           r1,[r4,#0x00]
+0x0000050A BD70      POP           {r4-r6,pc}
 ```

-or
+It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP functions. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.

-```
-MOV      r0,#0x942c
-...
-MOVT     r0,#0x3000
-```
-
-just before the `memcpy`
+The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.

+The generated code is as efficient as something manually coded.

+The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.

 It is not always as ideal as in this example. But it demonstrates that the use of C++ templates and a Python code generator is enabling a low overhead solution to the problem of streaming and compute graph.

--- a/ComputeGraph/cg/src/GenericNodes.h
+++ b/ComputeGraph/cg/src/GenericNodes.h
@ -156,12 +156,12 @@ class FIFO<T,length,1,0>: public FIFOBase<T>
        bool willOverflowWith(int nb) const final {return false;};
        int nbSamplesInFIFO() const final {return 0;};

-        T * getWriteBuffer(int nb) const final
+        T * getWriteBuffer(int nb) final
        {
            return(mBuffer);
        };

-        T* getReadBuffer(int nb) const final
+        T* getReadBuffer(int nb) final
        {
            return(mBuffer);
        }