Updated FAQ of compute graph documentation.

Corrected issue in GenericNodes.h
3 years ago · 175545244d
parent 46a732c158
commit 175545244d
2 changed files with 119 additions and 53 deletions
--- a/ComputeGraph/FAQ.md
+++ b/ComputeGraph/FAQ.md
@ -257,67 +257,133 @@ public:
 The `input` and `output` arrays, used in the sink / source, are defined as extern. The source is reading from `input` and the sink is writing to `output`.
-If we look at the asm code generated with `-Ofast` with armclang `AC6` and for one iteration of the schedule, we get:
+The generated scheduler is:
-```txt
+```C++
-PUSH     {r4-r6,lr}
+uint32_t scheduler(int *error)
-MOVW     r5,#0x220
+{
-MOVW     r1,#0x620
+    int cgStaticError=0;
-MOVT     r5,#0x3000
+    uint32_t nbSchedule=0;
-MOV      r4,r0
+    int32_t debugCounter=1;
-MOVT     r1,#0x3000
+
-MOV      r0,r5
+    CG_BEFORE_FIFO_INIT;
-MOV      r2,#0x200
+    /*
-BL       __aeabi_memcpy4 ; 0x10000a94
+    Create FIFOs objects
-MOVW     r6,#0x420
+    */
-MOV      r0,r5
+    FIFO<float32_t,FIFOSIZE0,1,0> fifo0(buf0);
-MOVT     r6,#0x3000
+    FIFO<float32_t,FIFOSIZE1,1,0> fifo1(buf1);
-MOVS     r2,#0x80
+
-VMOV.F32 s0,#0.5
+    CG_BEFORE_NODE_INIT;
-MOV      r1,r6
+    /* 
-BL       arm_offset_f32 ; 0x10002cd0
+    Create node objects
-MOV      r0,#0x942c
+    */
-MOV      r1,r6
+    ProcessingNode<float32_t,128,float32_t,128> proc(fifo0,fifo1);
-MOVT     r0,#0x3000
+    Sink<float32_t,128> sink(fifo1);
-MOV      r2,#0x200
+    Source<float32_t,128> source(fifo0);
-BL       __aeabi_memcpy4 ; 0x10000a94
+
-MOVS     r1,#0
+    /* Run several schedule iterations */
-MOVS     r0,#1
+    CG_BEFORE_SCHEDULE;
-STR      r1,[r4,#0]
+    while((cgStaticError==0) && (debugCounter > 0))
-POP      {r4-r6,pc}
+    {
-```
+        /* Run a schedule iteration */
-
+        CG_BEFORE_ITERATION;
-It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP function. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.
+        for(unsigned long id=0 ; id < 3; id++)
        {
            CG_BEFORE_NODE_EXECUTION;
-The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.
+            switch(schedule[id])
            {
                case 0:
                {
                   cgStaticError = proc.run();
                }
                break;
-The generated code is as efficient as something manually coded.
+                case 1:
                {
                   cgStaticError = sink.run();
                }
                break;
-The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.
+                case 2:
                {
                   cgStaticError = source.run();
                }
                break;
-The input buffer `input` is at address `0x30000620`.
+                default:
                break;
            }
            CG_AFTER_NODE_EXECUTION;
            CHECKERROR;
        }
       debugCounter--;
       CG_AFTER_ITERATION;
       nbSchedule++;
    }
-The `output` buffer is at address `0x3000942c`.
+errorHandling:
    CG_AFTER_SCHEDULE;
    *error=cgStaticError;
    return(nbSchedule);
 }
 ```
-We can see in the code:
+If we look at the asm of the scheduler generated for a Cortex-M7 with `-Ofast` with armclang `AC6.19` and for **one** iteration of the schedule, we get (disassembly is from uVision IDE):
 ```txt
-MOVW     r1,#0x620
+0x000004B0 B570      PUSH          {r4-r6,lr}
-...
+    97:             b[i] = input[i]; 
-MOVT     r1,#0x3000
+0x000004B2 F2402518  MOVW          r5,#0x218
 0x000004B6 F2406118  MOVW          r1,#0x618
 0x000004BA F2C20500  MOVT          r5,#0x2000
 0x000004BE 4604      MOV           r4,r0
 0x000004C0 F2C20100  MOVT          r1,#0x2000
 0x000004C4 F44F7200  MOV           r2,#0x200
 0x000004C8 4628      MOV           r0,r5
 0x000004CA F00BF8E6  BL.W          0x0000B69A __aeabi_memcpy4
 0x000004CE EEB60A00  VMOV.F32      s0,#0.5
   131:         arm_offset_f32(a,0.5,b,inputSize); 
 0x000004D2 F2404618  MOVW          r6,#0x418
 0x000004D6 F2C20600  MOVT          r6,#0x2000
 0x000004DA 2280      MOVS          r2,#0x80
 0x000004DC 4628      MOV           r0,r5
 0x000004DE 4631      MOV           r1,r6
 0x000004E0 F002FC5E  BL.W          0x00002DA0 arm_offset_f32
    63:             output[i] = b[i]; 
 0x000004E4 F648705C  MOVW          r0,#0x8F5C
 0x000004E8 F44F7200  MOV           r2,#0x200
 0x000004EC F2C20000  MOVT          r0,#0x2000
 0x000004F0 4631      MOV           r1,r6
 0x000004F2 F00BF8D2  BL.W          0x0000B69A __aeabi_memcpy4
   163:        CG_AFTER_ITERATION; 
   164:        nbSchedule++; 
   165:     } 
   166:  
   167: errorHandling: 
   168:     CG_AFTER_SCHEDULE; 
   169:     *error=cgStaticError; 
   170:     return(nbSchedule); 
 0x000004F6 F2402014  MOVW          r0,#0x214
 0x000004FA F2C20000  MOVT          r0,#0x2000
 0x000004FE 6801      LDR           r1,[r0,#0x00]
 0x00000500 3101      ADDS          r1,r1,#0x01
 0x00000502 6001      STR           r1,[r0,#0x00]
   171: } 
 0x00000504 2001      MOVS          r0,#0x01
 0x00000506 2100      MOVS          r1,#0x00
   169:     *error=cgStaticError; 
 0x00000508 6021      STR           r1,[r4,#0x00]
 0x0000050A BD70      POP           {r4-r6,pc}
 ```
-or
+It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP functions. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.
 ```
 MOV      r0,#0x942c
 ...
 MOVT     r0,#0x3000
 ```
-just before the `memcpy`
+The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.
 The generated code is as efficient as something manually coded.
 The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.
 It is not always as ideal as in this example. But it demonstrates that the use of C++ templates and a Python code generator is enabling a low overhead solution to the problem of streaming and compute graph.
--- a/ComputeGraph/cg/src/GenericNodes.h
+++ b/ComputeGraph/cg/src/GenericNodes.h
@ -156,12 +156,12 @@ class FIFO<T,length,1,0>: public FIFOBase<T>
        bool willOverflowWith(int nb) const final {return false;};
        int nbSamplesInFIFO() const final {return 0;};
-        T * getWriteBuffer(int nb) const final
+        T * getWriteBuffer(int nb) final
        {
            return(mBuffer);
        };
-        T* getReadBuffer(int nb) const final
+        T* getReadBuffer(int nb) final
        {
            return(mBuffer);
        }