Updated FAQ of compute graph documentation.

Corrected issue in GenericNodes.h
pull/94/head
Christophe Favergeon 3 years ago
parent 46a732c158
commit 175545244d

@ -257,67 +257,133 @@ public:
The `input` and `output` arrays, used in the sink / source, are defined as extern. The source is reading from `input` and the sink is writing to `output`. The `input` and `output` arrays, used in the sink / source, are defined as extern. The source is reading from `input` and the sink is writing to `output`.
If we look at the asm code generated with `-Ofast` with armclang `AC6` and for one iteration of the schedule, we get: The generated scheduler is:
```txt ```C++
PUSH {r4-r6,lr} uint32_t scheduler(int *error)
MOVW r5,#0x220 {
MOVW r1,#0x620 int cgStaticError=0;
MOVT r5,#0x3000 uint32_t nbSchedule=0;
MOV r4,r0 int32_t debugCounter=1;
MOVT r1,#0x3000
MOV r0,r5 CG_BEFORE_FIFO_INIT;
MOV r2,#0x200 /*
BL __aeabi_memcpy4 ; 0x10000a94 Create FIFOs objects
MOVW r6,#0x420 */
MOV r0,r5 FIFO<float32_t,FIFOSIZE0,1,0> fifo0(buf0);
MOVT r6,#0x3000 FIFO<float32_t,FIFOSIZE1,1,0> fifo1(buf1);
MOVS r2,#0x80
VMOV.F32 s0,#0.5 CG_BEFORE_NODE_INIT;
MOV r1,r6 /*
BL arm_offset_f32 ; 0x10002cd0 Create node objects
MOV r0,#0x942c */
MOV r1,r6 ProcessingNode<float32_t,128,float32_t,128> proc(fifo0,fifo1);
MOVT r0,#0x3000 Sink<float32_t,128> sink(fifo1);
MOV r2,#0x200 Source<float32_t,128> source(fifo0);
BL __aeabi_memcpy4 ; 0x10000a94
MOVS r1,#0 /* Run several schedule iterations */
MOVS r0,#1 CG_BEFORE_SCHEDULE;
STR r1,[r4,#0] while((cgStaticError==0) && (debugCounter > 0))
POP {r4-r6,pc} {
``` /* Run a schedule iteration */
CG_BEFORE_ITERATION;
It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP function. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed. for(unsigned long id=0 ; id < 3; id++)
{
CG_BEFORE_NODE_EXECUTION;
The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays. switch(schedule[id])
{
case 0:
{
cgStaticError = proc.run();
}
break;
The generated code is as efficient as something manually coded. case 1:
{
cgStaticError = sink.run();
}
break;
The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function. case 2:
{
cgStaticError = source.run();
}
break;
The input buffer `input` is at address `0x30000620`. default:
break;
}
CG_AFTER_NODE_EXECUTION;
CHECKERROR;
}
debugCounter--;
CG_AFTER_ITERATION;
nbSchedule++;
}
The `output` buffer is at address `0x3000942c`. errorHandling:
CG_AFTER_SCHEDULE;
*error=cgStaticError;
return(nbSchedule);
}
```
We can see in the code: If we look at the asm of the scheduler generated for a Cortex-M7 with `-Ofast` with armclang `AC6.19` and for **one** iteration of the schedule, we get (disassembly is from uVision IDE):
```txt ```txt
MOVW r1,#0x620 0x000004B0 B570 PUSH {r4-r6,lr}
... 97: b[i] = input[i];
MOVT r1,#0x3000 0x000004B2 F2402518 MOVW r5,#0x218
0x000004B6 F2406118 MOVW r1,#0x618
0x000004BA F2C20500 MOVT r5,#0x2000
0x000004BE 4604 MOV r4,r0
0x000004C0 F2C20100 MOVT r1,#0x2000
0x000004C4 F44F7200 MOV r2,#0x200
0x000004C8 4628 MOV r0,r5
0x000004CA F00BF8E6 BL.W 0x0000B69A __aeabi_memcpy4
0x000004CE EEB60A00 VMOV.F32 s0,#0.5
131: arm_offset_f32(a,0.5,b,inputSize);
0x000004D2 F2404618 MOVW r6,#0x418
0x000004D6 F2C20600 MOVT r6,#0x2000
0x000004DA 2280 MOVS r2,#0x80
0x000004DC 4628 MOV r0,r5
0x000004DE 4631 MOV r1,r6
0x000004E0 F002FC5E BL.W 0x00002DA0 arm_offset_f32
63: output[i] = b[i];
0x000004E4 F648705C MOVW r0,#0x8F5C
0x000004E8 F44F7200 MOV r2,#0x200
0x000004EC F2C20000 MOVT r0,#0x2000
0x000004F0 4631 MOV r1,r6
0x000004F2 F00BF8D2 BL.W 0x0000B69A __aeabi_memcpy4
163: CG_AFTER_ITERATION;
164: nbSchedule++;
165: }
166:
167: errorHandling:
168: CG_AFTER_SCHEDULE;
169: *error=cgStaticError;
170: return(nbSchedule);
0x000004F6 F2402014 MOVW r0,#0x214
0x000004FA F2C20000 MOVT r0,#0x2000
0x000004FE 6801 LDR r1,[r0,#0x00]
0x00000500 3101 ADDS r1,r1,#0x01
0x00000502 6001 STR r1,[r0,#0x00]
171: }
0x00000504 2001 MOVS r0,#0x01
0x00000506 2100 MOVS r1,#0x00
169: *error=cgStaticError;
0x00000508 6021 STR r1,[r4,#0x00]
0x0000050A BD70 POP {r4-r6,pc}
``` ```
or It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP functions. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.
```
MOV r0,#0x942c
...
MOVT r0,#0x3000
```
just before the `memcpy` The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.
The generated code is as efficient as something manually coded.
The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.
It is not always as ideal as in this example. But it demonstrates that the use of C++ templates and a Python code generator is enabling a low overhead solution to the problem of streaming and compute graph. It is not always as ideal as in this example. But it demonstrates that the use of C++ templates and a Python code generator is enabling a low overhead solution to the problem of streaming and compute graph.

@ -156,12 +156,12 @@ class FIFO<T,length,1,0>: public FIFOBase<T>
bool willOverflowWith(int nb) const final {return false;}; bool willOverflowWith(int nb) const final {return false;};
int nbSamplesInFIFO() const final {return 0;}; int nbSamplesInFIFO() const final {return 0;};
T * getWriteBuffer(int nb) const final T * getWriteBuffer(int nb) final
{ {
return(mBuffer); return(mBuffer);
}; };
T* getReadBuffer(int nb) const final T* getReadBuffer(int nb) final
{ {
return(mBuffer); return(mBuffer);
} }

Loading…
Cancel
Save