Updated FAQ of compute graph documentation.

Corrected issue in GenericNodes.h
pull/94/head
Christophe Favergeon 3 years ago
parent 46a732c158
commit 175545244d

@ -257,67 +257,133 @@ public:
The `input` and `output` arrays, used in the sink / source, are defined as extern. The source is reading from `input` and the sink is writing to `output`.
If we look at the asm code generated with `-Ofast` with armclang `AC6` and for one iteration of the schedule, we get:
The generated scheduler is:
```txt
PUSH {r4-r6,lr}
MOVW r5,#0x220
MOVW r1,#0x620
MOVT r5,#0x3000
MOV r4,r0
MOVT r1,#0x3000
MOV r0,r5
MOV r2,#0x200
BL __aeabi_memcpy4 ; 0x10000a94
MOVW r6,#0x420
MOV r0,r5
MOVT r6,#0x3000
MOVS r2,#0x80
VMOV.F32 s0,#0.5
MOV r1,r6
BL arm_offset_f32 ; 0x10002cd0
MOV r0,#0x942c
MOV r1,r6
MOVT r0,#0x3000
MOV r2,#0x200
BL __aeabi_memcpy4 ; 0x10000a94
MOVS r1,#0
MOVS r0,#1
STR r1,[r4,#0]
POP {r4-r6,pc}
```C++
uint32_t scheduler(int *error)
{
int cgStaticError=0;
uint32_t nbSchedule=0;
int32_t debugCounter=1;
CG_BEFORE_FIFO_INIT;
/*
Create FIFOs objects
*/
FIFO<float32_t,FIFOSIZE0,1,0> fifo0(buf0);
FIFO<float32_t,FIFOSIZE1,1,0> fifo1(buf1);
CG_BEFORE_NODE_INIT;
/*
Create node objects
*/
ProcessingNode<float32_t,128,float32_t,128> proc(fifo0,fifo1);
Sink<float32_t,128> sink(fifo1);
Source<float32_t,128> source(fifo0);
/* Run several schedule iterations */
CG_BEFORE_SCHEDULE;
while((cgStaticError==0) && (debugCounter > 0))
{
/* Run a schedule iteration */
CG_BEFORE_ITERATION;
for(unsigned long id=0 ; id < 3; id++)
{
CG_BEFORE_NODE_EXECUTION;
switch(schedule[id])
{
case 0:
{
cgStaticError = proc.run();
}
break;
case 1:
{
cgStaticError = sink.run();
}
break;
case 2:
{
cgStaticError = source.run();
}
break;
default:
break;
}
CG_AFTER_NODE_EXECUTION;
CHECKERROR;
}
debugCounter--;
CG_AFTER_ITERATION;
nbSchedule++;
}
errorHandling:
CG_AFTER_SCHEDULE;
*error=cgStaticError;
return(nbSchedule);
}
```
It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP function. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.
The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.
The generated code is as efficient as something manually coded.
The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.
The input buffer `input` is at address `0x30000620`.
The `output` buffer is at address `0x3000942c`.
We can see in the code:
If we look at the asm of the scheduler generated for a Cortex-M7 with `-Ofast` with armclang `AC6.19` and for **one** iteration of the schedule, we get (disassembly is from uVision IDE):
```txt
MOVW r1,#0x620
...
MOVT r1,#0x3000
0x000004B0 B570 PUSH {r4-r6,lr}
97: b[i] = input[i];
0x000004B2 F2402518 MOVW r5,#0x218
0x000004B6 F2406118 MOVW r1,#0x618
0x000004BA F2C20500 MOVT r5,#0x2000
0x000004BE 4604 MOV r4,r0
0x000004C0 F2C20100 MOVT r1,#0x2000
0x000004C4 F44F7200 MOV r2,#0x200
0x000004C8 4628 MOV r0,r5
0x000004CA F00BF8E6 BL.W 0x0000B69A __aeabi_memcpy4
0x000004CE EEB60A00 VMOV.F32 s0,#0.5
131: arm_offset_f32(a,0.5,b,inputSize);
0x000004D2 F2404618 MOVW r6,#0x418
0x000004D6 F2C20600 MOVT r6,#0x2000
0x000004DA 2280 MOVS r2,#0x80
0x000004DC 4628 MOV r0,r5
0x000004DE 4631 MOV r1,r6
0x000004E0 F002FC5E BL.W 0x00002DA0 arm_offset_f32
63: output[i] = b[i];
0x000004E4 F648705C MOVW r0,#0x8F5C
0x000004E8 F44F7200 MOV r2,#0x200
0x000004EC F2C20000 MOVT r0,#0x2000
0x000004F0 4631 MOV r1,r6
0x000004F2 F00BF8D2 BL.W 0x0000B69A __aeabi_memcpy4
163: CG_AFTER_ITERATION;
164: nbSchedule++;
165: }
166:
167: errorHandling:
168: CG_AFTER_SCHEDULE;
169: *error=cgStaticError;
170: return(nbSchedule);
0x000004F6 F2402014 MOVW r0,#0x214
0x000004FA F2C20000 MOVT r0,#0x2000
0x000004FE 6801 LDR r1,[r0,#0x00]
0x00000500 3101 ADDS r1,r1,#0x01
0x00000502 6001 STR r1,[r0,#0x00]
171: }
0x00000504 2001 MOVS r0,#0x01
0x00000506 2100 MOVS r1,#0x00
169: *error=cgStaticError;
0x00000508 6021 STR r1,[r4,#0x00]
0x0000050A BD70 POP {r4-r6,pc}
```
or
```
MOV r0,#0x942c
...
MOVT r0,#0x3000
```
It is the code you would get if you was manually writing a call to the corresponding CMSIS-DSP functions. All the C++ templates have disappeared. The switch / case used to implement the scheduler has also been removed.
just before the `memcpy`
The code was generated with `memoryOptimization` enabled and the Python script detected in this case that the FIFOs are used as arrays. As consequence, there is no FIFO update code. They are used as normal arrays.
The generated code is as efficient as something manually coded.
The sink and the sources have been replaced by a `memcpy`. The call to the CMSIS-DSP function is just loading the registers and branching to the CMSIS-DSP function.
It is not always as ideal as in this example. But it demonstrates that the use of C++ templates and a Python code generator is enabling a low overhead solution to the problem of streaming and compute graph.

@ -156,12 +156,12 @@ class FIFO<T,length,1,0>: public FIFOBase<T>
bool willOverflowWith(int nb) const final {return false;};
int nbSamplesInFIFO() const final {return 0;};
T * getWriteBuffer(int nb) const final
T * getWriteBuffer(int nb) final
{
return(mBuffer);
};
T* getReadBuffer(int nb) const final
T* getReadBuffer(int nb) final
{
return(mBuffer);
}

Loading…
Cancel
Save