Added a new simpler example for the compute graph.

Moved readme for the example in the example folders.
3 years ago · 95dc3f3807
parent d7e4dea51a
commit 95dc3f3807
40 changed files with 1112 additions and 23 deletions
--- a/ComputeGraph/README.md
+++ b/ComputeGraph/README.md
@ -128,11 +128,11 @@ If you have declared new nodes in `graph.py` then you'll need to provide an impl
 More details and explanations can be found in the documentation for the examples. The first example is a deep dive giving all the details about the Python and C++ sides of the tool: 
-* [Example 1 : how to describe a simple graph](documentation/example1.md)
+* [Example 1 : how to describe a simple graph](documentation/examples/example1/README.md)
-* [Example 2 : More complex example with delay and CMSIS-DSP](documentation/example2.md)
+* [Example 2 : More complex example with delay and CMSIS-DSP](documentation/examples/example2/README.md)
-* [Example 3 : Working example with CMSIS-DSP and FFT](documentation/example3.md)
+* [Example 3 : Working example with CMSIS-DSP and FFT](documentation/examples/example3/README.md)
-* [Example 4 : Same as example 3 but with the CMSIS-DSP Python wrapper](documentation/example4.md)
+* [Example 4 : Same as example 3 but with the CMSIS-DSP Python wrapper](documentation/examples/example4/README.md)
-* [Example 10 : The asynchronous mode](documentation/example10.md)
+* [Example 10 : The asynchronous mode](documentation/examples/example10/README.md)
 Examples 5 and 6 are showing how to use the CMSIS-DSP MFCC with a synchronous data flow.
--- a/ComputeGraph/documentation/Options.md
+++ b/ComputeGraph/documentation/Options.md
@ -0,0 +1 @@
 Options
--- a/ComputeGraph/examples/CMakeLists.txt
+++ b/ComputeGraph/examples/CMakeLists.txt
@ -5,22 +5,22 @@ set(Python_FIND_REGISTRY "LAST")
 find_package (Python COMPONENTS Interpreter)
-function(sdf TARGET)
+function(sdf TARGET SCRIPT DOTNAME)
    if (DOT)
    add_custom_command(TARGET ${TARGET} PRE_BUILD 
-        BYPRODUCTS ${CMAKE_CURRENT_SOURCE_DIR}/test.pdf
+        BYPRODUCTS ${CMAKE_CURRENT_SOURCE_DIR}/${DOTNAME}.pdf
-        COMMAND ${DOT} -Tpdf -o ${CMAKE_CURRENT_SOURCE_DIR}/test.pdf ${CMAKE_CURRENT_SOURCE_DIR}/test.dot
+        COMMAND ${DOT} -Tpdf -o ${CMAKE_CURRENT_SOURCE_DIR}/${DOTNAME}.pdf ${CMAKE_CURRENT_SOURCE_DIR}/${DOTNAME}.dot
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/test.dot
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${DOTNAME}.dot
        VERBATIM
        )
    endif()
    add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/generated/scheduler.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/test.dot
+        ${CMAKE_CURRENT_SOURCE_DIR}/${DOTNAME}.dot
-        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/graph.py
+        COMMAND ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${SCRIPT}
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/graph.py
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${SCRIPT}
        VERBATIM
        )
    target_sources(${TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/generated/scheduler.cpp)
@ -73,6 +73,7 @@ add_subdirectory(example6 bin_example6)
 add_subdirectory(example8 bin_example8)
 add_subdirectory(example9 bin_example9)
 add_subdirectory(example10 bin_example10)
 add_subdirectory(simple bin_simple)
 # Python examples
 add_subdirectory(example4 bin_example4)
--- a/ComputeGraph/examples/example1/CMakeLists.txt
+++ b/ComputeGraph/examples/example1/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example1)
 add_executable(example1 main.cpp)
-sdf(example1)
+sdf(example1 graph.py test)
 add_sdf_dir(example1)
 target_include_directories(example1 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/example1/README.md
+++ b/ComputeGraph/examples/example1/README.md
@ -2,7 +2,7 @@
 In this example we will see how to describe the following graph:
-<img src="graph1.PNG" alt="graph1" style="zoom:50%;" />
+<img src="docassets/graph1.PNG" alt="graph1" style="zoom:50%;" />
 The framework is coming with some default blocks. But for this example, we will create new blocks. The blocks that you to create need must be described with a simple Python class and a corresponding simple C++ class.
--- a/ComputeGraph/examples/example1/docassets/graph1.PNG
+++ b/ComputeGraph/examples/example1/docassets/graph1.PNG
--- a/ComputeGraph/examples/example10/CMakeLists.txt
+++ b/ComputeGraph/examples/example10/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example10)
 add_executable(example10 main.cpp)
-sdf(example10)
+sdf(example10 graph.py test)
 add_sdf_dir(example10)
 target_include_directories(example10 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/example10/README.md
+++ b/ComputeGraph/examples/example10/README.md
@ -12,7 +12,7 @@ The FIFO sizes are doubled with:
 The graph implemented in this example is:
-![graph10](graph10.png)
+![graph10](docassets/graph10.png)
 There is a global iteration count corresponding to one execution of the schedule.
--- a/ComputeGraph/examples/example10/docassets/graph10.png
+++ b/ComputeGraph/examples/example10/docassets/graph10.png
--- a/ComputeGraph/examples/example2/CMakeLists.txt
+++ b/ComputeGraph/examples/example2/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example2)
 add_executable(example2 main.cpp)
-sdf(example2)
+sdf(example2 graph.py test)
 add_sdf_dir(example2)
 target_include_directories(example2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/example2/README.md
+++ b/ComputeGraph/examples/example2/README.md
@ -10,7 +10,7 @@ In this example. we are just analyzing a much more complex example to see some n
 The graph is:
-![graph2](graph2.PNG)
+![graph2](docassets/graph2.PNG)
 It is much more complex:
--- a/ComputeGraph/examples/example2/docassets/graph2.PNG
+++ b/ComputeGraph/examples/example2/docassets/graph2.PNG
--- a/ComputeGraph/examples/example3/CMakeLists.txt
+++ b/ComputeGraph/examples/example3/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example3)
 add_executable(example3 main.cpp custom.cpp)
-sdf(example3)
+sdf(example3 graph.py test)
 add_sdf_dir(example3)
 target_include_directories(example3 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/example3/README.md
+++ b/ComputeGraph/examples/example3/README.md
@ -2,7 +2,7 @@
 This example is implementing a working example with FFT. The graph is:
-![graph3](graph3.PNG)
+![graph3](docassets/graph3.PNG)
 The example is:
--- a/ComputeGraph/examples/example3/docassets/graph3.PNG
+++ b/ComputeGraph/examples/example3/docassets/graph3.PNG
--- a/ComputeGraph/examples/example4/README.md
+++ b/ComputeGraph/examples/example4/README.md
--- a/ComputeGraph/examples/example6/CMakeLists.txt
+++ b/ComputeGraph/examples/example6/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example6)
 add_executable(example6 main.cpp mfccConfigData.c)
-sdf(example6)
+sdf(example6 graph.py test)
 add_sdf_dir(example6)
 target_include_directories(example6 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/example8/CMakeLists.txt
+++ b/ComputeGraph/examples/example8/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example8)
 add_executable(example8 main.cpp)
-sdf(example8)
+sdf(example8 graph.py test)
 add_sdf_dir(example8)
 target_include_directories(example8 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/example9/CMakeLists.txt
+++ b/ComputeGraph/examples/example9/CMakeLists.txt
@ -6,7 +6,7 @@ project(Example9)
 add_executable(example9 main.cpp)
-sdf(example9)
+sdf(example9 graph.py test)
 add_sdf_dir(example9)
 target_include_directories(example9 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/ComputeGraph/examples/simple/AppNodes.h
+++ b/ComputeGraph/examples/simple/AppNodes.h
@ -0,0 +1,128 @@
 /* ----------------------------------------------------------------------
 * Project:      CMSIS DSP Library
 * Title:        AppNodes.h
 * Description:  Application nodes for Example 1
 *
 * $Date:        29 July 2021
 * $Revision:    V1.10.0
 *
 * Target Processor: Cortex-M and Cortex-A cores
 * -------------------------------------------------------------------- */
 /*
 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the License); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef _APPNODES_H_
 #define _APPNODES_H_
 #include <iostream>
 template<typename IN, int inputSize>
 class Sink: public GenericSink<IN, inputSize>
 {
 public:
    Sink(FIFOBase<IN> &src):GenericSink<IN,inputSize>(src){};
    int prepareForRunning() final
    {
        if (this->willUnderflow())
        {
           return(CG_SKIP_EXECUTION_ID_CODE); // Skip execution
        }
        return(0);
    };
    int run() final
    {
        IN *b=this->getReadBuffer();
        printf("Sink\n");
        for(int i=0;i<inputSize;i++)
        {
            std::cout << (int)b[i] << std::endl;
        }
        return(0);
    };
 };
 template<typename OUT,int outputSize>
 class Source: public GenericSource<OUT,outputSize>
 {
 public:
    Source(FIFOBase<OUT> &dst):GenericSource<OUT,outputSize>(dst){};
    int prepareForRunning() final
    {
        if (this->willOverflow())
        {
           return(CG_SKIP_EXECUTION_ID_CODE); // Skip execution
        }
        return(0);
    };
    int run() final{
        OUT *b=this->getWriteBuffer();
        printf("Source\n");
        for(int i=0;i<outputSize;i++)
        {
            b[i] = (OUT)i;
        }
        return(0);
    };
 };
 template<typename IN, int inputSize,typename OUT,int outputSize>
 class ProcessingNode;
 template<typename IN, int inputOutputSize>
 class ProcessingNode<IN,inputOutputSize,IN,inputOutputSize>: 
      public GenericNode<IN,inputOutputSize,IN,inputOutputSize>
 {
 public:
    ProcessingNode(FIFOBase<IN> &src,
                   FIFOBase<IN> &dst):GenericNode<IN,inputOutputSize,
                                                  IN,inputOutputSize>(src,dst){};
    int prepareForRunning() final
    {
        if (this->willOverflow() ||
            this->willUnderflow())
        {
           return(CG_SKIP_EXECUTION_ID_CODE); // Skip execution
        }
        return(0);
    };
    int run() final{
        printf("ProcessingNode\n");
        IN *a=this->getReadBuffer();
        IN *b=this->getWriteBuffer();
        for(int i=0;i<inputOutputSize;i++)
        {
            b[i] = a[i]+1;
        }
        return(0);
    };
 };
 #endif
--- a/ComputeGraph/examples/simple/CMakeLists.txt
+++ b/ComputeGraph/examples/simple/CMakeLists.txt
@ -0,0 +1,13 @@
 cmake_minimum_required (VERSION 3.14)
 include(CMakePrintHelpers)
 project(Simple)
 add_executable(simple main.cpp)
 sdf(simple create.py simple)
 add_sdf_dir(simple)
 target_include_directories(simple PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(simple PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/generated)
--- a/ComputeGraph/examples/simple/Makefile
+++ b/ComputeGraph/examples/simple/Makefile
@ -0,0 +1,18 @@
 # Makefile for MSVC compiler on Windows
 SHELL = cmd
 CC = cl.exe
 RM = del /Q /F
 INCLUDES = /Igenerated /I../../cg/src /I.
 WINFLAGS = /DWIN32 /D_WINDOWS /EHsc /Zi /Ob0 /Od /RTC1 -MDd 
 CFLAGS =  $(INCLUDES) $(WINFLAGS)
 all:
 	$(CC) /Fesimple.exe $(CFLAGS) generated/scheduler.cpp main.cpp
 clean:
 	$(RM) main.obj
 	$(RM) scheduler.obj 
 	$(RM) simple.ilk 
 	$(RM) simple.exe 
 	$(RM) *.pdb 
--- a/ComputeGraph/examples/simple/README.md
+++ b/ComputeGraph/examples/simple/README.md
@ -0,0 +1,521 @@
 # README
 This example explains how to create a very simple synchronous compute graph with 3 nodes:
 ![simple](docassets/simple.png)
 The nodes are:
 * A source generating 5 floating point values (0,1,2,3,4) each time it is run
 * A processing node adding 1 to those values 
 * A sink printing its input values (1,2,3,4,5)
 The graph generates an infinite streams of values : 1,2,3,4,5,1,2,3,4,5,1,2,3,4,5 ... For this example, the number of iterations will be limited so that it does not run forever.
 Each node is using the floating point data type for the values.
 The sink and the sources are working on packets of 5 values.
 The processing node is working on packets of 7 values.
 ## Principle of operation
 The graph is described with a Python script `create.py` and this document will explain how to write this Python script to define the nodes and their connections.
 When this Python script is executed, it will compute a static schedule and generate a C++ implementation. This implementation is using some C++ nodes that must have been defined somewhere. This document will explain how to write those nodes and make them available to the C++ scheduler.
 To run the script you first must install the CMSIS-DSP Python package:
 `pip install cmsisdsp`
 Once the CMSIS-DSP python package has been installed, you can run the script with:
 `python create.py`
 This will generate the following files:
 * `generated/scheduler.cpp`
 * `generated/scheduler.h`
 * `simple.dot` (the graphviz representation of the graph)
 A graphical representation of the graph is generated in graphviz dot format. If you have graphviz installed, you can generated a `png` file representing the graph with:
 `dot -Tpng -o simple.png simple.dot`
 ## How to write the Python script
 Let's look at the required steps in reverse order starting first with how to generate some C++ code.
 ### Generating the C++ code and the Graphviz representation
 The file `create.py` will generate the C++ scheduler when run. This file is assuming that the nodes and the graph have already been defined somewhere else. The first lines of this script are including the nodes and graph definitions:
 ```python
 from nodes import * 
 from graph import * 
 ```
 For this example, we don't want the schedule to run forever which should be the case for a stream computation. The first thing the script is doing is limiting the number of schedule iterations to 1 by setting a field in the configuration object:
 ```python
 conf = Configuration()
 conf.debugLimit = 1
 ```
 This corresponds to one iteration of the schedule but the schedule may contain several executions of the different nodes in one iteration.
 This example is not using any datatype from CMSIS-DSP, so we don't need to include its header and recompile the library. We can disable the CMSIS-DSP header inclusion with:
 ```python
 conf.CMSISDSP = False
 ```
 Then, with all those configurations and a graph description, we are ready to compute the static schedule:
 ```python
 scheduling = the_graph.computeSchedule(config = conf)
 ```
 `the_graph` has been imported from another file. It is the description of the compute graph.
 Once the schedule has been computed, we can print some statistics:
 ```python
 print("Schedule length = %d" % scheduling.scheduleLength)
 print("Memory usage %d bytes" % scheduling.memory)
 ```
 The scheduling length is the number of node executions required for one scheduling iterations.
 The memory usage is the space required by all the FIFOs expressed in bytes.
 With this example, the output should be:
 ```
 Schedule length = 19
 Memory usage 88 bytes
 ```
 We can see that the schedule length is a bit long for 3 nodes : 19. And it is because the sink / source are processing packets of 5 samples and the processing node is processing packets of 7 samples. So, a scheduling iteration will require several executions of the different nodes in order to implement a periodic scheduling that can be iterated to process the whole stream.
 We can see in the picture of the graph that the connections between the nodes are requiring 11 samples.
 That's why we have 88 bytes of memory used. Each sample is 4 byte long (a float):
 ![simple](docassets/simple.png)
 Now that we have computed the scheduling, we are ready to generate the C++ implementation:
 ```python
 scheduling.ccode("generated",conf)
 ```
 `"generated" ` is the name of the folder where the files are generated (relative to the working directory of the script). It is possible to customize the naming of the generated files using the `Configuration` object `conf` we created to limit the number of iterations.
 We can also generated a `graphviz` file that can then be processed with the `dot` tool to generate a picture of the graph:
 ```python
 with open("simple.dot","w") as f:
    scheduling.graphviz(f)
 ```
 Those lines are creating a `"simple.dot"` file from the graph **and** the computed schedule. The computed schedule is used to display the length of the connections between the nodes.
 ### Describing the graph
 Now that we know how to compute a schedule and generate a C++ implementation, let's see how to describe the graph.
 The file `graph.py` is containing the Python code to describe the graph. The first lines of the script are loading the node definitions and some standard definitions from the CMSIS-DSP Python wrapper :
 ```python
 from cmsisdsp.cg.scheduler import *
 from nodes import * 
 ```
 We need the definitions from the CMSIS-DSP Python wrapper to define the datatypes used by the nodes. By default only basic datatypes are provided : float32, int16 ... It is also possible to define a C struct datatype.
 ```python
 floatType = CType(F32)
 ```
 The nodes are created like any other Python object. The API is not standardized. The compute graph should be able to work with any library of standard components. In this example, the node APIs are first listing the input, then the outputs. And for each IO, we define the data type and the number of samples produced or consumed.
 #### How to instantiate the source:
 ```python
 src = Source("source",floatType,5)
 ```
 A Python object `src` is created from the Python class `Source`. In the generated code, and in the pictures of the graph, this node will be named "source". This name must thus be a valid C variable name.
 The datatype is the second argument of the constructor. It is the float datatype we defined just before. The last argument is the number of sample produced by the node ar each execution : 5 samples.
 #### How to instantiate the processing node:
 ```python
 processing = ProcessingNode("processing",floatType,7,7)
 ```
 It is very similar to the sink. We just need to specify two sizes : the number of samples consumed and number of samples produced. This node is using the same data type for both input and output.
 As we will see later, the C++ implementation of the node is only supporting the case where the number of samples produced is equal to the number of samples consumed. If it is not the case, the solution won't build. It is caught at the type system level. This constraint could have been enforced at the Python level.
 It demonstrates that a Python description of a node can be very generic and anticipate on future use cases and implementation without introducing problem at runtime since some validation is occurring on the C++ side.
 #### How to instantiate the sink:
 ```python
 sink = Sink("sink",floatType,5)
 ```
 It is very similar to the source.
 #### How to connect the nodes:
 Now that we have defined the nodes, they need to be connected to create a graph. First a `Graph` object must be created:
 ```python
 the_graph = Graph()
 ```
 This `Graph` class is defined in `cmsisdsp.cg.scheduler`
 Now we can connect the nodes:
 ```python
 the_graph.connect(src.o,processing.i)
 the_graph.connect(processing.o,sink.i)
 ```
 The properties `i` and `o` of the nodes have been created during description of the nodes. It is not imposed by the framework. The input / output can be named as you want if they do not conflict with existing properties of the Python objects.
 ### Describing the nodes
 Let's now detail how to create the nodes. Those descriptions can be seen as a datatype for a node. They define:
 * The number of IOs
 * The datatype used for each IO
 * The number of samples produced or consumed on each IO
 The script `nodes.py` is defining the nodes needed for this example. The first line is importing some definitions from the CMSIS-DSP Python wrapper:
 ```python
 from cmsisdsp.cg.scheduler import GenericNode,GenericSink,GenericSource
 ```
 #### The source
 The source is defined with:
 ```python
 class Source(GenericSource):
    def __init__(self,name,theType,outLength):
        GenericSource.__init__(self,name)
        self.addOutput("o",theType,outLength)
    @property
    def typeName(self):
        return "Source"
 ```
 It is a lot but it is not complex. Let's detail each part of this definition:
 ```python
 class Source(GenericSource):
 ```
 The `Source` class is inheriting from the `GenericSource` class. CMSIS-DSP Python Wrapper is defining `GenericSource`, `GenericSink` and `GenericNode`.
 Then, we define the constructor of this class:
 ```python
 def __init__(self,name,theType,outLength):
        GenericSource.__init__(self,name)
        self.addOutput("o",theType,outLength)
 ```
 The constructor is first initializing the super class `GenericSource`. Then, it is defining the input or output. In a `GenericSource` we only have access to the `addOutput` function.
 This function is taking three arguments:
 * First argument : the name `name` of the IO. It will become a property of the object and then can be used like any other Python property 
 * The type `theType` of the IO. In our example it is the `floatType` passed in argument of the constructor
 * The number of samples `outLength` produced on this IO
 As we can see : the API is defined by the constructor `__init__` So the API is not enforced by the compute graph. The developer of the nodes can choose whatever API is the best for a given use case
 There is a last part in the definition of the node:
 ```python
@property
    def typeName(self):
        return "Source"
 ```
 This defines the name of the C++ class implementing the node.
 #### The processing node
 The processing node is defined in the same way but with input and outputs:
 ```python
 class ProcessingNode(GenericNode):
    def __init__(self,name,theType,inLength,outLength):
        GenericNode.__init__(self,name)
        self.addInput("i",theType,inLength)
        self.addOutput("o",theType,outLength)
    @property
    def typeName(self):
        return "ProcessingNode"
 ```
 #### The sink
 ```python
 class Sink(GenericSink):
    def __init__(self,name,theType,inLength):
        GenericSink.__init__(self,name)
        self.addInput("i",theType,inLength)
    @property
    def typeName(self):
        return "Sink"
 ```
 ## How to write the C++ nodes
 For each node datatype defined in the Python side, we need to provide an implementation on the C++ side.
 The C++ class templates that we will define are just wrappers around algorithms. In this example, since the algorithms are very simple, they have been implemented directly in the wrappers. It does not have to be the case for a more complex algorithms. The C++ template are serving the same purposes as the Python definitions : defining the datatype of a node:
 * The number of IOs
 * Their datatype
 * The number of samples consumed or produced on each IO
 The C++ template is also providing some entry points to enable the scheduler to do its works :
 * Access to the FIFOs
 * Running of the code
 Those C++ templates should thus be very light.
 Those templates are defined in a file `AppNodes.h` included by the scheduler (it is possible to change the name from the Pyuthon script). This file must be provided by the user of the ComputeGraph framework.
 ### The source
 First, like with Python, we need to define the datatype:
 * Number of IOs
 * Their type
 * The number of samples
 It is done through arguments of C++ templates.
 ```C++
 template<typename OUT,int outputSize>
 class Source;
 ```
 The previous line is defining a new class template with two arguments:
 * A datatype `OUT`
 * The number of samples `outputSize`
 This template can be used to implement different kind of `Source` classes : with different datatypes or number of samples. We can also (when it makes sense) define a `Source` implementation that can work with any datatype and any number of samples.
 You don't need to be knowledgeable in C++ template to start using them in the context of the compute graph. They are just here to define the plumbing.
 Now, when you have declared a C++ template, you need to implement it. There are two ways to do it:
 * You can define a generic implementation
 * And/or you can define specialized implementations for specific datatypes or sizes.
 For the `Source` we have defined a generic implementation so we need (like in Python case) to inherit from `GenericSource`:
 ```C++
 template<typename OUT,int outputSize>
 class Source: GenericSource<OUT,outputSize>
 ```
 Then, like in the Python case, we need to define a constructor. But contrary to the Python case, here we are defining an implementation. The constructor is not defining the IOs. The IOs are coming from the `GenericSource` template and its arguments.
 ```C++
 public:
    Source(FIFOBase<OUT> &dst):public GenericSource<OUT,outputSize>(dst){};
 ```
 Our `Source` has only one IO : the output. It needs the FIFO for this output. The first argument, `dst`, of  the `Source` constructor is the FIFO. This FIFO is coming from the scheduler.
 We also need to initialize the `GenericSource` parent since we are inheriting from it. `GenericSource` constructor is called with the `FIFO` argument `dst`.
 The constructor is here doing nothing more than initializing the parent and the implementation is empty `{}`
 Then, the implementation needs to provide an entry point to be usable from the scheduler. It is the `run` function. As said before, since the algorithm is very simple it has been implemented in `run`. In general, `run` is just calling an external function with the buffers coming from the FIFOs.
 ```C++
 int run() final {
        OUT *b=this->getWriteBuffer();
        printf("Source\n");
        for(int i=0;i<outputSize;i++)
        {
            b[i] = (OUT)i;
        }
        return(0);
    };
 ```
 The first line is the important one:
 ```C++
 OUT *b=this->getWriteBuffer();
 ```
 We get a pointer to be able to write in the output FIFO. This pointer has the datatype OUT coming from the template so can be anything.
 The code in the loop is casting an `int` (the loop index) into the `OUT` datatype. If it is not possible it won't typecheck and build.
 So, although we have not provided a specific implementation of the template, this template can only work with specific `OUT` datatypes.
 The return of the function is to inform the scheduler that no error occurred. In synchronous mode, errors (like underflow or overflow) cannot occur due to the scheduling but only because of a broken real time. So any error returned by a node will stop the scheduling.
 ### The processing node
 It is similar but now we have one input and one output. The template is:
 ```C++
 template<typename IN, int inputSize,typename OUT,int outputSize>
 class ProcessingNode;
 ```
 In this example we have decided to implement only a specific version of the processing node. We want to enforce the constraint that the output datatype must be equal to the input datatype and that the number of sample produced must be equal to the number of sample consumed. If it is not the case, it won't type check and the solution won't build.
 Remember from the Python definition that this constraint has not been enforced in the Python description of the processing node.
 Here is how we implement a specialized version of the template.
 First we define the arguments of the template. It is no more generic. We have to give all the arguments:
 ```C++
 class ProcessingNode<IN,inputOutputSize,IN,inputOutputSize>
 ```
 This enforces that the `OUT` datatype is equal to the `IN` datatype since `IN` is used in both arguments.
 It also envorces that the input and output sizes are the same since `inputOutputSize` is used in the two arguments for the size.
 Since the arguments of the template are still not fully specified and there is some remaining degree of freedom, we need to continue to define some template parameters:
 ```C++
 template<typename IN, int inputOutputSize>
 class ProcessingNode<IN,inputOutputSize,IN,inputOutputSize>
 ```
 And finally, like before, we inherit from `GenericNode` using the same template arguments:
 ```C++
 template<typename IN, int inputOutputSize>
 class ProcessingNode<IN,inputOutputSize,IN,inputOutputSize>: 
      public GenericNode<IN,inputOutputSize,IN,inputOutputSize>
 ```
 To be compared with the generic implementation:
 ```C++
 template<typename IN, int inputSize, typename OUT, int outputSize>
 class ProcessingNode: 
      public GenericNode<IN,inputSize,OUT,outputSize>
 ```
 It is possible to have several specialization of the same class.
 One could also have another specialization like:
 ```C++
 template<int inputOutputSize>
 class ProcessingNode<q15_t,inputOutputSize,q15_t,inputOutputSize>: 
      public GenericNode<q15_tIN,inputOutputSize,q15_t,inputOutputSize>
 ```
 Just working `q15_t` datatype
 The `run` function of the processing node has access to `getReadBuffer` and `getWriteBuffer` to access to the FIFO buffers.
 ### The sink
 The definition of the `Sink` should be clear now:
 ```C++
 template<typename IN, int inputSize>
 class Sink: public GenericSink<IN, inputSize>
 {
 public:
    Sink(FIFOBase<IN> &src):GenericSink<IN,inputSize>(src){};
 ```
 ## How to call the C++ scheduler
 The API to the scheduler is:
 ```C
 extern uint32_t scheduler(int *error);
 ```
 It is a C API that can be used from C code.
 In case of error, the function is returning :
 * the number of schedule iterations computed since 
 * an error code.
 It is possible, from the Python script, to add arguments to this API when there is the need to pass additional information to the nodes.
 ## How to build and run the example
 There is a very simple `Makefile` in the folder. It is for `MSVC` compiler on Windows but can be easily adapted. There are only 2 files to compile:
 * `generated/scheduler.cpp`
 * `main.c`
 The directory to use for headers are:
 * `generated`
 * `../../cg/src`
 * `.` the current directory
 The headers required by the software are:
 * `generated/scheduler.h`
  * The is the C API to the scheduler function
 * `AppNodes.h`
  * `AppNodes.h` is where the implementation of the nodes is defined. This file could also just include nodes from a standard library.
 * `custom.h`
  * This is the first include in the `scheduler.cpp` and this file can contain whatever is needed or just be empty
  * In this example, the datatype `float32_t` is defined in `custom.h` so that we don't needed to build the CMSIS-DSP for such a simple example
 * `GenericNodes.h` 
  * It is coming from the `../../cg/src` folder.
  * It provides the basic definitions needed by the framework like `GenericNode`, `GenericSink`,`GenericSource`, `FIFO` ...
--- a/ComputeGraph/examples/simple/create.py
+++ b/ComputeGraph/examples/simple/create.py
@ -0,0 +1,31 @@
 # Include definition of the nodes
 from nodes import * 
 # Include definition of the graph
 from graph import * 
 # Create a configuration object
 conf=Configuration()
 # The number of schedule iteration is limited to 1
 # to prevent the scheduling from running forever
 # (which should be the case for a stream computation)
 conf.debugLimit=1
 # Disable inclusion of CMSIS-DSP headers so that we don't have
 # to recompile CMSIS-DSP for such a simple example
 conf.CMSISDSP = False
 # Compute a static scheduling of the graph 
 # The size of FIFO is also computed
 scheduling = the_graph.computeSchedule(config=conf)
 # Print some statistics about the compute schedule
 # and the memory usage
 print("Schedule length = %d" % scheduling.scheduleLength)
 print("Memory usage %d bytes" % scheduling.memory)
 # Generate the C++ code for the static scheduler
 scheduling.ccode("generated",conf)
 # Generate a graphviz representation of the graph
 with open("simple.dot","w") as f:
    scheduling.graphviz(f)
--- a/ComputeGraph/examples/simple/custom.h
+++ b/ComputeGraph/examples/simple/custom.h
@ -0,0 +1,5 @@
 #ifndef _CUSTOM_H_
 typedef float float32_t;
 #endif 
--- a/ComputeGraph/examples/simple/docassets/simple.png
+++ b/ComputeGraph/examples/simple/docassets/simple.png
--- a/ComputeGraph/examples/simple/generated/scheduler.cpp
+++ b/ComputeGraph/examples/simple/generated/scheduler.cpp
@ -0,0 +1,170 @@
 /*
 Generated with CMSIS-DSP Compute Graph Scripts.
 The generated code is not covered by CMSIS-DSP license.
 The support classes and code is covered by CMSIS-DSP license.
 */
 #include "custom.h"
 #include "GenericNodes.h"
 #include "AppNodes.h"
 #include "scheduler.h"
 #if !defined(CHECKERROR)
 #define CHECKERROR       if (cgStaticError < 0) \
       {\
         goto errorHandling;\
       }
 #endif
 #if !defined(CG_BEFORE_ITERATION)
 #define CG_BEFORE_ITERATION
 #endif 
 #if !defined(CG_AFTER_ITERATION)
 #define CG_AFTER_ITERATION
 #endif 
 #if !defined(CG_BEFORE_SCHEDULE)
 #define CG_BEFORE_SCHEDULE
 #endif
 #if !defined(CG_AFTER_SCHEDULE)
 #define CG_AFTER_SCHEDULE
 #endif
 #if !defined(CG_BEFORE_BUFFER)
 #define CG_BEFORE_BUFFER
 #endif
 #if !defined(CG_BEFORE_FIFO_BUFFERS)
 #define CG_BEFORE_FIFO_BUFFERS
 #endif
 #if !defined(CG_BEFORE_FIFO_INIT)
 #define CG_BEFORE_FIFO_INIT
 #endif
 #if !defined(CG_BEFORE_NODE_INIT)
 #define CG_BEFORE_NODE_INIT
 #endif
 #if !defined(CG_AFTER_INCLUDES)
 #define CG_AFTER_INCLUDES
 #endif
 #if !defined(CG_BEFORE_SCHEDULER_FUNCTION)
 #define CG_BEFORE_SCHEDULER_FUNCTION
 #endif
 #if !defined(CG_BEFORE_NODE_EXECUTION)
 #define CG_BEFORE_NODE_EXECUTION
 #endif
 #if !defined(CG_AFTER_NODE_EXECUTION)
 #define CG_AFTER_NODE_EXECUTION
 #endif
 CG_AFTER_INCLUDES
 /*
 Description of the scheduling. 
 */
 static unsigned int schedule[19]=
 { 
 2,2,0,1,2,0,1,2,2,0,1,1,2,0,1,2,0,1,1,
 };
 CG_BEFORE_FIFO_BUFFERS
 /***********
 FIFO buffers
 ************/
 #define FIFOSIZE0 11
 #define FIFOSIZE1 11
 #define BUFFERSIZE1 11
 CG_BEFORE_BUFFER
 float32_t buf1[BUFFERSIZE1]={0};
 #define BUFFERSIZE2 11
 CG_BEFORE_BUFFER
 float32_t buf2[BUFFERSIZE2]={0};
 CG_BEFORE_SCHEDULER_FUNCTION
 uint32_t scheduler(int *error)
 {
    int cgStaticError=0;
    uint32_t nbSchedule=0;
    int32_t debugCounter=1;
    CG_BEFORE_FIFO_INIT;
    /*
    Create FIFOs objects
    */
    FIFO<float32_t,FIFOSIZE0,0,0> fifo0(buf1);
    FIFO<float32_t,FIFOSIZE1,0,0> fifo1(buf2);
    CG_BEFORE_NODE_INIT;
    /* 
    Create node objects
    */
    ProcessingNode<float32_t,7,float32_t,7> processing(fifo0,fifo1);
    Sink<float32_t,5> sink(fifo1);
    Source<float32_t,5> source(fifo0);
    /* Run several schedule iterations */
    CG_BEFORE_SCHEDULE;
    while((cgStaticError==0) && (debugCounter > 0))
    {
        /* Run a schedule iteration */
        CG_BEFORE_ITERATION;
        for(unsigned long id=0 ; id < 19; id++)
        {
            CG_BEFORE_NODE_EXECUTION;
            switch(schedule[id])
            {
                case 0:
                {
                   cgStaticError = processing.run();
                }
                break;
                case 1:
                {
                   cgStaticError = sink.run();
                }
                break;
                case 2:
                {
                   cgStaticError = source.run();
                }
                break;
                default:
                break;
            }
            CG_AFTER_NODE_EXECUTION;
            CHECKERROR;
        }
       debugCounter--;
       CG_AFTER_ITERATION;
       nbSchedule++;
    }
 errorHandling:
    CG_AFTER_SCHEDULE;
    *error=cgStaticError;
    return(nbSchedule);
 }
--- a/ComputeGraph/examples/simple/generated/scheduler.h
+++ b/ComputeGraph/examples/simple/generated/scheduler.h
@ -0,0 +1,26 @@
 /*
 Generated with CMSIS-DSP Compute Graph Scripts.
 The generated code is not covered by CMSIS-DSP license.
 The support classes and code is covered by CMSIS-DSP license.
 */
 #ifndef _SCHEDULER_H_ 
 #define _SCHEDULER_H_
 #ifdef   __cplusplus
 extern "C"
 {
 #endif
 extern uint32_t scheduler(int *error);
 #ifdef   __cplusplus
 }
 #endif
 #endif
--- a/ComputeGraph/examples/simple/graph.py
+++ b/ComputeGraph/examples/simple/graph.py
@ -0,0 +1,39 @@
 # Include definitions from the Python package to
 # define datatype for the IOs and to have access to the
 # Graph class
 from cmsisdsp.cg.scheduler import *
 # Include definition of the nodes
 from nodes import * 
 # Define the datatype we are using for all the IOs in this
 # example
 floatType=CType(F32)
 # Instantiate a Source node with a float datatype and
 # working with packet of 5 samples (each execution of the
 # source in the C code will generate 5 samples)
 # "source" is the name of the C variable that will identify
 # this node
 src=Source("source",floatType,5)
 # Instantiate a Processing node using a float data type for
 # both the input and output. The number of samples consumed
 # on the input and produced on the output is 7 each time
 # the node is executed in the C code
 # "processing" is the name of the C variable that will identify
 # this node
 processing=ProcessingNode("processing",floatType,7,7)
 # Instantiate a Sink node with a float datatype and consuming
 # 5 samples each time the node is executed in the C code
 # "sink" is the name of the C variable that will identify
 # this node
 sink=Sink("sink",floatType,5)
 # Create a Graph object
 the_graph = Graph()
 # Connect the source to the processing node
 the_graph.connect(src.o,processing.i)
 # Connect the processing node to the sink
 the_graph.connect(processing.o,sink.i)
--- a/ComputeGraph/examples/simple/main.cpp
+++ b/ComputeGraph/examples/simple/main.cpp
@ -0,0 +1,11 @@
 #include <cstdio>
 #include <cstdint>
 #include "scheduler.h"
 int main(int argc, char const *argv[])
 {
    int error;
    printf("Start\n");
    uint32_t nbSched=scheduler(&error);
    return 0;
 }
--- a/ComputeGraph/examples/simple/main.obj
+++ b/ComputeGraph/examples/simple/main.obj
--- a/ComputeGraph/examples/simple/nodes.py
+++ b/ComputeGraph/examples/simple/nodes.py
@ -0,0 +1,77 @@
 # Include definitions from the Python package
 from cmsisdsp.cg.scheduler import GenericNode,GenericSink,GenericSource
 ### Define new types of Nodes 
 class ProcessingNode(GenericNode):
    """
    Definition of a ProcessingNode for the graph
    Parameters
    ----------
    name : str
         Name of the C variable identifying this node 
         in the C code
    theType : CGStaticType
            The datatype for the input and output
    inLength : int
             The number of samples consumed by input
    outLength : int 
              The number of samples produced on output
    """
    def __init__(self,name,theType,inLength,outLength):
        GenericNode.__init__(self,name)
        self.addInput("i",theType,inLength)
        self.addOutput("o",theType,outLength)
    @property
    def typeName(self):
        """The name of the C++ class implementing this node"""
        return "ProcessingNode"
 class Sink(GenericSink):
    """
    Definition of a Sink node for the graph
    Parameters
    ----------
    name : str
         Name of the C variable identifying this node 
         in the C code
    theType : CGStaticType
            The datatype for the input
    inLength : int
             The number of samples consumed by input
    """
    def __init__(self,name,theType,inLength):
        GenericSink.__init__(self,name)
        self.addInput("i",theType,inLength)
    @property
    def typeName(self):
        """The name of the C++ class implementing this node"""
        return "Sink"
 class Source(GenericSource):
    """
    Definition of a Source node for the graph
    Parameters
    ----------
    name : str
         Name of the C variable identifying this node 
         in the C code
    theType : CGStaticType
            The datatype for the output
    outLength : int 
              The number of samples produced on output
    """
    def __init__(self,name,theType,outLength):
        GenericSource.__init__(self,name)
        self.addOutput("o",theType,outLength)
    @property
    def typeName(self):
        """The name of the C++ class implementing this node"""
        return "Source"
--- a/ComputeGraph/examples/simple/scheduler.obj
+++ b/ComputeGraph/examples/simple/scheduler.obj
--- a/ComputeGraph/examples/simple/simple.dot
+++ b/ComputeGraph/examples/simple/simple.dot
@ -0,0 +1,48 @@
 digraph structs {
    node [shape=plaintext]
    rankdir=LR
    edge [arrowsize=0.5]
    fontname="times"
 processing [label=<
 <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
  <TR>
    <TD ALIGN="CENTER" PORT="i">processing<BR/>(ProcessingNode)</TD>
  </TR>
 </TABLE>>];
 sink [label=<
 <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
  <TR>
    <TD ALIGN="CENTER" PORT="i">sink<BR/>(Sink)</TD>
  </TR>
 </TABLE>>];
 source [label=<
 <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">
  <TR>
    <TD ALIGN="CENTER" PORT="i">source<BR/>(Source)</TD>
  </TR>
 </TABLE>>];
 source:i -> processing:i [label="f32(11)"
 ,headlabel=<<TABLE BORDER="0" CELLPADDING="2"><TR><TD><FONT COLOR="blue" POINT-SIZE="12.0" >7</FONT>
 </TD></TR></TABLE>>
 ,taillabel=<<TABLE BORDER="0" CELLPADDING="2"><TR><TD><FONT COLOR="blue" POINT-SIZE="12.0" >5</FONT>
 </TD></TR></TABLE>>]
 processing:i -> sink:i [label="f32(11)"
 ,headlabel=<<TABLE BORDER="0" CELLPADDING="2"><TR><TD><FONT COLOR="blue" POINT-SIZE="12.0" >5</FONT>
 </TD></TR></TABLE>>
 ,taillabel=<<TABLE BORDER="0" CELLPADDING="2"><TR><TD><FONT COLOR="blue" POINT-SIZE="12.0" >7</FONT>
 </TD></TR></TABLE>>]
 }
--- a/ComputeGraph/examples/simple/simple.exe
+++ b/ComputeGraph/examples/simple/simple.exe
--- a/ComputeGraph/examples/simple/simple.ilk
+++ b/ComputeGraph/examples/simple/simple.ilk
--- a/ComputeGraph/examples/simple/simple.pdb
+++ b/ComputeGraph/examples/simple/simple.pdb
--- a/ComputeGraph/examples/simple/simple.pdf
+++ b/ComputeGraph/examples/simple/simple.pdf
--- a/ComputeGraph/examples/simple/simple.png
+++ b/ComputeGraph/examples/simple/simple.png
--- a/ComputeGraph/examples/simple/vc140.pdb
+++ b/ComputeGraph/examples/simple/vc140.pdb