pulp-platform · diaconuccalin · Jul 29, 2025 · Aug 7, 2025 · Sep 18, 2025 · Sep 19, 2025
@@ -53,7 +53,15 @@ jobs:
         testBacktracking
         testFloatAdder
         testFloatGEMM
+
         testFloat2DConvolution
+        testFloat2DConvolutionBias
+        testFloat2DConvolutionZeroBias
+
+        testFloat2DDWConvolution
+        testFloat2DDWConvolutionBias
+        testFloat2DDWConvolutionZeroBias
+
         testFloatLayerNorm
         testFloatRelu
         testFloatMaxPool
@@ -64,6 +72,7 @@ jobs:
         Quant
         Dequant
         testFloatReduceSum
+        testFloatReshapeWithSkipConnection
         testFloatSoftmaxGrad
         testFloatSoftmaxCrossEntropy
         testFloatSoftmaxCrossEntropyGrad
@@ -87,4 +96,5 @@ jobs:
         CCT/CCT_1_16_16_8
         CCT/CCT_2_32_32_128_Opset20
         testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8
+        testFloatDemoTinyViT
       num-cores: 8
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- TinyViT on non-tiled Siracusa [#117](https://github.com/pulp-platform/Deeploy/pull/117)
 - Refactor Logging for Improved Debugging [#115](https://github.com/pulp-platform/Deeploy/pull/115)
 - Add reuse-tool as an SPDX license header linter [#113](https://github.com/pulp-platform/Deeploy/pull/113)
 - Bug fixes, API Cleanup and Reduce Compiler Warning on PULP [#112](https://github.com/pulp-platform/Deeploy/pull/112)
@@ -17,6 +18,13 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Fix `Unsqueeze` Op. when using ONNX opset 13 or higher (from attribute to input) [#119](https://github.com/pulp-platform/Deeploy/pull/119)
 
 ### Added
+- PULP 2D FP DW conv Im2Col template and kernel, with bias support.
+- Bias support for PULP 2D FP regular conv Im2Col in template & kernel.
+- PULP FP DW conv 2D parser.
+- FP conv 2D (simple & DW), reshape & skip connection, and TinyViT demo tests to the non-tiled Siracusa CI pipeline.
+- FP bindings and mappings for PULP slice, DW conv 2D, and reduce mean operations.
+- FP PULP DW conv lowering optimization pass similar to the existent one for integer version.
+- RemoveEmptyConvBiasPass to the PULP optimizer.
 - Add manual type inference feature (CLI: `--input-type-map`/`--input-offset-map`) to resolve ambiguities when test inputs are not representative enough
 - Added a `testTypeInferenceDifferentTypes` test case to validate type inference for different input types
 - Added `_mangleNodeNames` function to avoid duplicate node mappings
@@ -48,6 +56,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Memory/I/O summaries and input/output logging in deployers
 
 ### Changed
+- Reduced size of reshape & skip connection test, for non-tiled Siracusa memory compatibility.
 - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`.
 - mchan HAL is now reduced to bare-bones
 - refactor of the IntrospectiveCodeTransformation to work on the Mako template
@@ -75,6 +84,10 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Deployer workflow now uses `prepare(...)` instead of `generateFunction(...)`.
 
 ### Fixed
+- Fixed bug in alias_of node parameter handling, that takes care of the lifetime of buffers in skip connection situations.
+- Fixed bug for non-batched elements in the PULPOpen FP GEMM and matmul templates.
+- Added underscore to the beginning of closure names to avoid naming issues when they start with unsupported first characters (like numbers).
+- Data types in the PULPOpen FP add and mul templates.
 - Prevent node duplication for graphs generated via GraphSurgeon
 - Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step.
 - Fix license CI check and prevent potential issues with `jq` installation

@@ -155,7 +155,8 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        self.closureName = name + self.closureSuffix
+        # Add underscore to avoid name issues when beginning with problematic characters (like numbers)
+        self.closureName = "_" + name + self.closureSuffix
         self.functionCall = executionBlock.generate(ctxt)
         self._generateClosureStruct(ctxt, executionBlock)
         ctxt = self._generateClosureCtxt(ctxt, name)

@@ -22,7 +22,8 @@ def __init__(self,
                  name: str = 'DeeployNetwork',
                  default_channels_first: bool = True,
                  deeployStateDir: str = "DeeployState",
-                 inputOffsets: Dict[str, int] = {}):
+                 inputOffsets: Dict[str, int] = {},
+                 n_cores: int = 8):
         super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
                          default_channels_first, deeployStateDir)
 
@@ -31,6 +32,7 @@ def __init__(self,
                 inputOffsets[key] = 0
 
         self.inputOffsets = inputOffsets
+        self.n_cores = n_cores
 
     def _createIOBindings(self, ctxt, graph):
         ctxt = super()._createIOBindings(ctxt, graph)

@@ -247,7 +247,7 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f
         if node_op in ["RequantizedConv", "Conv"]:
 
             # Non DW-Type:
-            if opNode.attrs['group'] == 1:
+            if opNode.attrs.get('group', 1) == 1:
                 weightNode = opNode.inputs[1]
                 weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
                                                                                   inPermute)
@@ -341,7 +341,7 @@ def _PULPDWNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_chan
     opNode = matched_nodes[0]
     node_op = opNode.op
 
-    if opNode.attrs['group'] == 1:
+    if opNode.attrs.get('group', 1) == 1:
         return graph
 
     if (("channels_first" in opNode.attrs and opNode.attrs["channels_first"] != default_channels_first)
@@ -362,30 +362,67 @@ def _PULPDWNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_chan
         graph.nodes.append(outputTransposeNode)
 
         if node_op == "RequantizedConv":
-
             weightNode = opNode.inputs[1]
             weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
                                                                               inPermute)
             opNode.inputs[1] = weightTransposeOutput
             graph.nodes.append(weightTransposeNode)
+        else:
+            inputTransposeNode, inputTransposeOutput = _appendTransposeNode(inputNode, name + "_TransposeIn", inPermute)
+            opNode.inputs[0] = inputTransposeOutput
+            graph.nodes.append(inputTransposeNode)
 
         opNode.attrs["channels_first"] = default_channels_first
 
     return graph
 
 
+# Requantized DW Conv
 @contextagnostic
 class PULPDWConvPass(ReplaceSequentialPatternPass):
 
     def __init__(self, default_channels_first: bool = True):
+        # Define pattern graph
         graph = gs.Graph()
+
         _input = gs.Variable(name = 'input_1')
         output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
+
         graph.outputs.append(output)
         graph.inputs.append(_input)
 
-        name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_PULPDWNCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+        # Define name
+        name = "_NCHW_TO_NHWC_DW_CONV_PASS"
+
+        # Initialize Pass
+        super().__init__(pattern = graph,
+                         replacement_fn = partial(_PULPDWNCHWtoNHWC_fun,
+                                                  default_channels_first = default_channels_first),
+                         name = name)
+
+
+# Float DW Conv
+@contextagnostic
+class PULPFPDWConvPass(ReplaceSequentialPatternPass):
+
+    def __init__(self, default_channels_first: bool = True):
+        # Define pattern graph
+        graph = gs.Graph()
+
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
+
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        # Define name
+        name = "_NCHW_TO_NHWC_FP_DW_CONV_PASS"
+
+        # Initialize Pass
+        super().__init__(pattern = graph,
+                         replacement_fn = partial(_PULPDWNCHWtoNHWC_fun,
+                                                  default_channels_first = default_channels_first),
+                         name = name)
 
 
 def _PULPDenseNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
@@ -465,6 +502,7 @@ def __init__(self, default_channels_first: bool = True):
             NCHWtoNHWCPadPass(default_channels_first),
             NCHWtoNHWCMaxPoolPass(default_channels_first),
             PULPDWConvPass(default_channels_first),
+            PULPFPDWConvPass(default_channels_first),
             PULPNCHWtoNHWCDenseConvPass(default_channels_first),
             PULPNCHWtoNHWCDenseRequantizedConvPass(default_channels_first),
         ]

@@ -257,7 +257,7 @@ def __init__(self, name: str = '', shape = [1], alias_of: Optional[List[str]] =
         self.is_input: bool = False
         self.is_output: bool = False
 
-        self.alias_of: List[str] = alias_of if alias_of is not None else []
+        self.alias_of: List[str] = list(alias_of) if alias_of is not None else []
 
     def _bufferRepresentation(self) -> Dict:
         return {"type": self._instance, "name": self.name, "size": int(np.prod(self.shape))}
@@ -322,7 +322,11 @@ def __getstate__(self):
 
     @classmethod
     def fromNode(cls, node: gs.Node):
-        return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape))
+        return (cls(
+            name = node.name,
+            shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape,
+            alias_of = [],
+        ))
 
     def add_aliases(self, aliases_to_add: List[str]):
         """
@@ -355,7 +359,7 @@ def get_aliases_of(self):
         """
 
         if hasattr(self, "alias_of"):
-            return self.alias_of
+            return list(self.alias_of)
         else:
             return list()
 
@@ -399,7 +403,7 @@ class TransientBuffer(VariableBuffer):
 
     def __init__(self, name: str = '', size = 0):
         self.name = name
-        self.size = size  #: int: Total BYTE size of this TransientBuffer
+        self.size = size  # int: Total BYTE size
 
         # Do not override - Should be written in the parsing passes
         self._users = []
@@ -446,7 +450,9 @@ class ConstantBuffer(VariableBuffer):
     """
 
     def __init__(self, name: str = '', shape = [1], values = [0]):
+        # Pass a copy of alias_of to avoid shared references
         super().__init__(name, shape)
+
         values = np.asarray(values)
         # intArray = values.astype(int)
         # assert (np.abs(values - intArray)).max() < 0.001, "Constant value {name} is NOT an integer!"
@@ -481,7 +487,11 @@ def _bufferRepresentation(self) -> Dict:
 
     @classmethod
     def fromVariableBuffer(cls, buffer: VariableBuffer, values):
-        ret = cls(name = buffer.name, shape = buffer.shape, values = values)
+        ret = cls(
+            name = buffer.name,
+            shape = buffer.shape,
+            values = values,
+        )
 
         return ret
 
@@ -572,14 +582,16 @@ def __init__(self,
                  transientBuffer: Type[TransientBuffer],
                  globalObjects = {},
                  localObjects = {},
-                 name: str = 'DeeployNetwork'):
+                 name: str = 'DeeployNetwork',
+                 n_cores: int = 8):
         self.globalObjects = OrderedDict()
         self.localObjects = OrderedDict()
         self.VariableBuffer = variableBuffer
         self.ConstantBuffer = constantBuffer
         self.StructBuffer = structBuffer
         self.TransientBuffer = transientBuffer
         self.name = name
+        self.n_cores = n_cores
 
         self._maxDynamicSize = {}  #: int: Maximum dynamic memory size occupied by live buffers at any point in time
         self._dynamicSize = {}  #: int: Current dynamic memory size occupied by live buffers
@@ -874,7 +886,7 @@ def is_buffer(self, value: Any) -> bool:
         obj = self.lookup(value)
         return isinstance(obj, VariableBuffer)
 
-    def hoistTransientBuffer(self, name: str, size: int) -> str:
+    def hoistTransientBuffer(self, name: str, size: Union[int, str]) -> str:
         """Registers a new TransientBuffer in the local context
 
         Parameters
@@ -1186,7 +1198,11 @@ def parseOutputs(cls, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
 
         for node, name in zip(outputNodes, outputNames):
             if not ctxt.is_global(name):
-                nb = ctxt.VariableBuffer(name = name, shape = node.shape)
+                nb = ctxt.VariableBuffer(
+                    name = name,
+                    shape = node.shape,
+                    alias_of = [],
+                )
                 ctxt.add(nb, 'local')
             else:
                 nb = ctxt.lookup(name)
@@ -2487,7 +2503,8 @@ def __init__(self,
                  inputTypes: Dict[str, Type[Pointer]],
                  scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
                  name: str = 'DeeployNetwork',
-                 deeployStateDir: str = "DeeployState"):
+                 deeployStateDir: str = "DeeployState",
+                 n_cores: int = 8):
         """Initializes a new NetworkContainer and its NetworkContext
 
         Parameters
@@ -2505,6 +2522,8 @@ def __init__(self,
             Prefix to use in deployment to uniquify tensor names
         deeployStateDir : str
             Path to a directory to dump intermediate outputs
+        n_cores : int
+            The number of cores on which the network will be run
 
 
         """
@@ -2523,7 +2542,8 @@ def __init__(self,
         self.ctxt = NetworkContext(variableBuffer = self.Platform.VariableBuffer,
                                    constantBuffer = self.Platform.ConstantBuffer,
                                    structBuffer = self.Platform.StructBuffer,
-                                   transientBuffer = self.Platform.TransientBuffer)
+                                   transientBuffer = self.Platform.TransientBuffer,
+                                   n_cores = n_cores)
 
         self.deeployStateDir = deeployStateDir
 
@@ -2683,10 +2703,13 @@ def parse(self, default_channels_first: bool = True) -> bool:
 
         """
 
-        self.ctxt = NetworkContext(variableBuffer = self.Platform.VariableBuffer,
-                                   constantBuffer = self.Platform.ConstantBuffer,
-                                   structBuffer = self.Platform.StructBuffer,
-                                   transientBuffer = self.Platform.TransientBuffer)
+        self.ctxt = NetworkContext(
+            variableBuffer = self.Platform.VariableBuffer,
+            constantBuffer = self.Platform.ConstantBuffer,
+            structBuffer = self.Platform.StructBuffer,
+            transientBuffer = self.Platform.TransientBuffer,
+            n_cores = self.ctxt.n_cores,
+        )
 
         log.debug(" - Create IO Bindings")
         self.ctxt = self._createIOBindings(self.ctxt, self.graph)
@@ -3232,15 +3255,18 @@ class NetworkDeployer(NetworkContainer):
     """Deeploy abstraction to contain an entire network and all necessary information to deploy it
     """
 
-    def __init__(self,
-                 graph: gs.Graph,
-                 deploymentPlatform: DeploymentPlatform,
-                 inputTypes: Dict[str, Type[Pointer]],
-                 loweringOptimizer: TopologyOptimizer,
-                 scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
-                 name: str = 'DeeployNetwork',
-                 default_channels_first: bool = True,
-                 deeployStateDir: str = "DeeployState"):
+    def __init__(
+        self,
+        graph: gs.Graph,
+        deploymentPlatform: DeploymentPlatform,
+        inputTypes: Dict[str, Type[Pointer]],
+        loweringOptimizer: TopologyOptimizer,
+        scheduler: Callable[[gs.Graph], Schedule] = lambda graph: list(graph.nodes),
+        name: str = 'DeeployNetwork',
+        default_channels_first: bool = True,
+        deeployStateDir: str = "DeeployState",
+        n_cores: int = 8,
+    ):
         """Initialize a new NetworkDeployer
 
         Parameters
@@ -3269,12 +3295,21 @@ def __init__(self,
 
 
         """
-        super().__init__(graph, deploymentPlatform, inputTypes, scheduler, name, deeployStateDir = deeployStateDir)
+        super().__init__(
+            graph = graph,
+            platform = deploymentPlatform,
+            inputTypes = inputTypes,
+            scheduler = scheduler,
+            name = name,
+            deeployStateDir = deeployStateDir,
+            n_cores = n_cores,
+        )
 
         self.loweringOptimizer = loweringOptimizer
         self.default_channels_first = default_channels_first
 
         self.prepared = False
+        self.n_cores = n_cores
 
     def __repr__(self):
         return super().__repr__(