pulp-platform · runwangdl · Mar 17, 2025 · Mar 3, 2025 · Mar 18, 2025 · Mar 19, 2025
@@ -38,4 +38,4 @@ jobs:
           file: Container/Dockerfile
           push: true
           # JUNGVI: If you operate from a fork and want to build a new docker make sure to replace 'pulp-platform' by your uname. 
-          tags: ghcr.io/pulp-platform/deeploy:main
+          tags: ghcr.io/runwangdl/deeploy:redmule
@@ -9,7 +9,7 @@ on:
     - cron: "0 1 */6 * *"
 
 env:
-  DOCKER_IMAGE: ghcr.io/pulp-platform/deeploy:main
+  DOCKER_IMAGE: ghcr.io/runwangdl/deeploy:redmule
 
 jobs:
 
@@ -338,7 +338,7 @@ jobs:
           },
           {
             "name": "testFloat2DConvolution",
-            "L1": [2000]
+            "L1": [8000]
           },
           {
             "name": "testFloatLayerNorm",
@@ -420,7 +420,7 @@ jobs:
           },
           {
             "name": "testFloat2DConvolution",
-            "L1": [4000]
+            "L1": [15000]
           },
           {
             "name": "testFloatLayerNorm",
@@ -514,12 +514,8 @@ jobs:
             L1: [64000]
           - name: "CCT/CCT_1_16_16_64"
             L1: [64000]
-          - name: "CCT/CCT_1_16_16_128"
-            L1: [64000]
           - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_64"
             L1: [64000]
-          - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_128"
-            L1: [64000]
         num-cores:
           - 8
         default-memory-level:
@@ -559,12 +555,8 @@ jobs:
             L1: [64000]
           - name: "CCT/CCT_1_16_16_64"
             L1: [64000]
-          - name: "CCT/CCT_1_16_16_128"
-            L1: [64000]
           - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_64"
             L1: [64000]
-          - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_128"
-            L1: [64000]
         num-cores:
           - 8
         double-buffer:
@@ -748,6 +740,42 @@ jobs:
       default-memory-level: ${{ matrix.default-memory-level }}
       neureka-wmem: ${{ matrix.neureka-wmem }}
 
+  siracusa-redmule-kernels-tiled-singlebuffer-L2:
+    strategy:
+      fail-fast: false
+      matrix:
+        test-data: 
+          - name: "testFloatMatmul"
+            L1: [8000]
+        num-cores:
+          - 8
+    uses: ./.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml
+    needs: select-docker-image
+    with:
+      docker-image: ${{ needs.select-docker-image.outputs.image }}
+      test-name: ${{ matrix.test-data.name }}
+      num-cores: ${{ matrix.num-cores }}
+      L1: ${{ toJson(matrix.test-data.L1) }}
+
+  siracusa-redmule-kernels-tiled-doublebuffer-L2:
+    strategy:
+      fail-fast: false
+      matrix:
+        test-data: 
+          - name: "testFloatMatmul"
+            L1: [8000]
+        num-cores:
+          - 8
+        double-buffer:
+          - true
+    uses: ./.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml
+    needs: select-docker-image
+    with:
+      docker-image: ${{ needs.select-docker-image.outputs.image }}
+      test-name: ${{ matrix.test-data.name }}
+      num-cores: ${{ matrix.num-cores }}
+      L1: ${{ toJson(matrix.test-data.L1) }}
+      double-buffer: ${{ matrix.double-buffer }}
 
   ### Deeploy Extension and Internal Tests ###
   deeploy-memory-allocation:

@@ -0,0 +1,72 @@
+name: TestRunnerTiledSiracusa
+
+on:
+  workflow_call:
+    inputs:
+      docker-image:
+        required: true
+        type: string
+      test-name:
+        required: true
+        type: string
+      num-cores:
+        required: false
+        default: 8
+        type: number
+      L1:
+        required: false
+        default: "[64000]"
+        type: string
+      default-memory-level:
+        required: false
+        default: "L2"
+        type: string
+      double-buffer:
+        required: false
+        default: false
+        type: boolean
+      memory-allocation-strategy:
+        required: false
+        default: "MiniMalloc"
+        type: string
+      search-strategy:
+        required: false
+        default: "random-max"
+        type: string
+
+jobs:
+
+  test-runner-siracusa-tiled:
+    strategy:
+      fail-fast: false
+      matrix:
+        L1: ${{ fromJSON(inputs.L1) }}
+    runs-on: ubuntu-22.04
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        run: pip install -e .
+      - name: Cache ccache
+        id: ccache-cache
+        uses: actions/cache@v4
+        with:
+          path: /app/.ccache
+          key: ${{ runner.os }}-ccache
+      - name: Run Test
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          retry_on: timeout
+          command: |
+            cd DeeployTest
+            mkdir -p /app/.ccache
+            export CCACHE_DIR=/app/.ccache
+            python testRunner_tiled_siracusa_w_redmule.py -t Tests/${{ inputs.test-name }} --cores=${{ inputs.num-cores }} --l1 ${{ matrix.L1 }} --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
+          shell: bash
+
@@ -282,4 +282,18 @@ Change main.c to use OUTPUTTYPE instead of float
 
 ### Changed
 - The ISA for the Siracusa platform has been updated from rv32imc_zfinx_xpulpv2 to rv32imf_xpulpv2.
-- All floating-point comparison tasks in deeploytest.c are now offloaded to Cluster 0 for execution.
+- All floating-point comparison tasks in deeploytest.c are now offloaded to Cluster 0 for execution.
+
+## Add RV32IMF Picolibc support for Siracusa platform
+
+## Added
+- Adds RV32IMF Picolib to the toolchain
+
+## Parallelization and Optimization of CCT Inference and Training Kernels
+
+### Added
+- Parallel Matmul, Softmax, Gelu, Conv, Layernorm, Maxpool, Add
+- Gelu with sigmoid approximation
+- Im2col Conv
+- Matmul with pulptrainlib with 1*7 unrolling performance aligned with pulptrainlib
+- Compute op support for multiple float kernels: Maxpool, Relu, Mul
@@ -15,8 +15,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, Generic, Snitch)")
+set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open Generic Snitch)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -26,6 +26,8 @@ elseif(platform STREQUAL Siracusa)
   message(STATUS "Building for platform 'Siracusa'")
 elseif(platform STREQUAL Siracusa_w_neureka)
   message(STATUS "Building for platform 'Siracusa_w_neureka'")
+elseif(platform STREQUAL Siracusa_w_redmule)
+  message(STATUS "Building for platform 'Siracusa_w_redmule'")
 elseif(platform STREQUAL PULPOpen)
   message(STATUS "Building for platform 'PULP-Open'")
 elseif(platform STREQUAL Generic)
@@ -148,7 +150,7 @@ if(platform STREQUAL QEMU-ARM)
 
 endif()
 
-if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen)
+if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen)
 
   if(TOOLCHAIN STREQUAL LLVM)
     set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake)
@@ -158,7 +160,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
 
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake)
 
-  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake)
   elseif(platform STREQUAL PULPOpen)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake)

@@ -42,7 +42,9 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y git-lfs \
     libsdl2-ttf-dev \
     gcc-multilib \
     wget \
-    clang-format
+    clang-format \
+    libxtensor-dev \
+    libxsimd-dev
 
 # Install cmake 3.31.1
 RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.sh && \

@@ -69,15 +69,16 @@ def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
     def computeOps(self):
-        compAbs = self.mapper.parser.operatorRepresentation['size']
-        compAdd = self.mapper.parser.operatorRepresentation['size']
-        compSqr = self.mapper.parser.operatorRepresentation['size']
-        compMul = self.mapper.parser.operatorRepresentation['size']
-        compAdd = self.mapper.parser.operatorRepresentation['size']
-        compMul2 = self.mapper.parser.operatorRepresentation['size']
-        compAdd2 = self.mapper.parser.operatorRepresentation['size']
-        compDiv = self.mapper.parser.operatorRepresentation['size']
-        return compAbs + compAdd + compSqr + compMul + compAdd + compMul2 + compAdd2 + compDiv
+        size = self.mapper.parser.operatorRepresentation['size']
+        # RW: Sigmoid approximation
+        mul1 = size  # Multiply by 1.702
+        neg = size  # Negate the result
+        exp = size  # Compute exponential
+        add = size  # Add 1
+        div = size  # Division for sigmoid
+        mul2 = size  # Final multiplication by x
+
+        return mul1 + neg + exp + add + div + mul2
 
 
 class iHardswishLayer(ONNXLayer):
@@ -120,12 +121,39 @@ class SoftmaxLayer(ONNXLayer):
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+
+        size = self.mapper.parser.operatorRepresentation['size']
+        last_dim_length = self.mapper.parser.operatorRepresentation['lastDimLength']
+        batch_size = size // last_dim_length
+
+        max_ops = last_dim_length - 1
+        exp_ops = last_dim_length * 2
+        sum_ops = last_dim_length - 1
+        div_ops = last_dim_length
+        ops_per_batch = max_ops + exp_ops + sum_ops + div_ops
+        total_ops = ops_per_batch * batch_size
+
+        return total_ops
+
 
 class SoftmaxGradLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+        input_size = self.mapper.parser.operatorRepresentation['size']
+
+        # SoftmaxGrad operation: dy * (y - (y * sum(dy * y)))
+        mul_ops = input_size
+        sum_ops = input_size
+        broadcast_mul_ops = input_size
+        sub_ops = input_size
+        final_mul_ops = input_size
+
+        return mul_ops + sum_ops + broadcast_mul_ops + sub_ops + final_mul_ops
+
 
 class ITAMaxLayer(ONNXLayer):
 
@@ -252,7 +280,7 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese
             N = inputShapes[1][-1]
 
         if len(inputShapes) == 3:
-            inputShapes[2] = [M, N]
+            inputShapes[2] = outputShapes[0]
 
         return (inputShapes, outputShapes)
 
@@ -317,6 +345,9 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese
             inputShapes[0] = inputShapes[1]
         return (inputShapes, outputShapes)
 
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
 
 class ConvLayer(ONNXLayer):
 
@@ -374,6 +405,14 @@ class MaxPoolLayer(ONNXLayer):
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+        kernel_shape = self.mapper.parser.operatorRepresentation['kernel_shape']
+        elements_per_window = int(np.prod(kernel_shape))
+        data_out_size = self.mapper.parser.operatorRepresentation['data_out_size']
+        comparisons_per_window = elements_per_window - 1
+        total_ops = data_out_size * comparisons_per_window
+        return total_ops
+
 
 class ReduceMeanLayer(ONNXLayer):
 
@@ -403,6 +442,9 @@ class ReluLayer(ONNXLayer):
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
 
 class LayerNormLayer(ONNXLayer):
 

@@ -1,12 +1,12 @@
 # ----------------------------------------------------------------------
 #
-# File: iGELUTemplate.py
+# File: FloatGELUTemplate.py
 #
-# Last edited: 13.12.2021
+# Last edited: 28.03.2025
 #
 # Copyright (C) 2021, ETH Zurich and University of Bologna.
 #
-# Author: Moritz Scherer, ETH Zurich
+# Author: Run Wang, ETH Zurich
 #
 # ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
@@ -28,4 +28,4 @@
 referenceTemplate = NodeTemplate("""
 // GELU (Name: ${nodeName}, Op: ${nodeOp})
 SINGLE_CORE GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size});
-""")
+""")