buddy-compiler · GuoningHuang · Jun 15, 2025 · Jun 21, 2025 · Jul 22, 2025 · Aug 1, 2025
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
@@ -16,6 +16,52 @@ MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
 MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
 MTRIPLE := x86_64-apple-darwin
 endif
+next-silu-run:
+	@${MLIR_OPT} ./next-silu.mlir \
+		-pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize="bufferize-function-boundaries" \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm  \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-silu-silu-run:
+	@${MLIR_OPT} ./next-silu-silu.mlir \
+	-pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-convert-linalg-to-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
+		-convert-vector-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-math-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
 next-attention-lower:
 	@${MLIR_OPT} ./next-attention.mlir \

diff --git a/examples/BuddyNext/next-silu-silu.mlir b/examples/BuddyNext/next-silu-silu.mlir
@@ -0,0 +1,77 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -convert-linalg-to-loops \
+// RUN:     -lower-affine \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-cf-to-llvm \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts  \
+// RUN: | mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+#map = affine_map<(d0) -> (d0)>
+  func.func private @rtclock() -> f64
+  func.func private @printMemrefF32(%ptr: memref<*xf32>) attributes {llvm.emit_c_interface}
+
+  func.func @kernel(%arg0: memref<1x40x8960xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    %output = memref.alloc() : memref<1x40x8960xf32>
+
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %cst_1f = arith.constant 1.0 : f32
+    %vec_1f = vector.broadcast %cst_1f : f32 to vector<8xf32>
+    %cst_0f = arith.constant 0.0 : f32 // for padding
+
+    %d0 = memref.dim %arg0, %c0 : memref<1x40x8960xf32>
+    %d1 = memref.dim %arg0, %c1 : memref<1x40x8960xf32>
+    %d2 = memref.dim %arg0, %c2 : memref<1x40x8960xf32>
+
+    affine.for %i = #map(%c0) to #map(%d0) {
+      affine.for %j = #map(%c0) to #map(%d1) {
+        affine.for %k = #map(%c0) to #map(%d2) step 8 {
+          %x_vec = vector.transfer_read %arg0[%i, %j, %k], %cst_0f : memref<1x40x8960xf32>, vector<8xf32>
+          %neg_x_vec = arith.negf %x_vec : vector<8xf32>
+          %exp_neg_x_vec = math.exp %neg_x_vec : vector<8xf32>
+          %one_plus_exp_vec = arith.addf %vec_1f, %exp_neg_x_vec : vector<8xf32>
+          %sigmoid_x_vec = arith.divf %vec_1f, %one_plus_exp_vec : vector<8xf32>
+          %silu_vec = arith.mulf %x_vec, %sigmoid_x_vec : vector<8xf32>
+          vector.transfer_write %silu_vec, %output[%i, %j, %k] : vector<8xf32>, memref<1x40x8960xf32>
+        }
+      }
+    }
+
+    %t_end = call @rtclock() : () -> f64
+    %unranked_result = memref.cast %output : memref<1x40x8960xf32> to memref<*xf32>
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 8960] strides = [358400, 8960, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [2.85772{{(, 2.85772)*}}],
+    call @printMemrefF32(%unranked_result) : (memref<*xf32>) -> ()
+    memref.dealloc %output : memref<1x40x8960xf32>
+
+    %time = arith.subf %t_end, %t_start : f64
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input = memref.alloc() : memref<1x40x8960xf32>
+    %cst_neg_1_23 = arith.constant 3.0 : f32
+    linalg.fill ins(%cst_neg_1_23 : f32) outs(%input : memref<1x40x8960xf32>)
+
+    call @kernel(%input) : (memref<1x40x8960xf32>) -> ()
+
+    memref.dealloc %input : memref<1x40x8960xf32>
+
+    return
+  }
diff --git a/examples/BuddyNext/next-silu.mlir b/examples/BuddyNext/next-silu.mlir
@@ -0,0 +1,64 @@
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-cf-to-llvm \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+  func.func private @rtclock() -> f64
+
+  func.func @kenerl(%arg0: tensor<1x40x8960xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    %sigmoid_x = tosa.sigmoid %arg0 : (tensor<1x40x8960xf32>) -> tensor<1x40x8960xf32>
+
+    %silu_result = tosa.mul %arg0, %sigmoid_x {shift = 0 : i8} : (tensor<1x40x8960xf32>, tensor<1x40x8960xf32>) -> tensor<1x40x8960xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    %unranked_result = tensor.cast %silu_result : tensor<1x40x8960xf32> to tensor<*xf32>
+
+    // All the elements of the MemRef are the same,
+    // only check the first line to verify the correctness.
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 8960] strides = [358400, 8960, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [2.85772{{(, 2.85772)*}}],
+
+    // print results.
+    call @printMemrefF32(%unranked_result) : (tensor<*xf32>) -> ()
+    // print timings.
+    vector.print %time : f64
+
+    return
+  }
+
+  func.func @main() {
+    %input_tensor = arith.constant dense<3.0> : tensor<1x40x8960xf32>
+    call @kenerl(%input_tensor) : (tensor<1x40x8960xf32>) -> ()
+
+    return
+  }
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/midend/lib/CMakeLists.txt b/midend/lib/CMakeLists.txt
@@ -25,6 +25,7 @@ set(LinkedLibs
   BatchMatMulOptimization
   MatMulParallelVectorization
   TransposeOptimization
+  SiluOptimization
 )
 
 

diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(LowerLinalgToGemmini)
 add_subdirectory(FuncBufferize)
 add_subdirectory(DepthwiseConvOptimization)
 add_subdirectory(MLIRGPU)
+add_subdirectory(SiluOptimization)
diff --git a/midend/lib/Conversion/SiluOptimization/CMakeLists.txt b/midend/lib/Conversion/SiluOptimization/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_mlir_library(SiluOptimization
+  SiluOptimization.cpp
+  LINK_LIBS PUBLIC
+  BuddyUtils
+)