From 18d070786274ac6c530a922298f4237e703d5b34 Mon Sep 17 00:00:00 2001
From: guoninghuang <gnhuang@ustc.edu>
Date: Sun, 15 Jun 2025 20:40:21 +0800
Subject: [PATCH 1/4] add silu optimization

---
 examples/BuddyNext/makefile                   |  47 ++++
 examples/BuddyNext/next-silu-silu.mlir        |  78 ++++++
 examples/BuddyNext/next-silu.mlir             |  53 ++++
 midend/lib/CMakeLists.txt                     |   1 +
 midend/lib/Conversion/CMakeLists.txt          |   2 +
 .../SiluOptimization/CMakeLists.txt           |   5 +
 .../SiluOptimization/SiluOptimization.cpp     | 249 ++++++++++++++++++
 midend/lib/InitAll.cpp                        |   2 +
 8 files changed, 437 insertions(+)
 create mode 100644 examples/BuddyNext/next-silu-silu.mlir
 create mode 100644 examples/BuddyNext/next-silu.mlir
 create mode 100644 midend/lib/Conversion/SiluOptimization/CMakeLists.txt
 create mode 100644 midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp

diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
index 750bd93f85..c464d7d85a 100644
--- a/examples/BuddyNext/makefile
+++ b/examples/BuddyNext/makefile
@@ -16,6 +16,53 @@ MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
 MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
 MTRIPLE := x86_64-apple-darwin
 endif
+next-silu-run:
+	@${MLIR_OPT} ./next-silu.mlir \
+		-pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" | \
+	${MLIR_OPT} \
+		-arith-expand \
+		-eliminate-empty-tensors \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize \
+		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
+		-lower-affine \
+		-func-bufferize \
+		-arith-bufferize \
+		-tensor-bufferize \
+		-buffer-deallocation \
+		-finalizing-bufferize \
+		-convert-vector-to-scf \
+		-expand-strided-metadata \
+		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
+		-convert-arith-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-scf-to-cf \
+		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-silu-silu-run:
+	@${MLIR_OPT} ./next-silu-silu.mlir \
+		-convert-linalg-to-loops \
+		-lower-affine \
+		-convert-vector-to-scf \
+		-convert-scf-to-cf \
+		-convert-vector-to-llvm \
+		-finalize-memref-to-llvm \
+		-convert-math-to-llvm \
+		-convert-arith-to-llvm \
+		-convert-func-to-llvm \
+		-reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
 next-attention-lower:
 	@${MLIR_OPT} ./next-attention.mlir \
diff --git a/examples/BuddyNext/next-silu-silu.mlir b/examples/BuddyNext/next-silu-silu.mlir
new file mode 100644
index 0000000000..7d0ab1ce2f
--- /dev/null
+++ b/examples/BuddyNext/next-silu-silu.mlir
@@ -0,0 +1,78 @@
+// next-silu-silu.mlir
+//
+// Manually optimized version of SiLU.
+// This implementation avoids TOSA and uses lower-level dialects like
+// scf, arith, math, and vector to have fine-grained control over the computation.
+// The core idea is to vectorize the innermost loop.
+//
+#map = affine_map<(d0) -> (d0)>
+module {
+  // Declare external utility functions for timing and printing.
+  func.func private @rtclock() -> f64
+  func.func private @printMemrefF32(%ptr: memref<*xf32>) attributes {llvm.emit_c_interface}
+
+  // The kernel function that performs the SiLU calculation.
+  func.func @kernel_silu(%arg0: memref<1x40x8960xf32>) {
+    %t_start = call @rtclock() : () -> f64
+    
+    // Allocate output buffer
+    %output = memref.alloc() : memref<1x40x8960xf32>
+    
+    // Constants
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %cst_1f = arith.constant 1.0 : f32
+    %vec_1f = vector.broadcast %cst_1f : f32 to vector<8xf32>
+    %cst_0f = arith.constant 0.0 : f32 // for padding
+
+    // Get dimensions.
+    %d0 = memref.dim %arg0, %c0 : memref<1x40x8960xf32>
+    %d1 = memref.dim %arg0, %c1 : memref<1x40x8960xf32>
+    %d2 = memref.dim %arg0, %c2 : memref<1x40x8960xf32>
+
+    // Loop nest
+    affine.for %i = #map(%c0) to #map(%d0) {
+      affine.for %j = #map(%c0) to #map(%d1) {
+        affine.for %k = #map(%c0) to #map(%d2) step 8 {
+          // Vectorized SiLU computation
+          %x_vec = vector.transfer_read %arg0[%i, %j, %k], %cst_0f : memref<1x40x8960xf32>, vector<8xf32>
+          %neg_x_vec = arith.negf %x_vec : vector<8xf32>
+          %exp_neg_x_vec = math.exp %neg_x_vec : vector<8xf32>
+          %one_plus_exp_vec = arith.addf %vec_1f, %exp_neg_x_vec : vector<8xf32>
+          %sigmoid_x_vec = arith.divf %vec_1f, %one_plus_exp_vec : vector<8xf32>
+          %silu_vec = arith.mulf %x_vec, %sigmoid_x_vec : vector<8xf32>
+          vector.transfer_write %silu_vec, %output[%i, %j, %k] : vector<8xf32>, memref<1x40x8960xf32>
+        }
+      }
+    }
+
+    %t_end = call @rtclock() : () -> f64
+    %unranked_result = memref.cast %output : memref<1x40x8960xf32> to memref<*xf32>
+    call @printMemrefF32(%unranked_result) : (memref<*xf32>) -> ()
+    memref.dealloc %output : memref<1x40x8960xf32>
+    
+    %time = arith.subf %t_end, %t_start : f64
+    vector.print %time : f64
+    
+    return 
+  }
+
+  // Main function to drive the test.
+  func.func @main() {
+    // 1. Allocate and initialize input.
+    %input = memref.alloc() : memref<1x40x8960xf32>
+    %cst_neg_1_23 = arith.constant 3.0 : f32
+    linalg.fill ins(%cst_neg_1_23 : f32) outs(%input : memref<1x40x8960xf32>)
+
+    // 2. Call the SiLU kernel.
+    call @kernel_silu(%input) : (memref<1x40x8960xf32>) -> ()
+
+
+    // 4. Deallocate memrefs.
+    memref.dealloc %input : memref<1x40x8960xf32>
+    
+    return
+  }
+}
+
diff --git a/examples/BuddyNext/next-silu.mlir b/examples/BuddyNext/next-silu.mlir
new file mode 100644
index 0000000000..b63caa8e51
--- /dev/null
+++ b/examples/BuddyNext/next-silu.mlir
@@ -0,0 +1,53 @@
+// next-silu.mlir
+//
+// Implements the SiLU (Sigmoid Linear Unit) activation function.
+// SiLU(x) = x * sigmoid(x)
+// This is realized by composing a tosa.sigmoid and a tosa.mul operation.
+//
+module {
+  // Declare external utility functions for timing and printing.
+  func.func private @rtclock() -> f64
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+
+  //
+  // The kernel function that performs the SiLU calculation.
+  //
+  func.func @kernel_silu(%arg0: tensor<1x40x8960xf32>) {
+    %t_start = call @rtclock() : () -> f64
+
+    // Step 1: Calculate the sigmoid of the input.
+    %sigmoid_x = tosa.sigmoid %arg0 : (tensor<1x40x8960xf32>) -> tensor<1x40x8960xf32>
+
+    // Step 2: Multiply the original input with its sigmoid.
+    // This is the SiLU operation. {shift=0} is standard for float multiplication.
+    %silu_result = tosa.mul %arg0, %sigmoid_x {shift = 0 : i8} : (tensor<1x40x8960xf32>, tensor<1x40x8960xf32>) -> tensor<1x40x8960xf32>
+
+    %t_end = call @rtclock() : () -> f64
+    %time = arith.subf %t_end, %t_start : f64
+
+    // Cast the result to an unranked tensor for the print function.
+    %unranked_result = tensor.cast %silu_result : tensor<1x40x8960xf32> to tensor<*xf32>
+    
+    // Print the result tensor to verify correctness.
+    call @printMemrefF32(%unranked_result) : (tensor<*xf32>) -> ()
+    
+    // Print the execution time.
+    vector.print %time : f64
+
+    return
+  }
+
+  //
+  // Main function to drive the test.
+  //
+  func.func @main() {
+    // Initialize input with a constant value (e.g., 3.0).
+    // The SiLU of 3.0 is 3.0 * sigmoid(3.0) = 3.0 * 0.952574 = 2.857722
+    %input_tensor = arith.constant dense<3.0> : tensor<1x40x8960xf32>
+
+    // Call the SiLU kernel.
+    call @kernel_silu(%input_tensor) : (tensor<1x40x8960xf32>) -> ()
+
+    return
+  }
+} 
diff --git a/midend/lib/CMakeLists.txt b/midend/lib/CMakeLists.txt
index 26477e0e40..965e50012d 100644
--- a/midend/lib/CMakeLists.txt
+++ b/midend/lib/CMakeLists.txt
@@ -25,6 +25,7 @@ set(LinkedLibs
   BatchMatMulOptimization
   MatMulParallelVectorization
   TransposeOptimization
+  SiluOptimization
 )
 
 
diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
index 6ad92c6892..f78ad95b3c 100644
--- a/midend/lib/Conversion/CMakeLists.txt
+++ b/midend/lib/Conversion/CMakeLists.txt
@@ -14,3 +14,5 @@ add_subdirectory(LowerLinalgToGemmini)
 add_subdirectory(FuncBufferize)
 add_subdirectory(DepthwiseConvOptimization)
 add_subdirectory(MLIRGPU)
+add_subdirectory(MLIRGPU)
+add_subdirectory(SiluOptimization)
diff --git a/midend/lib/Conversion/SiluOptimization/CMakeLists.txt b/midend/lib/Conversion/SiluOptimization/CMakeLists.txt
new file mode 100644
index 0000000000..64696f80c0
--- /dev/null
+++ b/midend/lib/Conversion/SiluOptimization/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_mlir_library(SiluOptimization
+  SiluOptimization.cpp
+  LINK_LIBS PUBLIC
+  BuddyUtils
+) 
diff --git a/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp b/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp
new file mode 100644
index 0000000000..62b428320a
--- /dev/null
+++ b/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp
@@ -0,0 +1,249 @@
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Rewrite Pattern
+//===----------------------------------------------------------------------===//
+
+class SiLUVectorizePattern : public ConversionPattern {
+public:
+  explicit SiLUVectorizePattern(MLIRContext *context, int64_t vectorSizeParam)
+      : ConversionPattern(linalg::GenericOp::getOperationName(), 1, context) {
+    vectorSize = vectorSizeParam;
+  }
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    linalg::GenericOp sigmoidOp = cast<linalg::GenericOp>(op);
+    
+    //--------------sigmoid OP--------
+    // 1. Check input/output
+    if (sigmoidOp.getNumDpsInputs() != 1 || sigmoidOp.getNumDpsInits() != 1){
+      llvm::errs() << "1\n";
+      return failure();}
+
+    // Check the body of the op for sigmoid computation.
+    // The IR should be: negf, exp, addf, divf, yield.
+    Block &block = sigmoidOp.getRegion().front();
+    if (block.getOperations().size() != 5) // negf, exp, addf, divf, yield
+      {llvm::errs() << "4\n";
+      return failure();}
+
+    Operation &negfOp = block.getOperations().front();
+    Operation &yieldOp = block.getOperations().back();
+
+    // Check the type of the two operations.
+    if (!isa<arith::NegFOp>(negfOp) || !isa<linalg::YieldOp>(yieldOp))
+      {llvm::errs() << "5\n";
+      return failure();}
+
+
+    //-----------Find the consumer mul operation.------------------------------
+    // The result of the sigmoid op must be used by another linalg.generic op.
+    Value outputBuffer = sigmoidOp.getDpsInitOperand(0)->get();
+
+    // 遍历所有 uses，寻找满足条件的 consumer op
+    linalg::GenericOp mulOp = nullptr;
+
+    for (auto &use : outputBuffer.getUses()) {
+      Operation *user = use.getOwner();
+
+      // 要求是 linalg.generic，且 %alloc 是 input operand（即 ins()）
+      auto linalgOp = dyn_cast<linalg::GenericOp>(user);
+      if (!linalgOp)
+        continue;
+
+      bool foundInInput = false;
+      for (OpOperand *input : linalgOp.getDpsInputOperands()) {
+        if (input->get() == outputBuffer) {
+          foundInInput = true;
+          break;
+        }
+      }
+      if (!foundInInput)
+        continue;
+
+      // 检查其内部是否有 arith.mulf 操作
+      for (auto &nestedOp : linalgOp.getRegion().front()) {
+        if (isa<arith::MulFOp>(nestedOp)) {
+          mulOp = linalgOp;
+          break;
+        }
+      }
+
+      if (mulOp)
+        break;
+    }
+
+    if (!mulOp) {
+      llvm::errs() << "Didn't find a consumer linalg.generic using sigmoid output with mulf.\n";
+      return failure();
+    }
+
+    // Set the insertion point before the mulOp. This ensures that the new affine
+    // loop is inserted at a point that is dominated by the allocation of the
+    // output buffer.
+    // rewriter.setInsertionPoint(mulOp);
+
+    // Now we have matched the silu pattern: sigmoid followed by a mul.
+    // The rewrite logic will be applied to the sigmoidOp, and the mulOp will be
+    // erased.
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(mulOp);
+    Location loc = sigmoidOp.getLoc();
+    Value input = sigmoidOp.getDpsInputOperand(0)->get();
+    // The final output buffer comes from the mulOp.
+    Value output = mulOp.getDpsInitOperand(0)->get();
+
+    auto inputMemRefType = input.getType().cast<MemRefType>();
+    Type elementType = inputMemRefType.getElementType();
+    VectorType vectorType = VectorType::get({vectorSize}, elementType);
+
+    // Define constants.
+    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    const int64_t unrollFactor = 2;
+    Value cUnrollVec =
+        rewriter.create<arith::ConstantIndexOp>(loc, vectorSize * unrollFactor);
+    Value cst1f = rewriter.create<arith::ConstantOp>(
+        loc, rewriter.getF32FloatAttr(1.0));
+    Value vec1f = rewriter.create<vector::BroadcastOp>(loc, vectorType, cst1f);
+
+    // Get dimensions.
+    Value d0 = rewriter.create<memref::DimOp>(loc, input, 0);
+    Value d1 = rewriter.create<memref::DimOp>(loc, input, 1);
+    Value d2 = rewriter.create<memref::DimOp>(loc, input, 2);
+
+    // Create loop nest.
+    scf::ForOp iLoop = rewriter.create<scf::ForOp>(loc, c0, d0, c1);
+    rewriter.setInsertionPointToStart(iLoop.getBody());
+    Value iv_i = iLoop.getInductionVar();
+
+    scf::ForOp jLoop = rewriter.create<scf::ForOp>(loc, c0, d1, c1);
+    rewriter.setInsertionPointToStart(jLoop.getBody());
+    Value iv_j = jLoop.getInductionVar();
+
+    scf::ForOp kLoop = rewriter.create<scf::ForOp>(loc, c0, d2, cUnrollVec);
+    rewriter.setInsertionPointToStart(kLoop.getBody());
+    Value iv_k = kLoop.getInductionVar();
+
+    // Prefetch
+    Value k_next = rewriter.create<arith::AddIOp>(loc, iv_k, cUnrollVec);
+    rewriter.create<memref::PrefetchOp>(loc, input, ValueRange{iv_i, iv_j, k_next},
+                                      /*isWrite=*/false, /*localityHint=*/3,
+                                      /*isDataCache=*/true);
+
+    // Unrolled loop body
+    for (int i = 0; i < unrollFactor; ++i) {
+      Value k_offset =
+          rewriter.create<arith::ConstantIndexOp>(loc, i * vectorSize);
+      Value k_i = rewriter.create<arith::AddIOp>(loc, iv_k, k_offset);
+
+      // --- Process Vector ---
+      Value x_vec = rewriter.create<vector::LoadOp>(
+          loc, vectorType, input, ValueRange{iv_i, iv_j, k_i});
+      Value neg_x_vec = rewriter.create<arith::NegFOp>(loc, x_vec);
+      Value exp_neg_x_vec = rewriter.create<math::ExpOp>(loc, neg_x_vec);
+      Value one_plus_exp_vec =
+          rewriter.create<arith::AddFOp>(loc, vec1f, exp_neg_x_vec);
+      Value sigmoid_x_vec =
+          rewriter.create<arith::DivFOp>(loc, vec1f, one_plus_exp_vec);
+      Value silu_vec = rewriter.create<arith::MulFOp>(loc, x_vec, sigmoid_x_vec);
+      rewriter.create<vector::StoreOp>(loc, silu_vec, output,
+                                      ValueRange{iv_i, iv_j, k_i});
+    }
+
+    // Replace the original mulOp with the result from our new computation.
+    // The 'output' buffer now holds the final result. `replaceOp` will
+    // replace all uses of mulOp's results with `output` and then erase mulOp.
+    rewriter.eraseOp(mulOp);
+    rewriter.eraseOp(sigmoidOp);
+
+    return success();
+  }
+
+private:
+  int64_t vectorSize;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+class SiluOptimizationPass
+    : public PassWrapper<SiluOptimizationPass, OperationPass<ModuleOp>> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SiluOptimizationPass)
+  StringRef getArgument() const final { return "silu-optimization"; }
+  StringRef getDescription() const final {
+    return "Vectorize linalg.generic representing SiLU.";
+  }
+  SiluOptimizationPass() = default;
+  SiluOptimizationPass(const SiluOptimizationPass &) {}
+  explicit SiluOptimizationPass(int64_t vectorSizeParam) {
+    vectorSize = vectorSizeParam;
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, affine::AffineDialect,
+                    vector::VectorDialect, math::MathDialect, scf::SCFDialect,
+                    arith::ArithDialect, memref::MemRefDialect>();
+  }
+
+  Option<int64_t> vectorSize{*this, "vector-size",
+                               llvm::cl::desc("Vector size for SiLU."),
+                               llvm::cl::init(8)};
+};
+
+void SiluOptimizationPass::runOnOperation() {
+  MLIRContext *context = &getContext();
+  ModuleOp module = getOperation();
+
+  ConversionTarget target(*context);
+  target.addLegalDialect<arith::ArithDialect, affine::AffineDialect,
+                         memref::MemRefDialect, vector::VectorDialect,
+                         func::FuncDialect, math::MathDialect,
+                         scf::SCFDialect>();
+  target.addLegalOp<ModuleOp, func::FuncOp>();
+  
+  // We will manually mark linalg.generic as illegal if it is part of a SiLU pattern.
+  // The pattern itself will handle the legality checks and replacements.
+  // Therefore, we don't need to addIllegalOp<linalg::GenericOp>() here.
+  
+  RewritePatternSet patterns(context);
+  patterns.add<SiLUVectorizePattern>(context, vectorSize);
+
+  if (failed(applyPartialConversion(module, target, std::move(patterns))))
+    signalPassFailure();
+}
+} // end anonymous namespace 
+namespace mlir {
+namespace buddy {
+void registerSiluOptimizationPass() {
+  PassRegistration<SiluOptimizationPass>();
+}
+} // namespace buddy
+} // namespace mlir
diff --git a/midend/lib/InitAll.cpp b/midend/lib/InitAll.cpp
index 7d8929e055..8c2ab1de56 100644
--- a/midend/lib/InitAll.cpp
+++ b/midend/lib/InitAll.cpp
@@ -45,6 +45,7 @@ void registerMatMulOptimizePass();
 void registerMatMulParallelVectorizationPass();
 void registerMatMulVectorizationPass();
 void registerTransposeOptimizationPass();
+void registerSiluOptimizationPass();
 } // namespace buddy
 } // namespace mlir
 
@@ -74,4 +75,5 @@ void mlir::buddy::registerAllPasses() {
   mlir::buddy::registerMatMulParallelVectorizationPass();
   mlir::buddy::registerMatMulVectorizationPass();
   mlir::buddy::registerTransposeOptimizationPass();
+  mlir::buddy::registerSiluOptimizationPass();
 }

From 5514650eea891264cfb55609ad264b1133e11191 Mon Sep 17 00:00:00 2001
From: guoninghuang <gnhuang@ustc.edu>
Date: Sat, 21 Jun 2025 16:14:34 +0800
Subject: [PATCH 2/4] Add Silu optimization and test

---
 midend/lib/Conversion/CMakeLists.txt          |   1 -
 .../SiluOptimization/SiluOptimization.cpp     | 147 ++++++++++--------
 tests/Conversion/silu-optimization.mlir       |  56 +++++++
 3 files changed, 134 insertions(+), 70 deletions(-)
 create mode 100644 tests/Conversion/silu-optimization.mlir

diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
index f78ad95b3c..48fc3d8aa5 100644
--- a/midend/lib/Conversion/CMakeLists.txt
+++ b/midend/lib/Conversion/CMakeLists.txt
@@ -14,5 +14,4 @@ add_subdirectory(LowerLinalgToGemmini)
 add_subdirectory(FuncBufferize)
 add_subdirectory(DepthwiseConvOptimization)
 add_subdirectory(MLIRGPU)
-add_subdirectory(MLIRGPU)
 add_subdirectory(SiluOptimization)
diff --git a/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp b/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp
index 62b428320a..f036001649 100644
--- a/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp
+++ b/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp
@@ -1,3 +1,24 @@
+//====- SiluOptimization.cpp - Silu Optimization Pass ---------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass that vectorizes the linalg.generic representing
+// SiLU.
+//
+//===----------------------------------------------------------------------===//
+
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -13,8 +34,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-
-
 using namespace mlir;
 
 namespace {
@@ -35,40 +54,40 @@ class SiLUVectorizePattern : public ConversionPattern {
                   ConversionPatternRewriter &rewriter) const override {
 
     linalg::GenericOp sigmoidOp = cast<linalg::GenericOp>(op);
-    
+
     //--------------sigmoid OP--------
-    // 1. Check input/output
-    if (sigmoidOp.getNumDpsInputs() != 1 || sigmoidOp.getNumDpsInits() != 1){
-      llvm::errs() << "1\n";
-      return failure();}
+    //  Check input/output
+    if (sigmoidOp.getNumDpsInputs() != 1 || sigmoidOp.getNumDpsInits() != 1) {
+      return failure();
+    }
 
     // Check the body of the op for sigmoid computation.
     // The IR should be: negf, exp, addf, divf, yield.
     Block &block = sigmoidOp.getRegion().front();
-    if (block.getOperations().size() != 5) // negf, exp, addf, divf, yield
-      {llvm::errs() << "4\n";
-      return failure();}
+    if (block.getOperations().size() != 5) { // negf, exp, addf, divf, yield
+      return failure();
+    }
 
     Operation &negfOp = block.getOperations().front();
     Operation &yieldOp = block.getOperations().back();
 
     // Check the type of the two operations.
-    if (!isa<arith::NegFOp>(negfOp) || !isa<linalg::YieldOp>(yieldOp))
-      {llvm::errs() << "5\n";
-      return failure();}
-
+    if (!isa<arith::NegFOp>(negfOp) || !isa<linalg::YieldOp>(yieldOp)) {
+      return failure();
+    }
 
     //-----------Find the consumer mul operation.------------------------------
     // The result of the sigmoid op must be used by another linalg.generic op.
     Value outputBuffer = sigmoidOp.getDpsInitOperand(0)->get();
 
-    // 遍历所有 uses，寻找满足条件的 consumer op
+    // Iterate over all uses to find a suitable consumer op.
     linalg::GenericOp mulOp = nullptr;
 
     for (auto &use : outputBuffer.getUses()) {
       Operation *user = use.getOwner();
 
-      // 要求是 linalg.generic，且 %alloc 是 input operand（即 ins()）
+      // It must be a linalg.generic, and the buffer must be an input operand
+      // (i.e., ins()).
       auto linalgOp = dyn_cast<linalg::GenericOp>(user);
       if (!linalgOp)
         continue;
@@ -83,7 +102,7 @@ class SiLUVectorizePattern : public ConversionPattern {
       if (!foundInInput)
         continue;
 
-      // 检查其内部是否有 arith.mulf 操作
+      // Check if it contains an arith.mulf operation inside.
       for (auto &nestedOp : linalgOp.getRegion().front()) {
         if (isa<arith::MulFOp>(nestedOp)) {
           mulOp = linalgOp;
@@ -96,14 +115,14 @@ class SiLUVectorizePattern : public ConversionPattern {
     }
 
     if (!mulOp) {
-      llvm::errs() << "Didn't find a consumer linalg.generic using sigmoid output with mulf.\n";
+      llvm::errs() << "Didn't find a consumer linalg.generic using sigmoid "
+                      "output with mulf.\n";
       return failure();
     }
 
-    // Set the insertion point before the mulOp. This ensures that the new affine
-    // loop is inserted at a point that is dominated by the allocation of the
-    // output buffer.
-    // rewriter.setInsertionPoint(mulOp);
+    // Set the insertion point before the mulOp. This ensures that the new
+    // affine loop is inserted at a point that is dominated by the allocation of
+    // the output buffer. rewriter.setInsertionPoint(mulOp);
 
     // Now we have matched the silu pattern: sigmoid followed by a mul.
     // The rewrite logic will be applied to the sigmoidOp, and the mulOp will be
@@ -122,13 +141,11 @@ class SiLUVectorizePattern : public ConversionPattern {
 
     // Define constants.
     Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    const int64_t unrollFactor = 2;
-    Value cUnrollVec =
-        rewriter.create<arith::ConstantIndexOp>(loc, vectorSize * unrollFactor);
-    Value cst1f = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getF32FloatAttr(1.0));
+    Value cst1f =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(1.0));
     Value vec1f = rewriter.create<vector::BroadcastOp>(loc, vectorType, cst1f);
+    Value cst0f =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(0.0f));
 
     // Get dimensions.
     Value d0 = rewriter.create<memref::DimOp>(loc, input, 0);
@@ -136,43 +153,34 @@ class SiLUVectorizePattern : public ConversionPattern {
     Value d2 = rewriter.create<memref::DimOp>(loc, input, 2);
 
     // Create loop nest.
-    scf::ForOp iLoop = rewriter.create<scf::ForOp>(loc, c0, d0, c1);
+    AffineMap map = rewriter.getDimIdentityMap();
+    affine::AffineForOp iLoop = rewriter.create<affine::AffineForOp>(
+        loc, ValueRange{c0}, map, ValueRange{d0}, map);
     rewriter.setInsertionPointToStart(iLoop.getBody());
     Value iv_i = iLoop.getInductionVar();
 
-    scf::ForOp jLoop = rewriter.create<scf::ForOp>(loc, c0, d1, c1);
+    affine::AffineForOp jLoop = rewriter.create<affine::AffineForOp>(
+        loc, ValueRange{c0}, map, ValueRange{d1}, map);
     rewriter.setInsertionPointToStart(jLoop.getBody());
     Value iv_j = jLoop.getInductionVar();
 
-    scf::ForOp kLoop = rewriter.create<scf::ForOp>(loc, c0, d2, cUnrollVec);
+    affine::AffineForOp kLoop = rewriter.create<affine::AffineForOp>(
+        loc, ValueRange{c0}, map, ValueRange{d2}, map, vectorSize);
     rewriter.setInsertionPointToStart(kLoop.getBody());
     Value iv_k = kLoop.getInductionVar();
 
-    // Prefetch
-    Value k_next = rewriter.create<arith::AddIOp>(loc, iv_k, cUnrollVec);
-    rewriter.create<memref::PrefetchOp>(loc, input, ValueRange{iv_i, iv_j, k_next},
-                                      /*isWrite=*/false, /*localityHint=*/3,
-                                      /*isDataCache=*/true);
-
-    // Unrolled loop body
-    for (int i = 0; i < unrollFactor; ++i) {
-      Value k_offset =
-          rewriter.create<arith::ConstantIndexOp>(loc, i * vectorSize);
-      Value k_i = rewriter.create<arith::AddIOp>(loc, iv_k, k_offset);
-
-      // --- Process Vector ---
-      Value x_vec = rewriter.create<vector::LoadOp>(
-          loc, vectorType, input, ValueRange{iv_i, iv_j, k_i});
-      Value neg_x_vec = rewriter.create<arith::NegFOp>(loc, x_vec);
-      Value exp_neg_x_vec = rewriter.create<math::ExpOp>(loc, neg_x_vec);
-      Value one_plus_exp_vec =
-          rewriter.create<arith::AddFOp>(loc, vec1f, exp_neg_x_vec);
-      Value sigmoid_x_vec =
-          rewriter.create<arith::DivFOp>(loc, vec1f, one_plus_exp_vec);
-      Value silu_vec = rewriter.create<arith::MulFOp>(loc, x_vec, sigmoid_x_vec);
-      rewriter.create<vector::StoreOp>(loc, silu_vec, output,
-                                      ValueRange{iv_i, iv_j, k_i});
-    }
+    // --- Process Vector ---
+    Value x_vec = rewriter.create<vector::TransferReadOp>(
+        loc, vectorType, input, ValueRange{iv_i, iv_j, iv_k}, cst0f);
+    Value neg_x_vec = rewriter.create<arith::NegFOp>(loc, x_vec);
+    Value exp_neg_x_vec = rewriter.create<math::ExpOp>(loc, neg_x_vec);
+    Value one_plus_exp_vec =
+        rewriter.create<arith::AddFOp>(loc, vec1f, exp_neg_x_vec);
+    Value sigmoid_x_vec =
+        rewriter.create<arith::DivFOp>(loc, vec1f, one_plus_exp_vec);
+    Value silu_vec = rewriter.create<arith::MulFOp>(loc, x_vec, sigmoid_x_vec);
+    rewriter.create<vector::TransferWriteOp>(loc, silu_vec, output,
+                                             ValueRange{iv_i, iv_j, iv_k});
 
     // Replace the original mulOp with the result from our new computation.
     // The 'output' buffer now holds the final result. `replaceOp` will
@@ -214,8 +222,8 @@ class SiluOptimizationPass
   }
 
   Option<int64_t> vectorSize{*this, "vector-size",
-                               llvm::cl::desc("Vector size for SiLU."),
-                               llvm::cl::init(8)};
+                             llvm::cl::desc("Vector size for SiLU."),
+                             llvm::cl::init(8)};
 };
 
 void SiluOptimizationPass::runOnOperation() {
@@ -223,27 +231,28 @@ void SiluOptimizationPass::runOnOperation() {
   ModuleOp module = getOperation();
 
   ConversionTarget target(*context);
-  target.addLegalDialect<arith::ArithDialect, affine::AffineDialect,
-                         memref::MemRefDialect, vector::VectorDialect,
-                         func::FuncDialect, math::MathDialect,
-                         scf::SCFDialect>();
+  target
+      .addLegalDialect<arith::ArithDialect, affine::AffineDialect,
+                       memref::MemRefDialect, vector::VectorDialect,
+                       func::FuncDialect, math::MathDialect, scf::SCFDialect>();
   target.addLegalOp<ModuleOp, func::FuncOp>();
-  
-  // We will manually mark linalg.generic as illegal if it is part of a SiLU pattern.
-  // The pattern itself will handle the legality checks and replacements.
-  // Therefore, we don't need to addIllegalOp<linalg::GenericOp>() here.
-  
+
+  // We will manually mark linalg.generic as illegal if it is part of a SiLU
+  // pattern. The pattern itself will handle the legality checks and
+  // replacements. Therefore, we don't need to addIllegalOp<linalg::GenericOp>()
+  // here.
+
   RewritePatternSet patterns(context);
   patterns.add<SiLUVectorizePattern>(context, vectorSize);
 
   if (failed(applyPartialConversion(module, target, std::move(patterns))))
     signalPassFailure();
 }
-} // end anonymous namespace 
+} // end anonymous namespace
 namespace mlir {
 namespace buddy {
 void registerSiluOptimizationPass() {
   PassRegistration<SiluOptimizationPass>();
 }
 } // namespace buddy
-} // namespace mlir
+} // namespace mlir
\ No newline at end of file
diff --git a/tests/Conversion/silu-optimization.mlir b/tests/Conversion/silu-optimization.mlir
new file mode 100644
index 0000000000..a6cbc8c169
--- /dev/null
+++ b/tests/Conversion/silu-optimization.mlir
@@ -0,0 +1,56 @@
+// RUN: buddy-opt -silu-optimization="vector-size=8" %s | FileCheck %s
+
+// CHECK: #map = affine_map<(d0) -> (d0)>
+// CHECK: module {
+// CHECK:   func.func @silu_tosa(%arg0: memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>) -> memref<1x40x8960xf32> {
+// CHECK:%cst = arith.constant 1.000000e+00 : f32
+// CHECK:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+// CHECK:     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+// CHECK:     %c0 = arith.constant 0 : index
+// CHECK:     %cst_1 = arith.constant 1.000000e+00 : f32
+// CHECK:     %0 = vector.broadcast %cst_1 : f32 to vector<8xf32>
+// CHECK:     %cst_2 = arith.constant 0.000000e+00 : f32
+// CHECK:     %c0_3 = arith.constant 0 : index
+// CHECK:     %dim = memref.dim %arg0, %c0_3 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>
+// CHECK:     %c1 = arith.constant 1 : index
+// CHECK:     %dim_4 = memref.dim %arg0, %c1 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>
+// CHECK:     %c2 = arith.constant 2 : index
+// CHECK:     %dim_5 = memref.dim %arg0, %c2 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>
+// CHECK:     affine.for %arg1 = #map(%c0) to #map(%dim) {
+// CHECK:       affine.for %arg2 = #map(%c0) to #map(%dim_4) {
+// CHECK:         affine.for %arg3 = #map(%c0) to #map(%dim_5) step 8 {
+// CHECK:           %1 = vector.transfer_read %arg0[%arg1, %arg2, %arg3], %cst_2 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>, vector<8xf32>
+// CHECK:           %2 = arith.negf %1 : vector<8xf32>
+// CHECK:           %3 = math.exp %2 : vector<8xf32>
+// CHECK:           %4 = arith.addf %0, %3 : vector<8xf32>
+// CHECK:           %5 = arith.divf %0, %4 : vector<8xf32>
+// CHECK:           %6 = arith.mulf %1, %5 : vector<8xf32>
+// CHECK:           vector.transfer_write %6, %alloc_0[%arg1, %arg2, %arg3] : vector<8xf32>, memref<1x40x8960xf32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:     }
+// CHECK:     return %alloc_0 : memref<1x40x8960xf32>
+// CHECK:   }
+// CHECK: }
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @silu_tosa(%arg0: memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>) -> memref<1x40x8960xf32> {
+  %cst = arith.constant 1.000000e+00 : f32
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+  linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<1x40x8960xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %3 = arith.negf %in : f32
+    %4 = math.exp %3 : f32
+    %5 = arith.addf %4, %cst : f32
+    %6 = arith.divf %cst, %5 : f32
+    linalg.yield %6 : f32
+  }
+  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+  linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %alloc : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>, memref<1x40x8960xf32>) outs(%alloc_0 : memref<1x40x8960xf32>) {
+  ^bb0(%in: f32, %in_1: f32, %out: f32):
+    %3 = arith.mulf %in, %in_1 : f32
+    linalg.yield %3 : f32
+  }
+  return %alloc_0 : memref<1x40x8960xf32>
+}

From 2c6a446d340e3bf1472d728c8872f1dabbb66206 Mon Sep 17 00:00:00 2001
From: guoninghuang <gnhuang@ustc.edu>
Date: Tue, 22 Jul 2025 18:21:54 +0800
Subject: [PATCH 3/4] silu pass and test

---
 examples/BuddyNext/next-silu.mlir | 71 ++++++++++++++++++-------------
 tools/buddy-opt/CMakeLists.txt    |  1 +
 tools/buddy-opt/buddy-opt.cpp     |  2 +
 3 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/examples/BuddyNext/next-silu.mlir b/examples/BuddyNext/next-silu.mlir
index b63caa8e51..597b003cbc 100644
--- a/examples/BuddyNext/next-silu.mlir
+++ b/examples/BuddyNext/next-silu.mlir
@@ -1,53 +1,64 @@
-// next-silu.mlir
-//
-// Implements the SiLU (Sigmoid Linear Unit) activation function.
-// SiLU(x) = x * sigmoid(x)
-// This is realized by composing a tosa.sigmoid and a tosa.mul operation.
-//
-module {
-  // Declare external utility functions for timing and printing.
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -arith-expand \
+// RUN:     -eliminate-empty-tensors \
+// RUN:     -empty-tensor-to-alloc-tensor \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -convert-linalg-to-affine-loops \
+// RUN:     -affine-loop-fusion \
+// RUN:     -lower-affine \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -memref-expand \
+// RUN:     -arith-expand \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-cf-to-llvm \
+// RUN:     -convert-openmp-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-math-to-libm  \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts \
+// RUN: | mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
   func.func private @rtclock() -> f64
-  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
 
-  //
-  // The kernel function that performs the SiLU calculation.
-  //
-  func.func @kernel_silu(%arg0: tensor<1x40x8960xf32>) {
+  func.func @kenerl(%arg0: tensor<1x40x8960xf32>) {
     %t_start = call @rtclock() : () -> f64
 
-    // Step 1: Calculate the sigmoid of the input.
     %sigmoid_x = tosa.sigmoid %arg0 : (tensor<1x40x8960xf32>) -> tensor<1x40x8960xf32>
 
-    // Step 2: Multiply the original input with its sigmoid.
-    // This is the SiLU operation. {shift=0} is standard for float multiplication.
     %silu_result = tosa.mul %arg0, %sigmoid_x {shift = 0 : i8} : (tensor<1x40x8960xf32>, tensor<1x40x8960xf32>) -> tensor<1x40x8960xf32>
 
     %t_end = call @rtclock() : () -> f64
     %time = arith.subf %t_end, %t_start : f64
 
-    // Cast the result to an unranked tensor for the print function.
     %unranked_result = tensor.cast %silu_result : tensor<1x40x8960xf32> to tensor<*xf32>
-    
-    // Print the result tensor to verify correctness.
+
+    // All the elements of the MemRef are the same,
+    // only check the first line to verify the correctness.
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 8960] strides = [358400, 8960, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [2.85772{{(, 2.85772)*}}],
+
+    // print results.
     call @printMemrefF32(%unranked_result) : (tensor<*xf32>) -> ()
-    
-    // Print the execution time.
+    // print timings.
     vector.print %time : f64
 
     return
   }
 
-  //
-  // Main function to drive the test.
-  //
   func.func @main() {
-    // Initialize input with a constant value (e.g., 3.0).
-    // The SiLU of 3.0 is 3.0 * sigmoid(3.0) = 3.0 * 0.952574 = 2.857722
     %input_tensor = arith.constant dense<3.0> : tensor<1x40x8960xf32>
-
-    // Call the SiLU kernel.
-    call @kernel_silu(%input_tensor) : (tensor<1x40x8960xf32>) -> ()
+    call @kenerl(%input_tensor) : (tensor<1x40x8960xf32>) -> ()
 
     return
   }
-} 
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
diff --git a/tools/buddy-opt/CMakeLists.txt b/tools/buddy-opt/CMakeLists.txt
index 46fcfb4a57..19a9ba9724 100644
--- a/tools/buddy-opt/CMakeLists.txt
+++ b/tools/buddy-opt/CMakeLists.txt
@@ -42,4 +42,5 @@ target_link_libraries(buddy-opt
   MLIRTransforms
   MLIRTransformUtils
   MatMulTransposeBVec
+  SiluOptimization
   )
diff --git a/tools/buddy-opt/buddy-opt.cpp b/tools/buddy-opt/buddy-opt.cpp
index fb668a391a..a389cd8f22 100644
--- a/tools/buddy-opt/buddy-opt.cpp
+++ b/tools/buddy-opt/buddy-opt.cpp
@@ -82,6 +82,7 @@ void registerConvertMemcpyToGPUPass();
 void registerLegalizeShmemOutliningPass();
 void registerMatMulTransposeBVecPass();
 void registerLegalizeShmemOutliningPass();
+void registerSiluOptimizationPass();
 } // namespace buddy
 } // namespace mlir
 
@@ -121,6 +122,7 @@ int main(int argc, char **argv) {
   mlir::buddy::registerDepthwiseConv2DNhwcHwcOptimizePass();
   mlir::buddy::registerFuncBufferizeDynamicOffsetPass();
   mlir::buddy::registerMatMulTransposeBVecPass();
+  mlir::buddy::registerSiluOptimizationPass();
 
   // Register gpu passes
   mlir::buddy::registerConvertMemcpyToGPUPass();

From d7442facb72630081022337823f77f63356f6180 Mon Sep 17 00:00:00 2001
From: GuoningHuang <2985186238@qq.com>
Date: Fri, 1 Aug 2025 15:27:46 +0800
Subject: [PATCH 4/4] fix check failed caused by next-silu-silu.mlir

---
 examples/BuddyNext/makefile            | 13 +++---
 examples/BuddyNext/next-silu-silu.mlir | 59 +++++++++++++-------------
 examples/BuddyNext/next-silu.mlir      |  2 +-
 3 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
index c464d7d85a..641f4c62e8 100644
--- a/examples/BuddyNext/makefile
+++ b/examples/BuddyNext/makefile
@@ -23,15 +23,10 @@ next-silu-run:
 		-arith-expand \
 		-eliminate-empty-tensors \
 		-empty-tensor-to-alloc-tensor \
-		-one-shot-bufferize \
+		-one-shot-bufferize="bufferize-function-boundaries" \
 		-convert-linalg-to-affine-loops \
 		-affine-loop-fusion \
 		-lower-affine \
-		-func-bufferize \
-		-arith-bufferize \
-		-tensor-bufferize \
-		-buffer-deallocation \
-		-finalizing-bufferize \
 		-convert-vector-to-scf \
 		-expand-strided-metadata \
 		-convert-vector-to-llvm \
@@ -40,10 +35,11 @@ next-silu-run:
 		-convert-arith-to-llvm \
 		-finalize-memref-to-llvm \
 		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
 		-convert-openmp-to-llvm \
 		-convert-arith-to-llvm \
 		-convert-math-to-llvm \
-		-convert-math-to-libm \
+		-convert-math-to-libm  \
 		-convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
@@ -51,10 +47,13 @@ next-silu-run:
 
 next-silu-silu-run:
 	@${MLIR_OPT} ./next-silu-silu.mlir \
+	-pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" | \
+	${MLIR_OPT} \
 		-convert-linalg-to-loops \
 		-lower-affine \
 		-convert-vector-to-scf \
 		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
 		-convert-vector-to-llvm \
 		-finalize-memref-to-llvm \
 		-convert-math-to-llvm \
diff --git a/examples/BuddyNext/next-silu-silu.mlir b/examples/BuddyNext/next-silu-silu.mlir
index 7d0ab1ce2f..d09e071d23 100644
--- a/examples/BuddyNext/next-silu-silu.mlir
+++ b/examples/BuddyNext/next-silu-silu.mlir
@@ -1,24 +1,30 @@
-// next-silu-silu.mlir
-//
-// Manually optimized version of SiLU.
-// This implementation avoids TOSA and uses lower-level dialects like
-// scf, arith, math, and vector to have fine-grained control over the computation.
-// The core idea is to vectorize the innermost loop.
-//
+// RUN: buddy-opt %s \
+// RUN:     -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-tensor,tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN:     -convert-linalg-to-loops \
+// RUN:     -lower-affine \
+// RUN:     -convert-vector-to-scf \
+// RUN:     -convert-scf-to-cf \
+// RUN:     -convert-cf-to-llvm \
+// RUN:     -convert-vector-to-llvm \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -convert-math-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts  \
+// RUN: | mlir-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
 #map = affine_map<(d0) -> (d0)>
-module {
-  // Declare external utility functions for timing and printing.
   func.func private @rtclock() -> f64
   func.func private @printMemrefF32(%ptr: memref<*xf32>) attributes {llvm.emit_c_interface}
 
-  // The kernel function that performs the SiLU calculation.
-  func.func @kernel_silu(%arg0: memref<1x40x8960xf32>) {
+  func.func @kernel(%arg0: memref<1x40x8960xf32>) {
     %t_start = call @rtclock() : () -> f64
-    
-    // Allocate output buffer
+
     %output = memref.alloc() : memref<1x40x8960xf32>
-    
-    // Constants
+
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
@@ -26,16 +32,13 @@ module {
     %vec_1f = vector.broadcast %cst_1f : f32 to vector<8xf32>
     %cst_0f = arith.constant 0.0 : f32 // for padding
 
-    // Get dimensions.
     %d0 = memref.dim %arg0, %c0 : memref<1x40x8960xf32>
     %d1 = memref.dim %arg0, %c1 : memref<1x40x8960xf32>
     %d2 = memref.dim %arg0, %c2 : memref<1x40x8960xf32>
 
-    // Loop nest
     affine.for %i = #map(%c0) to #map(%d0) {
       affine.for %j = #map(%c0) to #map(%d1) {
         affine.for %k = #map(%c0) to #map(%d2) step 8 {
-          // Vectorized SiLU computation
           %x_vec = vector.transfer_read %arg0[%i, %j, %k], %cst_0f : memref<1x40x8960xf32>, vector<8xf32>
           %neg_x_vec = arith.negf %x_vec : vector<8xf32>
           %exp_neg_x_vec = math.exp %neg_x_vec : vector<8xf32>
@@ -49,30 +52,26 @@ module {
 
     %t_end = call @rtclock() : () -> f64
     %unranked_result = memref.cast %output : memref<1x40x8960xf32> to memref<*xf32>
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 8960] strides = [358400, 8960, 1] data =
+    // CHECK-NEXT: [
+    // CHECK-SAME: [2.85772{{(, 2.85772)*}}],
     call @printMemrefF32(%unranked_result) : (memref<*xf32>) -> ()
     memref.dealloc %output : memref<1x40x8960xf32>
-    
+
     %time = arith.subf %t_end, %t_start : f64
     vector.print %time : f64
-    
-    return 
+
+    return
   }
 
-  // Main function to drive the test.
   func.func @main() {
-    // 1. Allocate and initialize input.
     %input = memref.alloc() : memref<1x40x8960xf32>
     %cst_neg_1_23 = arith.constant 3.0 : f32
     linalg.fill ins(%cst_neg_1_23 : f32) outs(%input : memref<1x40x8960xf32>)
 
-    // 2. Call the SiLU kernel.
-    call @kernel_silu(%input) : (memref<1x40x8960xf32>) -> ()
-
+    call @kernel(%input) : (memref<1x40x8960xf32>) -> ()
 
-    // 4. Deallocate memrefs.
     memref.dealloc %input : memref<1x40x8960xf32>
-    
+
     return
   }
-}
-
diff --git a/examples/BuddyNext/next-silu.mlir b/examples/BuddyNext/next-silu.mlir
index 597b003cbc..de6a927ce4 100644
--- a/examples/BuddyNext/next-silu.mlir
+++ b/examples/BuddyNext/next-silu.mlir
@@ -61,4 +61,4 @@
 
     return
   }
-  func.func private @printMemrefF32(%ptr : tensor<*xf32>)
\ No newline at end of file
+  func.func private @printMemrefF32(%ptr : tensor<*xf32>)