Add Silu optimization and test

GuoningHuang · GuoningHuang · commit a42eded35eb8 · 2025-06-21T16:14:34.000+08:00
diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
@@ -16,5 +16,4 @@ add_subdirectory(LowerSche)
 add_subdirectory(FuncBufferize)
 add_subdirectory(DepthwiseConvOptimization)
 add_subdirectory(MLIRGPU)
-add_subdirectory(MLIRGPU)
 add_subdirectory(SiluOptimization)
diff --git a/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp b/midend/lib/Conversion/SiluOptimization/SiluOptimization.cpp
@@ -1,3 +1,24 @@
+//====- SiluOptimization.cpp - Silu Optimization Pass ---------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass that vectorizes the linalg.generic representing
+// SiLU.
+//
+//===----------------------------------------------------------------------===//
+
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -13,8 +34,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-
-
 using namespace mlir;
 
 namespace {
@@ -35,40 +54,40 @@ class SiLUVectorizePattern : public ConversionPattern {
                   ConversionPatternRewriter &rewriter) const override {
 
     linalg::GenericOp sigmoidOp = cast<linalg::GenericOp>(op);
-    
+
     //--------------sigmoid OP--------
-    // 1. Check input/output
-    if (sigmoidOp.getNumDpsInputs() != 1 || sigmoidOp.getNumDpsInits() != 1){
-      llvm::errs() << "1\n";
-      return failure();}
+    //  Check input/output
+    if (sigmoidOp.getNumDpsInputs() != 1 || sigmoidOp.getNumDpsInits() != 1) {
+      return failure();
+    }
 
     // Check the body of the op for sigmoid computation.
     // The IR should be: negf, exp, addf, divf, yield.
     Block &block = sigmoidOp.getRegion().front();
-    if (block.getOperations().size() != 5) // negf, exp, addf, divf, yield
-      {llvm::errs() << "4\n";
-      return failure();}
+    if (block.getOperations().size() != 5) { // negf, exp, addf, divf, yield
+      return failure();
+    }
 
     Operation &negfOp = block.getOperations().front();
     Operation &yieldOp = block.getOperations().back();
 
     // Check the type of the two operations.
-    if (!isa<arith::NegFOp>(negfOp) || !isa<linalg::YieldOp>(yieldOp))
-      {llvm::errs() << "5\n";
-      return failure();}
-
+    if (!isa<arith::NegFOp>(negfOp) || !isa<linalg::YieldOp>(yieldOp)) {
+      return failure();
+    }
 
     //-----------Find the consumer mul operation.------------------------------
     // The result of the sigmoid op must be used by another linalg.generic op.
     Value outputBuffer = sigmoidOp.getDpsInitOperand(0)->get();
 
-    // 遍历所有 uses，寻找满足条件的 consumer op
+    // Iterate over all uses to find a suitable consumer op.
     linalg::GenericOp mulOp = nullptr;
 
     for (auto &use : outputBuffer.getUses()) {
       Operation *user = use.getOwner();
 
-      // 要求是 linalg.generic，且 %alloc 是 input operand（即 ins()）
+      // It must be a linalg.generic, and the buffer must be an input operand
+      // (i.e., ins()).
       auto linalgOp = dyn_cast<linalg::GenericOp>(user);
       if (!linalgOp)
         continue;
@@ -83,7 +102,7 @@ class SiLUVectorizePattern : public ConversionPattern {
       if (!foundInInput)
         continue;
 
-      // 检查其内部是否有 arith.mulf 操作
+      // Check if it contains an arith.mulf operation inside.
       for (auto &nestedOp : linalgOp.getRegion().front()) {
         if (isa<arith::MulFOp>(nestedOp)) {
           mulOp = linalgOp;
@@ -96,14 +115,14 @@ class SiLUVectorizePattern : public ConversionPattern {
     }
 
     if (!mulOp) {
-      llvm::errs() << "Didn't find a consumer linalg.generic using sigmoid output with mulf.\n";
+      llvm::errs() << "Didn't find a consumer linalg.generic using sigmoid "
+                      "output with mulf.\n";
       return failure();
     }
 
-    // Set the insertion point before the mulOp. This ensures that the new affine
-    // loop is inserted at a point that is dominated by the allocation of the
-    // output buffer.
-    // rewriter.setInsertionPoint(mulOp);
+    // Set the insertion point before the mulOp. This ensures that the new
+    // affine loop is inserted at a point that is dominated by the allocation of
+    // the output buffer. rewriter.setInsertionPoint(mulOp);
 
     // Now we have matched the silu pattern: sigmoid followed by a mul.
     // The rewrite logic will be applied to the sigmoidOp, and the mulOp will be
@@ -122,57 +141,46 @@ class SiLUVectorizePattern : public ConversionPattern {
 
     // Define constants.
     Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    const int64_t unrollFactor = 2;
-    Value cUnrollVec =
-        rewriter.create<arith::ConstantIndexOp>(loc, vectorSize * unrollFactor);
-    Value cst1f = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getF32FloatAttr(1.0));
+    Value cst1f =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(1.0));
     Value vec1f = rewriter.create<vector::BroadcastOp>(loc, vectorType, cst1f);
+    Value cst0f =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(0.0f));
 
     // Get dimensions.
     Value d0 = rewriter.create<memref::DimOp>(loc, input, 0);
     Value d1 = rewriter.create<memref::DimOp>(loc, input, 1);
     Value d2 = rewriter.create<memref::DimOp>(loc, input, 2);
 
     // Create loop nest.
-    scf::ForOp iLoop = rewriter.create<scf::ForOp>(loc, c0, d0, c1);
+    AffineMap map = rewriter.getDimIdentityMap();
+    affine::AffineForOp iLoop = rewriter.create<affine::AffineForOp>(
+        loc, ValueRange{c0}, map, ValueRange{d0}, map);
     rewriter.setInsertionPointToStart(iLoop.getBody());
     Value iv_i = iLoop.getInductionVar();
 
-    scf::ForOp jLoop = rewriter.create<scf::ForOp>(loc, c0, d1, c1);
+    affine::AffineForOp jLoop = rewriter.create<affine::AffineForOp>(
+        loc, ValueRange{c0}, map, ValueRange{d1}, map);
     rewriter.setInsertionPointToStart(jLoop.getBody());
     Value iv_j = jLoop.getInductionVar();
 
-    scf::ForOp kLoop = rewriter.create<scf::ForOp>(loc, c0, d2, cUnrollVec);
+    affine::AffineForOp kLoop = rewriter.create<affine::AffineForOp>(
+        loc, ValueRange{c0}, map, ValueRange{d2}, map, vectorSize);
     rewriter.setInsertionPointToStart(kLoop.getBody());
     Value iv_k = kLoop.getInductionVar();
 
-    // Prefetch
-    Value k_next = rewriter.create<arith::AddIOp>(loc, iv_k, cUnrollVec);
-    rewriter.create<memref::PrefetchOp>(loc, input, ValueRange{iv_i, iv_j, k_next},
-                                      /*isWrite=*/false, /*localityHint=*/3,
-                                      /*isDataCache=*/true);
-
-    // Unrolled loop body
-    for (int i = 0; i < unrollFactor; ++i) {
-      Value k_offset =
-          rewriter.create<arith::ConstantIndexOp>(loc, i * vectorSize);
-      Value k_i = rewriter.create<arith::AddIOp>(loc, iv_k, k_offset);
-
-      // --- Process Vector ---
-      Value x_vec = rewriter.create<vector::LoadOp>(
-          loc, vectorType, input, ValueRange{iv_i, iv_j, k_i});
-      Value neg_x_vec = rewriter.create<arith::NegFOp>(loc, x_vec);
-      Value exp_neg_x_vec = rewriter.create<math::ExpOp>(loc, neg_x_vec);
-      Value one_plus_exp_vec =
-          rewriter.create<arith::AddFOp>(loc, vec1f, exp_neg_x_vec);
-      Value sigmoid_x_vec =
-          rewriter.create<arith::DivFOp>(loc, vec1f, one_plus_exp_vec);
-      Value silu_vec = rewriter.create<arith::MulFOp>(loc, x_vec, sigmoid_x_vec);
-      rewriter.create<vector::StoreOp>(loc, silu_vec, output,
-                                      ValueRange{iv_i, iv_j, k_i});
-    }
+    // --- Process Vector ---
+    Value x_vec = rewriter.create<vector::TransferReadOp>(
+        loc, vectorType, input, ValueRange{iv_i, iv_j, iv_k}, cst0f);
+    Value neg_x_vec = rewriter.create<arith::NegFOp>(loc, x_vec);
+    Value exp_neg_x_vec = rewriter.create<math::ExpOp>(loc, neg_x_vec);
+    Value one_plus_exp_vec =
+        rewriter.create<arith::AddFOp>(loc, vec1f, exp_neg_x_vec);
+    Value sigmoid_x_vec =
+        rewriter.create<arith::DivFOp>(loc, vec1f, one_plus_exp_vec);
+    Value silu_vec = rewriter.create<arith::MulFOp>(loc, x_vec, sigmoid_x_vec);
+    rewriter.create<vector::TransferWriteOp>(loc, silu_vec, output,
+                                             ValueRange{iv_i, iv_j, iv_k});
 
     // Replace the original mulOp with the result from our new computation.
     // The 'output' buffer now holds the final result. `replaceOp` will
@@ -214,36 +222,37 @@ class SiluOptimizationPass
   }
 
   Option<int64_t> vectorSize{*this, "vector-size",
-                               llvm::cl::desc("Vector size for SiLU."),
-                               llvm::cl::init(8)};
+                             llvm::cl::desc("Vector size for SiLU."),
+                             llvm::cl::init(8)};
 };
 
 void SiluOptimizationPass::runOnOperation() {
   MLIRContext *context = &getContext();
   ModuleOp module = getOperation();
 
   ConversionTarget target(*context);
-  target.addLegalDialect<arith::ArithDialect, affine::AffineDialect,
-                         memref::MemRefDialect, vector::VectorDialect,
-                         func::FuncDialect, math::MathDialect,
-                         scf::SCFDialect>();
+  target
+      .addLegalDialect<arith::ArithDialect, affine::AffineDialect,
+                       memref::MemRefDialect, vector::VectorDialect,
+                       func::FuncDialect, math::MathDialect, scf::SCFDialect>();
   target.addLegalOp<ModuleOp, func::FuncOp>();
-  
-  // We will manually mark linalg.generic as illegal if it is part of a SiLU pattern.
-  // The pattern itself will handle the legality checks and replacements.
-  // Therefore, we don't need to addIllegalOp<linalg::GenericOp>() here.
-  
+
+  // We will manually mark linalg.generic as illegal if it is part of a SiLU
+  // pattern. The pattern itself will handle the legality checks and
+  // replacements. Therefore, we don't need to addIllegalOp<linalg::GenericOp>()
+  // here.
+
   RewritePatternSet patterns(context);
   patterns.add<SiLUVectorizePattern>(context, vectorSize);
 
   if (failed(applyPartialConversion(module, target, std::move(patterns))))
     signalPassFailure();
 }
-} // end anonymous namespace 
+} // end anonymous namespace
 namespace mlir {
 namespace buddy {
 void registerSiluOptimizationPass() {
   PassRegistration<SiluOptimizationPass>();
 }
 } // namespace buddy
-} // namespace mlir
+} // namespace mlir
diff --git a/tests/Conversion/silu-optimization.mlir b/tests/Conversion/silu-optimization.mlir
@@ -0,0 +1,56 @@
+// RUN: buddy-opt -silu-optimization="vector-size=8" %s | FileCheck %s
+
+// CHECK: #map = affine_map<(d0) -> (d0)>
+// CHECK: module {
+// CHECK:   func.func @silu_tosa(%arg0: memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>) -> memref<1x40x8960xf32> {
+// CHECK:%cst = arith.constant 1.000000e+00 : f32
+// CHECK:     %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+// CHECK:     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+// CHECK:     %c0 = arith.constant 0 : index
+// CHECK:     %cst_1 = arith.constant 1.000000e+00 : f32
+// CHECK:     %0 = vector.broadcast %cst_1 : f32 to vector<8xf32>
+// CHECK:     %cst_2 = arith.constant 0.000000e+00 : f32
+// CHECK:     %c0_3 = arith.constant 0 : index
+// CHECK:     %dim = memref.dim %arg0, %c0_3 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>
+// CHECK:     %c1 = arith.constant 1 : index
+// CHECK:     %dim_4 = memref.dim %arg0, %c1 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>
+// CHECK:     %c2 = arith.constant 2 : index
+// CHECK:     %dim_5 = memref.dim %arg0, %c2 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>
+// CHECK:     affine.for %arg1 = #map(%c0) to #map(%dim) {
+// CHECK:       affine.for %arg2 = #map(%c0) to #map(%dim_4) {
+// CHECK:         affine.for %arg3 = #map(%c0) to #map(%dim_5) step 8 {
+// CHECK:           %1 = vector.transfer_read %arg0[%arg1, %arg2, %arg3], %cst_2 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>, vector<8xf32>
+// CHECK:           %2 = arith.negf %1 : vector<8xf32>
+// CHECK:           %3 = math.exp %2 : vector<8xf32>
+// CHECK:           %4 = arith.addf %0, %3 : vector<8xf32>
+// CHECK:           %5 = arith.divf %0, %4 : vector<8xf32>
+// CHECK:           %6 = arith.mulf %1, %5 : vector<8xf32>
+// CHECK:           vector.transfer_write %6, %alloc_0[%arg1, %arg2, %arg3] : vector<8xf32>, memref<1x40x8960xf32>
+// CHECK:         }
+// CHECK:       }
+// CHECK:     }
+// CHECK:     return %alloc_0 : memref<1x40x8960xf32>
+// CHECK:   }
+// CHECK: }
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @silu_tosa(%arg0: memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>) -> memref<1x40x8960xf32> {
+  %cst = arith.constant 1.000000e+00 : f32
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+  linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<1x40x8960xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %3 = arith.negf %in : f32
+    %4 = math.exp %3 : f32
+    %5 = arith.addf %4, %cst : f32
+    %6 = arith.divf %cst, %5 : f32
+    linalg.yield %6 : f32
+  }
+  %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x40x8960xf32>
+  linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %alloc : memref<1x40x8960xf32, strided<[?, ?, ?], offset: ?>>, memref<1x40x8960xf32>) outs(%alloc_0 : memref<1x40x8960xf32>) {
+  ^bb0(%in: f32, %in_1: f32, %out: f32):
+    %3 = arith.mulf %in, %in_1 : f32
+    linalg.yield %3 : f32
+  }
+  return %alloc_0 : memref<1x40x8960xf32>
+}