[frontend] Implement SiLU fusion with mul + sigmoid pattern

GuoningHuang · GuoningHuang · commit 561bcc367b65 · 2025-09-09T20:52:45.000+08:00
diff --git a/frontend/Python/graph/transform/fuse_ops.py b/frontend/Python/graph/transform/fuse_ops.py
@@ -23,7 +23,8 @@
 from .. import DeviceType
 from torch.fx.immutable_collections import immutable_list
 
-classicfuse_register = {"transpose_matmul_fusion": TransposeMatmulFusedOp}
+classicfuse_register = {"transpose_matmul_fusion": TransposeMatmulFusedOp,
+                        "silu_fusion": SiluOp}
 
 # TODO: classify op type for op fusion
 # OP_TYPE_FUSABLE = [OpType.BroadcastType, OpType.ElementwiseType, OpType.ReshapeType]
@@ -52,10 +53,25 @@ def classic_fuse_check(graph: Graph):
                     1
                 ] == immutable_list([1, 0]):
                     pattern = target, parentop, "transpose_matmul_fusion"
+        elif isinstance(op, MulOp):
+            # Check for mul + sigmoid fusion pattern: mul(x, sigmoid(x))
+            parentop = [graph.node_table[str(i)] for i in op._parents]
+            for target in parentop:
+                if isinstance(target, SigmoidOp):
+                    # Check if the sigmoid input is also an input to the mul operation
+                    sigmoid_input = target._parents[0] if target._parents else None
+                    if sigmoid_input and sigmoid_input in op._parents:
+                        pattern = target, parentop, "silu_fusion"
+                        break
         if pattern:
-            transpose_matmul_fusion(
-                graph, op, pattern[0], pattern[1], pattern[2]
-            )
+            if pattern[2] == "transpose_matmul_fusion":
+                transpose_matmul_fusion(
+                    graph, op, pattern[0], pattern[1], pattern[2]
+                )
+            elif pattern[2] == "silu_fusion":
+                silu_fusion(
+                    graph, op, pattern[0], pattern[1], pattern[2]
+                )
 
 
 def transpose_matmul_fusion(
@@ -91,6 +107,44 @@ def transpose_matmul_fusion(
         graph.delete_node(target, targets_parent)
 
 
+def silu_fusion(
+    graph: Graph, node, target: Op, parents: List[Op], pattern: str
+):
+    """
+    Function to fuse mul and sigmoid operations into one operation.
+    Such as mul(x, sigmoid(x)) -> fused_mul_sigmoid(x)
+
+    Args:
+    - graph (Graph): The input graph to be simplified.
+    - node (Op): The mul operation to be fused.
+    - target (Op): The sigmoid operation to be fused.
+    - parents (List[Op]): The parents of the node to be fused.
+    - pattern (str): The pattern of the fusion.
+    Returns:
+    - None: Modifies the input graph in place.
+    """
+    fused_op = classicfuse_register.get(pattern)()
+    # mulop -> fusedmulopnode
+    fused_op.name = "fused" + node.name
+    graph.displace_node(node, fused_op)
+    fused_op.args.pop(fused_op.args.index(target.name))
+    fused_op._parents.pop(fused_op._parents.index(target.name))
+    fused_op.args.extend(target.args)
+
+    fused_op._parents.extend(target._parents)
+
+    fused_op.args[:] = list(set(fused_op.args))
+    fused_op._parents[:] = list(set(fused_op._parents))
+
+    targets_parent = [graph.node_table[i] for i in target._parents]
+    for i in targets_parent:
+        i.add_children(fused_op.name)
+    target._children.pop(target._children.index(fused_op.name))
+
+    if graph.check_delete_node(target):
+        graph.delete_node(target, targets_parent)
+
+
 def apply_classic_fusion(graph: Graph):
     """
     Function to fuse some typical operations into one operation and fuse
@@ -134,3 +188,4 @@ def simply_fuse(graph: Graph):
     graph.op_groups = {}
     graph.op_groups["subgraph0"] = new_op_group
     graph.group_map_device = {"subgraph0": device}
+    
diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py
@@ -64,6 +64,7 @@
     ArgMaxOp,
     ScaledDotProductFlashAttentionForCpuOp,
     MatmulOp,
+    SiluOp,
 )
 from .utils import *
 
@@ -1449,6 +1450,26 @@ def sigmoid_op(node: SigmoidOp, symbol_table):
     return op
 
 
+def silu_op(node: SiluOp, symbol_table):
+    """
+    Import the buddy SiluOp.
+    Implements SiLU fusion: x * sigmoid(x) using tosa.sigmoid and tosa.mul.
+    """
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    if input_tensor is None:
+        return
+
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+
+    sigmoid_op = tosa.SigmoidOp(tensor_type, input_tensor)
+    mul_op = tosa.MulOp(tensor_type, input_tensor, sigmoid_op.result)
+
+    return mul_op
+
+
 def reciprocal_op(node: ReciprocalOp, symbol_table):
     """
     Import the buddy ReciprocalOp.
@@ -1859,6 +1880,7 @@ def scaled_dot_product_flash_attention_for_cpu_op(
     "ReluOp": relu_op,
     "IotaOp": iota_op,
     "SigmoidOp": sigmoid_op,
+    "SiLUOp": silu_op,
     "ReciprocalOp": reciprocal_op,
     "MeanOp": mean_op,
     "ClampMinOp": clamp_min_op,
diff --git a/tests/Python/test_silu.py b/tests/Python/test_silu.py
@@ -7,29 +7,42 @@
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import linalg
+from buddy.compiler.graph.transform import simply_fuse, apply_classic_fusion
 
+def silu_pattern(x):
+    sigmoid_x = torch.sigmoid(x)
+    return torch.mul(x, sigmoid_x)
 
 def foo(x):
-    return torch.nn.functional.silu(x)
+    return silu_pattern(x)
 
-
-in1 = torch.ones([13, 13], dtype=torch.float32)
+x = torch.ones([4, 4], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
     primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-graphs = dynamo_compiler.importer(foo, in1)
+graphs = dynamo_compiler.importer(foo, x)
 assert len(graphs) == 1
 graph = graphs[0]
+pattern_list = [apply_classic_fusion]
+graphs[0].fuse_ops(pattern_list)
+
 graph.lower_to_top_level_ir()
 print(graph._imported_module)
 
-# CHECK: module {
+#       CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: return %{{.*}}
-# CHECK: }
-# CHECK: }
+#       CHECK:   %[[EMPTY:.*]] = tensor.empty() : tensor<4x4xf32>
+#       CHECK:   %[[RES:.*]] = linalg.generic {.*} ins(%arg0 : tensor<4x4xf32>) outs(%[[EMPTY]] : tensor<4x4xf32>) {
+#       CHECK:   ^bb0(%in: f32, %out: f32):
+#       CHECK:       %[[NEG:.*]] = arith.negf %in : f32 
+#       CHECK:       %[[EXP:.*]] = math.exp %[[NEG]] : f32
+#       CHECK:       %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
+#       CHECK:       %[[ADD:.*]] = arith.addf %[[EXP]], %[[ONE]] : f32
+#       CHECK:       %[[DIV:.*]] = arith.divf %in, %[[ADD]] : f32
+#       CHECK:       linalg.yield %[[DIV]] : f32
+#       CHECK:    } -> tensor<4x4xf32>
+#       CHECK: return %[[RES]] : tensor<4x4xf32>
+