[rewriter | torchlib] respect ops order in torchscript graph (microsoft#2134)

titaiwangms · bmehta001 · commit 6024d7c83797 · 2025-04-11T17:31:50.000Z
This helps us to match the optimization pattern in https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/fusion_fastgelu.py ref: microsoft#2132 (comment)
diff --git a/onnxscript/function_libs/torch_lib/ops/nn.py b/onnxscript/function_libs/torch_lib/ops/nn.py
@@ -487,8 +487,8 @@ def _aten_gelu_approximate_none(self: TReal) -> TReal:
     inner = op.Div(self, 1.4142135623730951)
     erf = op.Erf(inner)
     inner = op.Add(erf, 1)
-    inner = op.Mul(self, inner)
-    result = op.Mul(0.5, inner)
+    inner = op.Mul(0.5, inner)
+    result = op.Mul(self, inner)
     return result
 
 
@@ -505,8 +505,8 @@ def _aten_gelu_approximate_tanh(self: TReal) -> TReal:
     inner = op.Mul(op.Sqrt(two_over_pi), inner)
     inner = op.Tanh(inner)
     inner = op.Add(inner, 1)
-    inner = op.Mul(self, inner)
-    result = op.Mul(0.5, inner)
+    inner = op.Mul(0.5, inner)
+    result = op.Mul(self, inner)
     return result
 
 
diff --git a/onnxscript/rewriter/ort_fusions/gelu.py b/onnxscript/rewriter/ort_fusions/gelu.py
@@ -20,8 +20,8 @@ def pattern(self, op, x):
         t4 = op.Mul(_sqrt_two_over_pi, t3)
         t5 = op.Tanh(t4)
         t6 = op.Add(t5, 1)
-        t7 = op.Mul(x, t6)
-        result = op.Mul(0.5, t7)
+        t7 = op.Mul(0.5, t6)
+        result = op.Mul(x, t7)
         return result
 
     def rewrite(self, op, x):
diff --git a/onnxscript/rewriter/ort_fusions/gelu_test.py b/onnxscript/rewriter/ort_fusions/gelu_test.py
@@ -28,8 +28,8 @@ def gelu_model(x):
             t4 = op.Mul(_sqrt_two_over_pi, t3)
             t5 = op.Tanh(t4)
             t6 = op.Add(t5, 1)
-            t7 = op.Mul(x, t6)
-            result = op.Mul(0.5, t7)
+            t7 = op.Mul(0.5, t6)
+            result = op.Mul(x, t7)
             return result
 
         model_proto = gelu_model.to_model_proto(