pytorch
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 27 additions & 1 deletion b/‎.ci/scripts/test_model.sh
Lines changed: 27 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_wrap_with_autocast.py
Lines changed: 88 additions & 0 deletions b/‎backends/qualcomm/_passes/decompose_wrap_with_autocast.py
Lines changed: 88 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/qnn_pass_manager.py
Lines changed: 3 additions & 0 deletions b/‎backends/qualcomm/_passes/qnn_pass_manager.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/remove_redundancy.py
Lines changed: 11 additions & 13 deletions b/‎backends/qualcomm/_passes/remove_redundancy.py
Lines changed: 11 additions & 13 deletions
diff --git a/‎backends/qualcomm/_passes/replace_inf_values.py
Lines changed: 20 additions & 2 deletions b/‎backends/qualcomm/_passes/replace_inf_values.py
Lines changed: 20 additions & 2 deletions
diff --git a/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 1 addition & 6 deletions b/‎backends/qualcomm/quantizer/annotators.py
Lines changed: 1 addition & 6 deletions
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py
Lines changed: 29 additions & 0 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py
Lines changed: 29 additions & 0 deletions
@@ -188,6 +188,14 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
     pip install piq
+  elif [[ "${MODEL_NAME}" == "albert" ]]; then
+    EXPORT_SCRIPT=albert
+  elif [[ "${MODEL_NAME}" == "bert" ]]; then
+    EXPORT_SCRIPT=bert
+  elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
+    EXPORT_SCRIPT=distilbert
+  elif [[ "${MODEL_NAME}" == "eurobert" ]]; then
+    EXPORT_SCRIPT=eurobert
   else
     echo "Unsupported model $MODEL_NAME"
     exit 1
@@ -197,7 +205,25 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
+  SCRIPT_FOLDER=""
+  case "${MODEL_NAME}" in
+    "dl3"|"mv3"|"mv2"|"ic4"|"ic3"|"vit"|"mb"|"w2l")
+        SCRIPT_FOLDER=scripts
+        ;;
+    "albert"|"bert"|"distilbert")
+        pip install evaluate
+        SCRIPT_FOLDER=oss_scripts
+        # Bert models running in 16bit will encounter op validation fail on some operations,
+        # which requires CHIPSET >= SM8550.
+        QNN_CHIPSET=SM8550
+        ;;
+    *)
+        echo "Unsupported model $MODEL_NAME"
+        exit 1
+        ;;
+  esac
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 
@@ -480,6 +480,32 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
+  test-qnn-optimum-model:
+    name: test-qnn-optimum-model
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        dtype: [fp32]
+        model: [albert, bert, distilbert] # eurobert requires transfomer >= 4.48.0, skip for now
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
+
   test-apple-model:
     name: test-apple-model
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
 
@@ -19,6 +19,7 @@
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
+from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
 from .fold_qdq import FoldQDQ
@@ -56,6 +57,7 @@
     DecomposeLinalgVectorNorm,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
 
@@ -0,0 +1,88 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import _operator
+from typing import Dict, Tuple
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_nn_module_stack
+
+
+class DecomposeWrapWithAutocast(ExportPass):
+    """
+    Decompose the _higher_order_ops WrapWithAutocast
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _get_submod(
+        self, gm: torch.fx.GraphModule, node: torch.fx.Node
+    ) -> Tuple[torch.fx.GraphModule, str]:
+        for a in node.args:
+            if isinstance(a, torch.fx.Node) and "submod" in a.target:
+                return getattr(gm, a.target), a.target
+
+    def _replace_output(
+        self, wwac_node: torch.fx.Node, output_node: torch.fx.Node, remap: Dict
+    ):
+        for user in wwac_node.users.copy():
+            arg_idx = 0
+            is_user_getitem = False
+
+            if user.target == _operator.getitem:
+                arg_idx = user.args[1]
+                is_user_getitem = True
+
+            user.replace_input_with(
+                wwac_node,
+                remap[output_node.args[0][arg_idx]],
+            )
+
+            if is_user_getitem:
+                for user_user in user.users.copy():
+                    user_user.replace_input_with(user, user.args[0])
+
+    def _replace(self, gm: torch.fx.GraphModule) -> None:
+        graph = gm.graph
+        for node in graph.nodes:
+            if isinstance(node.target, torch._higher_order_ops.wrap.WrapWithAutocast):
+                submod, submod_name = self._get_submod(gm, node)
+                n_args = node.args
+                input_submod = n_args[4]
+                decomposed_module = submod
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    # remap = {"expand_1": node.args[5], "to_4": node.args[6]}
+                    remap = {n_args[i].name: n_args[i] for i in range(5, len(n_args))}
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        copy_nn_module_stack(node, decomposed_node)
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            self._replace_output(node, decomposed_node, remap)
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+                graph.erase_node(input_submod)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._replace(graph_module)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -24,6 +24,7 @@
     DecomposeLinalgVectorNorm,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
@@ -194,6 +195,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
@@ -207,6 +209,7 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeWrapWithAutocast())
         # this pass will rewrite state_dict, it needs to be accomplished before
         # to_edge_transform_and_lower
         self.add_pass(ConvertConv1dToConv2d(exported_program))
 
@@ -43,6 +43,8 @@ def _dim_order_op_condition(self, node):
         dim_order = node.kwargs.get("dim_order")
         # skip if there contains layout hint
         # e.g. (0, 2, 3, 1) != (0, 1, 2, 3)
+        if node.meta["val"].dtype != node.args[0].meta["val"].dtype:
+            return False
         return dim_order != list(range(len(dim_order)))
 
     def _to_copy_op_condition(self, node):
@@ -53,19 +55,15 @@ def _default_condition(self, ndoe):
 
     def _remove(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
         for n in graph_module.graph.nodes:
-            if n.target not in self.redundant_ops or not self.redundant_ops[n.target](
-                n
-            ):
-                continue
-
-            to_be_remove = n
-            # assert_tensor_metadata op has no user
-            if len(n.users.keys()) == 0:
-                n.args = ()
-            # normal case
-            for user_n in list(n.users.keys()):
-                user_n.replace_input_with(n, n.args[0])
-            graph_module.graph.erase_node(to_be_remove)
+            if n.target in self.redundant_ops and self.redundant_ops[n.target](n):
+                to_be_remove = n
+                # assert_tensor_metadata op has no user
+                if len(n.users.keys()) == 0:
+                    n.args = ()
+                # normal case
+                for user_n in list(n.users.keys()):
+                    user_n.replace_input_with(n, n.args[0])
+                graph_module.graph.erase_node(to_be_remove)
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._remove(graph_module)
 
@@ -9,13 +9,13 @@
 
 class ReplaceInfValues(ExportPass):
     """
-    Due to limitation in Qnn, we need to change inf or -inf to arbitrary value in quantization.
+    Due to limitation in QNN, change inf or -inf to arbitrary value in quantization.
     """
 
     def __init__(self):
         super(ReplaceInfValues, self).__init__()
 
-    def call(self, graph_module: torch.fx.GraphModule):
+    def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
         for buf_name, tensor in graph_module.named_buffers():
             if tensor.is_floating_point():
                 # 255 here is mainly for attention_mask in Llama for reasonable quant scale
@@ -38,5 +38,23 @@ def call(self, graph_module: torch.fx.GraphModule):
                     arg_list[2] = -255
             node.args = tuple(arg_list)
 
+            if node.target in [
+                torch.ops.aten.masked_fill.Tensor,
+                torch.ops.aten.masked_fill.Scalar,
+            ]:
+                assert (
+                    len(node.args) == 3
+                ), f"Expecting {node.name} to have 3 arguments."
+                val = node.args[2]
+                if node.args[2] > torch.finfo(torch.float16).max:
+                    val = 255
+                elif node.args[2] < torch.finfo(torch.float16).min:
+                    val = -255
+                node.args = (
+                    node.args[0],
+                    node.args[1],
+                    val,
+                )
+
         graph_module.recompile()
         return PassResult(graph_module, True)
@@ -462,7 +462,7 @@ def annotate_hardtanh(node: Node, quantization_config: QuantizationConfig) -> No
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.mean.default])
+@register_annotator([torch.ops.aten.mean.default, torch.ops.aten.mean.dim])
 def annotate_mean(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -604,11 +604,6 @@ def annotate_select(node: Node, quantization_config: QuantizationConfig) -> None
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.mean.dim])
-def annotate_mean_dim(node: Node, quantization_config: QuantizationConfig) -> None:
-    annotate_single_in_single_out(node, quantization_config)
-
-
 @register_annotator([torch.ops.aten.slice.Tensor])
 def annotate_slice(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -26,6 +26,35 @@
 )
 
 
+def annotate_eurobert(gm: torch.fx.GraphModule):
+    """
+    QNN does not support int32 -> signed 16bit quant
+    We need to first annotate this to_fp node as 8bit quant, so it will perform requantize
+    Final graph should look like: int32 -> convert -> cast -> matmul.args[1]
+
+    """
+    quantization_config_8a8w = get_8a8w_qnn_ptq_config()
+    for node in gm.graph.nodes:
+        # A little tricky here. This matmul node is wrapped inside a submodule after 1st torch.export.
+        # There are actually 2 'to' op that is redundant.
+        # It will look like: int64 -> to_fp -> to_fp -> matmul.args[1]
+        # Draw out the graph after the 1st export will help visualize the submodule.
+
+        if node.target == torch.ops.aten.matmul.default and node.args[1].args[0].args[
+            0
+        ].meta["val"].dtype in [torch.int64, torch.int32]:
+            to_node = node.args[1]
+            input_qspec_map = {}
+            assert isinstance(to_node, Node)
+            input_spec = quantization_config_8a8w.input_activation
+            input_qspec_map[to_node] = input_spec
+            to_node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=quantization_config_8a8w.output_activation,
+                _annotated=True,
+            )
+
+
 def annotate_mimi_decoder(gm: torch.fx.GraphModule):
     """
     The 1st transpose conv in mimi decoder is really sensitive to scale/offset in 16a8w, which causes execution failure.