From 94d970bc69ad5717bf19fc579d30e2ff8d33e58a Mon Sep 17 00:00:00 2001
From: shewu <shewu@qti.qualcomm.com>
Date: Tue, 7 Apr 2026 15:32:52 +0800
Subject: [PATCH 1/3] Qualcomm AI Engine Direct - Add claude skill for qualcomm
 Summary: - Add essential functionality in qualcomm skill, such as build,
 test, and add new op   - /qualcomm build with /path/to/qairt/2.42.0.251225/  
 - /qualcomm test test_qnn_backend_channel_shuffle in fp and quantized - Use
 qualcomm skill to develop new op "ChannelShuffle" and add test case

---
 .claude/skills/qualcomm/SKILL.md              |  98 +++++
 .claude/skills/qualcomm/lowering_export.md    | 140 +++++++
 .claude/skills/qualcomm/model_enablement.md   | 107 ++++++
 .claude/skills/qualcomm/new_op_development.md | 358 ++++++++++++++++++
 backends/qualcomm/builders/README.md          |   6 +-
 backends/qualcomm/builders/__init__.py        |   2 +
 .../qualcomm/builders/op_channel_shuffle.py   |  73 ++++
 backends/qualcomm/builders/qnn_constants.py   |   7 +
 backends/qualcomm/partition/utils.py          |   1 +
 .../quantizer/annotators/htp_rules.py         |  11 +
 backends/qualcomm/tests/models.py             |   9 +
 backends/qualcomm/tests/test_qnn_delegate.py  |  11 +
 12 files changed, 820 insertions(+), 3 deletions(-)
 create mode 100644 .claude/skills/qualcomm/SKILL.md
 create mode 100644 .claude/skills/qualcomm/lowering_export.md
 create mode 100644 .claude/skills/qualcomm/model_enablement.md
 create mode 100644 .claude/skills/qualcomm/new_op_development.md
 create mode 100644 backends/qualcomm/builders/op_channel_shuffle.py

diff --git a/.claude/skills/qualcomm/SKILL.md b/.claude/skills/qualcomm/SKILL.md
new file mode 100644
index 00000000000..bb0c5f017e7
--- /dev/null
+++ b/.claude/skills/qualcomm/SKILL.md
@@ -0,0 +1,98 @@
+---
+name: qualcomm
+description: Build, test, or develop the QNN (Qualcomm AI Engine Direct) backend. Use when working on backends/qualcomm/, building QNN (use backends/qualcomm/scripts/build.sh), adding new ops or passes, running QNN delegate
+  tests, or exporting models for Qualcomm HTP/GPU targets.
+---
+
+# QNN (Qualcomm AI Engine Direct) Backend
+
+## Advanced Topics
+
+When the user's request falls into one of these areas, read the corresponding file before proceeding:
+
+| Topic | File | When to read |
+|---|---|---|
+| Export / lowering / quantization options / pass pipelines | `lowering_export.md` | User asks about exporting, lowering, quantization config, QuantDtype, QuantRecipe, pass pipelines |
+| New op development | `new_op_development.md` | User asks to add/implement a new op or op builder |
+| Model enablement | `model_enablement.md` | User asks to enable a new model end-to-end |
+| Profiling & debugging | `profiling.md` | User asks about profiling, optrace, QHAS, QAIRT Visualizer *(file TBD)* |
+
+## Building
+
+Use `backends/qualcomm/scripts/build.sh`. Linux only (macOS not supported).
+
+**Environment variables:**
+- `QNN_SDK_ROOT` — path to QNN SDK (auto-downloaded if not set)
+- `ANDROID_NDK_ROOT` — path to Android NDK (auto-downloaded if not set)
+
+**Build targets:**
+
+| Target | Default | Build dir |
+|---|---|---|
+| x86_64 (Python interface + host tools) | enabled | `build-x86/` |
+| Android arm64-v8a (device runner) | enabled | `build-android/` |
+| Hexagon DSP (direct mode) | disabled | `build-hexagon/` |
+| OE Linux embedded | disabled | `build-oe-linux/` |
+
+**Common build commands:**
+
+```bash
+# Full build (x86_64 + Android)
+./backends/qualcomm/scripts/build.sh
+
+# x86_64 only (faster, for Python interface development)
+./backends/qualcomm/scripts/build.sh --skip_linux_android
+
+# Android only (skip x86_64)
+./backends/qualcomm/scripts/build.sh --skip_x86_64
+
+# Incremental build (skip clean)
+./backends/qualcomm/scripts/build.sh --no_clean
+
+# Enable Hexagon DSP direct mode (requires HEXAGON_SDK_ROOT, HEXAGON_TOOLS_ROOT, DSP_VERSION)
+./backends/qualcomm/scripts/build.sh --enable_hexagon
+
+# OE Linux embedded target (requires TOOLCHAIN_ROOT_HOST, TOOLCHAIN_ROOT_TARGET)
+./backends/qualcomm/scripts/build.sh --enable_linux_embedded
+
+# Release build
+./backends/qualcomm/scripts/build.sh --release
+
+# Control parallelism
+./backends/qualcomm/scripts/build.sh --job_number 8
+```
+
+**After x86_64 build**, the Python interface `.so` files are copied to `backends/qualcomm/python/` automatically.
+
+## Testing
+
+```bash
+QNN_SDK_ROOT=/path/to/qnn_sdk \
+ANDROID_NDK_ROOT=/path/to/android_ndk \
+LD_LIBRARY_PATH=/path/to/executorch/build-x86/lib:/path/to/qnn_sdk/lib/x86_64-linux-clang \
+PYTHONPATH=$(dirname $EXECUTORCH_ROOT) \
+python backends/qualcomm/tests/test_qnn_delegate.py \
+    TestQNNFloatingPointOperator.test_qnn_backend_abs \
+    -H $HOST -s $DEVICE_SERIAL -m SM8850 -b build-android -a /path/to/artifacts
+```
+
+> **Note (build from source):** Set `PYTHONPATH` to the parent directory of the executorch repo root. Required because `executorch.examples.qualcomm` lives in the source tree and is not installed into site-packages.
+
+Required flags: `-m` (SoC model), `-b` (Android build dir). Optional: `-s` (device serial), `-H` (host), `-a` (artifact dir), `-c` (compile only), `-x` (run on x86_64).
+
+**Test classes:**
+
+| Class | Description |
+|---|---|
+| `TestQNNFloatingPointOperator` | FP16 operator tests |
+| `TestQNNQuantizedOperator` | Quantized operator tests |
+| `TestQNNFloatingPointModel` | FP16 model-level tests |
+| `TestQNNQuantizedModel` | Quantized model-level tests |
+| `TestQNNFloatingPointUtils` | FP16 utility tests |
+| `TestQNNQuantizedUtils` | Quantized utility tests |
+| `TestExampleLLMScript` | LLM script tests |
+| `TestExampleMultimodalityScript` | Multimodality script tests |
+| `TestExampleOssScript` | OSS model script tests |
+| `TestExampleQaihubScript` | QAI Hub script tests |
+| `TestExampleScript` | General example script tests |
+| `TestUtilsScript` | Utility script tests |
diff --git a/.claude/skills/qualcomm/lowering_export.md b/.claude/skills/qualcomm/lowering_export.md
new file mode 100644
index 00000000000..4e9d04e7665
--- /dev/null
+++ b/.claude/skills/qualcomm/lowering_export.md
@@ -0,0 +1,140 @@
+# QNN Lowering / Export
+
+## Common Setup
+
+```python
+from executorch.backends.qualcomm.serialization.qc_schema import QnnExecuTorchBackendType
+from executorch.backends.qualcomm.utils.utils import (
+    generate_htp_compiler_spec,
+    generate_qnn_executorch_compiler_spec,
+    get_soc_to_chipset_map,
+    to_edge_transform_and_lower_to_qnn,
+)
+
+soc_model = get_soc_to_chipset_map()["SM8650"]  # adjust SoC as needed
+```
+
+---
+
+## FP16 Export
+
+```python
+backend_options = generate_htp_compiler_spec(use_fp16=True)
+compiler_specs = generate_qnn_executorch_compiler_spec(
+    soc_model=soc_model,
+    backend_options=backend_options,
+)
+edge_prog_mgr = to_edge_transform_and_lower_to_qnn(model, example_inputs, compiler_specs)
+et_program = edge_prog_mgr.to_executorch()
+```
+
+---
+
+## Quantized (PTQ) Export
+
+```python
+import torch
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
+
+# 1. Export to ATen IR
+m = torch.export.export(model.eval(), example_inputs, strict=True).module()
+
+# 2. Prepare for quantization
+quantizer = QnnQuantizer(
+    backend=QnnExecuTorchBackendType.kHtpBackend,
+    soc_model=soc_model,
+)
+m = prepare_pt2e(m, quantizer)
+
+# 3. Calibrate
+m(*example_inputs)
+
+# 4. Convert
+m = convert_pt2e(m)
+
+# 5. Lower to QNN
+backend_options = generate_htp_compiler_spec(use_fp16=False)
+compiler_specs = generate_qnn_executorch_compiler_spec(
+    soc_model=soc_model,
+    backend_options=backend_options,
+)
+edge_prog_mgr = to_edge_transform_and_lower_to_qnn(m, example_inputs, compiler_specs)
+et_program = edge_prog_mgr.to_executorch()
+```
+
+---
+
+## Quantized (QAT) Export
+
+Same as PTQ but use `prepare_qat_pt2e` and run a training loop instead of calibration:
+
+```python
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_qat_pt2e
+
+m = prepare_qat_pt2e(m, quantizer)
+# training loop
+m(*example_inputs)
+m = convert_pt2e(m)
+# ... same lowering steps as PTQ
+```
+
+---
+
+## Quantization Options
+
+| QuantDtype | Activation | Weight |
+|---|---|---|
+| `use_16a16w` | fp16 | fp16 |
+| `use_16a8w` | fp16 | int8 |
+| `use_16a4w` | fp16 | int4 |
+| `use_8a8w` | int8 | int8 |
+| `use_8a4w` | int8 | int4 |
+
+**Fine-grained control with QuantRecipe:**
+
+```python
+from executorch.backends.qualcomm.quantizer.quant_recipe import QuantRecipe, QuantGranularity
+
+recipe = QuantRecipe(quant_dtype=QuantDtype.use_8a8w, is_qat=False)
+recipe.add_node_target(targets={torch.ops.aten.linear.default}, quant_dtype=QuantDtype.use_16a8w)
+recipe.add_regex(regex={"layers.[0-3].attention"}, quant_dtype=QuantDtype.use_16a4w)
+```
+
+---
+
+## Pass Pipelines (QnnPassManager)
+
+| Pipeline | When Called | Key Passes |
+|---|---|---|
+| `transform_for_annotation_pipeline` | Before `prepare_pt2e` (called internally by `QnnQuantizer`) | RemoveRedundancy, Decompose*, Recompose*, ReplaceInfValues |
+| `transform_for_export_pipeline` | After `torch.export` | Decompose*, CanonicalizeConv, LiftConstantScalarOperands |
+| `get_to_edge_transform_passes` | Before `to_edge` | AnnotateQuantAttrs, FoldQDQ, LayoutTransform, TagQuantIO, **ResolveDebugHandle (must be last)** |
+| `transform_for_preprocess_pipeline` | Inside `QnnBackend.preprocess` | FoldQDQ(force_fold=True), InsertRequantize, InsertIOQDQ, LayoutTransform(insert_permute=True), FuseConsecutiveCast |
+
+---
+
+## Skipping Ops / Partial Delegation
+
+```python
+from executorch.backends.qualcomm.utils.utils import skip_annotation
+
+# Skip specific node targets from being delegated
+skip_annotation(model, skipped_ops={torch.ops.aten.add.Tensor})
+```
+
+---
+
+## Dumping Context Binary
+
+```python
+from executorch.backends.qualcomm.utils.utils import dump_context_from_pte
+
+dump_context_from_pte("model.pte", output_dir="./context_bins/")
+```
+
+---
+
+## SoC Reference
+
+See `_soc_info_table` in `backends/qualcomm/serialization/qc_schema.py`.
diff --git a/.claude/skills/qualcomm/model_enablement.md b/.claude/skills/qualcomm/model_enablement.md
new file mode 100644
index 00000000000..c3ce3eae45c
--- /dev/null
+++ b/.claude/skills/qualcomm/model_enablement.md
@@ -0,0 +1,107 @@
+# Model Enablement
+
+Checklist for enabling a new model end-to-end on the QNN backend.
+
+---
+
+## 1. Identify Unsupported Ops
+
+Export the model and check which ops fall back to CPU:
+
+```python
+from executorch.backends.qualcomm.utils.utils import capture_program
+
+prog = capture_program(model, example_inputs)
+for node in prog.exported_program.graph.nodes:
+    if node.op == "call_function":
+        print(node.target.__name__)
+```
+
+Or run the full lowering and inspect the partition result — nodes outside the delegate are CPU fallbacks.
+
+For each unsupported op, follow `new_op_development.md`.
+
+---
+
+## 2. Add Export Script
+
+Place the script under `examples/qualcomm/scripts/<model_name>.py`. Use `build_executorch_binary` as the standard entry point:
+
+```python
+from executorch.examples.qualcomm.utils import build_executorch_binary
+
+build_executorch_binary(
+    model=model,
+    inputs=example_inputs,
+    soc_model=args.model,
+    file_name=f"{args.artifact}/{pte_filename}",
+    dataset=calibration_data,       # None for FP16
+    quant_dtype=QuantDtype.use_8a8w, # omit for FP16
+    online_prepare=args.online_prepare,
+)
+```
+
+For models requiring custom runners, add under `examples/qualcomm/oss_scripts/`.
+
+---
+
+## 3. Verify Delegation
+
+After lowering, confirm the graph is fully delegated:
+
+```python
+from executorch.backends.qualcomm.utils.utils import draw_graph
+
+draw_graph("model_graph", prog.exported_program.graph)
+```
+
+Expected: all compute nodes inside a single `torch.ops.higher_order.executorch_call_delegate` node. Any remaining `call_function` nodes are CPU fallbacks — investigate and fix.
+
+---
+
+## 4. Add Model-Level Tests
+
+In `tests/test_qnn_delegate.py`, add to `TestQNNFloatingPointModel` and/or `TestQNNQuantizedModel`:
+
+```python
+def test_qnn_backend_my_model(self):
+    # setup model and inputs
+    module = MyModel()
+    sample_input = (torch.randn(1, 3, 224, 224),)
+    # lower and test
+    self.lower_module_and_test_output(module, sample_input)
+```
+
+For script-based tests (with artifact dependencies), add to `TestExampleScript` or `TestExampleOssScript`.
+
+---
+
+## 5. Accuracy Validation
+
+Run on device and compare outputs against CPU reference:
+
+```python
+import torch
+
+cpu_output = model(*example_inputs)
+qnn_output = # load from device execution
+
+torch.testing.assert_close(qnn_output, cpu_output, rtol=1e-2, atol=1e-2)
+```
+
+Typical tolerances:
+- FP16: `rtol=1e-2, atol=1e-2`
+- INT8 quantized: `rtol=1e-1, atol=1e-1` (accuracy depends on calibration quality)
+
+---
+
+## 6. Common Issues
+
+| Symptom | Likely Cause | Fix |
+|---|---|---|
+| Op falls back to CPU | Missing builder or annotation | Add op builder + quantizer annotation |
+| Shape mismatch after layout transform | NHWC/NCHW confusion | Check `LayoutTransform` pass, verify `get_tensor` axis order |
+| Quantization accuracy degraded | Poor calibration data | Use representative dataset; try per-channel quantization |
+| `KeyError` in `node_visitors` | Builder not registered | Check `builders/__init__.py` import |
+| Context binary compile failure | QNN op spec mismatch | Verify IO order and parameter names against `QnnOpDef.h` |
+| `online_prepare` vs offline mismatch | Context binary format | Use `--online_prepare` for QAIRT Visualizer; offline for deployment |
diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md
new file mode 100644
index 00000000000..f6ca1f47101
--- /dev/null
+++ b/.claude/skills/qualcomm/new_op_development.md
@@ -0,0 +1,358 @@
+# New Op Development
+
+Full reference: `backends/qualcomm/builders/README.md` (op builder) and `backends/qualcomm/quantizer/README.md` (quantizer annotation).
+
+## Overview
+
+Adding a new op requires three steps:
+1. Implement the op builder (`builders/op_*.py`)
+2. Register quantizer annotation (`quantizer/annotators/`)
+3. Add unit tests (`tests/`)
+
+**Important**: If the torch op requires **multiple QNN ops** to implement (e.g., no direct QNN equivalent), use a **decompose pass** instead of creating multiple ops in a single builder. Skip Steps 3–6 and follow the **Decompose Pass Approach** section at the bottom of this file.
+
+---
+
+## Step 1: Identify the Unsupported Op
+
+Run the model through the QNN backend. A missing op surfaces as:
+
+```
+KeyError: 'aten.native_layer_norm.default'
+```
+
+To trace back to the source PyTorch layer:
+
+```python
+from executorch.backends.qualcomm.utils.utils import capture_program
+
+prog = capture_program(MyModel(), example_inputs)
+for node in prog.exported_program.graph.nodes:
+    if node.op == "call_function" and node.target.__name__ == 'aten.native_layer_norm.default':
+        print(node.meta["source_fn_stack"])
+```
+
+---
+
+## Step 2: Check Operator Spec
+
+- **QNN side**: [Operator Definitions](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-10/MasterOpDef.html) — check IO order, required vs optional tensors, parameter names and shapes
+- **PyTorch side**: [ATen Operator Definitions](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native) — map PyTorch args to QNN IO/params
+- **Fallback search**: [Supported Ops table](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-10/SupportedOps.html)
+- **Header reference**: `$QNN_SDK_ROOT/include/QNN/QnnOpDef.h` — authoritative string literals
+
+---
+
+## Step 3: Add Op Constant
+
+In `builders/qnn_constants.py`, add a dataclass (alphabetical order):
+
+```python
+@dataclass(init=False, frozen=True)
+class OpLayerNorm:
+    op_name: str = "LayerNorm"
+    param_epsilon = "epsilon"
+    param_axes = "axes"
+```
+
+String values must exactly match `QnnOpDef.h`.
+
+---
+
+## Step 4: Implement the Builder
+
+Create `builders/op_layer_norm.py`:
+
+```python
+import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpLayerNorm, QNN_OP_PACKAGE_NAME_QTI_AISW
+from .utils import get_parameter
+
+@register_node_visitor
+class LayerNormVisitor(NodeVisitor):
+    target = ["aten.native_layer_norm.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(self, node, nodes_to_wrappers):
+        # 1. Input activation
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node, node, input_tensor,
+            PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        # 2. Weight (gamma) and bias (beta) — STATIC tensors
+        weight_node = self.get_node(node.args[2])
+        weight_tensor = get_parameter(weight_node, self.edge_program)
+        weight_tensor_wrapper = self.define_tensor(
+            weight_node, node, weight_tensor,
+            PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+        )
+
+        bias_node = self.get_node(node.args[3])
+        bias_tensor = get_parameter(bias_node, self.edge_program)
+        bias_tensor_wrapper = self.define_tensor(
+            bias_node, node, bias_tensor,
+            PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+            nodes_to_wrappers,
+        )
+
+        # 3. Parameters
+        normalized_shapes = node.args[1]
+        if len(normalized_shapes) != 1:
+            print("QNN only supports normalized output with rank 1")
+            return
+        axes = [len(input_tensor.shape) - 1]
+        axes_shape = [len(axes)]
+        epsilon = node.args[4]
+
+        # 4. Output
+        output_tensor = self.get_tensor(node, node, 0)
+        output_tensor_wrapper = self.define_tensor(
+            node, node, output_tensor,
+            PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        # 5. Build op
+        layer_norm_op = PyQnnManager.PyQnnOpWrapper(
+            node.name, QNN_OP_PACKAGE_NAME_QTI_AISW, OpLayerNorm.op_name,
+        )
+        layer_norm_op.AddInputTensors(
+            [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper]
+        )
+        layer_norm_op.AddOutputTensors([output_tensor_wrapper])
+        layer_norm_op.AddScalarParam(
+            OpLayerNorm.param_epsilon,
+            PyQnnManager.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
+            {QCOM_DATA: np.float32(epsilon)},
+        )
+        layer_norm_op.AddTensorParam(
+            OpLayerNorm.param_axes,
+            PyQnnManager.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(axes_shape), axes_shape,
+            np.array(axes, dtype=np.uint32),
+            True,
+        )
+        return layer_norm_op
+```
+
+Key notes:
+- `target` must be a list (multiple targets can share one visitor)
+- Use `QNN_TENSOR_TYPE_NATIVE` for activations, `QNN_TENSOR_TYPE_STATIC` for weights/biases
+- `define_tensor` handles `APP_READ`/`APP_WRITE` detection internally — always pass `NATIVE`
+- `wrapper_idx` needed when node output is a tuple (e.g. split ops)
+- Return `None` to signal validation failure → op falls back to CPU
+
+---
+
+## Step 5: Register the Builder
+
+In `builders/__init__.py` (alphabetical order):
+
+```python
+from . import (
+    ...
+    op_layer_norm,
+    ...
+)
+__all__ = [..., op_layer_norm, ...]
+```
+
+---
+
+## Step 6: Add Quantizer Annotation
+
+In `quantizer/annotators/{backend}_rules.py`:
+
+```python
+@register_annotator(
+    [torch.ops.aten.native_layer_norm.default],
+    QnnConstants.OpLayerNorm.op_name,
+)
+class LayerNormAnnotator(GeneralOpDef):
+    @staticmethod
+    def annotate(node, quantization_config):
+        annotate_single_in_single_out(node, quantization_config)
+```
+
+- Use `qnn_op=None` for skip ops (e.g. `operator.getitem`)
+- `annotate_single_in_single_out` covers most cases; implement custom logic for multi-input ops
+
+Full annotation tutorial: `backends/qualcomm/quantizer/README.md`
+
+### Choosing the right annotate function
+
+The QNN backend validates quantization constraints via `backend_opinfo` (QNN SDK ≥ 2.41). If validation fails with:
+
+```
+ValueError: Validation failed for node <name> with target aten.<op>.default
+```
+
+Check the warning log above it — it will say which constraint failed. The most common case is `is_math_invariant=True`, which means the op does not change values (only rearranges data), so input and output **must share the same quantization parameters**.
+
+| Op type | annotate function | Example ops |
+|---------|-------------------|-------------|
+| General (input → output with new scale) | `annotate_single_in_single_out` | LayerNorm, Conv2d |
+| Pass-through (rearranges data only) | `annotate_in_out_obs_sharing_op` + fallback | Reshape, ChannelShuffle, PixelShuffle |
+| Multi-input | `annotate_binary` | Add, Mul |
+
+For **pass-through ops** (reshape, shuffle, permute — ops where `is_math_invariant=True`), override `annotate` like this:
+
+```python
+@register_annotator(
+    [torch.ops.aten.channel_shuffle.default], QnnConstants.OpChannelShuffle.op_name
+)
+class ChannelShuffle(GeneralOpDef):
+    @staticmethod
+    def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
+        annotate_in_out_obs_sharing_op(node, quantization_config)
+        if not _is_annotated([node]):
+            annotate_single_in_share_out(node, quantization_config)
+```
+
+`annotate_in_out_obs_sharing_op` shares the input's observer with the output (satisfies `is_math_invariant`). The fallback `annotate_single_in_share_out` handles the case where the input node is not yet annotated.
+
+---
+
+## Step 7: Add Unit Tests
+
+In `tests/models.py` (alphabetical order):
+
+```python
+class LayerNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer_norm = torch.nn.LayerNorm([768], eps=1e-6)
+
+    def forward(self, x):
+        return self.layer_norm(x)
+```
+
+In `tests/test_qnn_delegate.py`, add to both `TestQNNFloatingPointOperator` and `TestQNNQuantizedOperator` (alphabetical order):
+
+```python
+def test_qnn_backend_layer_norm(self):
+    module = LayerNorm()
+    sample_input = (torch.randn(196, 768),)
+    module = self.get_qdq_module(module, sample_input)  # quantized only
+    self.lower_module_and_test_output(module, sample_input)
+```
+
+Expected result: 1 delegated node, only placeholders/output nodes remain outside the delegate.
+
+---
+
+## Step 8: Prevent Decomposition (if needed)
+
+Some torch ops are in ExecuTorch's default decomposition table and will be broken into primitives **before** the QNN partitioner sees them. If QNN has a native op for it, you must explicitly skip decomposition.
+
+**Check first** using the helper script:
+
+```bash
+PYTHONPATH=/local/mnt/workspace/shewu \
+python /local/mnt/workspace/shewu/executorch_artifacts/hutton_scripts/check_op_decomposed.py \
+    aten.channel_shuffle.default
+```
+
+Output:
+```
+aten.channel_shuffle.default: True  (in ExecuTorch decomp table)
+```
+
+If `True`, add the op to `get_skip_decomp_table()` in `partition/utils.py` (alphabetical order):
+
+```python
+def get_skip_decomp_table() -> List[torch._ops.OperatorBase]:
+    do_not_decompose = [
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.channel_shuffle.default,   # ← add here
+        torch.ops.aten.col2im.default,
+        ...
+    ]
+```
+
+**Verification**: After adding, re-run the tests. The partitioner log should show:
+
+```
+[QNN Partitioner Op Support]: aten.channel_shuffle.default | True
+```
+
+If the op was decomposed (not in skip table), the partitioner would never see `aten.channel_shuffle.default` and the test would still pass but via decomposed primitives — not the native QNN op.
+
+---
+
+## Decompose Pass Approach (for ops without direct QNN equivalent)
+
+When a torch op has **no direct QNN equivalent** and requires multiple QNN ops to implement, use a **decompose pass** to rewrite the graph into primitive ops that QNN already supports. This is preferred over creating multiple ops in a single builder.
+
+**Reference**: `backends/qualcomm/_passes/decompose_linalg_vector_norm.py`
+
+### Pattern
+
+```python
+# 1. Define a torch.nn.Module that implements the op using supported primitives
+class MyOpDecomposed(torch.nn.Module):
+    def __init__(self, param):
+        super().__init__()
+        self.param = param
+
+    def forward(self, x):
+        # Use only ops that QNN supports
+        return torch.some_supported_op(x, self.param)
+
+
+# 2. Create the ExportPass
+class DecomposeMyOp(ExportPass):
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in list(graph.nodes):
+            if node.target == torch.ops.aten.my_op.default:
+                param = node.args[1]  # extract params from node
+                model = MyOpDecomposed(param)
+                ep = torch.export.export(model, (node.args[0].meta["val"],), strict=True)
+                decomposed_module = ep.run_decompositions().graph_module
+
+                with graph.inserting_before(node):
+                    remap = {"x": node.args[0]}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+```
+
+### Registration
+
+1. Add to `_passes/__init__.py` (alphabetical order):
+   ```python
+   from .decompose_my_op import DecomposeMyOp
+   ```
+
+2. Add to `_passes/qnn_pass_manager.py` imports and both pipeline methods:
+   - `transform_for_annotation_pipeline` (before quantizer)
+   - `transform_for_export_pipeline` (before `to_edge`)
+
+3. Remove the op from `to_be_implemented_operator` in `partition/common_defs.py`
+
+### Notes
+- The decomposed module must only use ops that QNN already supports
+- `ep.run_decompositions()` ensures the graph is in edge IR form
+- `remap` maps placeholder names in the decomposed graph to actual nodes in the target graph
+- No separate quantizer annotation needed — the decomposed ops already have their own annotations
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
index 01e8503dc26..b672c851ee2 100644
--- a/backends/qualcomm/builders/README.md
+++ b/backends/qualcomm/builders/README.md
@@ -369,14 +369,14 @@ Please help update following table if you are contributing new operators:
 + &#128683; = Deprecated, supported with other QNN Ops
 
 
-| Operators | HTP - 94/116 Enabled |
+| Operators | HTP - 98/119 Enabled |
 |-----------|---------|
 | Argmax | &check; |
 | Argmin | &check; |
 | BatchNorm | &check; |
 | BatchToSpace | &cross; |
 | Cast | &check; |
-| ChannelShuffle | &cross; |
+| ChannelShuffle | &check; |
 | Concat | &check; |
 | Conv2d | &check; |
 | Conv3d | &check; |
@@ -430,7 +430,7 @@ Please help update following table if you are contributing new operators:
 | ExtractPatches | &cross; |
 | FullyConnected | &check; |
 | Gather | &check; |
-| GatherElements | &cross; |
+| GatherElements | &check; |
 | GatherNd | &check; |
 | Gelu | &check; |
 | GetSparseIndices | &cross; |
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index 14f53840dd7..a897dfa53bd 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -25,6 +25,7 @@
     op_bmm,
     op_cat,
     op_ceil,
+    op_channel_shuffle,
     op_clamp,
     op_conv,
     op_copy,
@@ -137,6 +138,7 @@
     op_bmm,
     op_cat,
     op_ceil,
+    op_channel_shuffle,
     op_clamp,
     op_conv,
     op_copy,
diff --git a/backends/qualcomm/builders/op_channel_shuffle.py b/backends/qualcomm/builders/op_channel_shuffle.py
new file mode 100644
index 00000000000..a51d9e6a1a2
--- /dev/null
+++ b/backends/qualcomm/builders/op_channel_shuffle.py
@@ -0,0 +1,73 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
+
+import numpy as np
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_DATA
+
+from .node_visitor import NodeVisitor
+from .node_visitor_manager import register_node_visitor
+from .qnn_constants import OpChannelShuffle, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class ChannelShuffleVisitor(NodeVisitor):
+    target = ["aten.channel_shuffle.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnManager.TensorWrapper],
+    ) -> PyQnnManager.PyQnnOpWrapper:
+        input_node = self.get_node(node.args[0])
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnManager.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        num_groups = cast(int, node.args[1])
+        # QNN ChannelShuffle operates on the channel dimension (axis=1 for NCHW)
+        axis = 1
+
+        channel_shuffle_op = PyQnnManager.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpChannelShuffle.op_name,
+        )
+        channel_shuffle_op.AddInputTensors([input_tensor_wrapper])
+        channel_shuffle_op.AddOutputTensors([output_tensor_wrapper])
+        channel_shuffle_op.AddScalarParam(
+            OpChannelShuffle.param_num_groups,
+            PyQnnManager.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: np.uint32(num_groups)},
+        )
+        channel_shuffle_op.AddScalarParam(
+            OpChannelShuffle.param_axis,
+            PyQnnManager.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {QCOM_DATA: np.uint32(axis)},
+        )
+
+        return channel_shuffle_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index 58037459bc8..d7ec30fddc0 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -38,6 +38,13 @@ class OpCast:
     op_name: str = "Cast"
 
 
+@dataclass(init=False, frozen=True)
+class OpChannelShuffle:
+    op_name: str = "ChannelShuffle"
+    param_num_groups: str = "num_groups"
+    param_axis: str = "axis"
+
+
 @dataclass(init=False, frozen=True)
 class OpConcat:
     op_name: str = "Concat"
diff --git a/backends/qualcomm/partition/utils.py b/backends/qualcomm/partition/utils.py
index 7c45845f516..bcfe41c2bbe 100644
--- a/backends/qualcomm/partition/utils.py
+++ b/backends/qualcomm/partition/utils.py
@@ -50,6 +50,7 @@ def filter_fn(node: torch.fx.Node) -> bool:
 def get_skip_decomp_table() -> List[torch._ops.OperatorBase]:
     do_not_decompose = [
         torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.channel_shuffle.default,
         torch.ops.aten.col2im.default,
         torch.ops.aten.elu.default,
         torch.ops.aten.floor_divide.default,
diff --git a/backends/qualcomm/quantizer/annotators/htp_rules.py b/backends/qualcomm/quantizer/annotators/htp_rules.py
index 9cf16cb079e..ce01fceca80 100644
--- a/backends/qualcomm/quantizer/annotators/htp_rules.py
+++ b/backends/qualcomm/quantizer/annotators/htp_rules.py
@@ -281,6 +281,17 @@ class Ceil(GeneralOpDef):
     pass
 
 
+@register_annotator(
+    [torch.ops.aten.channel_shuffle.default], QnnConstants.OpChannelShuffle.op_name
+)
+class ChannelShuffle(GeneralOpDef):
+    @staticmethod
+    def annotate(node: Node, quantization_config: QuantizationConfig) -> None:
+        annotate_in_out_obs_sharing_op(node, quantization_config)
+        if not _is_annotated([node]):
+            annotate_single_in_share_out(node, quantization_config)
+
+
 @register_annotator(
     [
         torch.ops.aten.split_with_sizes.default,
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index e0b23c5b0cc..3df9c437447 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -400,6 +400,15 @@ def forward(self, x):
         return torch.ceil(x)
 
 
+class ChannelShuffle(torch.nn.Module):
+    def __init__(self, groups):
+        super().__init__()
+        self.channel_shuffle = torch.nn.ChannelShuffle(groups)
+
+    def forward(self, x):
+        return self.channel_shuffle(x)
+
+
 class Chunk(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index a00e421a80a..d44204ec495 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -378,6 +378,11 @@ def test_qnn_backend_cdist(self):
         )
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_channel_shuffle(self):
+        module = ChannelShuffle(2)  # noqa: F405
+        sample_input = (torch.randn(1, 4, 3, 3),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_chunk_single(self):
         module = Chunk()  # noqa: F405
         sample_input = (torch.randn(1, 1, 4, 3),)
@@ -2675,6 +2680,12 @@ def test_qnn_backend_cdist(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_channel_shuffle(self):
+        module = ChannelShuffle(2)  # noqa: F405
+        sample_input = (torch.randn(1, 4, 3, 3),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_chunk_single(self):
         module = Chunk()  # noqa: F405
         sample_input = (torch.randn(1, 1, 4, 3),)

From 701a4593e4cccb934a8d456b0fcc4428bec2c213 Mon Sep 17 00:00:00 2001
From: shewu <shewu@qti.qualcomm.com>
Date: Tue, 7 Apr 2026 16:45:00 +0800
Subject: [PATCH 2/3] Updated CLAUDE.md

---
 CLAUDE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 8cb29af5d4d..9f75100415a 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -7,6 +7,7 @@
 - `/profile` - Profile execution
 - `/cortex-m` - Build, test, or develop the Cortex-M backend
 - `/binary-size` - Analyze and reduce binary size
+- `/qualcomm` - Build, test, or develop the QNN (Qualcomm AI Engine Direct) backend
 
 Reference docs in `.claude/`: backends, runtime-api, quantization, llm-export, faq, tokenizers
 

From bc46b3498c1577ba55236a7ef3de4919a5ce4118 Mon Sep 17 00:00:00 2001
From: shewu <shewu@qti.qualcomm.com>
Date: Mon, 13 Apr 2026 18:05:57 +0800
Subject: [PATCH 3/3] Correct Skill descriptions

---
 .claude/skills/qualcomm/lowering_export.md    | 11 ++++++-----
 .claude/skills/qualcomm/new_op_development.md | 15 +++++++++------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.claude/skills/qualcomm/lowering_export.md b/.claude/skills/qualcomm/lowering_export.md
index 4e9d04e7665..aaa9691e855 100644
--- a/.claude/skills/qualcomm/lowering_export.md
+++ b/.claude/skills/qualcomm/lowering_export.md
@@ -85,11 +85,12 @@ m = convert_pt2e(m)
 
 | QuantDtype | Activation | Weight |
 |---|---|---|
-| `use_16a16w` | fp16 | fp16 |
-| `use_16a8w` | fp16 | int8 |
-| `use_16a4w` | fp16 | int4 |
-| `use_8a8w` | int8 | int8 |
-| `use_8a4w` | int8 | int4 |
+| `use_16a16w` | uint16 | int16 |
+| `use_16a8w` | uint16 | int8 |
+| `use_16a4w` | uint16 | int4 |
+| `use_16a4w_block` | uint16 | int4 (block-wise) |
+| `use_8a8w` | uint8 | int8 |
+| `use_8a4w` | uint8 | int4 |
 
 **Fine-grained control with QuantRecipe:**
 
diff --git a/.claude/skills/qualcomm/new_op_development.md b/.claude/skills/qualcomm/new_op_development.md
index f6ca1f47101..dc639655257 100644
--- a/.claude/skills/qualcomm/new_op_development.md
+++ b/.claude/skills/qualcomm/new_op_development.md
@@ -257,17 +257,20 @@ Expected result: 1 delegated node, only placeholders/output nodes remain outside
 
 Some torch ops are in ExecuTorch's default decomposition table and will be broken into primitives **before** the QNN partitioner sees them. If QNN has a native op for it, you must explicitly skip decomposition.
 
-**Check first** using the helper script:
+**Check first** with a quick Python snippet (run from the executorch root with the `executorch` conda env active):
 
-```bash
-PYTHONPATH=/local/mnt/workspace/shewu \
-python /local/mnt/workspace/shewu/executorch_artifacts/hutton_scripts/check_op_decomposed.py \
-    aten.channel_shuffle.default
+```python
+import torch
+from executorch.exir.tracer import _default_decomposition_table
+
+decomp_table = _default_decomposition_table()
+op = torch.ops.aten.channel_shuffle.default
+print(op in decomp_table)  # True → will be decomposed
 ```
 
 Output:
 ```
-aten.channel_shuffle.default: True  (in ExecuTorch decomp table)
+True  # in ExecuTorch decomp table
 ```
 
 If `True`, add the op to `get_skip_decomp_table()` in `partition/utils.py` (alphabetical order):