From 4a7c1852070a515926447c1216f34693a626d831 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 26 May 2026 14:12:29 -0700
Subject: [PATCH 1/2] Cortex-M backend: lower aten.conv1d to existing
 quantized_conv2d kernels

PyTorch Conv1d produces 3D NCW tensors at the edge dialect level, while
CMSIS-NN's `arm_convolve_wrapper_s8` only accepts 4D NHWC. Rather than
add a dedicated conv1d C++ kernel that would duplicate the quantized
conv2d machinery, this lowers conv1d to the existing
cortex_m.quantized_conv2d / quantized_depthwise_conv2d ops by wrapping
the runtime input in `aten.unsqueeze_copy` plus
`dim_order_ops._clone_dim_order(dim_order=[0,2,3,1])` to produce a
logical (N, C, 1, W) tensor with NHWC byte layout, AoT-permuting the
weight to 4D OHWI (or IHWO for depthwise) with H=1, and unwrapping the
output with `_clone_dim_order([0,1,2,3])` + `squeeze_copy`. The `dim_order`
attribute is what gets `is_channels_last_tensor` to pass at runtime; the
physical reorder inside the clone op is what makes CMSIS-NN's raw
int8_t* arithmetic read the right bytes.

New: `CortexMConv1DCheck` accepts rank-3 patterns with the same
activation tails (relu / hardtanh / hardsigmoid / clamp) the conv2d
quantizer pattern already supports. `ConvertToCortexMPass._lower_conv1d`
performs the full graph rewrite end-to-end -- it inserts the wrap
chain, creates the conv2d node, initialises its scratch buffer through
the existing `required_cmsis_nn_buffer_sizes` lookup, and replaces uses
of the original convolution node itself, so `call()`'s standard
single-node-replacement tail doesn't need to know about conv1d.

For the three target models:

* Wav2Letter (Conv1d + ReLU + log_softmax stack) collapses entirely;
  every Conv1d + ReLU pair fuses into a single quantized_conv2d, only
  log_softmax stays in aten. Test expectations populated.
* Silero VAD's six Conv1d ops (1 STFT + 4 encoder + 1 final) all
  lower: 5 as regular conv2d, 1 as depthwise (the STFT layer has
  in_channels == groups == 1). Four of the five encoder ReLUs fuse;
  the remaining unfused ReLU is the post-LSTM `F.relu(h)`, with no
  Conv1d producer to fuse into.
* YOLO11 is 2D and unaffected.

Op-level unit tests cover regular, depthwise, kernel=1 pointwise,
larger kernels with padding, stride > 1, bias, an STFT-shaped
in=1 depthwise case, and a Conv1d+ReLU fusion case.

Follow-ups (intentionally deferred): factor the shared
qparams / multiplier / weight-placeholder code between conv1d and
conv2d helpers; collapse the 9 activation-tail pattern entries via a
helper; add an explicit reject for `groups > 1 && !depthwise` in both
the conv1d and existing conv2d paths.

Co-authored-by: Claude <noreply@anthropic.com>
---
 .../passes/convert_to_cortex_m_pass.py        | 168 ++++++++++++++++++
 .../cortex_m/quantizer/pattern_checkers.py    |  29 +++
 .../cortex_m/quantizer/quantizer_support.py   |  22 +++
 backends/cortex_m/test/build_test_runner.sh   |   1 +
 .../cortex_m/test/models/test_silero_vad.py   |  26 ++-
 .../cortex_m/test/models/test_wav2letter.py   |  25 ++-
 backends/cortex_m/test/ops/test_conv.py       | 122 ++++++++++++-
 7 files changed, 375 insertions(+), 18 deletions(-)

diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index e61ddaf63bc..aa0d13d9a48 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -304,6 +304,165 @@ def _get_convolution_replacement(self, node):
             )
             return exir_ops.edge.cortex_m.quantized_conv2d.default, new_args
 
+    def _lower_conv1d(self, node, graph_module):
+        """Lower a quantized 3D aten.convolution.default (conv1d) end-to-end.
+
+        Wraps the runtime input in `aten.unsqueeze_copy +
+        dim_order_ops._clone_dim_order` to a 4D NHWC tensor with H=1,
+        AoT-reshapes the weight to 4D OHWI / IHWO with H=1, calls the existing
+        cortex_m.quantized_conv2d kernel, then clones + squeezes back to 3D
+        NCW. Replaces uses of `node` with the terminal squeeze and erases
+        `node`. The caller does not need to do any further graph mutation.
+        """
+        (
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            _transposed,
+            _output_padding,
+            groups,
+        ) = node.args
+
+        stride_2d = [1, stride[0]]
+        padding_2d = [0, padding[0]]
+        dilation_2d = [1, dilation[0]]
+
+        input_scale = node.meta["input_qparams"][0].scale
+        input_zero_point = node.meta["input_qparams"][0].zp
+        weight_scales = node.meta["input_qparams"][1].scale
+        if not isinstance(weight_scales, list):
+            fake_weight_tensor = get_first_fake_tensor(weight)
+            weight_scales = [weight_scales] * fake_weight_tensor.shape[0]
+
+        output_qparams = node.meta["output_qparams"][0]
+        output_scale = output_qparams.scale
+        output_zero_point = output_qparams.zp
+        output_qmin = output_qparams.qmin
+        output_qmax = output_qparams.qmax
+
+        quantized_multipliers = []
+        quantized_shifts = []
+        for weight_scale in weight_scales:
+            quantized_multiplier, quantized_shift = quantize_multiplier_aot(
+                input_scale * weight_scale / output_scale
+            )
+            quantized_multipliers.append(quantized_multiplier)
+            quantized_shifts.append(quantized_shift)
+
+        param_weight_tensor = get_param_tensor(self.exported_program, weight)
+        if param_weight_tensor is None:
+            raise RuntimeError(
+                f"Expected conv1d weight parameter tensor for node {node.name}."
+            )
+
+        # Conv1d weight shape: (out_channels, in_channels/groups, K).
+        in_channels = param_weight_tensor.shape[1] * groups
+        out_channels = param_weight_tensor.shape[0]
+        is_depthwise = (in_channels == groups) and (out_channels % in_channels == 0)
+        batch_size = self._get_batch_size_from_conv(node)
+        use_depthwise_conv = is_depthwise and (batch_size == 1)
+
+        # Lift the weight to the 4D layout the existing quantized_conv2d kernels
+        # already consume: unsqueeze a singleton H=1, then permute by the same
+        # axes the 4D path uses (OHWI for regular, IHWO for depthwise).
+        param_weight_4d = param_weight_tensor.unsqueeze(2)
+        if use_depthwise_conv:
+            weight_permuted = param_weight_4d.permute(1, 2, 3, 0).contiguous()
+        else:
+            weight_permuted = param_weight_4d.permute(0, 2, 3, 1).contiguous()
+
+        with node.graph.inserting_after(weight):
+            weight_nhwc = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_weight_nhwc",
+                InputKind.PARAMETER,
+                weight_permuted,
+            )
+            quantized_multiplier_tensor = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_quantized_multiplier",
+                InputKind.PARAMETER,
+                torch.tensor(quantized_multipliers, dtype=torch.int32),
+            )
+            quantized_shift_tensor = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_quantized_shift",
+                InputKind.PARAMETER,
+                torch.tensor(quantized_shifts, dtype=torch.int32),
+            )
+
+        # Build the input chain (NCW -> 4D NHWC), the conv, and the output chain
+        # (4D NHWC -> NCW), all inserted in graph order before the original conv.
+        with node.graph.inserting_before(node):
+            x_4d_nchw = node.graph.create_node(
+                "call_function",
+                target=exir_ops.edge.aten.unsqueeze_copy.default,
+                args=(x, 2),
+            )
+            x_4d_nhwc = node.graph.create_node(
+                "call_function",
+                target=exir_ops.edge.dim_order_ops._clone_dim_order.default,
+                args=(x_4d_nchw,),
+                kwargs={"dim_order": [0, 2, 3, 1]},
+            )
+            scratch = self._create_uninitialized_alloc_node()
+
+            # `is_depthwise` already required `out_channels % in_channels == 0`,
+            # so depth_multiplier is exact; pass it as the extra positional that
+            # the depthwise kernel takes between dilation and input_zero_point.
+            if use_depthwise_conv:
+                conv_op = exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default
+                depth_multiplier_args = (out_channels // in_channels,)
+            else:
+                conv_op = exir_ops.edge.cortex_m.quantized_conv2d.default
+                depth_multiplier_args = ()
+
+            conv_args = (
+                x_4d_nhwc,
+                weight_nhwc,
+                bias,
+                stride_2d,
+                padding_2d,
+                dilation_2d,
+                *depth_multiplier_args,
+                -input_zero_point,
+                output_zero_point,
+                quantized_multiplier_tensor,
+                quantized_shift_tensor,
+                output_qmin,
+                output_qmax,
+                scratch,
+            )
+
+            conv_node = node.graph.create_node(
+                "call_function",
+                target=conv_op,
+                args=conv_args,
+                kwargs={},
+            )
+            self._initialize_alloc_node_size(conv_node)
+
+            out_4d_nchw = node.graph.create_node(
+                "call_function",
+                target=exir_ops.edge.dim_order_ops._clone_dim_order.default,
+                args=(conv_node,),
+                kwargs={"dim_order": [0, 1, 2, 3]},
+            )
+            out_3d = node.graph.create_node(
+                "call_function",
+                target=exir_ops.edge.aten.squeeze_copy.dims,
+                args=(out_4d_nchw, [2]),
+            )
+
+        node.replace_all_uses_with(out_3d)
+        graph_module.graph.erase_node(node)
+
     def _initialize_alloc_node_size(self, node: torch.fx.Node) -> None:
         """For nodes with a registered buffer size function for node.target, set the buffer sizes
         of the last n args, which should be exir.memory.alloc nodes. For nodes without a
@@ -500,6 +659,15 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 case exir_ops.edge.aten.convolution.default:
                     # Check if it's transposed convolution (arg index 6)
                     transposed = node.args[6] if len(node.args) > 6 else False
+                    # stride length is 1 for conv1d, 2 for conv2d. Conv1d is
+                    # lowered to the existing quantized_conv2d kernel with H=1
+                    # by inserting unsqueeze + dim-order clone around the call;
+                    # the helper handles its own replace_all_uses + erase.
+                    is_conv1d = len(node.args[3]) == 1
+                    if is_conv1d and not transposed:
+                        self._lower_conv1d(node, graph_module)
+                        modified = True
+                        continue
                     if transposed:
                         op, args = self._get_transpose_conv2d_replacement(node)
                     else:
diff --git a/backends/cortex_m/quantizer/pattern_checkers.py b/backends/cortex_m/quantizer/pattern_checkers.py
index 860d8345607..283f9d55bdf 100644
--- a/backends/cortex_m/quantizer/pattern_checkers.py
+++ b/backends/cortex_m/quantizer/pattern_checkers.py
@@ -87,6 +87,35 @@ def check_quantization_config(
         return is_int8 and is_ch_axis_0
 
 
+class CortexMConv1DCheck(PatternCheck):
+    """Accepts aten.conv1d.default with rank-3 NCW inputs.
+
+    The conv1d is lowered to cortex_m.quantized_conv2d via AoT weight
+    reshape (O, I, K) -> (O, 1, K, I) and graph-level input unsqueeze +
+    channels_last conversion in ConvertToCortexMPass.
+    """
+
+    @classmethod
+    def check_pattern(cls, pattern):
+        for node in pattern:
+            tensor = get_first_fake_tensor(node)
+            if tensor is None or tensor.ndim != 3:
+                return False
+        return True
+
+    @classmethod
+    def check_quantization_config(
+        cls, pattern: list[Node], quantization_config: QuantizationConfig
+    ) -> bool:
+        is_int8 = cls.is_int8_activations(quantization_config)
+        conv_node = pattern[0] if pattern else None
+        weight_qspec = quantization_config.get_weight_qspec(conv_node)
+        if not isinstance(weight_qspec, QuantizationSpec):
+            return False
+        is_ch_axis_0 = weight_qspec.ch_axis == 0 or weight_qspec.ch_axis is None
+        return is_int8 and is_ch_axis_0
+
+
 class CortexMLinearCheck(PatternCheck):
     @classmethod
     def check_quantization_config(
diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py
index 3dfbb67638a..332fb3ae15a 100644
--- a/backends/cortex_m/quantizer/quantizer_support.py
+++ b/backends/cortex_m/quantizer/quantizer_support.py
@@ -8,6 +8,7 @@
     CortexMAddMulCheck,
     CortexMAvgPool2DCheck,
     CortexMBmmCheck,
+    CortexMConv1DCheck,
     CortexMConv2DCheck,
     CortexMConvTranspose2DCheck,
     CortexMLinearCheck,
@@ -76,6 +77,27 @@
     ): CortexMConv2DCheck,
     (torch.ops.aten.conv2d.default, torch.ops.aten.clamp.default): CortexMConv2DCheck,
     (torch.ops.aten.conv2d.default, torch.ops.aten.clamp_.default): CortexMConv2DCheck,
+    (torch.ops.aten.conv1d.default,): CortexMConv1DCheck,
+    (torch.ops.aten.conv1d.default, torch.ops.aten.relu.default): CortexMConv1DCheck,
+    (torch.ops.aten.conv1d.default, torch.ops.aten.relu_.default): CortexMConv1DCheck,
+    (
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.hardtanh.default,
+    ): CortexMConv1DCheck,
+    (
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.hardtanh_.default,
+    ): CortexMConv1DCheck,
+    (
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.hardsigmoid.default,
+    ): CortexMConv1DCheck,
+    (
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.hardsigmoid_.default,
+    ): CortexMConv1DCheck,
+    (torch.ops.aten.conv1d.default, torch.ops.aten.clamp.default): CortexMConv1DCheck,
+    (torch.ops.aten.conv1d.default, torch.ops.aten.clamp_.default): CortexMConv1DCheck,
 }
 
 CONV_TRANSPOSE_OP_PATTERNS = {
diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh
index a67c5a907a4..05f58e73b10 100755
--- a/backends/cortex_m/test/build_test_runner.sh
+++ b/backends/cortex_m/test/build_test_runner.sh
@@ -45,6 +45,7 @@ aten::cat.out,\
 aten::full.out,\
 aten::ge.Tensor_out,\
 aten::unsqueeze_copy.out,\
+aten::squeeze_copy.dims_out,\
 aten::select_copy.int_out,\
 aten::amax.out"
 
diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py
index 27b958627bb..0f5a5a7d8f2 100644
--- a/backends/cortex_m/test/models/test_silero_vad.py
+++ b/backends/cortex_m/test/models/test_silero_vad.py
@@ -36,34 +36,40 @@
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
     "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
     "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
-    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12,
-    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 11,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 24,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 22,
 }
 ops_after_transforms: dict[str, int] = {
     "executorch_exir_dialects_edge__ops_aten_abs_default": 2,
     "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_arange_start_step": 1,
     "executorch_exir_dialects_edge__ops_aten_cat_default": 1,
-    "executorch_exir_dialects_edge__ops_aten_convolution_default": 6,
     "executorch_exir_dialects_edge__ops_aten_index_Tensor": 1,
     "executorch_exir_dialects_edge__ops_aten_linear_default": 2,
     "executorch_exir_dialects_edge__ops_aten_mean_dim": 1,
     "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 3,
     "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2,
-    "executorch_exir_dialects_edge__ops_aten_relu_default": 5,
+    # 4 of the 5 ReLUs fuse into their preceding encoder Conv1d. The surviving
+    # one is the post-LSTM `F.relu(h)` -- it has no Conv1d producer to fuse
+    # into and stays in aten until standalone activation lowering lands.
+    "executorch_exir_dialects_edge__ops_aten_relu_default": 1,
     "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2,
     "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4,
     "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1,
     "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1,
-    "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 2,
+    "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 8,
     "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
-    "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
+    "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 8,
     "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
-    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6,
-    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 9,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 8,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 5,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_depthwise_conv2d_default": 1,
+    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 12,
 }
 
 
@@ -84,7 +90,9 @@
 
 @parametrize("test_case", test_cases)
 def test_dialect_silero_vad_16k(test_case):
-    """This model currently does largely not lower to accelerated kernels due to missing LSTM and conv1d support, this test is to track development progress."""
+    """Conv1d layers now lower to cortex_m.quantized_conv2d via reshape;
+    sigmoid / tanh / linear (LSTM gates) still stay in aten until those
+    activations get their own quantized ops. This test tracks progress."""
     inputs = test_case.get_example_inputs()
     tester = CortexMTester(test_case.model, inputs)
     tester.test_dialect(
diff --git a/backends/cortex_m/test/models/test_wav2letter.py b/backends/cortex_m/test/models/test_wav2letter.py
index ddc5354293c..7f70bfc544d 100644
--- a/backends/cortex_m/test/models/test_wav2letter.py
+++ b/backends/cortex_m/test/models/test_wav2letter.py
@@ -8,8 +8,25 @@
 from executorch.examples.models.wav2letter.model import Wav2LetterModel
 
 
-ops_before_transforms: dict[str, int] = {}
-ops_after_transforms: dict[str, int] = {}
+ops_before_transforms: dict[str, int] = {
+    "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1,
+    "executorch_exir_dialects_edge__ops_aten_convolution_default": 12,
+    "executorch_exir_dialects_edge__ops_aten_relu_default": 12,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 24,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 14,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 14,
+}
+# Every Conv1d + ReLU pair fuses into a single cortex_m.quantized_conv2d call;
+# only the final log_softmax stays in aten until a quantized log_softmax lands.
+ops_after_transforms: dict[str, int] = {
+    "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1,
+    "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 12,
+    "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 12,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 2,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 12,
+    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 24,
+}
 
 model = Wav2LetterModel()
 pt_model = model.get_eager_model()
@@ -24,7 +41,9 @@
 
 @parametrize("test_case", test_cases)
 def test_dialect_wav2letter(test_case):
-    """This model currently does largely not lower to accelerated kernels due to missing conv1d support, this test is to track development progress."""
+    """Wav2Letter is a pure Conv1d+ReLU stack with a log_softmax tail; the
+    Conv1d-via-Conv2d-reshape lowering now collapses every layer into a single
+    cortex_m.quantized_conv2d. Only log_softmax stays unfused."""
     inputs = test_case.get_example_inputs()
     tester = CortexMTester(test_case.model, inputs)
     tester.test_dialect(
diff --git a/backends/cortex_m/test/ops/test_conv.py b/backends/cortex_m/test/ops/test_conv.py
index 5750ccf3bdb..baf3b44e44b 100644
--- a/backends/cortex_m/test/ops/test_conv.py
+++ b/backends/cortex_m/test/ops/test_conv.py
@@ -14,8 +14,89 @@
 
 
 class CortexMConv1D(torch.nn.Module):
-    ops_before_transforms: dict[str, int] = {}
-    ops_after_transforms: dict[str, int] = {}
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(*args, **kwargs, bias=False)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class CortexMConv1DBias(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 2,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(*args, **kwargs, bias=True)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class CortexMConv1DReLU(torch.nn.Module):
+    """Conv1d + ReLU. The quantizer's `(aten.conv1d, aten.relu)` pattern
+    fuses the activation into the conv's output clamp, so the ReLU should
+    disappear and a single quantized_conv2d remains."""
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_relu_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1,
+    }
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_relu_default": 0,
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(*args, **kwargs, bias=False)
+
+    def forward(self, x):
+        return torch.nn.functional.relu(self.conv(x))
+
+
+class CortexMDepthwiseConv1D(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_convolution_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_depthwise_conv2d_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
 
     def __init__(self, *args, **kwargs):
         super().__init__()
@@ -204,8 +285,39 @@ def forward(self, x):
         example_inputs=(ramp_tensor(0, 10, (1, 5, 8, 8)),),
     ),
     "conv1d": McuTestCase(
-        model=CortexMConv1D(1, 1, 1),
-        example_inputs=(ramp_tensor(0, 10, (1, 3, 2)),),
+        model=CortexMConv1D(2, 4, 1),
+        example_inputs=(ramp_tensor(0, 10, (1, 2, 8)),),
+    ),
+    "conv1d_kernel3": McuTestCase(
+        model=CortexMConv1D(4, 8, 3, padding=1),
+        example_inputs=(ramp_tensor(-1, 1, (1, 4, 16)),),
+    ),
+    "conv1d_stride": McuTestCase(
+        model=CortexMConv1D(4, 8, 3, stride=2, padding=1),
+        example_inputs=(ramp_tensor(0, 10, (1, 4, 16)),),
+    ),
+    "conv1d_bias": McuTestCase(
+        model=CortexMConv1DBias(3, 6, 3, padding=1),
+        example_inputs=(ramp_tensor(-5, 5, (1, 3, 12)),),
+    ),
+    "conv1d_pointwise": McuTestCase(
+        model=CortexMConv1D(8, 4, 1),
+        example_inputs=(ramp_tensor(0, 10, (1, 8, 12)),),
+    ),
+    "conv1d_large_kernel_stride": McuTestCase(
+        # Mirrors Silero VAD's learned STFT: Conv1d(1, 258, k=256, s=128).
+        # in_channels == groups == 1 means the backend lowers via the depthwise
+        # path; use a smaller variant that's quick to test.
+        model=CortexMDepthwiseConv1D(1, 16, 16, stride=8),
+        example_inputs=(ramp_tensor(-1, 1, (1, 1, 64)),),
+    ),
+    "conv1d_relu": McuTestCase(
+        model=CortexMConv1DReLU(4, 8, 3, padding=1),
+        example_inputs=(ramp_tensor(-5, 5, (1, 4, 16)),),
+    ),
+    "depthwise_conv1d": McuTestCase(
+        model=CortexMDepthwiseConv1D(4, 4, 3, padding=1, groups=4),
+        example_inputs=(ramp_tensor(0, 10, (1, 4, 16)),),
     ),
     "conv3d": McuTestCase(
         model=CortexMConv3D(1, 1, 1),
@@ -314,7 +426,6 @@ def forward(self, x):
 
 xfails_dialect: dict[str, xfail_type] = {
     "conv2d_dilation": "NotImplementedError: 'slow_conv_dilated<>' not implemented for 'Int'",
-    "conv1d": "Currently not supported.",
     "conv2d_nchw": "Currently not supported.",
 }
 
@@ -330,7 +441,6 @@ def test_dialect_conv2d(test_case):
 
 
 xfails_implementation: dict[str, xfail_type] = {
-    "conv1d": "Currently not supported.",
     "conv3d": "Currently not supported.",
 }
 

From fabc4c6c979796803bab27e9bc966a90bdd4ad83 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Wed, 27 May 2026 10:39:04 -0700
Subject: [PATCH 2/2] Cortex-M backend: fold inter-conv1d view-op chains

Between two consecutive conv1d layers the lowering pass currently emits a
4-op no-op chain: `_clone_dim_order(to NCHW) -> squeeze_copy -> unsqueeze_copy
-> _clone_dim_order(to NHWC)`. The two clones are inverse byte reorders and
the squeeze/unsqueeze on the same dim cancel logically, so the data flows
through unchanged -- but every op still materialises an activation-sized
buffer. On Wav2Letter (12 chained Conv1ds) that's ~3 MB of redundant
memory traffic per inference, ~2-6% of runtime budget at MCU bandwidths.

The cleanup composes two passes:

* Switch the conv1d lowering to use `aten.view_copy` (with the explicit
  shape arg) rather than `aten.unsqueeze_copy` / `aten.squeeze_copy.dims`.
  This lets `backends/transforms/fuse_view_copy.FuseViewCopyTransform`
  walk the view_copy <-> view_copy chain (treating the intermediate
  `_clone_dim_order` as a unary elementwise op it can pass through) and
  rewrite both view_copies to the same final shape; the noop ones then
  drop in `remove_noop_view_copy`.
* Add a small `FoldInverseDimOrderClonePass` that runs after the view-copy
  fuse and removes any remaining pair of adjacent `_clone_dim_order` ops
  whose composed dim_order returns to the input's original dim_order.

Net result on Wav2Letter: 60 graph nodes between the convs collapse to
zero -- the post-pass IR is a direct `quantized_conv2d -> quantized_conv2d`
flow with the wrap/unwrap only at the model boundaries (2 view_copy + 2
_clone_dim_order total, not 24 + 24). Silero VAD sees similar simplification
for its encoder block; its remaining wraps are at the STFT/magnitude-spectrum
and post-LSTM boundaries where the next op isn't a conv1d.

`_lower_conv1d` reads the input shape from `x.args[1]` when the input is a
view_copy from a previous conv1d's lowering (its meta["val"] hasn't been
repopulated yet at that point in the pass; the explicit shape arg is the
source of truth).

`aten::view_copy.out` is added to the test runner's selected_ops_list so
the on-device tests can resolve the view_copy kernel.

Co-authored-by: Claude <noreply@anthropic.com>
---
 .../passes/convert_to_cortex_m_pass.py        |  34 +++++-
 .../cortex_m/passes/cortex_m_pass_manager.py  |  11 ++
 .../fold_inverse_dim_order_clone_pass.py      |  66 +++++++++++
 backends/cortex_m/test/build_test_runner.sh   |   1 +
 .../misc/test_fold_inverse_dim_order_clone.py | 111 ++++++++++++++++++
 .../cortex_m/test/models/test_silero_vad.py   |  15 ++-
 .../cortex_m/test/models/test_wav2letter.py   |   9 +-
 7 files changed, 236 insertions(+), 11 deletions(-)
 create mode 100644 backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py
 create mode 100644 backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py

diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index aa0d13d9a48..50d0dcfb429 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -399,11 +399,37 @@ def _lower_conv1d(self, node, graph_module):
 
         # Build the input chain (NCW -> 4D NHWC), the conv, and the output chain
         # (4D NHWC -> NCW), all inserted in graph order before the original conv.
+        # Use view_copy (not unsqueeze_copy / squeeze_copy) so that
+        # backends/transforms/fuse_view_copy.FuseViewCopyTransform can collapse
+        # the view_copy <-> view_copy chain that forms between consecutive
+        # conv1d layers (e.g. Wav2Letter, Silero VAD encoder).
+        #
+        # For the input shape, prefer the explicit shape arg on x when x is a
+        # view_copy we just inserted for an earlier conv1d (its meta["val"]
+        # hasn't been repopulated yet at this point in the pass). Restrict to
+        # 3D targets so an unrelated view_copy producing a different rank
+        # can't silently feed a malformed shape into the conv reshape.
+        in_3d_shape = None
+        if (
+            isinstance(x, torch.fx.Node)
+            and x.target == exir_ops.edge.aten.view_copy.default
+            and len(x.args[1]) == 3
+        ):
+            in_3d_shape = list(x.args[1])
+        if in_3d_shape is None:
+            in_3d_shape = list(get_first_fake_tensor(x).shape)
+        assert (
+            len(in_3d_shape) == 3
+        ), f"_lower_conv1d expects a 3D input, got shape {in_3d_shape}"
+        x_4d_shape = [in_3d_shape[0], in_3d_shape[1], 1, in_3d_shape[2]]
+        out_3d_shape = list(node.meta["val"].shape)
+        out_4d_shape = [out_3d_shape[0], out_3d_shape[1], 1, out_3d_shape[2]]
+
         with node.graph.inserting_before(node):
             x_4d_nchw = node.graph.create_node(
                 "call_function",
-                target=exir_ops.edge.aten.unsqueeze_copy.default,
-                args=(x, 2),
+                target=exir_ops.edge.aten.view_copy.default,
+                args=(x, x_4d_shape),
             )
             x_4d_nhwc = node.graph.create_node(
                 "call_function",
@@ -456,8 +482,8 @@ def _lower_conv1d(self, node, graph_module):
             )
             out_3d = node.graph.create_node(
                 "call_function",
-                target=exir_ops.edge.aten.squeeze_copy.dims,
-                args=(out_4d_nchw, [2]),
+                target=exir_ops.edge.aten.view_copy.default,
+                args=(out_4d_nchw, out_3d_shape),
             )
 
         node.replace_all_uses_with(out_3d)
diff --git a/backends/cortex_m/passes/cortex_m_pass_manager.py b/backends/cortex_m/passes/cortex_m_pass_manager.py
index f0326ec76c4..36f253b853e 100644
--- a/backends/cortex_m/passes/cortex_m_pass_manager.py
+++ b/backends/cortex_m/passes/cortex_m_pass_manager.py
@@ -13,6 +13,7 @@
     ScalarsToAttributePass,
 )
 from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
+from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
 from executorch.backends.transforms.replace_scalar_with_tensor import (
     ReplaceScalarWithTensorArgPass,
@@ -27,6 +28,7 @@
 from .convert_to_cortex_m_pass import ConvertToCortexMPass
 from .decompose_hardswish_pass import DecomposeHardswishPass
 from .decompose_mean_pass import DecomposeMeanPass
+from .fold_inverse_dim_order_clone_pass import FoldInverseDimOrderClonePass
 from .quantized_clamp_activation_pass import QuantizedClampActivationPass
 from .quantized_op_fusion_pass import QuantizedOpFusionPass
 from .replace_quant_nodes_pass import ReplaceQuantNodesPass
@@ -46,6 +48,15 @@ class CortexMPassManager(PassManager):
         DecomposeHardswishPass,
         QuantizedOpFusionPass,
         ConvertToCortexMPass,
+        # Conv1d lowering inserts view_copy + _clone_dim_order wrappers around
+        # each conv2d call. Between consecutive conv1d layers these chain
+        # together to form an identity. FuseViewCopyTransform collapses the
+        # view_copy <-> view_copy chain (treating the dim_order clones as
+        # unary elementwise ops it walks through); FoldInverseDimOrderClonePass
+        # then removes the surviving _clone_dim_order pair whose composed
+        # dim_order is the identity.
+        FuseViewCopyTransform,
+        FoldInverseDimOrderClonePass,
     ]
 
     pass_list_transform_for_annotation: list[PassClass] = [
diff --git a/backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py b/backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py
new file mode 100644
index 00000000000..f5da64d9141
--- /dev/null
+++ b/backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py
@@ -0,0 +1,66 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.fx
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+from torch.fx.passes.infra.pass_manager import PassResult
+
+
+class FoldInverseDimOrderClonePass(ExportPass):
+    """Fold adjacent `_clone_dim_order` pairs whose net effect is identity.
+
+    The conv1d lowering inserts a `_clone_dim_order(dim_order=[0,2,3,1])`
+    before each conv and a `_clone_dim_order(dim_order=[0,1,2,3])` after. When
+    `FuseViewCopyTransform` collapses the intermediate view_copy chain between
+    two consecutive conv1d lowerings, the surviving graph is
+
+        ... -> _clone_dim_order(to NCHW) -> _clone_dim_order(to NHWC) -> ...
+
+    where the second clone's dim_order is the inverse of the first applied to
+    the same shape -- two byte reorders that cancel. This pass detects that
+    exact pattern and replaces uses of the second clone with the first
+    clone's input, then lets dead code elimination remove both.
+    """
+
+    _CLONE = exir_ops.edge.dim_order_ops._clone_dim_order.default
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+
+        for node in list(graph_module.graph.nodes):
+            if node.op != "call_function" or node.target != self._CLONE:
+                continue
+
+            second_clone = node
+            first_clone = second_clone.args[0]
+            if (
+                not isinstance(first_clone, torch.fx.Node)
+                or first_clone.op != "call_function"
+                or first_clone.target != self._CLONE
+                or len(first_clone.users) != 1
+            ):
+                continue
+
+            original_input = first_clone.args[0]
+            if not isinstance(original_input, torch.fx.Node):
+                continue
+
+            # Net effect is identity iff the second clone's target dim_order
+            # equals the input tensor's dim_order before the first clone.
+            original_dim_order = tuple(original_input.meta["val"].dim_order())
+            second_dim_order = tuple(second_clone.kwargs.get("dim_order", ()))
+            if original_dim_order != second_dim_order:
+                continue
+
+            second_clone.replace_all_uses_with(original_input)
+            modified = True
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh
index 05f58e73b10..425f81bb6c4 100755
--- a/backends/cortex_m/test/build_test_runner.sh
+++ b/backends/cortex_m/test/build_test_runner.sh
@@ -46,6 +46,7 @@ aten::full.out,\
 aten::ge.Tensor_out,\
 aten::unsqueeze_copy.out,\
 aten::squeeze_copy.dims_out,\
+aten::view_copy.out,\
 aten::select_copy.int_out,\
 aten::amax.out"
 
diff --git a/backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py b/backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py
new file mode 100644
index 00000000000..20a7cc3cb3a
--- /dev/null
+++ b/backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py
@@ -0,0 +1,111 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.fx
+from executorch.backends.cortex_m.passes.fold_inverse_dim_order_clone_pass import (
+    FoldInverseDimOrderClonePass,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+_CLONE = exir_ops.edge.dim_order_ops._clone_dim_order.default
+
+
+def _count(graph_module: torch.fx.GraphModule, target) -> int:
+    return sum(
+        1
+        for n in graph_module.graph.nodes
+        if n.op == "call_function" and n.target == target
+    )
+
+
+def _make_clone_pair_graph(
+    first_dim_order: tuple[int, ...],
+    second_dim_order: tuple[int, ...],
+) -> torch.fx.GraphModule:
+    """Hand-build a graph: placeholder -> clone(first) -> clone(second) -> output.
+    The input tensor is contiguous (dim_order = (0, 1, 2, 3)).
+    """
+    shape = (1, 4, 1, 8)
+    input_tensor = torch.empty(shape)
+
+    graph = torch.fx.Graph()
+    x = graph.placeholder("x")
+    x.meta["val"] = input_tensor
+
+    first = graph.create_node(
+        "call_function",
+        target=_CLONE,
+        args=(x,),
+        kwargs={"dtype": torch.float32, "dim_order": list(first_dim_order)},
+    )
+    first.meta["val"] = input_tensor
+
+    second = graph.create_node(
+        "call_function",
+        target=_CLONE,
+        args=(first,),
+        kwargs={"dtype": torch.float32, "dim_order": list(second_dim_order)},
+    )
+    second.meta["val"] = input_tensor
+
+    graph.output(second)
+
+    return torch.fx.GraphModule(torch.nn.Module(), graph)
+
+
+def test_fold_removes_inverse_pair():
+    # Input is contiguous (dim_order 0,1,2,3); first clone reorders to NHWC,
+    # second clone reorders back to the original -> net identity.
+    gm = _make_clone_pair_graph(
+        first_dim_order=(0, 2, 3, 1),
+        second_dim_order=(0, 1, 2, 3),
+    )
+    assert _count(gm, _CLONE) == 2
+
+    result = FoldInverseDimOrderClonePass()(gm)
+    assert result.modified
+    assert _count(result.graph_module, _CLONE) == 0
+
+
+def test_fold_preserves_non_identity_pair():
+    # Second clone's target is not the input's original dim_order, so the
+    # composition isn't identity -- fold must not fire.
+    gm = _make_clone_pair_graph(
+        first_dim_order=(0, 2, 3, 1),
+        second_dim_order=(0, 3, 1, 2),
+    )
+    assert _count(gm, _CLONE) == 2
+
+    result = FoldInverseDimOrderClonePass()(gm)
+    assert not result.modified
+    assert _count(result.graph_module, _CLONE) == 2
+
+
+def test_fold_respects_fanout():
+    # If the first clone has another consumer, folding would orphan that
+    # consumer's view of the reordered data. The pass must refuse.
+    gm = _make_clone_pair_graph(
+        first_dim_order=(0, 2, 3, 1),
+        second_dim_order=(0, 1, 2, 3),
+    )
+    first_clone = next(
+        n for n in gm.graph.nodes if n.op == "call_function" and n.target == _CLONE
+    )
+    output_node = next(n for n in gm.graph.nodes if n.op == "output")
+    with gm.graph.inserting_before(output_node):
+        extra_user = gm.graph.create_node(
+            "call_function",
+            target=torch.ops.aten.relu.default,
+            args=(first_clone,),
+        )
+        extra_user.meta["val"] = first_clone.meta["val"]
+    output_node.args = ((output_node.args[0], extra_user),)
+    gm.recompile()
+
+    result = FoldInverseDimOrderClonePass()(gm)
+    assert not result.modified
+    assert _count(result.graph_module, _CLONE) == 2
diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py
index 0f5a5a7d8f2..05456428cb0 100644
--- a/backends/cortex_m/test/models/test_silero_vad.py
+++ b/backends/cortex_m/test/models/test_silero_vad.py
@@ -59,17 +59,24 @@
     "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1,
     "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1,
-    "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 8,
+    "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 2,
     "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
-    "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 8,
-    "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
+    "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
+    # The conv1d lowering inserts view_copy wraps around each conv2d; the
+    # encoder's chained Conv1ds get their inter-layer view_copy <-> view_copy
+    # plus inverse _clone_dim_order pairs folded out. View_copy nodes here
+    # are: original model view (1) + boundary wraps that survive folding (6).
+    "executorch_exir_dialects_edge__ops_aten_view_copy_default": 7,
     "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 9,
     "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 8,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 5,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_depthwise_conv2d_default": 1,
-    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 12,
+    # Six clone_dim_order survive: boundary wraps for the conv1ds whose
+    # neighbours aren't another conv1d (the STFT/magnitude-spectrum interface
+    # and the final-conv boundary).
+    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 6,
 }
 
 
diff --git a/backends/cortex_m/test/models/test_wav2letter.py b/backends/cortex_m/test/models/test_wav2letter.py
index 7f70bfc544d..37a162ff78e 100644
--- a/backends/cortex_m/test/models/test_wav2letter.py
+++ b/backends/cortex_m/test/models/test_wav2letter.py
@@ -18,14 +18,17 @@
 }
 # Every Conv1d + ReLU pair fuses into a single cortex_m.quantized_conv2d call;
 # only the final log_softmax stays in aten until a quantized log_softmax lands.
+# Between consecutive Conv1ds the view_copy + _clone_dim_order wrap/unwrap
+# chain collapses to a direct conv2d -> conv2d hand-off (via FuseViewCopyTransform
+# + FoldInverseDimOrderClonePass). Only the model-boundary view_copy and
+# clone_dim_order pairs survive (one each at the model input, one at output).
 ops_after_transforms: dict[str, int] = {
     "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1,
-    "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 12,
-    "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 12,
+    "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
     "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 2,
     "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 12,
-    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 24,
+    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 2,
 }
 
 model = Wav2LetterModel()