From 4a7c1852070a515926447c1216f34693a626d831 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Tue, 26 May 2026 14:12:29 -0700 Subject: [PATCH 1/2] Cortex-M backend: lower aten.conv1d to existing quantized_conv2d kernels PyTorch Conv1d produces 3D NCW tensors at the edge dialect level, while CMSIS-NN's `arm_convolve_wrapper_s8` only accepts 4D NHWC. Rather than add a dedicated conv1d C++ kernel that would duplicate the quantized conv2d machinery, this lowers conv1d to the existing cortex_m.quantized_conv2d / quantized_depthwise_conv2d ops by wrapping the runtime input in `aten.unsqueeze_copy` plus `dim_order_ops._clone_dim_order(dim_order=[0,2,3,1])` to produce a logical (N, C, 1, W) tensor with NHWC byte layout, AoT-permuting the weight to 4D OHWI (or IHWO for depthwise) with H=1, and unwrapping the output with `_clone_dim_order([0,1,2,3])` + `squeeze_copy`. The `dim_order` attribute is what gets `is_channels_last_tensor` to pass at runtime; the physical reorder inside the clone op is what makes CMSIS-NN's raw int8_t* arithmetic read the right bytes. New: `CortexMConv1DCheck` accepts rank-3 patterns with the same activation tails (relu / hardtanh / hardsigmoid / clamp) the conv2d quantizer pattern already supports. `ConvertToCortexMPass._lower_conv1d` performs the full graph rewrite end-to-end -- it inserts the wrap chain, creates the conv2d node, initialises its scratch buffer through the existing `required_cmsis_nn_buffer_sizes` lookup, and replaces uses of the original convolution node itself, so `call()`'s standard single-node-replacement tail doesn't need to know about conv1d. For the three target models: * Wav2Letter (Conv1d + ReLU + log_softmax stack) collapses entirely; every Conv1d + ReLU pair fuses into a single quantized_conv2d, only log_softmax stays in aten. Test expectations populated. * Silero VAD's six Conv1d ops (1 STFT + 4 encoder + 1 final) all lower: 5 as regular conv2d, 1 as depthwise (the STFT layer has in_channels == groups == 1). Four of the five encoder ReLUs fuse; the remaining unfused ReLU is the post-LSTM `F.relu(h)`, with no Conv1d producer to fuse into. * YOLO11 is 2D and unaffected. Op-level unit tests cover regular, depthwise, kernel=1 pointwise, larger kernels with padding, stride > 1, bias, an STFT-shaped in=1 depthwise case, and a Conv1d+ReLU fusion case. Follow-ups (intentionally deferred): factor the shared qparams / multiplier / weight-placeholder code between conv1d and conv2d helpers; collapse the 9 activation-tail pattern entries via a helper; add an explicit reject for `groups > 1 && !depthwise` in both the conv1d and existing conv2d paths. Co-authored-by: Claude --- .../passes/convert_to_cortex_m_pass.py | 168 ++++++++++++++++++ .../cortex_m/quantizer/pattern_checkers.py | 29 +++ .../cortex_m/quantizer/quantizer_support.py | 22 +++ backends/cortex_m/test/build_test_runner.sh | 1 + .../cortex_m/test/models/test_silero_vad.py | 26 ++- .../cortex_m/test/models/test_wav2letter.py | 25 ++- backends/cortex_m/test/ops/test_conv.py | 122 ++++++++++++- 7 files changed, 375 insertions(+), 18 deletions(-) diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index e61ddaf63bc..aa0d13d9a48 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -304,6 +304,165 @@ def _get_convolution_replacement(self, node): ) return exir_ops.edge.cortex_m.quantized_conv2d.default, new_args + def _lower_conv1d(self, node, graph_module): + """Lower a quantized 3D aten.convolution.default (conv1d) end-to-end. + + Wraps the runtime input in `aten.unsqueeze_copy + + dim_order_ops._clone_dim_order` to a 4D NHWC tensor with H=1, + AoT-reshapes the weight to 4D OHWI / IHWO with H=1, calls the existing + cortex_m.quantized_conv2d kernel, then clones + squeezes back to 3D + NCW. Replaces uses of `node` with the terminal squeeze and erases + `node`. The caller does not need to do any further graph mutation. + """ + ( + x, + weight, + bias, + stride, + padding, + dilation, + _transposed, + _output_padding, + groups, + ) = node.args + + stride_2d = [1, stride[0]] + padding_2d = [0, padding[0]] + dilation_2d = [1, dilation[0]] + + input_scale = node.meta["input_qparams"][0].scale + input_zero_point = node.meta["input_qparams"][0].zp + weight_scales = node.meta["input_qparams"][1].scale + if not isinstance(weight_scales, list): + fake_weight_tensor = get_first_fake_tensor(weight) + weight_scales = [weight_scales] * fake_weight_tensor.shape[0] + + output_qparams = node.meta["output_qparams"][0] + output_scale = output_qparams.scale + output_zero_point = output_qparams.zp + output_qmin = output_qparams.qmin + output_qmax = output_qparams.qmax + + quantized_multipliers = [] + quantized_shifts = [] + for weight_scale in weight_scales: + quantized_multiplier, quantized_shift = quantize_multiplier_aot( + input_scale * weight_scale / output_scale + ) + quantized_multipliers.append(quantized_multiplier) + quantized_shifts.append(quantized_shift) + + param_weight_tensor = get_param_tensor(self.exported_program, weight) + if param_weight_tensor is None: + raise RuntimeError( + f"Expected conv1d weight parameter tensor for node {node.name}." + ) + + # Conv1d weight shape: (out_channels, in_channels/groups, K). + in_channels = param_weight_tensor.shape[1] * groups + out_channels = param_weight_tensor.shape[0] + is_depthwise = (in_channels == groups) and (out_channels % in_channels == 0) + batch_size = self._get_batch_size_from_conv(node) + use_depthwise_conv = is_depthwise and (batch_size == 1) + + # Lift the weight to the 4D layout the existing quantized_conv2d kernels + # already consume: unsqueeze a singleton H=1, then permute by the same + # axes the 4D path uses (OHWI for regular, IHWO for depthwise). + param_weight_4d = param_weight_tensor.unsqueeze(2) + if use_depthwise_conv: + weight_permuted = param_weight_4d.permute(1, 2, 3, 0).contiguous() + else: + weight_permuted = param_weight_4d.permute(0, 2, 3, 1).contiguous() + + with node.graph.inserting_after(weight): + weight_nhwc = create_constant_placeholder( + self.exported_program, + node.graph, + node.name + "_weight_nhwc", + InputKind.PARAMETER, + weight_permuted, + ) + quantized_multiplier_tensor = create_constant_placeholder( + self.exported_program, + node.graph, + node.name + "_quantized_multiplier", + InputKind.PARAMETER, + torch.tensor(quantized_multipliers, dtype=torch.int32), + ) + quantized_shift_tensor = create_constant_placeholder( + self.exported_program, + node.graph, + node.name + "_quantized_shift", + InputKind.PARAMETER, + torch.tensor(quantized_shifts, dtype=torch.int32), + ) + + # Build the input chain (NCW -> 4D NHWC), the conv, and the output chain + # (4D NHWC -> NCW), all inserted in graph order before the original conv. + with node.graph.inserting_before(node): + x_4d_nchw = node.graph.create_node( + "call_function", + target=exir_ops.edge.aten.unsqueeze_copy.default, + args=(x, 2), + ) + x_4d_nhwc = node.graph.create_node( + "call_function", + target=exir_ops.edge.dim_order_ops._clone_dim_order.default, + args=(x_4d_nchw,), + kwargs={"dim_order": [0, 2, 3, 1]}, + ) + scratch = self._create_uninitialized_alloc_node() + + # `is_depthwise` already required `out_channels % in_channels == 0`, + # so depth_multiplier is exact; pass it as the extra positional that + # the depthwise kernel takes between dilation and input_zero_point. + if use_depthwise_conv: + conv_op = exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default + depth_multiplier_args = (out_channels // in_channels,) + else: + conv_op = exir_ops.edge.cortex_m.quantized_conv2d.default + depth_multiplier_args = () + + conv_args = ( + x_4d_nhwc, + weight_nhwc, + bias, + stride_2d, + padding_2d, + dilation_2d, + *depth_multiplier_args, + -input_zero_point, + output_zero_point, + quantized_multiplier_tensor, + quantized_shift_tensor, + output_qmin, + output_qmax, + scratch, + ) + + conv_node = node.graph.create_node( + "call_function", + target=conv_op, + args=conv_args, + kwargs={}, + ) + self._initialize_alloc_node_size(conv_node) + + out_4d_nchw = node.graph.create_node( + "call_function", + target=exir_ops.edge.dim_order_ops._clone_dim_order.default, + args=(conv_node,), + kwargs={"dim_order": [0, 1, 2, 3]}, + ) + out_3d = node.graph.create_node( + "call_function", + target=exir_ops.edge.aten.squeeze_copy.dims, + args=(out_4d_nchw, [2]), + ) + + node.replace_all_uses_with(out_3d) + graph_module.graph.erase_node(node) + def _initialize_alloc_node_size(self, node: torch.fx.Node) -> None: """For nodes with a registered buffer size function for node.target, set the buffer sizes of the last n args, which should be exir.memory.alloc nodes. For nodes without a @@ -500,6 +659,15 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: case exir_ops.edge.aten.convolution.default: # Check if it's transposed convolution (arg index 6) transposed = node.args[6] if len(node.args) > 6 else False + # stride length is 1 for conv1d, 2 for conv2d. Conv1d is + # lowered to the existing quantized_conv2d kernel with H=1 + # by inserting unsqueeze + dim-order clone around the call; + # the helper handles its own replace_all_uses + erase. + is_conv1d = len(node.args[3]) == 1 + if is_conv1d and not transposed: + self._lower_conv1d(node, graph_module) + modified = True + continue if transposed: op, args = self._get_transpose_conv2d_replacement(node) else: diff --git a/backends/cortex_m/quantizer/pattern_checkers.py b/backends/cortex_m/quantizer/pattern_checkers.py index 860d8345607..283f9d55bdf 100644 --- a/backends/cortex_m/quantizer/pattern_checkers.py +++ b/backends/cortex_m/quantizer/pattern_checkers.py @@ -87,6 +87,35 @@ def check_quantization_config( return is_int8 and is_ch_axis_0 +class CortexMConv1DCheck(PatternCheck): + """Accepts aten.conv1d.default with rank-3 NCW inputs. + + The conv1d is lowered to cortex_m.quantized_conv2d via AoT weight + reshape (O, I, K) -> (O, 1, K, I) and graph-level input unsqueeze + + channels_last conversion in ConvertToCortexMPass. + """ + + @classmethod + def check_pattern(cls, pattern): + for node in pattern: + tensor = get_first_fake_tensor(node) + if tensor is None or tensor.ndim != 3: + return False + return True + + @classmethod + def check_quantization_config( + cls, pattern: list[Node], quantization_config: QuantizationConfig + ) -> bool: + is_int8 = cls.is_int8_activations(quantization_config) + conv_node = pattern[0] if pattern else None + weight_qspec = quantization_config.get_weight_qspec(conv_node) + if not isinstance(weight_qspec, QuantizationSpec): + return False + is_ch_axis_0 = weight_qspec.ch_axis == 0 or weight_qspec.ch_axis is None + return is_int8 and is_ch_axis_0 + + class CortexMLinearCheck(PatternCheck): @classmethod def check_quantization_config( diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py index 3dfbb67638a..332fb3ae15a 100644 --- a/backends/cortex_m/quantizer/quantizer_support.py +++ b/backends/cortex_m/quantizer/quantizer_support.py @@ -8,6 +8,7 @@ CortexMAddMulCheck, CortexMAvgPool2DCheck, CortexMBmmCheck, + CortexMConv1DCheck, CortexMConv2DCheck, CortexMConvTranspose2DCheck, CortexMLinearCheck, @@ -76,6 +77,27 @@ ): CortexMConv2DCheck, (torch.ops.aten.conv2d.default, torch.ops.aten.clamp.default): CortexMConv2DCheck, (torch.ops.aten.conv2d.default, torch.ops.aten.clamp_.default): CortexMConv2DCheck, + (torch.ops.aten.conv1d.default,): CortexMConv1DCheck, + (torch.ops.aten.conv1d.default, torch.ops.aten.relu.default): CortexMConv1DCheck, + (torch.ops.aten.conv1d.default, torch.ops.aten.relu_.default): CortexMConv1DCheck, + ( + torch.ops.aten.conv1d.default, + torch.ops.aten.hardtanh.default, + ): CortexMConv1DCheck, + ( + torch.ops.aten.conv1d.default, + torch.ops.aten.hardtanh_.default, + ): CortexMConv1DCheck, + ( + torch.ops.aten.conv1d.default, + torch.ops.aten.hardsigmoid.default, + ): CortexMConv1DCheck, + ( + torch.ops.aten.conv1d.default, + torch.ops.aten.hardsigmoid_.default, + ): CortexMConv1DCheck, + (torch.ops.aten.conv1d.default, torch.ops.aten.clamp.default): CortexMConv1DCheck, + (torch.ops.aten.conv1d.default, torch.ops.aten.clamp_.default): CortexMConv1DCheck, } CONV_TRANSPOSE_OP_PATTERNS = { diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh index a67c5a907a4..05f58e73b10 100755 --- a/backends/cortex_m/test/build_test_runner.sh +++ b/backends/cortex_m/test/build_test_runner.sh @@ -45,6 +45,7 @@ aten::cat.out,\ aten::full.out,\ aten::ge.Tensor_out,\ aten::unsqueeze_copy.out,\ +aten::squeeze_copy.dims_out,\ aten::select_copy.int_out,\ aten::amax.out" diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py index 27b958627bb..0f5a5a7d8f2 100644 --- a/backends/cortex_m/test/models/test_silero_vad.py +++ b/backends/cortex_m/test/models/test_silero_vad.py @@ -36,34 +36,40 @@ "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12, - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 11, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 24, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 22, } ops_after_transforms: dict[str, int] = { "executorch_exir_dialects_edge__ops_aten_abs_default": 2, "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_arange_start_step": 1, "executorch_exir_dialects_edge__ops_aten_cat_default": 1, - "executorch_exir_dialects_edge__ops_aten_convolution_default": 6, "executorch_exir_dialects_edge__ops_aten_index_Tensor": 1, "executorch_exir_dialects_edge__ops_aten_linear_default": 2, "executorch_exir_dialects_edge__ops_aten_mean_dim": 1, "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 3, "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2, - "executorch_exir_dialects_edge__ops_aten_relu_default": 5, + # 4 of the 5 ReLUs fuse into their preceding encoder Conv1d. The surviving + # one is the post-LSTM `F.relu(h)` -- it has no Conv1d producer to fuse + # into and stays in aten until standalone activation lowering lands. + "executorch_exir_dialects_edge__ops_aten_relu_default": 1, "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2, "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4, "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1, - "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 2, + "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 8, "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, - "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, + "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 8, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, - "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6, - "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 9, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 8, "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 5, + "executorch_exir_dialects_edge__ops_cortex_m_quantized_depthwise_conv2d_default": 1, + "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 12, } @@ -84,7 +90,9 @@ @parametrize("test_case", test_cases) def test_dialect_silero_vad_16k(test_case): - """This model currently does largely not lower to accelerated kernels due to missing LSTM and conv1d support, this test is to track development progress.""" + """Conv1d layers now lower to cortex_m.quantized_conv2d via reshape; + sigmoid / tanh / linear (LSTM gates) still stay in aten until those + activations get their own quantized ops. This test tracks progress.""" inputs = test_case.get_example_inputs() tester = CortexMTester(test_case.model, inputs) tester.test_dialect( diff --git a/backends/cortex_m/test/models/test_wav2letter.py b/backends/cortex_m/test/models/test_wav2letter.py index ddc5354293c..7f70bfc544d 100644 --- a/backends/cortex_m/test/models/test_wav2letter.py +++ b/backends/cortex_m/test/models/test_wav2letter.py @@ -8,8 +8,25 @@ from executorch.examples.models.wav2letter.model import Wav2LetterModel -ops_before_transforms: dict[str, int] = {} -ops_after_transforms: dict[str, int] = {} +ops_before_transforms: dict[str, int] = { + "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1, + "executorch_exir_dialects_edge__ops_aten_convolution_default": 12, + "executorch_exir_dialects_edge__ops_aten_relu_default": 12, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 24, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 14, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 14, +} +# Every Conv1d + ReLU pair fuses into a single cortex_m.quantized_conv2d call; +# only the final log_softmax stays in aten until a quantized log_softmax lands. +ops_after_transforms: dict[str, int] = { + "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1, + "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 12, + "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 12, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 12, + "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 24, +} model = Wav2LetterModel() pt_model = model.get_eager_model() @@ -24,7 +41,9 @@ @parametrize("test_case", test_cases) def test_dialect_wav2letter(test_case): - """This model currently does largely not lower to accelerated kernels due to missing conv1d support, this test is to track development progress.""" + """Wav2Letter is a pure Conv1d+ReLU stack with a log_softmax tail; the + Conv1d-via-Conv2d-reshape lowering now collapses every layer into a single + cortex_m.quantized_conv2d. Only log_softmax stays unfused.""" inputs = test_case.get_example_inputs() tester = CortexMTester(test_case.model, inputs) tester.test_dialect( diff --git a/backends/cortex_m/test/ops/test_conv.py b/backends/cortex_m/test/ops/test_conv.py index 5750ccf3bdb..baf3b44e44b 100644 --- a/backends/cortex_m/test/ops/test_conv.py +++ b/backends/cortex_m/test/ops/test_conv.py @@ -14,8 +14,89 @@ class CortexMConv1D(torch.nn.Module): - ops_before_transforms: dict[str, int] = {} - ops_after_transforms: dict[str, int] = {} + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_convolution_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def __init__(self, *args, **kwargs): + super().__init__() + self.conv = torch.nn.Conv1d(*args, **kwargs, bias=False) + + def forward(self, x): + return self.conv(x) + + +class CortexMConv1DBias(torch.nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_convolution_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 2, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def __init__(self, *args, **kwargs): + super().__init__() + self.conv = torch.nn.Conv1d(*args, **kwargs, bias=True) + + def forward(self, x): + return self.conv(x) + + +class CortexMConv1DReLU(torch.nn.Module): + """Conv1d + ReLU. The quantizer's `(aten.conv1d, aten.relu)` pattern + fuses the activation into the conv's output clamp, so the ReLU should + disappear and a single quantized_conv2d remains.""" + + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_convolution_default": 1, + "executorch_exir_dialects_edge__ops_aten_relu_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1, + } + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_aten_relu_default": 0, + } + + def __init__(self, *args, **kwargs): + super().__init__() + self.conv = torch.nn.Conv1d(*args, **kwargs, bias=False) + + def forward(self, x): + return torch.nn.functional.relu(self.conv(x)) + + +class CortexMDepthwiseConv1D(torch.nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_convolution_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_default": 1, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_depthwise_conv2d_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } def __init__(self, *args, **kwargs): super().__init__() @@ -204,8 +285,39 @@ def forward(self, x): example_inputs=(ramp_tensor(0, 10, (1, 5, 8, 8)),), ), "conv1d": McuTestCase( - model=CortexMConv1D(1, 1, 1), - example_inputs=(ramp_tensor(0, 10, (1, 3, 2)),), + model=CortexMConv1D(2, 4, 1), + example_inputs=(ramp_tensor(0, 10, (1, 2, 8)),), + ), + "conv1d_kernel3": McuTestCase( + model=CortexMConv1D(4, 8, 3, padding=1), + example_inputs=(ramp_tensor(-1, 1, (1, 4, 16)),), + ), + "conv1d_stride": McuTestCase( + model=CortexMConv1D(4, 8, 3, stride=2, padding=1), + example_inputs=(ramp_tensor(0, 10, (1, 4, 16)),), + ), + "conv1d_bias": McuTestCase( + model=CortexMConv1DBias(3, 6, 3, padding=1), + example_inputs=(ramp_tensor(-5, 5, (1, 3, 12)),), + ), + "conv1d_pointwise": McuTestCase( + model=CortexMConv1D(8, 4, 1), + example_inputs=(ramp_tensor(0, 10, (1, 8, 12)),), + ), + "conv1d_large_kernel_stride": McuTestCase( + # Mirrors Silero VAD's learned STFT: Conv1d(1, 258, k=256, s=128). + # in_channels == groups == 1 means the backend lowers via the depthwise + # path; use a smaller variant that's quick to test. + model=CortexMDepthwiseConv1D(1, 16, 16, stride=8), + example_inputs=(ramp_tensor(-1, 1, (1, 1, 64)),), + ), + "conv1d_relu": McuTestCase( + model=CortexMConv1DReLU(4, 8, 3, padding=1), + example_inputs=(ramp_tensor(-5, 5, (1, 4, 16)),), + ), + "depthwise_conv1d": McuTestCase( + model=CortexMDepthwiseConv1D(4, 4, 3, padding=1, groups=4), + example_inputs=(ramp_tensor(0, 10, (1, 4, 16)),), ), "conv3d": McuTestCase( model=CortexMConv3D(1, 1, 1), @@ -314,7 +426,6 @@ def forward(self, x): xfails_dialect: dict[str, xfail_type] = { "conv2d_dilation": "NotImplementedError: 'slow_conv_dilated<>' not implemented for 'Int'", - "conv1d": "Currently not supported.", "conv2d_nchw": "Currently not supported.", } @@ -330,7 +441,6 @@ def test_dialect_conv2d(test_case): xfails_implementation: dict[str, xfail_type] = { - "conv1d": "Currently not supported.", "conv3d": "Currently not supported.", } From fabc4c6c979796803bab27e9bc966a90bdd4ad83 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Wed, 27 May 2026 10:39:04 -0700 Subject: [PATCH 2/2] Cortex-M backend: fold inter-conv1d view-op chains Between two consecutive conv1d layers the lowering pass currently emits a 4-op no-op chain: `_clone_dim_order(to NCHW) -> squeeze_copy -> unsqueeze_copy -> _clone_dim_order(to NHWC)`. The two clones are inverse byte reorders and the squeeze/unsqueeze on the same dim cancel logically, so the data flows through unchanged -- but every op still materialises an activation-sized buffer. On Wav2Letter (12 chained Conv1ds) that's ~3 MB of redundant memory traffic per inference, ~2-6% of runtime budget at MCU bandwidths. The cleanup composes two passes: * Switch the conv1d lowering to use `aten.view_copy` (with the explicit shape arg) rather than `aten.unsqueeze_copy` / `aten.squeeze_copy.dims`. This lets `backends/transforms/fuse_view_copy.FuseViewCopyTransform` walk the view_copy <-> view_copy chain (treating the intermediate `_clone_dim_order` as a unary elementwise op it can pass through) and rewrite both view_copies to the same final shape; the noop ones then drop in `remove_noop_view_copy`. * Add a small `FoldInverseDimOrderClonePass` that runs after the view-copy fuse and removes any remaining pair of adjacent `_clone_dim_order` ops whose composed dim_order returns to the input's original dim_order. Net result on Wav2Letter: 60 graph nodes between the convs collapse to zero -- the post-pass IR is a direct `quantized_conv2d -> quantized_conv2d` flow with the wrap/unwrap only at the model boundaries (2 view_copy + 2 _clone_dim_order total, not 24 + 24). Silero VAD sees similar simplification for its encoder block; its remaining wraps are at the STFT/magnitude-spectrum and post-LSTM boundaries where the next op isn't a conv1d. `_lower_conv1d` reads the input shape from `x.args[1]` when the input is a view_copy from a previous conv1d's lowering (its meta["val"] hasn't been repopulated yet at that point in the pass; the explicit shape arg is the source of truth). `aten::view_copy.out` is added to the test runner's selected_ops_list so the on-device tests can resolve the view_copy kernel. Co-authored-by: Claude --- .../passes/convert_to_cortex_m_pass.py | 34 +++++- .../cortex_m/passes/cortex_m_pass_manager.py | 11 ++ .../fold_inverse_dim_order_clone_pass.py | 66 +++++++++++ backends/cortex_m/test/build_test_runner.sh | 1 + .../misc/test_fold_inverse_dim_order_clone.py | 111 ++++++++++++++++++ .../cortex_m/test/models/test_silero_vad.py | 15 ++- .../cortex_m/test/models/test_wav2letter.py | 9 +- 7 files changed, 236 insertions(+), 11 deletions(-) create mode 100644 backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py create mode 100644 backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index aa0d13d9a48..50d0dcfb429 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -399,11 +399,37 @@ def _lower_conv1d(self, node, graph_module): # Build the input chain (NCW -> 4D NHWC), the conv, and the output chain # (4D NHWC -> NCW), all inserted in graph order before the original conv. + # Use view_copy (not unsqueeze_copy / squeeze_copy) so that + # backends/transforms/fuse_view_copy.FuseViewCopyTransform can collapse + # the view_copy <-> view_copy chain that forms between consecutive + # conv1d layers (e.g. Wav2Letter, Silero VAD encoder). + # + # For the input shape, prefer the explicit shape arg on x when x is a + # view_copy we just inserted for an earlier conv1d (its meta["val"] + # hasn't been repopulated yet at this point in the pass). Restrict to + # 3D targets so an unrelated view_copy producing a different rank + # can't silently feed a malformed shape into the conv reshape. + in_3d_shape = None + if ( + isinstance(x, torch.fx.Node) + and x.target == exir_ops.edge.aten.view_copy.default + and len(x.args[1]) == 3 + ): + in_3d_shape = list(x.args[1]) + if in_3d_shape is None: + in_3d_shape = list(get_first_fake_tensor(x).shape) + assert ( + len(in_3d_shape) == 3 + ), f"_lower_conv1d expects a 3D input, got shape {in_3d_shape}" + x_4d_shape = [in_3d_shape[0], in_3d_shape[1], 1, in_3d_shape[2]] + out_3d_shape = list(node.meta["val"].shape) + out_4d_shape = [out_3d_shape[0], out_3d_shape[1], 1, out_3d_shape[2]] + with node.graph.inserting_before(node): x_4d_nchw = node.graph.create_node( "call_function", - target=exir_ops.edge.aten.unsqueeze_copy.default, - args=(x, 2), + target=exir_ops.edge.aten.view_copy.default, + args=(x, x_4d_shape), ) x_4d_nhwc = node.graph.create_node( "call_function", @@ -456,8 +482,8 @@ def _lower_conv1d(self, node, graph_module): ) out_3d = node.graph.create_node( "call_function", - target=exir_ops.edge.aten.squeeze_copy.dims, - args=(out_4d_nchw, [2]), + target=exir_ops.edge.aten.view_copy.default, + args=(out_4d_nchw, out_3d_shape), ) node.replace_all_uses_with(out_3d) diff --git a/backends/cortex_m/passes/cortex_m_pass_manager.py b/backends/cortex_m/passes/cortex_m_pass_manager.py index f0326ec76c4..36f253b853e 100644 --- a/backends/cortex_m/passes/cortex_m_pass_manager.py +++ b/backends/cortex_m/passes/cortex_m_pass_manager.py @@ -13,6 +13,7 @@ ScalarsToAttributePass, ) from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig +from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass from executorch.backends.transforms.replace_scalar_with_tensor import ( ReplaceScalarWithTensorArgPass, @@ -27,6 +28,7 @@ from .convert_to_cortex_m_pass import ConvertToCortexMPass from .decompose_hardswish_pass import DecomposeHardswishPass from .decompose_mean_pass import DecomposeMeanPass +from .fold_inverse_dim_order_clone_pass import FoldInverseDimOrderClonePass from .quantized_clamp_activation_pass import QuantizedClampActivationPass from .quantized_op_fusion_pass import QuantizedOpFusionPass from .replace_quant_nodes_pass import ReplaceQuantNodesPass @@ -46,6 +48,15 @@ class CortexMPassManager(PassManager): DecomposeHardswishPass, QuantizedOpFusionPass, ConvertToCortexMPass, + # Conv1d lowering inserts view_copy + _clone_dim_order wrappers around + # each conv2d call. Between consecutive conv1d layers these chain + # together to form an identity. FuseViewCopyTransform collapses the + # view_copy <-> view_copy chain (treating the dim_order clones as + # unary elementwise ops it walks through); FoldInverseDimOrderClonePass + # then removes the surviving _clone_dim_order pair whose composed + # dim_order is the identity. + FuseViewCopyTransform, + FoldInverseDimOrderClonePass, ] pass_list_transform_for_annotation: list[PassClass] = [ diff --git a/backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py b/backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py new file mode 100644 index 00000000000..f5da64d9141 --- /dev/null +++ b/backends/cortex_m/passes/fold_inverse_dim_order_clone_pass.py @@ -0,0 +1,66 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.fx +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass +from torch.fx.passes.infra.pass_manager import PassResult + + +class FoldInverseDimOrderClonePass(ExportPass): + """Fold adjacent `_clone_dim_order` pairs whose net effect is identity. + + The conv1d lowering inserts a `_clone_dim_order(dim_order=[0,2,3,1])` + before each conv and a `_clone_dim_order(dim_order=[0,1,2,3])` after. When + `FuseViewCopyTransform` collapses the intermediate view_copy chain between + two consecutive conv1d lowerings, the surviving graph is + + ... -> _clone_dim_order(to NCHW) -> _clone_dim_order(to NHWC) -> ... + + where the second clone's dim_order is the inverse of the first applied to + the same shape -- two byte reorders that cancel. This pass detects that + exact pattern and replaces uses of the second clone with the first + clone's input, then lets dead code elimination remove both. + """ + + _CLONE = exir_ops.edge.dim_order_ops._clone_dim_order.default + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + + for node in list(graph_module.graph.nodes): + if node.op != "call_function" or node.target != self._CLONE: + continue + + second_clone = node + first_clone = second_clone.args[0] + if ( + not isinstance(first_clone, torch.fx.Node) + or first_clone.op != "call_function" + or first_clone.target != self._CLONE + or len(first_clone.users) != 1 + ): + continue + + original_input = first_clone.args[0] + if not isinstance(original_input, torch.fx.Node): + continue + + # Net effect is identity iff the second clone's target dim_order + # equals the input tensor's dim_order before the first clone. + original_dim_order = tuple(original_input.meta["val"].dim_order()) + second_dim_order = tuple(second_clone.kwargs.get("dim_order", ())) + if original_dim_order != second_dim_order: + continue + + second_clone.replace_all_uses_with(original_input) + modified = True + + if modified: + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + + return PassResult(graph_module, modified) diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh index 05f58e73b10..425f81bb6c4 100755 --- a/backends/cortex_m/test/build_test_runner.sh +++ b/backends/cortex_m/test/build_test_runner.sh @@ -46,6 +46,7 @@ aten::full.out,\ aten::ge.Tensor_out,\ aten::unsqueeze_copy.out,\ aten::squeeze_copy.dims_out,\ +aten::view_copy.out,\ aten::select_copy.int_out,\ aten::amax.out" diff --git a/backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py b/backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py new file mode 100644 index 00000000000..20a7cc3cb3a --- /dev/null +++ b/backends/cortex_m/test/misc/test_fold_inverse_dim_order_clone.py @@ -0,0 +1,111 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.fx +from executorch.backends.cortex_m.passes.fold_inverse_dim_order_clone_pass import ( + FoldInverseDimOrderClonePass, +) +from executorch.exir.dialects._ops import ops as exir_ops + + +_CLONE = exir_ops.edge.dim_order_ops._clone_dim_order.default + + +def _count(graph_module: torch.fx.GraphModule, target) -> int: + return sum( + 1 + for n in graph_module.graph.nodes + if n.op == "call_function" and n.target == target + ) + + +def _make_clone_pair_graph( + first_dim_order: tuple[int, ...], + second_dim_order: tuple[int, ...], +) -> torch.fx.GraphModule: + """Hand-build a graph: placeholder -> clone(first) -> clone(second) -> output. + The input tensor is contiguous (dim_order = (0, 1, 2, 3)). + """ + shape = (1, 4, 1, 8) + input_tensor = torch.empty(shape) + + graph = torch.fx.Graph() + x = graph.placeholder("x") + x.meta["val"] = input_tensor + + first = graph.create_node( + "call_function", + target=_CLONE, + args=(x,), + kwargs={"dtype": torch.float32, "dim_order": list(first_dim_order)}, + ) + first.meta["val"] = input_tensor + + second = graph.create_node( + "call_function", + target=_CLONE, + args=(first,), + kwargs={"dtype": torch.float32, "dim_order": list(second_dim_order)}, + ) + second.meta["val"] = input_tensor + + graph.output(second) + + return torch.fx.GraphModule(torch.nn.Module(), graph) + + +def test_fold_removes_inverse_pair(): + # Input is contiguous (dim_order 0,1,2,3); first clone reorders to NHWC, + # second clone reorders back to the original -> net identity. + gm = _make_clone_pair_graph( + first_dim_order=(0, 2, 3, 1), + second_dim_order=(0, 1, 2, 3), + ) + assert _count(gm, _CLONE) == 2 + + result = FoldInverseDimOrderClonePass()(gm) + assert result.modified + assert _count(result.graph_module, _CLONE) == 0 + + +def test_fold_preserves_non_identity_pair(): + # Second clone's target is not the input's original dim_order, so the + # composition isn't identity -- fold must not fire. + gm = _make_clone_pair_graph( + first_dim_order=(0, 2, 3, 1), + second_dim_order=(0, 3, 1, 2), + ) + assert _count(gm, _CLONE) == 2 + + result = FoldInverseDimOrderClonePass()(gm) + assert not result.modified + assert _count(result.graph_module, _CLONE) == 2 + + +def test_fold_respects_fanout(): + # If the first clone has another consumer, folding would orphan that + # consumer's view of the reordered data. The pass must refuse. + gm = _make_clone_pair_graph( + first_dim_order=(0, 2, 3, 1), + second_dim_order=(0, 1, 2, 3), + ) + first_clone = next( + n for n in gm.graph.nodes if n.op == "call_function" and n.target == _CLONE + ) + output_node = next(n for n in gm.graph.nodes if n.op == "output") + with gm.graph.inserting_before(output_node): + extra_user = gm.graph.create_node( + "call_function", + target=torch.ops.aten.relu.default, + args=(first_clone,), + ) + extra_user.meta["val"] = first_clone.meta["val"] + output_node.args = ((output_node.args[0], extra_user),) + gm.recompile() + + result = FoldInverseDimOrderClonePass()(gm) + assert not result.modified + assert _count(result.graph_module, _CLONE) == 2 diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py index 0f5a5a7d8f2..05456428cb0 100644 --- a/backends/cortex_m/test/models/test_silero_vad.py +++ b/backends/cortex_m/test/models/test_silero_vad.py @@ -59,17 +59,24 @@ "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1, - "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 8, + "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 2, "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, - "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 8, - "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, + "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, + # The conv1d lowering inserts view_copy wraps around each conv2d; the + # encoder's chained Conv1ds get their inter-layer view_copy <-> view_copy + # plus inverse _clone_dim_order pairs folded out. View_copy nodes here + # are: original model view (1) + boundary wraps that survive folding (6). + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 7, "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 9, "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 8, "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 5, "executorch_exir_dialects_edge__ops_cortex_m_quantized_depthwise_conv2d_default": 1, - "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 12, + # Six clone_dim_order survive: boundary wraps for the conv1ds whose + # neighbours aren't another conv1d (the STFT/magnitude-spectrum interface + # and the final-conv boundary). + "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 6, } diff --git a/backends/cortex_m/test/models/test_wav2letter.py b/backends/cortex_m/test/models/test_wav2letter.py index 7f70bfc544d..37a162ff78e 100644 --- a/backends/cortex_m/test/models/test_wav2letter.py +++ b/backends/cortex_m/test/models/test_wav2letter.py @@ -18,14 +18,17 @@ } # Every Conv1d + ReLU pair fuses into a single cortex_m.quantized_conv2d call; # only the final log_softmax stays in aten until a quantized log_softmax lands. +# Between consecutive Conv1ds the view_copy + _clone_dim_order wrap/unwrap +# chain collapses to a direct conv2d -> conv2d hand-off (via FuseViewCopyTransform +# + FoldInverseDimOrderClonePass). Only the model-boundary view_copy and +# clone_dim_order pairs survive (one each at the model input, one at output). ops_after_transforms: dict[str, int] = { "executorch_exir_dialects_edge__ops_aten__log_softmax_default": 1, - "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 12, - "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 12, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 2, "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, "executorch_exir_dialects_edge__ops_cortex_m_quantized_conv2d_default": 12, - "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 24, + "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default": 2, } model = Wav2LetterModel()