executorch-grid-sample-export/arm_backend_monkey_patch.py at main · Arm-Examples/executorch-grid-sample-export · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
# SPDX-FileCopyrightText: Copyright 2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
#
# This source code is licensed under the BSD-style license found in the
# LicenseRef-BSD-ExecuTorch.txt file in the top-level directory.
# SPDX-FileCopyrightText: <text>Copyright 2025-2026 Arm Limited and/or
# its affiliates <open-source-office@arm.com></text>
# SPDX-License-Identifier: Apache-2.0 AND LicenseRef-BSD-ExecuTorch

import torch


def apply_arm_backend_monkey_patch() -> None:
    # Arm's FoldAndAnnotateQParamsPass normally folds terminal quantize ops into
    # their producer and keeps only qparam metadata. That works for the usual
    # int8 image path, but for our grid-position branch we need the explicit
    # terminal int16 quantize to survive so DecomposeQuantNodesPass can lower it
    # into real TOSA ops and VGF can emit an int16 graph output.
    from executorch.backends.arm._passes import fold_qdq_with_annotated_qparams_pass

    fold_pass = fold_qdq_with_annotated_qparams_pass.FoldAndAnnotateQParamsPass
    if getattr(fold_pass, "_grid_output_q_patch_applied", False):
        return

    original_is_foldable = fold_pass.is_foldable

    def _is_graph_output_quantize(q_node) -> bool:
        return any(user.op == "output" for user in q_node.users)

    def _feeds_grid_sampler_through_dq(q_node) -> bool:
        # Match the explicit grid-position quantize that feeds the custom
        # grid-sampler backend through the usual q -> dq -> grid_sampler edge.
        for dq_user in q_node.users:
            if (
                dq_user.op != "call_function"
                or dq_user.target not in fold_qdq_with_annotated_qparams_pass.DQ_OPS
            ):
                continue
            for consumer in dq_user.users:
                if consumer.op != "call_function":
                    continue
                target_name = str(consumer.target)
                if (
                    "grid_sampler.default" in target_name
                    or "grid_sampler_2d.default" in target_name
                ):
                    return True
        return False

    def _should_preserve_grid_output_quantize(node) -> bool:
        # Preserve only the terminal int16 quantize nodes used for grid-sampler
        # position tensors. Everything else should keep the stock Arm folding
        # behavior so we minimize divergence from the upstream backend.
        for user in node.users:
            if (
                user.op != "call_function"
                or user.target not in fold_qdq_with_annotated_qparams_pass.Q_OPS
            ):
                continue
            if len(user.args) <= 5 or user.args[5] != torch.int16:
                continue
            if _is_graph_output_quantize(user) or _feeds_grid_sampler_through_dq(user):
                return True
        return False

    def patched_is_foldable(node) -> bool:
        if not original_is_foldable(node):
            return False
        return not _should_preserve_grid_output_quantize(node)

    fold_pass.is_foldable = staticmethod(patched_is_foldable)
    fold_pass._grid_output_q_patch_applied = True

    # Arm's generic quantization annotator marks constant/factory producers like
    # aten.full.default as quantized outputs. PT2E prepare then processes those
    # nodes directly and asserts because exported factory ops still carry kwargs
    # such as dtype/device/pin_memory. XNNPACK avoids this by leaving the
    # factory node unannotated and letting annotated consumers insert observers
    # at the boundary instead.
    #
    # Upstream issue:
    # https://github.com/pytorch/executorch/issues/18322
    from executorch.backends.arm.quantizer import quantization_annotator

    if getattr(
        quantization_annotator, "_skip_factory_output_annotation_patch_applied", False
    ):
        return

    original_get_quant_properties = quantization_annotator.get_quant_properties
    skipped_factory_targets = {
        torch.ops.aten.full.default,
        torch.ops.aten.full,
        torch.ops.aten.zeros.default,
        torch.ops.aten.ones.default,
        torch.ops.aten.fill_.Scalar,
        torch.ops.aten.scalar_tensor.default,
    }

    def patched_get_quant_properties(node, gm, quantization_config):
        if node.target in skipped_factory_targets:
            return None
        return original_get_quant_properties(node, gm, quantization_config)

    quantization_annotator.get_quant_properties = patched_get_quant_properties
    quantization_annotator._skip_factory_output_annotation_patch_applied = True

    # Arm's RewriteConvPass inserts a TOSA rescale using qparams read from the
    # rewritten TOSA conv node itself. For non-fuseable conv -> clamp branches,
    # FoldAndAnnotateQParamsPass can leave the output qparams on the clamp
    # instead of the conv. The rewritten TOSA conv then appears to have no
    # output qparams and lowering fails. Use the original conv as the qparam
    # source and fall back to its immediate clamp user when needed.
    #
    # Upstream issue:
    # https://github.com/pytorch/executorch/issues/18491
    from executorch.backends.arm._passes import rewrite_conv_pass
    from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
        get_input_qparams,
        get_output_qparams,
    )
    from executorch.exir.dialects._ops import ops as exir_ops

    rewrite_pass = rewrite_conv_pass.RewriteConvPass
    if getattr(rewrite_pass, "_conv_clamp_output_q_patch_applied", False):
        return

    original_insert_output_rescale = rewrite_pass.insert_output_rescale

    def patched_insert_output_rescale(self, graph_module, node, source_node=None):
        qparam_source = source_node if source_node is not None else node
        input_qparams = get_input_qparams(qparam_source)
        try:
            output_qparams = get_output_qparams(qparam_source)[0]
        except ValueError:
            users = list(qparam_source.users)
            if (
                len(users) == 1
                and users[0].target == exir_ops.edge.aten.clamp.default
                and "output_qparams" in users[0].meta
                and len(users[0].meta["output_qparams"]) > 0
            ):
                output_qparams = get_output_qparams(users[0])[0]
            else:
                raise

        if source_node is None:
            return original_insert_output_rescale(self, graph_module, node)

        weight_qparams = input_qparams[1]
        input_qparams = input_qparams[0]
        is_per_channel = weight_qparams.per_channel
        if is_per_channel:
            weight_scale = weight_qparams.get_scale_per_channel()
        else:
            weight_scale = [weight_qparams.get_scale_per_tensor()]
        input_scale = input_qparams.get_scale_per_tensor()
        post_conv2d_scale = [
            (inp * w) / out
            for inp, w, out in zip(
                rewrite_conv_pass.itertools.cycle([input_scale]),
                weight_scale,
                rewrite_conv_pass.itertools.cycle(
                    [output_qparams.get_scale_per_tensor()]
                ),
            )
        ]
        with graph_module.graph.inserting_after(node):
            rescale_node = rewrite_conv_pass.create_node(
                graph=graph_module.graph,
                op_target=exir_ops.backend.tosa.RESCALE.default,
                args=(
                    node,
                    output_qparams.dtype,
                    post_conv2d_scale,
                    0,
                    output_qparams.get_zp_per_tensor(),
                ),
                from_node=node,
            )
        return rescale_node

    def patched_call(self, graph_module):
        modified = False
        for node in graph_module.graph.nodes:
            if (
                node.op != "call_function"
                or node.target != exir_ops.edge.aten.convolution.default
            ):
                continue

            modified = True

            (
                x,
                weight,
                bias,
                stride,
                pad,
                dilation,
                transposed,
                output_padding,
                group,
            ) = node.args

            input_fake_tensor = rewrite_conv_pass.get_first_fake_tensor(x)
            weight_fake_tensor = rewrite_conv_pass.get_first_fake_tensor(weight)
            input_shape = input_fake_tensor.shape
            weight_shape = weight_fake_tensor.shape
            spatial_rank = len(input_shape) - 2
            stride_list = rewrite_conv_pass.expand_around_channel(stride, spatial_rank)
            dilation_list = rewrite_conv_pass.expand_around_channel(
                dilation, spatial_rank
            )
            pad_list = rewrite_conv_pass.expand_around_channel(pad, spatial_rank)

            stride = tuple(stride_list)

            has_bias = bias is not None
            if not has_bias:
                bias = self._add_bias(graph_module, node, weight)

            conv_args: tuple[object, ...]
            if transposed:
                if spatial_rank != 2:
                    raise RuntimeError(
                        "Only 2D transpose convolutions are supported in the Arm backend."
                    )
                if group != 1:
                    raise RuntimeError(
                        "Grouped transpose convolutions are not supported in the Arm backend."
                    )
                if any(d != 1 for d in dilation_list):
                    raise RuntimeError(
                        "Transpose convolutions with dilation are not supported in the Arm backend."
                    )
                output_padding_list = rewrite_conv_pass.expand_around_channel(
                    output_padding, spatial_rank
                )
                out_pad = [
                    -pad_list[0],
                    -pad_list[0] + output_padding_list[0],
                    -pad_list[1],
                    -pad_list[1] + output_padding_list[1],
                ]
                target_op = exir_ops.backend.tosa.TRANSPOSE_CONV2D.default
                conv_args = (
                    x,
                    weight,
                    bias,
                    out_pad,
                    stride,
                )
            else:
                pad_attr: list[int] = []
                for value in pad_list:
                    pad_attr.extend([value, value])

                for axis_index in range(spatial_rank):
                    pad_index = axis_index * 2 + 1
                    pad_attr[pad_index] = self._adjust_pad_if_needed(
                        input_shape[axis_index + 2],
                        weight_shape[axis_index + 2],
                        stride_list[axis_index],
                        pad_attr[pad_index],
                        dilation_list[axis_index],
                    )

                dilation = tuple(dilation_list)
                pad = pad_attr

                if self._is_conv3d(len(input_shape), group):
                    target_op = exir_ops.backend.tosa.CONV3D.default
                elif self._is_depthwise_conv2d(node):
                    target_op = exir_ops.backend.tosa.DEPTHWISE_CONV2D.default
                    if all(user.target != target_op for user in weight.users):
                        self._reshape_weights(weight, input_fake_tensor.shape[1])
                    weight_fake_tensor = rewrite_conv_pass.get_first_fake_tensor(weight)
                else:
                    target_op = exir_ops.backend.tosa.CONV2D.default

                conv_args = (
                    x,
                    weight,
                    bias,
                    stride,
                    pad,
                    dilation,
                )

            with graph_module.graph.inserting_after(node):
                tosa_op = rewrite_conv_pass.create_node(
                    graph=graph_module.graph,
                    op_target=target_op,
                    args=conv_args,
                    from_node=node,
                    inherit_qparams=True,
                )
            bias_fake_tensor = rewrite_conv_pass.get_first_fake_tensor(bias) if bias else None
            tosa_node_fake_tensor = target_op(
                input_fake_tensor,
                weight_fake_tensor,
                bias_fake_tensor,
                *conv_args[3:],
            )

            if (
                tosa_node_fake_tensor.dtype == torch.int32
                and input_fake_tensor.dtype == torch.int8
            ):
                output_rescale = self.insert_output_rescale(
                    graph_module, tosa_op, source_node=node
                )
                node.replace_all_uses_with(output_rescale)
            elif (
                tosa_node_fake_tensor.dtype == torch.int32
                and input_fake_tensor.dtype == torch.int16
            ):
                has_bias = len(node.meta["input_qparams"]) > 2
                if not has_bias:
                    output_rescale = self.insert_output_rescale(
                        graph_module, tosa_op, source_node=node
                    )
                    node.replace_all_uses_with(output_rescale)
                else:
                    node.replace_all_uses_with(tosa_op)
                tosa_op.meta[rewrite_conv_pass.TosaSpecialDtype.meta_key()] = (
                    rewrite_conv_pass.TosaSpecialDtype.INT48
                )
            else:
                node.replace_all_uses_with(tosa_op)

            graph_module.graph.erase_node(node)

        if modified:
            graph_module.recompile()
            graph_module = rewrite_conv_pass.ArmPass.call(
                self, graph_module
            ).graph_module
        return rewrite_conv_pass.PassResult(graph_module, modified)

    rewrite_pass.insert_output_rescale = patched_insert_output_rescale
    rewrite_pass.call = patched_call
    rewrite_pass._conv_clamp_output_q_patch_applied = True

    # ToTosaMemoryFormatPass rewrites graph outputs back to the original memory
    # format by calling node.replace_input_with(...) on the FX output node. If
    # the output tuple contains the same FX node multiple times, such as after
    # FuseEqualPlaceholdersPass merges equal constant placeholders, that helper
    # rewrites every matching slot at once. The result is that distinct logical
    # outputs collapse onto the same transpose node and lose per-slot identity.
    # Patch output-node rewrites to update only the first matching tuple slot so
    # duplicate logical outputs each keep their own transpose.
    #
    # Upstream issue:
    # https://github.com/pytorch/executorch/issues/18320
    from executorch.backends.arm._passes import to_tosa_memory_format_pass

    tosa_memory_format_module = to_tosa_memory_format_pass
    tosa_memory_format = tosa_memory_format_module.ToTosaMemoryFormatPass
    if getattr(
        tosa_memory_format, "_duplicate_output_transpose_patch_applied", False
    ):
        return

    original_insert_input_transpose = tosa_memory_format.insert_input_transpose

    def _replace_first_output_slot(
        output_node, original_input_node, replacement_node
    ) -> None:
        outputs = output_node.args[0]
        if not isinstance(outputs, (list, tuple)):
            raise TypeError(
                f"Expected output node args to be a list or tuple, got {type(outputs)}"
            )

        rewritten_outputs = list(outputs)
        for output_index, existing_output in enumerate(rewritten_outputs):
            if existing_output is original_input_node:
                rewritten_outputs[output_index] = replacement_node
                break
        else:
            raise RuntimeError(
                "Could not find the original output node while rewriting the output tuple."
            )

        replacement = (
            rewritten_outputs if isinstance(outputs, list) else tuple(rewritten_outputs)
        )
        output_node.args = (replacement,)

    def patched_insert_input_transpose(node, input_node, graph_module):
        if node.op != "output":
            return original_insert_input_transpose(node, input_node, graph_module)

        if (
            input_node.op == "call_function"
            and input_node.target
            == tosa_memory_format_module.exir_ops.backend.tosa.TRANSPOSE.default
        ):
            pre_permute_node = input_node.all_input_nodes[0]
            _replace_first_output_slot(node, input_node, pre_permute_node)
            return

        rank = len(tosa_memory_format_module.get_first_fake_tensor(input_node).size())
        spatial_rank = input_node.meta["tosa_spatial_rank"]
        mem_format = tosa_memory_format._channels_last_inverse_order(
            rank, spatial_rank
        )
        assert sorted(mem_format) == list(
            range(rank)
        ), f"bad perm {mem_format} for rank {rank} in insert_input_transpose"

        with graph_module.graph.inserting_before(node):
            permute_node = tosa_memory_format_module.create_node(
                graph_module.graph,
                tosa_memory_format_module.exir_ops.backend.tosa.TRANSPOSE.default,
                args=(
                    input_node,
                    list(mem_format),
                ),
                from_node=node,
            )
            permute_node.meta["tosa_dim_order"] = tuple(
                range(len(input_node.meta["val"].size()))
            )
            permute_node.meta["tosa_spatial_rank"] = spatial_rank
            _replace_first_output_slot(node, input_node, permute_node)

    tosa_memory_format.insert_input_transpose = staticmethod(
        patched_insert_input_transpose
    )
    tosa_memory_format._duplicate_output_transpose_patch_applied = True