diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index f21bae7b4a8..92daf359c18 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -417,12 +417,11 @@ def get_tensor_name(
         elif is_graph_output(node):
             tensor_name = f"output_{tensor_name}"
 
-        # Save this for intermediate debugger
-        # Needs idx since node like topk has 2 outputs
-        if QCOM_TENSOR_NAME in node.meta:
-            node.meta[QCOM_TENSOR_NAME][wrapper_idx] = tensor_name
-        else:
-            node.meta[QCOM_TENSOR_NAME] = {wrapper_idx: tensor_name}
+        # Only add qcom_tensor_name when enable tensor dump.
+        # Only do this in qnn_preprocess since thats the final naming. enable_tensor_dump is set to true only in qnn_preprocess and not op validation.
+        if self.enable_tensor_dump:
+            node.meta.setdefault(QCOM_TENSOR_NAME, {})[wrapper_idx] = tensor_name
+
         return tensor_name
 
     def define_custom_tensor_wrapper(
diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md
index fb8f9a1c662..bda3937bc53 100644
--- a/backends/qualcomm/debugger/README.md
+++ b/backends/qualcomm/debugger/README.md
@@ -50,7 +50,7 @@ Generate optrace and QHAS files using QNN tools under $QNN_SDK_ROOT. After finis
 adb = SimpleADB(
     qnn_config=qnn_config,
     pte_path=f"{args.artifact}/{pte_filename}.pte",
-    workspace=f"/data/local/tmp/executorch/{pte_filename},
+    workspace=f"/data/local/tmp/executorch/{pte_filename}",
 )
 binaries_trace = generate_optrace(
     args, adb, f"{args.artifact}/{pte_filename}.pte", example_input
@@ -121,24 +121,24 @@ flowchart TB;
     debug --> output["Output Results"]
 ```
 
-## Instructions
-
-### 1. Setup
+## Prerequisites
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
 2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
 
-### 2. Enable Flag
+## Instructions
 
-When executing the script, please add the flag `--dump_intermediate_outputs`. This tells QNN to dump all intermediate tensors during execution.
+### 1. Initialize debugger and build binary
+
+Create a `QNNIntermediateDebugger` with a sample input and pass it to `build_executorch_binary`. The `--dump_intermediate_outputs` flag tells QNN to dump all intermediate tensors during execution.
 
-### 3. Add debugger to the example script
-Initialize a `QNNIntermediateDebugger`. Please pass initialized `QNNIntermediateDebugger` and the `args.dump_intermediate_outputs` to `build_executorch_binary` method as well.
-#### Example:
 ```python
 from executorch.backends.qualcomm.export_utils import build_executorch_binary
-from executorch.backends.qualcomm.debugger.qnn_intermediate_debugger import QNNIntermediateDebugger
+from executorch.backends.qualcomm.debugger.qnn_intermediate_debugger import (
+    OutputFormat,
+    QNNIntermediateDebugger,
+)
 
-qnn_intermediate_debugger = QNNIntermediateDebugger()
+qnn_intermediate_debugger = QNNIntermediateDebugger(sample_input=inputs[0])
 build_executorch_binary(
     model=MyModel(),
     qnn_config=qnn_config,
@@ -148,27 +148,61 @@ build_executorch_binary(
 )
 ```
 
-### 4. Set data num to 1
-It is perfectly fine for users to pass the desired amount of datasets to `build_executorch_binary`, which helps achieve better quantization results. However, after `build_executorch_binary` is called, we need to ensure that we only perform one inference during execution. Please ensure that CPU and QNN is using the same input during execution; otherwise, the debugging results might not be accurate.
+After `build_executorch_binary()`, the debugger holds:
+- `edge_ep` — edge `ExportedProgram` for CPU golden inference.
+- `etrecord_file_path` — path to the generated ET record.
+
+### 2. Execute on device
+
+Ensure `dump_intermediate_outputs` is enabled in your `QnnConfig` (or pass `--dump_intermediate_outputs` via CLI). Only run **one inference** for debugging — multiple executions are not supported.
 
-### 5: Pull and process the results.
-After QNN execution with the runner, if the previous steps are done correctly, we should be able to get two files: `etdump.etdp` and `debug_output.bin`.
-The following example pulls the files back and calls a callback function to process the results. In this callback function, we create the `Inspector`. Then we perform CPU inference to get CPU intermediate results. Now, we have both QNN and CPU intermediate results, we can start generating results to compare the accuracy. Taking the following example, we should be able to get `debug_graph.svg` as an output in the current directory.
-#### Example:
 ```python
-from executorch.backends.qualcomm.debugger.qnn_intermediate_debugger import  OutputFormat
+from executorch.examples.qualcomm.utils import SimpleADB
+
+adb = SimpleADB(
+    qnn_config=qnn_config,
+    pte_path=f"{args.artifact}/{pte_filename}.pte",
+    workspace=f"/data/local/tmp/executorch/{pte_filename}",
+)
+adb.push(inputs=inputs)
+adb.execute()
+```
+
+### 3. Pull results and compare
+
+After execution, pull `etdump.etdp` and `debug_output.bin` from the device. Use `setup_inspector()` to create the `Inspector`, then create comparators and generate results.
+
+Before comparing per-layer outputs, it is highly recommended to verify that the edge program's final output aligns with the original `nn.Module`. The debugger uses the edge program as the CPU golden reference, so if the edge graph itself has diverged (e.g., due to weights quantization or pass transformations), per-layer comparisons against it may be misleading.
+
+```python
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_sample import (
+    QcomCosineSimilarityComparator, QcomMSEComparator,
+)
+
 def validate_intermediate_tensor():
-    inspector = Inspector(
+    qnn_intermediate_debugger.setup_inspector(
         etdump_path=f"{args.artifact}/etdump.etdp",
         debug_buffer_path=f"{args.artifact}/debug_output.bin",
     )
-    qnn_intermediate_debugger.intermediate_output_module(*(inputs[0]))
+
+    # Verify edge program output aligns with the original nn.Module.
+    # This ensures the edge graph is a reliable golden reference.
+    edge_result = qnn_intermediate_debugger.edge_ep.module()(*(inputs[0]))
+    with torch.no_grad():
+        source_result = source_model(*(inputs[0]))
+        score = torch.nn.functional.cosine_similarity(
+            edge_result.flatten(), source_result.flatten(), dim=0
+        ).item()
+        print("Cosine similarity between nn.Module and edge CPU:", score)
+
+    cos_comparator = qnn_intermediate_debugger.create_comparator(
+        QcomCosineSimilarityComparator, threshold=0.9
+    )
     qnn_intermediate_debugger.generate_results(
-        title="debug_graph",
-        path=".",
-        output_format=OutputFormat.SVG_GRAPHS,
-        inspector=inspector,
-        evaluator=CosineSimilarityEvaluator(0.9),
+        title="debug_cos_similarity",
+        path=args.artifact,
+        output_format=OutputFormat.SVG_GRAPH,
+        comparator=cos_comparator,
     )
 
 adb.pull_debug_output(
@@ -176,53 +210,59 @@ adb.pull_debug_output(
 )
 ```
 
-#### Additional Options
-The above example sets output formats as SVG and evaluation metrics using Cosine Similarity. Based on different needs, users can choose other output formats as shown in the `OutputFormat` class under [qnn_intermediate_debugger](./qnn_intermediate_debugger.py)
+## Comparators
+
+Create comparators via the `create_comparator()` factory, which automatically injects the `edge_ep`. A couple sample comparators are provided under [qcom_numerical_comparator_sample.py](./qcom_numerical_comparator_sample.py):
+
 ```python
-class OutputFormat(IntEnum):
-    SVG_GRAPHS = 0
-    CSV_FILES = 1
-    DUMP_RAW = 2
+cos = qnn_intermediate_debugger.create_comparator(QcomCosineSimilarityComparator, threshold=0.9)
+mse = qnn_intermediate_debugger.create_comparator(QcomMSEComparator, threshold=0.1)
 ```
 
-For evaluation metrics, if users would like to implement their own metrics, we have provided the option to implement [MetricEvaluatorBase](./metrics_evaluator.py). The following shows how to define custom metrics.
+### Custom comparators
+
+Users can also define their own comparator by implementing a derived class from  [QcomNumericalComparatorBase](./qcom_numerical_comparator_base.py). Inside the derived class, users will need to implement `metric_name()`, `is_valid_score()`, and `element_compare()`. The base class handles QNN-specific preprocessing (dequantization, layout conversion) internally — `preprocessing` cannot be overridden.
 ```python
-class RootMeanSquaredErrorEvaluator(MetricEvaluatorBase):
-    def __init__(self, threshold=0.02):
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_base import (
+    QcomNumericalComparatorBase,
+)
+
+class MyComparator(QcomNumericalComparatorBase):
+    def __init__(self, edge_ep, threshold=0.5):
+        super().__init__(edge_ep)
         self.threshold = threshold
 
     def metric_name(self) -> str:
-        return "Root Mean Squared Error"
-
-    def evaluate(
-        self, qnn_output: torch.Tensor, cpu_output: torch.Tensor
-    ) -> Tuple[Any, bool]:
-        mse = F.mse_loss(qnn_output, cpu_output)
-        rmse = torch.sqrt(mse)
-        valid = rmse < self.threshold
-        return rmse, valid
-
-qnn_intermediate_debugger.generate_results(
-    title="my_metric",
-    path=".",
-    output_format=OutputFormat.SVG_GRAPHS,
-    inspector=inspector,
-    evaluator=RootMeanSquaredErrorEvaluator(),
-)
+        return "my_metric"
+
+    def is_valid_score(self, score: float) -> bool:
+        return score >= self.threshold
+
+    def element_compare(self, a, b) -> float:
+        # your comparison logic here
+        ...
 ```
 
-### Example Script
-We have provided an inception_v3 demo script to help users better understand how to apply the debugger to their scripts. Please refer to [qnn_intermediate_debugger_demo.py](../../../examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py) for the example script.
+## Output formats
+
+| Format | Enum | Output |
+|--------|------|--------|
+| SVG graph | `OutputFormat.SVG_GRAPH` | Color-coded computation graph (green=pass, red=fail) |
+| CSV file | `OutputFormat.CSV_FILE` | Per-node tabular results |
+
+## Example Script
+
+An Inception_V3 demo script is provided at [qnn_intermediate_debugger_demo.py](../../../examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py).
 
-Before running the example script, please ensure that dataset is downloaded. Example dataset can be retrieved [here](https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000).
+Before running, ensure the dataset is downloaded. An example dataset can be retrieved [here](https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000).
 
-To execute the model:
 ```bash
-python examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py -b build-android -m ${SOC_MODEL} --device ${SERIAL_NUM} --dataset ${PATH_TO_DATASET} --dump_intermediate_outputs
+python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build-android -s $DEVICE_SERIAL -m $SOC_MODEL -d path/to/imagenet/val --dump_intermediate_outputs
 ```
 
-### Limitation
-1. The current debugger only supports performing one execution. Multiple executions may cause unknown behavior and are not recommended.
-2. Please ignore this if you are using `qnn_executor_runner`. If you have decided to write your own runner, please follow the [tutorial](https://pytorch.org/executorch/stable/etdump.html) on how to implement etdump into your own runner.
-3. The current debugger does not support graph with partitions. (WIP)
-4. The current debugger does not support LLM models. (WIP)
+## Limitations
+1. Only one execution per debug session — multiple executions may cause unknown behavior.
+2. If you have decided to write your own runner (instead of `qnn_executor_runner`), follow the [tutorial](https://pytorch.org/executorch/stable/etdump.html) on how to implement etdump.
+3. Does not support graphs with partitions (partial delegation).
+4. Does not support LLM models.
+5. Does not support graphs with multiple methods.
diff --git a/backends/qualcomm/debugger/format_outputs.py b/backends/qualcomm/debugger/format_outputs.py
index 7388eef8223..d0dd165b186 100644
--- a/backends/qualcomm/debugger/format_outputs.py
+++ b/backends/qualcomm/debugger/format_outputs.py
@@ -5,21 +5,30 @@
 # LICENSE file in the root directory of this source tree.
 
 import csv
+import logging
 import os
+import subprocess
 from typing import Any
 
+import executorch.exir as exir
+import pandas
 import pydot
 import torch
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_base import (
+    QcomNumericalComparatorBase,
+)
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_QUANT_ATTRS,
     QCOM_SCALE,
     QCOM_SCALES,
-    QCOM_TENSOR_NAME,
     QCOM_ZERO_POINT,
     QCOM_ZERO_POINTS,
 )
+from executorch.exir.debug_handle_utils import DEBUG_HANDLE_KEY
 
-from .metrics_evaluator import MetricEvaluatorBase
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logging.getLogger().setLevel(logging.INFO)
 
 
 # Copied from site-packages/torch/fx/passes/graph_drawer.py
@@ -39,63 +48,37 @@ def typename(target: Any) -> str:
     return ret.replace("{", r"\{").replace("}", r"\}")
 
 
-def retrieve_node_info(evaluator, node, node_tensor_map):
-
-    node_info = {}
-    node_info["name"] = node.name
-    node_info["op_code"] = node.op
-    node_info["target"] = typename(node.target)
-    node_info["num_users"] = len(node.users)
-
-    if "val" in node.meta:
-        if isinstance(node.meta["val"], torch.Tensor):
-            node_info["pytorch_layout"] = node.meta["val"].shape
-        elif isinstance(node.meta["val"], (list, tuple)):
-            shape_list = []
-            for i in range(len(node.meta["val"])):
-                shape_list.append(node.meta["val"][i].shape)
-            node_info["pytorch_layout"] = shape_list
-
+def get_scale_zero_point(node: torch.fx.node.Node):
+    scale_zero_point = {"scale(s)": None, "zero_point(s)": None}
     if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
-        node_info["scale(s)"] = (
+        scale_zero_point["scale(s)"] = (
             quant_attrs.get(QCOM_SCALES)
             if QCOM_SCALES in quant_attrs
             else quant_attrs.get(QCOM_SCALE)
         )
-        node_info["zero_point(s)"] = (
+        scale_zero_point["zero_point(s)"] = (
             quant_attrs.get(QCOM_ZERO_POINTS)
             if QCOM_ZERO_POINTS in quant_attrs
             else quant_attrs.get(QCOM_ZERO_POINT)
         )
+    return scale_zero_point
 
-    if node.name in node_tensor_map:
-        qnn_output, cpu_output, meta = node_tensor_map[node.name]
-        node_info[QCOM_TENSOR_NAME] = meta.get(QCOM_TENSOR_NAME)
-        node_info[evaluator.metric_name()], node_info["is_valid_score"] = (
-            evaluator.evaluate(qnn_output, cpu_output)
-        )
 
-        # The values in meta are directly retrieved from the node during the forward hook, which means the values should be the same for meta and node.meta.
-        # Storing these data during the forward hook helps us compare QNN tensors with CPU tensors without traversing the graph.
-        # We only check "scale" and not "scales" since the forward hook only stores the node's output, which should always be per tensor.
-        if QCOM_QUANT_ATTRS in node.meta:
-            assert (
-                node_info["scale(s)"] == node.meta[QCOM_QUANT_ATTRS][QCOM_SCALE]
-            ), "node meta scale should be same as scale retrieve during forward hook"
-            assert (
-                node_info["zero_point(s)"]
-                == node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINT]
-            ), "node meta zero_point should be same as zero_point retrieve during forward hook"
+def get_pytorch_layout_info(node: torch.fx.node.Node):
+    val = node.meta.get("val")
+    if val is None:
+        return None
+    if isinstance(val, torch.Tensor):
+        return val.shape
+    return [v.shape for v in val if isinstance(v, torch.Tensor)]
 
-    return node_info
 
-
-def export_svg(
+def export_svg(  # noqa: C901
     title: str,
     path: str,
-    evaluator: MetricEvaluatorBase,
-    edge_module: torch.fx.GraphModule,
-    node_tensor_map: dict,
+    edge_ep: exir.ExirExportedProgram,
+    numeric_results: pandas.core.frame.DataFrame,
+    comparator: QcomNumericalComparatorBase,
 ):
     def get_node_style(is_valid_score: bool):
         template = {
@@ -117,37 +100,46 @@ def get_node_style(is_valid_score: bool):
     node_map = {}
 
     # Create node
-    for node in edge_module.graph.nodes:
+    for node in edge_ep.graph_module.graph.nodes:
         # These are just nodes before fold_quant and still there
         if len(node.users) == 0 and node.op == "placeholder":
             continue
-        node_info = retrieve_node_info(
-            evaluator=evaluator, node=node, node_tensor_map=node_tensor_map
-        )
+
+        pytorch_layout = get_pytorch_layout_info(node)
+        scale_zero_point = get_scale_zero_point(node)
+        scale = scale_zero_point["scale(s)"]
+        zero_point = scale_zero_point["zero_point(s)"]
 
         node_label = "{"
-        node_label += f"name=%{node_info.get('name')}" + r"\n"
-        node_label += f"|op_code={node_info.get('op_code')}" + r"\n"
-        node_label += f"|qnn_tensor_name={node_info.get('qnn_tensor_name')}" + r"\n"
-        node_label += f"|target={node_info.get('target')}" + r"\n"
-        node_label += f"|num_users={node_info.get('num_users')}" + r"\n"
-        node_label += f"|pytorch_layout={node_info.get('pytorch_layout')}" + r"\n"
-        node_label += f"|scale(s)={node_info.get('scale(s)')}" + r"\n"
-        node_label += f"|zero_point(s)={node_info.get('zero_point(s)')}" + r"\n"
-        node_label += (
-            f"|{evaluator.metric_name()}={node_info.get(evaluator.metric_name())}"
-            + r"\n"
-        )
-        node_label += f"|is_valid_score={node_info.get('is_valid_score')}" + r"\n"
+        node_label += f"name=%{node.name}" + r"\n"
+        node_label += f"|op_code={node.op}" + r"\n"
+        node_label += f"|target={typename(node.target)}" + r"\n"
+        node_label += f"|num_users={len(node.users)}" + r"\n"
+        node_label += f"|pytorch_layout={pytorch_layout}" + r"\n"
+        node_label += f"|scale(s)={scale}" + r"\n"
+        node_label += f"|zero_point(s)={zero_point}" + r"\n"
+
+        is_valid_score = None
+        if debug_handle := node.meta.get(DEBUG_HANDLE_KEY, None):
+            node_label += f"|debug_handle={debug_handle}" + r"\n"
+            debug_handle = (debug_handle,)
+            if debug_handle in numeric_results.index:
+                score = numeric_results.loc[[debug_handle], "gap"].iat[0][0]
+                assert isinstance(
+                    score, float
+                ), f"Expecting QcomNumericalComparatorBase element_compare to return float, but get {type(score)}."
+                node_label += f"|{comparator.metric_name()}={score:.3f}" + r"\n"
+                is_valid_score = comparator.is_valid_score(score)
+        node_label += f"|is_valid_score={is_valid_score}" + r"\n"
         node_label += "}"
 
-        template = get_node_style(node_info.get("is_valid_score"))
+        template = get_node_style(is_valid_score)
         pydot_node = pydot.Node(node.name, label=node_label, **template)
         node_map[node.name] = pydot_node
         pydot_graph.add_node(pydot_node)
 
     # Create edge
-    for node in edge_module.graph.nodes:
+    for node in edge_ep.graph_module.graph.nodes:
         if len(node.users) == 0 and node.op == "placeholder":
             continue
         cur_pydot_node = node_map[node.name]
@@ -157,28 +149,68 @@ def get_node_style(is_valid_score: bool):
             pydot_graph.add_edge(
                 pydot.Edge(cur_pydot_node, user_pydot_node, dir="forward")
             )
+    dot_file_path = os.path.join(path, f"{title}.dot")
+    pydot_graph.write_raw(dot_file_path)
+    logging.info(f"Intermediate debugger dot graph saved at: {dot_file_path}")
 
     svg_file_path = os.path.join(path, f"{title}.svg")
-    pydot_graph.write_svg(svg_file_path)
-    print(f"Intermediate debugger graph saved at: {svg_file_path}")
+    try:
+        subprocess.run(
+            ["dot", "-Tsvg", dot_file_path, "-o", svg_file_path],
+            timeout=5,
+            check=True,
+        )
+        logging.info(f"Intermediate debugger SVG graph saved at: {svg_file_path}.")
+    except subprocess.TimeoutExpired:
+        logging.warning(
+            f"SVG generation timed out after 5s, skipping. "
+            f"Only saving the dot file: {dot_file_path}."
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError) as e:
+        logging.warning(f"SVG generation failed ({e}), skipping.")
 
 
 def export_csv(
     title: str,
     path: str,
-    evaluator: MetricEvaluatorBase,
-    edge_module: torch.fx.GraphModule,
-    node_tensor_map: dict,
+    edge_ep: exir.ExirExportedProgram,
+    numeric_results: pandas.core.frame.DataFrame,
+    comparator: QcomNumericalComparatorBase,
 ):
     node_info_list = []
-    for node in edge_module.graph.nodes:
+    for node in edge_ep.graph_module.graph.nodes:
         # These are just nodes before fold_quant and still there
         if len(node.users) == 0 and node.op == "placeholder":
             continue
-        node_info = retrieve_node_info(
-            evaluator=evaluator, node=node, node_tensor_map=node_tensor_map
+
+        pytorch_layout = get_pytorch_layout_info(node)
+        scale_zero_point = get_scale_zero_point(node)
+        scale = scale_zero_point["scale(s)"]
+        zero_point = scale_zero_point["zero_point(s)"]
+        score = None
+        is_valid_score = None
+        if debug_handle := node.meta.get(DEBUG_HANDLE_KEY, None):
+            if (debug_handle,) in numeric_results.index:
+                score = numeric_results.loc[[(debug_handle,)], "gap"].iat[0][0]
+                assert isinstance(
+                    score, float
+                ), f"Expecting QcomNumericalComparatorBase element_compare to return float, but get {type(score)}."
+                is_valid_score = comparator.is_valid_score(score)
+
+        node_info_list.append(
+            {
+                "name": node.name,
+                "op_code": node.op,
+                "target": typename(node.target),
+                "num_users": len(node.users),
+                "pytorch_layout": pytorch_layout,
+                "scale(s)": scale,
+                "zero_point(s)": zero_point,
+                "debug_handle": debug_handle,
+                comparator.metric_name(): score,
+                "is_valid_score": is_valid_score,
+            }
         )
-        node_info_list.append(node_info)
 
     # Writing to a CSV file
     csv_file_path = os.path.join(path, f"{title}.csv")
@@ -186,13 +218,13 @@ def export_csv(
         fieldnames = [
             "name",
             "op_code",
-            "qnn_tensor_name",
             "target",
             "num_users",
             "pytorch_layout",
             "scale(s)",
             "zero_point(s)",
-            f"{evaluator.metric_name()}",
+            "debug_handle",
+            comparator.metric_name(),
             "is_valid_score",
         ]
         writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
@@ -201,23 +233,3 @@ def export_csv(
         writer.writerows(node_info_list)
 
     print(f"Intermediate debugger csv saved at: {csv_file_path}")
-
-
-def export_raw(
-    path: str,
-    edge_module: torch.fx.GraphModule,
-    node_tensor_map: dict,
-):
-    for node in edge_module.graph.nodes:
-        # These are just unused nodes before fold_quant and still there
-        if len(node.users) == 0 and node.op == "placeholder":
-            continue
-        if paired_event := node_tensor_map.get(node.name):
-            qnn_output, cpu_output, meta = paired_event
-            qnn_tensor_name = meta[QCOM_TENSOR_NAME]
-            qnn_output_path = os.path.join(path, qnn_tensor_name + "_qnn.raw")
-            cpu_output_path = os.path.join(path, qnn_tensor_name + "_cpu.raw")
-            qnn_output.numpy().tofile(qnn_output_path)
-            cpu_output.numpy().tofile(cpu_output_path)
-
-    print(f"Intermediate debugger raw files saved at: {path}")
diff --git a/backends/qualcomm/debugger/metrics_evaluator.py b/backends/qualcomm/debugger/metrics_evaluator.py
deleted file mode 100644
index 55c8b92b034..00000000000
--- a/backends/qualcomm/debugger/metrics_evaluator.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from abc import ABC, abstractmethod
-from typing import Any, Tuple
-
-import torch
-
-
-class MetricEvaluatorBase(ABC):
-    @abstractmethod
-    def metric_name(self) -> str:
-        """
-        A name for this metric evaluation
-
-        Returns:
-            str: name of the metric evaluation
-        """
-        ...
-
-    @abstractmethod
-    def evaluate(
-        self, qnn_output: torch.Tensor, cpu_output: torch.Tensor, **kwargs
-    ) -> Tuple[Any, bool]:
-        """
-        This abstract method should accept both QNN and CPU outputs for a single layer.
-        Define your own logic to compare the results.
-
-        Args:
-            qnn_output (torch.Tensor): QNN intermediate output
-            cpu_output (torch.Tensor): CPU intermediate output
-
-        Returns:
-            Tuple[Any, bool]: Return 2 elements:
-                1) Score or anything that you would like to be printed under metrics category for svg graph or csv file.
-                2) A boolean that indicates whether the evaluation result is acceptable or not.
-        """
-        ...
-
-
-class AtolEvaluator(MetricEvaluatorBase):
-    def __init__(self, threshold=1e-1):
-        self.threshold = threshold
-
-    def metric_name(self) -> str:
-        return "Atol Similarity"
-
-    def evaluate(
-        self, qnn_output: torch.Tensor, cpu_output: torch.Tensor
-    ) -> Tuple[Any, bool]:
-        avg_atol = torch.mean(torch.abs(qnn_output - cpu_output))
-        valid = avg_atol < self.threshold
-        formatted_score = f"{avg_atol:.3f}"
-        return formatted_score, valid
-
-
-class CosineSimilarityEvaluator(MetricEvaluatorBase):
-    def __init__(self, threshold=0.9):
-        self.threshold = threshold
-
-    def metric_name(self) -> str:
-        return "Cosine Similarity"
-
-    def evaluate(
-        self, qnn_output: torch.Tensor, cpu_output: torch.Tensor
-    ) -> Tuple[Any, bool]:
-        score = torch.nn.functional.cosine_similarity(
-            qnn_output.flatten(), cpu_output.flatten(), dim=0
-        ).item()
-        valid = score > self.threshold
-        formatted_score = f"{score:.3f}"
-        return formatted_score, valid
-
-
-class MeanSquaredErrorEvaluator(MetricEvaluatorBase):
-    def __init__(self, threshold=0.01):
-        self.threshold = threshold
-
-    def metric_name(self) -> str:
-        return "Mean Squared Error"
-
-    def evaluate(
-        self, qnn_output: torch.Tensor, cpu_output: torch.Tensor
-    ) -> Tuple[Any, bool]:
-        mse = torch.mean((qnn_output - cpu_output) ** 2)
-        valid = mse < self.threshold
-        return mse, valid
diff --git a/backends/qualcomm/debugger/qcom_numerical_comparator_base.py b/backends/qualcomm/debugger/qcom_numerical_comparator_base.py
new file mode 100644
index 00000000000..0068d28bfac
--- /dev/null
+++ b/backends/qualcomm/debugger/qcom_numerical_comparator_base.py
@@ -0,0 +1,194 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import operator
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Dict, final, Optional, Tuple
+
+import executorch.exir as exir
+import torch
+from executorch.backends.qualcomm._passes.layout_transform import LayoutTransform
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_AXIS_ORDER,
+    QCOM_QUANT_ATTRS,
+    QCOM_SCALE,
+    QCOM_ZERO_POINT,
+)
+from executorch.devtools.inspector.numerical_comparator import (
+    IntermediateOutputMapping,
+    NumericalComparatorBase,
+)
+from executorch.exir.sym_util import eval_shape
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class NodeMetaInfo:
+    node_name: str
+    scale: Optional[float] = None
+    zero_point: Optional[int] = None
+    axis_order: Optional[Tuple[int, ...]] = None
+
+
+class QcomNumericalComparatorBase(NumericalComparatorBase):
+    """Base class for Qualcomm numerical comparators.
+
+    This class locks down the `preprocessing` method to handle QNN-specific
+    tensor transformations (dequantization, layout conversion) internally.
+    Community users subclassing this base only need to implement `element_compare`.
+
+    Attempting to override `preprocessing` in a subclass will raise TypeError
+    at class definition time.
+    """
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        if "preprocessing" in cls.__dict__:
+            raise TypeError(
+                f"{cls.__name__} cannot override 'preprocessing'. "
+                "Qualcomm handles preprocessing (dequantization, layout conversion) internally."
+            )
+
+    def __init__(self, edge_ep: exir.ExportedProgram) -> None:
+        super().__init__()
+        self.edge_ep = edge_ep
+
+    @abstractmethod
+    def metric_name(self) -> str:
+        """
+        A name for this metric evaluation.
+
+        Returns:
+            str: name of the metric evaluation.
+        """
+        ...
+
+    @abstractmethod
+    def is_valid_score(self, score: float) -> bool:
+        """
+        Determine whether a comparison score is within an acceptable range.
+
+        Args:
+            score: the comparison score to validate.
+
+        Returns:
+            bool: True if the score is acceptable, False otherwise.
+        """
+        ...
+
+    @final
+    def preprocessing(  # noqa: C901
+        self, mapping: IntermediateOutputMapping
+    ) -> IntermediateOutputMapping:
+
+        def _preprocess_tensor(
+            qnn_tensor: torch.Tensor, meta: NodeMetaInfo, golden_tensor: torch.Tensor
+        ) -> torch.Tensor:
+            if meta.scale is not None:
+                # Dequantize
+                qnn_tensor = (
+                    qnn_tensor.to(torch.float32)
+                    .sub(meta.zero_point)
+                    .mul(meta.scale)
+                    .contiguous()
+                )
+            if meta.axis_order:
+                # QNN to Pytorch layout
+                axis_order = LayoutTransform.get_axis_order(
+                    eval_shape(qnn_tensor.shape), reverse=True
+                )
+                qnn_tensor = qnn_tensor.permute(axis_order)
+
+            assert (
+                golden_tensor.shape == qnn_tensor.shape
+            ), f"{meta.node_name}'s golden and QNN tensor has different shape. Golden Tensor Shape: {golden_tensor.shape}. QNN Tensor Shape: {qnn_tensor.shape}."
+
+            return qnn_tensor
+
+        def _build_debug_handle_to_meta() -> (
+            Dict[Tuple[int, ...], Dict[int, NodeMetaInfo]]
+        ):
+            debug_handle_to_meta: Dict[Tuple[int, ...], Dict[int, NodeMetaInfo]] = {}
+            for node in self.edge_ep.graph_module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+
+                if (debug_handle := node.meta.get("debug_handle")) is None:
+                    continue
+                else:
+                    debug_handle = (debug_handle,)
+
+                quant_attrs = node.meta.get(QCOM_QUANT_ATTRS, {})
+                node_meta_info = NodeMetaInfo(
+                    node_name=node.name,
+                    scale=quant_attrs.get(QCOM_SCALE, None),
+                    zero_point=quant_attrs.get(QCOM_ZERO_POINT, None),
+                    axis_order=node.meta.get(QCOM_AXIS_ORDER, None),
+                )
+
+                if any(user.target == operator.getitem for user in node.users):
+                    # Assume if a node user is getitem, all users are getitem
+                    assert all(
+                        user.target == operator.getitem for user in node.users
+                    ), "[QNN Delegate Debugger]: Expect all users to be getitem node"
+                    continue
+
+                # Multi-output op's getitem node shares the same debug handle.
+                if node.target == operator.getitem:
+                    output_idx = node.args[1]
+                    debug_handle_to_meta.setdefault(debug_handle, {})[
+                        output_idx
+                    ] = node_meta_info
+                else:
+                    assert (
+                        debug_handle not in debug_handle_to_meta
+                    ), f"[QNN Delegate Debugger]: Duplicate handle_id {debug_handle} found when visiting {node.name}."
+                    debug_handle_to_meta[debug_handle] = {0: node_meta_info}
+
+            return debug_handle_to_meta
+
+        debug_handle_to_meta = _build_debug_handle_to_meta()
+        processed_mapping: IntermediateOutputMapping = {}
+        for (golden_handle, golden_output), (qnn_handle, qnn_output) in mapping.items():
+            assert (
+                golden_handle == qnn_handle
+            ), f"Expecting the handle to match, aot handle: {golden_handle}, qnn_handle: {qnn_handle}."
+            if node_meta_dict := debug_handle_to_meta.get(qnn_handle, None):
+                if isinstance(qnn_output, tuple):
+                    assert len(qnn_output) <= len(
+                        node_meta_dict
+                    ), f"node_meta has {len(node_meta_dict)} entries but qnn_output has {len(qnn_output)} elements."
+                    if len(node_meta_dict) != len(qnn_output):
+                        logging.warning(
+                            f"Number of QNN output {len(qnn_output)} mismatched with number of output for edge module {len(node_meta_dict)}. This is possibly due to multi-outputs and QNN does not use all outputs. Please verify the following meta from edge module and ensure this is desired: {node_meta_dict}."
+                        )
+
+                    processed = []
+                    for idx, q_tensor in enumerate(qnn_output):
+                        processed.append(
+                            _preprocess_tensor(
+                                qnn_tensor=q_tensor,
+                                meta=node_meta_dict[idx],
+                                golden_tensor=golden_output[idx],
+                            )
+                        )
+
+                    qnn_output = tuple(processed)
+                else:
+                    assert (
+                        len(node_meta_dict) == 1 and 0 in node_meta_dict
+                    ), f"Single output expected node_meta_dict with key 0, got keys {list(node_meta_dict.keys())}"
+                    qnn_output = _preprocess_tensor(
+                        qnn_tensor=qnn_output,
+                        meta=node_meta_dict[0],
+                        golden_tensor=golden_output,
+                    )
+
+            processed_mapping[(golden_handle, golden_output)] = (qnn_handle, qnn_output)
+        return processed_mapping
diff --git a/backends/qualcomm/debugger/qcom_numerical_comparator_sample.py b/backends/qualcomm/debugger/qcom_numerical_comparator_sample.py
new file mode 100644
index 00000000000..43783a64420
--- /dev/null
+++ b/backends/qualcomm/debugger/qcom_numerical_comparator_sample.py
@@ -0,0 +1,57 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any
+
+import executorch.exir as exir
+import torch
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_base import (
+    QcomNumericalComparatorBase,
+)
+
+
+"""
+This file provides some examples on how to implement a QcomNumericalComparator
+"""
+
+
+class QcomMSEComparator(QcomNumericalComparatorBase):
+    """Mean Squared Error comparator for Qualcomm intermediate outputs."""
+
+    def __init__(self, edge_ep: exir.ExportedProgram, threshold: float = 1e-3) -> None:
+        super().__init__(edge_ep)
+        self.threshold = threshold
+
+    def metric_name(self) -> str:
+        return "mse"
+
+    def is_valid_score(self, score: float) -> bool:
+        return score <= self.threshold
+
+    def element_compare(self, a: Any, b: Any) -> float:
+        if isinstance(a, torch.Tensor) and isinstance(b, torch.Tensor):
+            return torch.mean(torch.square(a.float() - b.float())).item()
+        return float((a - b) ** 2)
+
+
+class QcomCosineSimilarityComparator(QcomNumericalComparatorBase):
+    """Cosine Similarity comparator for Qualcomm intermediate outputs."""
+
+    def __init__(self, edge_ep: exir.ExportedProgram, threshold: float = 0.95) -> None:
+        super().__init__(edge_ep)
+        self.threshold = threshold
+
+    def metric_name(self) -> str:
+        return "cosine_similarity"
+
+    def is_valid_score(self, score: float) -> bool:
+        return score >= self.threshold
+
+    def element_compare(self, a: Any, b: Any) -> float:
+        score = torch.nn.functional.cosine_similarity(
+            a.to(torch.float32).flatten(), b.to(torch.float32).flatten(), dim=0
+        ).item()
+        return score
diff --git a/backends/qualcomm/debugger/qnn_intermediate_debugger.py b/backends/qualcomm/debugger/qnn_intermediate_debugger.py
index 904dd4f6ccb..7c7609cd7f5 100644
--- a/backends/qualcomm/debugger/qnn_intermediate_debugger.py
+++ b/backends/qualcomm/debugger/qnn_intermediate_debugger.py
@@ -4,136 +4,94 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import copy
-import operator
 import os
 import warnings
 from enum import IntEnum
 
-import torch
-
-from executorch.backends.qualcomm._passes.layout_transform import LayoutTransform
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_AXIS_ORDER,
-    QCOM_QUANT_ATTRS,
-    QCOM_SCALE,
-    QCOM_TENSOR_NAME,
-    QCOM_ZERO_POINT,
+import executorch.exir as exir
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_base import (
+    QcomNumericalComparatorBase,
 )
 from executorch.devtools import Inspector
-from executorch.exir.sym_util import eval_shape
 
-from .format_outputs import export_csv, export_raw, export_svg
-from .metrics_evaluator import MetricEvaluatorBase
+from .format_outputs import export_csv, export_svg
 
 
 class OutputFormat(IntEnum):
-    SVG_GRAPHS = 0
-    CSV_FILES = 1
-    DUMP_RAW = 2
-
-
-class IntermediateModule(torch.nn.Module):
-    """
-    This class serves as an intermediate point and is inserted right after the call_function node.
-    It also saves some metadata such as scale, offset, etc.
-    Since we just want to check the intermediate output, we will directly return the value during the forward call.
-    """
-
-    def __init__(
-        self,
-        module_name: str,
-        qnn_tensor_name: str,
-        node_name: str,
-        scale: float,
-        zero_point: int,
-        revert_order: bool = None,
-    ):
-        super().__init__()
-        self.module_name = module_name
-        self.qnn_tensor_name = qnn_tensor_name
-        self.node_name = node_name
-        self.scale = scale
-        self.zero_point = zero_point
-        self.revert_order = revert_order
-
-    def forward(self, x):
-        return x
+    SVG_GRAPH = 0
+    CSV_FILE = 1
 
 
 class QNNIntermediateDebugger:
     """This is a debugger tool capable of retrieving intermediate results for CPU edge EP.
-    We can further compare these with QNN's intermediate output to identify any QNN accuracy issues.
+    We can further compare these with QNN's intermediate output to identify any accuracy issues.
     """
 
-    def __init__(self):
-        self.intermediate_outputs = {}
-
-    def set_edge_module(self, edge_module: torch.fx.graph_module.GraphModule):
-        self.orig_edge = copy.deepcopy(edge_module)
-        self.intermediate_output_module = self._insert_intermediate_module(
-            copy.deepcopy(edge_module)
+    def __init__(self, sample_input):
+        self.sample_input = sample_input
+        self.edge_ep = None
+        self.etrecord_file_path = None
+        self.inspector = None
+        # Support single to edge after transform forward graph for now.
+        self.reference_graph_name = "edge_after_transform/forward"
+
+    def set_edge_ep(self, edge_ep: exir.ExirExportedProgram):
+        self.edge_ep = edge_ep
+
+    def set_etrecord_file_path(self, etrecord_file_path: str):
+        self.etrecord_file_path = etrecord_file_path
+
+    def setup_inspector(self, etdump_path: str, debug_buffer_path: str):
+        self.inspector = Inspector(
+            etdump_path=etdump_path,
+            debug_buffer_path=debug_buffer_path,
+            etrecord=self.etrecord_file_path,
+            reference_graph_name=self.reference_graph_name,
         )
 
+    def create_comparator(
+        self, comparator_cls: type[QcomNumericalComparatorBase], **kwargs
+    ) -> QcomNumericalComparatorBase:
+        # No need to pass edge_ep — the factory injects it automatically.
+        # Just pass the comparator class and any comparator-specific args:
+        #   comparator = debugger.create_comparator(QcomMSEComparator, threshold=1e-4)
+        assert (
+            self.edge_ep is not None
+        ), "edge_ep must be set before creating a comparator."
+        return comparator_cls(edge_ep=self.edge_ep, **kwargs)
+
     def generate_results(
         self,
         title: str,
         path: str,
         output_format: OutputFormat,
-        inspector: Inspector,
-        evaluator: MetricEvaluatorBase = None,
-        keep_qnn_layout: bool = False,
+        comparator: QcomNumericalComparatorBase,
     ):
         assert isinstance(
             output_format, OutputFormat
         ), "output_format passed in is not an instance of OutputFormat"
         os.makedirs(path, exist_ok=True)
-        if keep_qnn_layout:
-            warnings.warn(
-                "[QNN Delegate Debugger]: keep_qnn_layout is not recommended for general use case. "
-                "QNN and CPU has different dtype(FP V.S. Quantized) and data formats(NCHW V.S. NHWC) in a lot of cases.",
-                stacklevel=1,
-            )
 
-        # Due to users can switch between keep_qnn_layout between generate_results, rematch this every time.
-        # Make this a class variable if repeat matching is taking too long and handle keep_qnn_layout.
-        node_tensor_map = self._match_tensors(
-            inspector=inspector,
-            keep_qnn_layout=keep_qnn_layout,
+        numeric_results = self.inspector.calculate_numeric_gap(
+            distance=comparator, reference_graph=self.reference_graph_name
         )
+        numeric_results = numeric_results.set_index("runtime_debug_handle")
 
-        if output_format == OutputFormat.SVG_GRAPHS:
-            assert evaluator is not None, "Please provide an evaluator."
+        if output_format == OutputFormat.SVG_GRAPH:
             export_svg(
                 title=title,
                 path=path,
-                evaluator=evaluator,
-                edge_module=self.orig_edge,
-                node_tensor_map=node_tensor_map,
+                edge_ep=self.edge_ep,
+                numeric_results=numeric_results,
+                comparator=comparator,
             )
-        elif output_format == OutputFormat.CSV_FILES:
-            assert evaluator is not None, "Please provide an evaluator."
+        elif output_format == OutputFormat.CSV_FILE:
             export_csv(
                 title=title,
                 path=path,
-                evaluator=evaluator,
-                edge_module=self.orig_edge,
-                node_tensor_map=node_tensor_map,
-            )
-        elif output_format == OutputFormat.DUMP_RAW:
-            warnings.warn(
-                f"[QNN Delegate Debugger]: Param 'title' will be ignored, all raw files will be stored under: {path}",
-                stacklevel=1,
-            )
-            if evaluator:
-                warnings.warn(
-                    "[QNN Delegate Debugger]: Param 'evaluator' will be ignored as DUMP_RAW will only dump tensors to raw files but won't perform comparison.",
-                    stacklevel=1,
-                )
-            export_raw(
-                path=path,
-                edge_module=self.intermediate_output_module,
-                node_tensor_map=node_tensor_map,
+                edge_ep=self.edge_ep,
+                numeric_results=numeric_results,
+                comparator=comparator,
             )
         else:
             warnings.warn(
@@ -141,188 +99,3 @@ def generate_results(
                 stacklevel=1,
             )
             return
-
-    def _insert_intermediate_module(  # noqa: C901
-        self, edge_module: torch.fx.graph_module.GraphModule
-    ):
-        """
-        This feature is for intermediate tensor dump on the host CPU.
-        After we get an edge GraphModule, we insert submodule between each call_function node,
-        and we register forward hooks to store the intermediate results.
-        We have to use the edge GraphModule because this is the graph closest to what QNN is executing
-        while still being a valid graph to ExecuTorch.
-
-        Args:
-            edge_module (exir.ExirExportedProgram): A deep copy of edge ir graph module.
-               We need to deep copy so we don't mess up the original edge_ep.
-        Returns:
-            exir.ExirExportedProgram: A deep copy of edge graph_module with intermediate modules inserted.
-        """
-
-        def hook_fn(module, input, output):
-            meta = {}
-            meta[QCOM_TENSOR_NAME] = module.qnn_tensor_name
-            meta["node_name"] = module.node_name
-            meta[QCOM_SCALE] = module.scale
-            meta[QCOM_ZERO_POINT] = module.zero_point
-            meta["revert_order"] = module.revert_order
-            meta["output"] = output  # CPU output
-
-            assert (
-                module.qnn_tensor_name not in self.intermediate_outputs
-            ), f"{module.qnn_tensor_name} checked already, check if this is a potential error"
-            self.intermediate_outputs[module.qnn_tensor_name] = meta
-
-        graph = edge_module.graph
-        module_count = 0
-        for node in graph.nodes:
-            if node.op == "call_function":
-                module_name = f"intermediate_module_{module_count}"
-                module_count += 1
-                with graph.inserting_after(node):
-                    scale = None
-                    zero_point = None
-                    if QCOM_QUANT_ATTRS in node.meta:
-                        scale = node.meta[QCOM_QUANT_ATTRS][QCOM_SCALE]
-                        zero_point = node.meta[QCOM_QUANT_ATTRS][QCOM_ZERO_POINT]
-
-                    revert_order = QCOM_AXIS_ORDER in node.meta
-
-                    if node.target == operator.getitem:
-                        index = node.args[1]
-                        # Ex: topk -> intermediate_module -> get_item
-                        src_node = node.args[0].args[0]
-                        qnn_tensor_name = src_node.meta[QCOM_TENSOR_NAME][index]
-                    elif any(user.target == operator.getitem for user in node.users):
-                        # For cases like topK, qnn_tensor_name is stored in get_item instead of source_node itself.
-                        assert all(
-                            user.target == operator.getitem for user in node.users
-                        ), "Expect all users to be get_item node"
-                        qnn_tensor_name = node.name
-                    elif QCOM_TENSOR_NAME in node.meta:
-                        assert (
-                            len(node.meta[QCOM_TENSOR_NAME]) == 1
-                        ), "Expecting a single qnn_tensor name but get more than 1."
-                        qnn_tensor_name = node.meta[QCOM_TENSOR_NAME][0]
-                    else:
-                        # Unused
-                        qnn_tensor_name = node.name
-
-                    obs = IntermediateModule(
-                        module_name=module_name,
-                        qnn_tensor_name=qnn_tensor_name,
-                        node_name=node.name,
-                        scale=scale,
-                        zero_point=zero_point,
-                        revert_order=revert_order,
-                    )
-                    setattr(
-                        edge_module,
-                        module_name,
-                        obs,
-                    )
-                    new_obs = graph.create_node("call_module", module_name, (node,), {})
-                orig_users = list(node.users.keys())
-                for user_node in orig_users:
-                    if user_node is new_obs:
-                        continue
-                    user_node.replace_input_with(node, new_obs)
-
-        # Register hooks for all intermediate layers
-        for (
-            _,
-            layer,
-        ) in edge_module.named_modules():
-            if isinstance(layer, IntermediateModule):
-                layer.register_forward_hook(hook_fn)
-
-        graph.eliminate_dead_code()
-        edge_module.recompile()
-
-        return edge_module
-
-    def _process_qnn_output(self, qnn_output: torch.tensor, meta: dict) -> torch.tensor:
-        """
-        QNN intermediate results are all quantized.
-        We need to dequantize them to match CPU float values.
-        Additionally, we need to revert the layout format for layout-sensitive nodes.
-
-        Args:
-            qnn_output (torch.tensor): QNN intermediate output from inspector event
-            meta (dict): The meta for this tensor/node that is stored during insert_intermediate_module().
-
-        Returns:
-            torch.tensor: Processed tensor that should have same dtype and shape as CPU tensors.
-        """
-        qnn_output = qnn_output.to(torch.float32)
-        if meta[QCOM_SCALE] is not None:
-            scale = meta[QCOM_SCALE]
-            zero_point = meta[QCOM_ZERO_POINT]
-            qnn_output = (
-                qnn_output.sub(zero_point).mul(scale).to(torch.float32).contiguous()
-            )
-        if meta["revert_order"]:
-            axis_order = LayoutTransform.get_axis_order(
-                eval_shape(qnn_output.shape), reverse=True
-            )
-            qnn_output = qnn_output.permute(axis_order)
-        return qnn_output
-
-    def _match_tensors(self, inspector: Inspector, keep_qnn_layout: bool = False):
-        """
-        Map QNN tensors back to CPU tensors.
-        Create a map using the node name as the key and (preprocessed/postprocessed QNN tensor, CPU tensor, meta) as the value.
-        We need meta because it holds values such as scale, offset, layout sensitivity, etc.
-
-        Args:
-            inspector (Inspector): Inspector that parse QNN runtime intermediate outputs
-            keep_qnn_layout (bool): If true, store QNN outputs in NHWC format. Not recommended for general users.
-
-        Returns:
-            A dict storing {node_name : tuple(qnn_output, cpu_output, meta_info)}
-            Meta_info is the info stored during forward hook_fn.
-        """
-
-        # node_tensor_map {key: tuple(qnn_output, cpu_output, meta_info)}
-        node_tensor_map = {}
-        # OPs that only exists in QNN but not CPU Golden
-        unmatched_qnn_tensors = []
-        # E.g.: DELEGATE_CALL (This is the model input data), 'Method::execute'
-        ignored_events = []
-        # Collected with forward hook
-        intermediate_outputs = self.intermediate_outputs
-        for event_block in inspector.event_blocks:
-            if event_block.name == "Execute":
-                for event in event_block.events:
-                    # If user enables profiling and dump intermediate outputs the same time, we need to skip the profiling event
-                    if event.perf_data is not None and event.is_delegated_op:
-                        continue
-                    if meta := intermediate_outputs.get(event.name):
-                        node_name = meta["node_name"]
-                        cpu_output = meta["output"]
-                        qnn_output = (
-                            event.debug_data[0]
-                            if keep_qnn_layout
-                            else self._process_qnn_output(event.debug_data[0], meta)
-                        )
-                        node_tensor_map[node_name] = (
-                            qnn_output,
-                            cpu_output,
-                            meta,
-                        )
-
-                    else:
-                        (
-                            unmatched_qnn_tensors.append(event.name)
-                            if event.is_delegated_op
-                            else ignored_events.append(event.name)
-                        )
-
-        warnings.warn(
-            f"The following events are ignored: {ignored_events}", stacklevel=1
-        )
-        warnings.warn(
-            f"The following QNN OPs are missing CPU reference. OPs added during qnn_preprocess will not have CPU reference. Please ensure the operations below are created during qnn_preprocess. {unmatched_qnn_tensors}",
-            stacklevel=1,
-        )
-        return node_tensor_map
diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py
index 2c7ab2abd02..c07b9848d14 100644
--- a/backends/qualcomm/export_utils.py
+++ b/backends/qualcomm/export_utils.py
@@ -53,7 +53,6 @@
     is_qnn_sdk_version_less_than,
     to_edge_transform_and_lower_to_qnn,
 )
-from executorch.exir.backend.utils import get_delegates
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 from torchao.quantization.pt2e import MovingAverageMinMaxObserver
@@ -575,6 +574,7 @@ def build_executorch_binary(
             dep_table=passes_dependency,
             skip_node_id_set=qnn_config.skip_delegate_node_ids,
             skip_node_op_set=qnn_config.skip_delegate_node_ops,
+            generate_etrecord=qnn_intermediate_debugger is not None,
         )
     else:
         edge_prog_mgr = to_edge_transform_and_lower_to_qnn(
@@ -585,21 +585,9 @@ def build_executorch_binary(
             passes_job=passes_job,
             skip_node_id_set=qnn_config.skip_delegate_node_ids,
             skip_node_op_set=qnn_config.skip_delegate_node_ops,
+            generate_etrecord=qnn_intermediate_debugger is not None,
         )
 
-    if qnn_intermediate_debugger:
-        lowered_module_nodes = get_delegates(edge_prog_mgr.exported_program().graph)
-        assert (
-            len(lowered_module_nodes) == 1
-        ), "Graph with partitions are currently unsupported."
-
-        lowered_module_node = lowered_module_nodes[0]
-        lower_module = getattr(
-            edge_prog_mgr.exported_program().graph_module, lowered_module_node.name
-        )
-        edge_module = lower_module.original_module.module()
-        qnn_intermediate_debugger.set_edge_module(edge_module=edge_module)
-
     allocate_io = not (qnn_config.shared_buffer or qnn_config.direct_build_folder)
     executorch_config = ExecutorchBackendConfig(
         # For shared buffer, user must pass the memory address
@@ -617,6 +605,16 @@ def build_executorch_binary(
     with open(pte_name, "wb") as file:
         exec_prog_mgr.write_to_file(file)
 
+    if qnn_intermediate_debugger:
+        etrecord = exec_prog_mgr.get_etrecord()
+        etrecord.update_representative_inputs(qnn_intermediate_debugger.sample_input)
+        edge_ep = etrecord.graph_map[qnn_intermediate_debugger.reference_graph_name]
+        # Use this edge_ep since edge_ep after etrecord serialize/deserialize will lose quant_attrs info.
+        qnn_intermediate_debugger.set_edge_ep(edge_ep=edge_ep)
+        etrecord_file_path = f"{os.path.dirname(pte_name)}/debug.etrecord"
+        qnn_intermediate_debugger.set_etrecord_file_path(etrecord_file_path)
+        etrecord.save(etrecord_file_path)
+
     if qnn_config.compile_only:
         sys.exit(0)
 
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
index f423288640c..fcd3b7cd6b5 100644
--- a/backends/qualcomm/qnn_preprocess.py
+++ b/backends/qualcomm/qnn_preprocess.py
@@ -19,7 +19,10 @@
 from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
     flatbuffer_to_option,
 )
-from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_AXIS_ORDER,
+    QCOM_TENSOR_NAME,
+)
 from executorch.backends.qualcomm.utils.qnn_manager_lifecycle import (
     get_current_qnn_manager,
 )
@@ -181,10 +184,15 @@ def preprocess_multimethod(  # noqa: C901
                 )
                 if qnn_manager.IsTensorDump():
                     for node in programs[i].graph.nodes:
-                        if handle_id := node.meta.get(DEBUG_HANDLE_KEY):
+                        # Make sure debug handle and runtime tensor name exists. Since we cannot support multioutput, only save single output nodes, which len of node.meta[QCOM_TENSOR_NAME] == 1
+                        if (
+                            (handle_id := node.meta.get(DEBUG_HANDLE_KEY))
+                            and QCOM_TENSOR_NAME in node.meta
+                            and len(node.meta[QCOM_TENSOR_NAME]) == 1
+                        ):
                             debug_handle_builder.insert_delegate_mapping_entry(
                                 handles=handle_id,
-                                identifier=node.name,
+                                identifier=node.meta[QCOM_TENSOR_NAME][0],
                             )
                 if isinstance(py_op_wrappers, bytes):
                     ctx_binary_list.append(py_op_wrappers)
@@ -195,7 +203,6 @@ def preprocess_multimethod(  # noqa: C901
                             for py_op_wrapper in py_op_wrappers
                         ]
                     )
-
             if len(py_op_wrapper_list) == len(edge_programs.values()):
                 qnn_context_binary = qnn_manager.Compile(
                     graph_names, py_op_wrapper_list
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 48f07da06e9..8276528fed7 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -5086,7 +5086,6 @@ def output_callback(log_msg):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_topk(self):
-        TestQNN.dump_intermediate_outputs = True
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -5099,12 +5098,10 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_intermediate_events=7,
-            expected_compared_events=5,
+            expected_compared_events=3,
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
-        TestQNN.dump_intermediate_outputs = True
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -5117,8 +5114,7 @@ def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_intermediate_events=20,
-            expected_compared_events=16,
+            expected_compared_events=14,
         )
 
     def test_qnn_backend_skip_node_id(self):
@@ -5720,7 +5716,6 @@ def output_callback(log_msg):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
-        TestQNN.dump_intermediate_outputs = True
         backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -5734,13 +5729,11 @@ def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_intermediate_events=21,
             expected_compared_events=14,
         )
 
     def test_qnn_backend_dump_intermediate_outputs_topk(self):
         torch.manual_seed(8)
-        TestQNN.dump_intermediate_outputs = True
         backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -5754,8 +5747,7 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self):
             module,
             sample_input,
             expected_partitions=1,
-            expected_intermediate_events=9,
-            expected_compared_events=5,
+            expected_compared_events=3,
         )
 
     def test_qnn_backend_dynamic_shape(self):
@@ -9008,9 +9000,9 @@ def test_intermediate_debugger(self):
             else:
                 svg_path = msg["svg_path"]
                 csv_path = msg["csv_path"]
-                min_accepted = 235
-                max_accepted = 241
-                # Having a +- 3 tolerance, expecting 238 events
+                min_accepted = 231
+                max_accepted = 237
+                # Having a +- 3 tolerance, expecting 234 events
                 assert os.path.exists(svg_path), f"Unable to find SVG file: {svg_path}"
                 assert os.path.exists(csv_path), f"Unable to find CSV file: {csv_path}"
 
@@ -9132,7 +9124,6 @@ def setup_environment():
     TestQNN.oss_repo = args.oss_repo
     TestQNN.shared_buffer = args.shared_buffer
     TestQNN.enable_x86_64 = args.enable_x86_64
-    TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
     TestQNN.compile_only = args.compile_only
     TestQNN.pre_gen_pte = args.pre_gen_pte
     TestQNN.llama_artifacts = args.llama_artifacts
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 481c2b71696..2f55b4bdfec 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -15,6 +15,10 @@
 import torchao
 from executorch import exir
 from executorch.backends.qualcomm.builders.node_visitor import dq_ops
+
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_sample import (
+    QcomCosineSimilarityComparator,
+)
 from executorch.backends.qualcomm.debugger.qnn_intermediate_debugger import (
     QNNIntermediateDebugger,
 )
@@ -45,7 +49,6 @@
 from executorch.examples.qualcomm.utils import make_output_dir
 
 from executorch.exir.backend.compile_spec_schema import CompileSpec
-from executorch.exir.backend.utils import get_delegates
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
@@ -183,7 +186,6 @@ class TestQNN(unittest.TestCase):
     compile_only: bool = False
     pre_gen_pte: str = ""
     llama_artifacts: str = ""
-    dump_intermediate_outputs: bool = False
     inference_speed: float = 0.0
     inference_speed_output_path = "outputs/inference_speed.txt"
     static_llm_eval_method = ""
@@ -318,7 +320,6 @@ def verify_output(  # noqa: C901
         executorch_prog: ExecutorchProgram | ExecutorchProgramManager,
         etrecord_path: str = "etrecord.bin",
         expected_profile_events: int = -1,
-        expected_intermediate_events: int = -1,
         method_index: int = 0,
         input_encodings: Tuple = (),
         output_encodings: Tuple = (),
@@ -383,29 +384,27 @@ def validate_profile():
                 )
 
             def validate_intermediate_tensor():
-                inspector = Inspector(
-                    etdump_path=etdump_path, debug_buffer_path=debug_output_path
+                qnn_intermediate_debugger.setup_inspector(
+                    etdump_path=etdump_path,
+                    debug_buffer_path=debug_output_path,
                 )
-                node_tensor_map = qnn_intermediate_debugger._match_tensors(
-                    inspector=inspector, keep_qnn_layout=False
+                cos_comparator = qnn_intermediate_debugger.create_comparator(
+                    QcomCosineSimilarityComparator
                 )
-                self.assertEqual(
-                    len(node_tensor_map),
-                    expected_compared_events,
-                    msg=f"Unexpected number of compared events, expecting {expected_compared_events}, but has {len(node_tensor_map)} events.",
-                )
-                # Compare accuracy for each layer
-                for _, value in node_tensor_map.items():
-                    self._assert_outputs_equal(
-                        value[0].to(torch.float32), value[1].to(torch.float32)
+                numeric_results = (
+                    qnn_intermediate_debugger.inspector.calculate_numeric_gap(
+                        distance=cos_comparator,
+                        reference_graph=qnn_intermediate_debugger.reference_graph_name,
                     )
-                for event_block in inspector.event_blocks:
-                    if event_block.name == "Execute":
-                        self.assertEqual(
-                            len(event_block.events),
-                            expected_intermediate_events,
-                            msg=f"Unexpected number of intermediate events, expecting {expected_intermediate_events}, but has {len(event_block.events)} events.",
-                        )
+                )
+                numeric_results = numeric_results.set_index("runtime_debug_handle")
+                assert (
+                    len(numeric_results) == expected_compared_events
+                ), f"Unexpected number of compared events, expecting {expected_compared_events}, but has {len(numeric_results)} events."
+                for _, row in numeric_results.iterrows():
+                    assert cos_comparator.is_valid_score(
+                        row.gap[0]
+                    ), f"Node {row.aot_ops} is failing {cos_comparator.metric_name()} test, {row.gap[0]} is lower than {cos_comparator.threshold}."
 
             processed_inputs = list(sample_inputs)
             for i, enc in enumerate(input_encodings):
@@ -445,7 +444,7 @@ def validate_intermediate_tensor():
                     "--method_index",
                     str(method_index),
                 ]
-                if self.dump_intermediate_outputs:
+                if expected_compared_events != -1:
                     cmd.append("--dump_intermediate_outputs")
                 cmd += extra_cmds.split()
 
@@ -496,7 +495,7 @@ def validate_intermediate_tensor():
                 if expected_profile_events != -1:
                     validate_profile()
 
-                if expected_intermediate_events != -1:
+                if expected_compared_events != -1:
                     validate_intermediate_tensor()
 
                 if save_inference_speed:
@@ -513,7 +512,7 @@ def validate_intermediate_tensor():
                     device=self.device,
                     host=self.host,
                     soc_model=self.soc_model,
-                    dump_intermediate_outputs=expected_intermediate_events != -1,
+                    dump_intermediate_outputs=expected_compared_events != -1,
                     direct_build_folder=self.direct_build_folder,
                 )
 
@@ -552,7 +551,7 @@ def validate_intermediate_tensor():
                 if expected_profile_events != -1:
                     adb.pull_etdump(etdump_path, callback=validate_profile)
 
-                if expected_intermediate_events != -1:
+                if expected_compared_events != -1:
                     adb.pull_debug_output(
                         etdump_path,
                         debug_output_path,
@@ -570,7 +569,6 @@ def lower_module_and_test_output(
         sample_inputs: Tuple[torch.Tensor],
         expected_partitions: int = 1,
         expected_profile_events: int = -1,
-        expected_intermediate_events: int = -1,
         expected_compared_events: int = -1,
         assert_output_equal: bool = True,
         passes_job: Optional[OrderedDict] = None,
@@ -592,27 +590,9 @@ def lower_module_and_test_output(
             skip_node_op_set=skip_node_op_set,
             skip_mutable_buffer=skip_mutable_buffer,
             generate_etrecord=self.profile_level != 0
-            or expected_intermediate_events != -1,
+            or expected_compared_events != -1,
         )
 
-        qnn_intermediate_debugger = None
-        if expected_intermediate_events != -1:
-            lowered_module_nodes = get_delegates(
-                delegated_program.exported_program().graph
-            )
-            assert len(lowered_module_nodes) == 1, "Length not correct"
-
-            lowered_module_node = lowered_module_nodes[0]
-            lower_module = getattr(
-                delegated_program.exported_program().graph_module,
-                lowered_module_node.name,
-            )
-            edge_module = lower_module.original_module.module()
-
-            qnn_intermediate_debugger = QNNIntermediateDebugger()
-            qnn_intermediate_debugger.set_edge_module(edge_module=edge_module)
-            qnn_intermediate_debugger.intermediate_output_module(*sample_inputs)
-
         # Don't allocate if shared_buffer enabled or using direct_mode
         allocate_io = not (self.shared_buffer or self.direct_build_folder)
         exec_prog = delegated_program.to_executorch(
@@ -643,11 +623,24 @@ def lower_module_and_test_output(
         etrecord_path = "etrecord.bin"
         if self.profile_level:
             exec_prog.get_etrecord().save(etrecord_path)
+
+        qnn_intermediate_debugger = None
+        if expected_compared_events != -1:
+            etrecord = exec_prog.get_etrecord()
+            qnn_intermediate_debugger = QNNIntermediateDebugger(sample_inputs)
+            qnn_intermediate_debugger.set_etrecord_file_path(etrecord_path)
+            edge_ep = etrecord.graph_map[qnn_intermediate_debugger.reference_graph_name]
+            qnn_intermediate_debugger.set_edge_ep(edge_ep=edge_ep)
+            etrecord.update_representative_inputs(
+                qnn_intermediate_debugger.sample_input
+            )
+            etrecord.save(etrecord_path)
+
         # Check numerics
         if (
             assert_output_equal
             or expected_profile_events != -1
-            or expected_intermediate_events != -1
+            or expected_compared_events != -1
         ):
             self.verify_output(
                 module=module,
@@ -655,7 +648,6 @@ def lower_module_and_test_output(
                 executorch_prog=exec_prog,
                 etrecord_path=etrecord_path,
                 expected_profile_events=expected_profile_events,
-                expected_intermediate_events=expected_intermediate_events,
                 extra_cmds=extra_cmds,
                 output_callback=output_callback,
                 save_inference_speed=save_inference_speed,
diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py
index b79fed256c1..3a07e01a09b 100644
--- a/backends/qualcomm/utils/constants.py
+++ b/backends/qualcomm/utils/constants.py
@@ -41,6 +41,7 @@
 QCOM_ZERO_POINTS = "zero_points"
 QCOM_PASS_ACTIVATE_KEY = "activate"
 QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY = "args_kwargs_defaults"
+QCOM_TENSOR_NAME = "qnn_tensor_name"
 
 # constants in backends/qualcomm/tests
 QCOM_ANNOTATION = "annotation"
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index da81a5df09f..f38950314a7 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -400,6 +400,21 @@ def ensure_graph_specific_dict(value, graph_names):
             return value
         return {graph_name: value for graph_name in graph_names}
 
+    # Ensure if user is using intermediate debugger, user only lower 1 method.
+    # This restriction is caused by conflict handle_id among graphs.
+    # This could be resolved with generating random debug_id(e.g., uuid).
+    for compiler_spec in (
+        compiler_specs.values()
+        if isinstance(compiler_specs, Dict)
+        else [compiler_specs]
+    ):
+        option = generate_qnn_executorch_option(compiler_spec)
+        obj_options = flatbuffer_to_option(option)
+        if obj_options.dump_intermediate_outputs and isinstance(module, Dict):
+            assert (
+                len(module) == 1
+            ), "Intermediate Tensor Dump does not support multi-methods."
+
     if not isinstance(module, dict):
         module = {"forward": module}
 
diff --git a/devtools/inspector/_inspector.py b/devtools/inspector/_inspector.py
index e9fbc4778f5..b61f72ae797 100644
--- a/devtools/inspector/_inspector.py
+++ b/devtools/inspector/_inspector.py
@@ -1025,6 +1025,7 @@ def __init__(
             Callable[[Union[int, str], Union[int, float]], Union[int, float]]
         ] = None,
         enable_module_hierarchy: bool = False,
+        reference_graph_name: str = EDGE_DIALECT_GRAPH_KEY,
     ) -> None:
         r"""
         Initialize an `Inspector` instance with the underlying `EventBlock`\ s populated with data from the provided ETDump path or binary,
@@ -1040,6 +1041,7 @@ def __init__(
             delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]].
             delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of target_time_scale/source_time_scale.
             enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False.
+            reference_graph_name: The reference graph used to consume ETRecord
 
         Returns:
             None
@@ -1104,9 +1106,9 @@ def __init__(
         # Key str is method name; value is list of ProgramOutputs because of list of test cases
         self._reference_outputs: Dict[str, List[ProgramOutput]] = {}
         self._enable_module_hierarchy = enable_module_hierarchy
-        self._consume_etrecord()
+        self._consume_etrecord(reference_graph_name)
 
-    def _consume_etrecord(self) -> None:
+    def _consume_etrecord(self, reference_graph_name) -> None:
         """
         If an ETRecord is provided, connect it to the EventBlocks and populate the Event metadata.
 
@@ -1147,7 +1149,7 @@ def _consume_etrecord(self) -> None:
             enable_module_hierarchy=self._enable_module_hierarchy,
         )
         debug_handle_to_op_node_map = create_debug_handle_to_op_node_mapping(
-            self.op_graph_dict[EDGE_DIALECT_GRAPH_KEY],
+            self.op_graph_dict[reference_graph_name],
         )
         for event_block in self.event_blocks:
             for event in event_block.events:
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
index 556987e4bbf..5bece480f76 100644
--- a/devtools/inspector/_inspector_utils.py
+++ b/devtools/inspector/_inspector_utils.py
@@ -65,7 +65,7 @@
 ]
 EXCLUDED_EVENTS_WHEN_PRINTING = {"OPERATOR_CALL"}
 
-EXCLUDED_EVENTS_FOR_INTERMEDIATE_OUTPUT = {"OPERATOR_CALL"}
+EXCLUDED_EVENTS_FOR_INTERMEDIATE_OUTPUT = {"OPERATOR_CALL", "DELEGATE_CALL"}
 
 
 class TimeScale(Enum):
diff --git a/devtools/inspector/numerical_comparator/numerical_comparator_base.py b/devtools/inspector/numerical_comparator/numerical_comparator_base.py
index c4f8a90f78f..c9d3629f3fd 100644
--- a/devtools/inspector/numerical_comparator/numerical_comparator_base.py
+++ b/devtools/inspector/numerical_comparator/numerical_comparator_base.py
@@ -246,6 +246,7 @@ def compare(
                 continue
             rows.append(
                 {
+                    "aot_debug_handle": aot_debug_handle,
                     "aot_ops": find_op_names(
                         aot_debug_handle, aot_debug_handle_to_op_names
                     ),
@@ -253,6 +254,7 @@ def compare(
                     "runtime_ops": find_op_names(
                         runtime_debug_handle, runtime_debug_handle_to_op_names
                     ),
+                    "runtime_debug_handle": runtime_debug_handle,
                     "runtime_intermediate_output": runtime_intermediate_output,
                     "gap": self._compare_intermediate_outputs(
                         aot_intermediate_output, runtime_intermediate_output
diff --git a/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py b/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py
index 07dfc9c9558..89d627379e1 100644
--- a/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py
+++ b/examples/qualcomm/util_scripts/qnn_intermediate_debugger_demo.py
@@ -7,15 +7,14 @@
 import json
 import os
 from multiprocessing.connection import Client
-from typing import Any, Tuple
 
 import numpy as np
 
 import torch
-import torch.nn.functional as F
-from executorch.backends.qualcomm.debugger.metrics_evaluator import (
-    CosineSimilarityEvaluator,
-    MetricEvaluatorBase,
+
+from executorch.backends.qualcomm.debugger.qcom_numerical_comparator_sample import (
+    QcomCosineSimilarityComparator,
+    QcomMSEComparator,
 )
 from executorch.backends.qualcomm.debugger.qnn_intermediate_debugger import (
     OutputFormat,
@@ -29,7 +28,6 @@
     SimpleADB,
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.devtools import Inspector
 from executorch.examples.models.inception_v3.model import InceptionV3Model
 from executorch.examples.qualcomm.utils import (
     get_imagenet_dataset,
@@ -62,12 +60,13 @@ def main(args):
         data_size=data_num,
         image_shape=(256, 256),
         crop_size=224,
+        shuffle=False,
     )
     pte_filename = "ic3_qnn_debug"
     instance = InceptionV3Model()
     source_model = instance.get_eager_model().eval()
     # Init our QNNIntermediateDebugger and pass it in to build_executorch_binary().
-    qnn_intermediate_debugger = QNNIntermediateDebugger()
+    qnn_intermediate_debugger = QNNIntermediateDebugger(sample_input=inputs[0])
     build_executorch_binary(
         model=source_model,
         qnn_config=qnn_config,
@@ -98,64 +97,52 @@ def main(args):
     output_data_folder = f"{args.artifact}/outputs"
     make_output_dir(output_data_folder)
 
-    class RootMeanSquaredErrorEvaluator(MetricEvaluatorBase):
-        def __init__(self, threshold=0.02):
-            self.threshold = threshold
-
-        def metric_name(self) -> str:
-            return "Root Mean Squared Error"
-
-        def evaluate(
-            self, qnn_output: torch.Tensor, cpu_output: torch.Tensor
-        ) -> Tuple[Any, bool]:
-            mse = F.mse_loss(qnn_output, cpu_output)
-            rmse = torch.sqrt(mse)
-            valid = rmse < self.threshold
-            return rmse, valid
-
     # We will pull the debug output and provide them to the Inspector class.
     # We can then provide our own metrics and output type to generate the intermediate debugging results.
     def validate_intermediate_tensor():
-        inspector = Inspector(
+        qnn_intermediate_debugger.setup_inspector(
             etdump_path=f"{args.artifact}/etdump.etdp",
             debug_buffer_path=f"{args.artifact}/debug_output.bin",
         )
 
-        edge_result = qnn_intermediate_debugger.intermediate_output_module(
-            *(inputs[0])
-        )[0]
+        edge_result = qnn_intermediate_debugger.edge_ep.module()(
+            *(qnn_intermediate_debugger.sample_input)
+        )
 
-        # Optional: Ensures that edge module accuracy aligns with nn.Module
+        # Highly Recommended: Ensures that edge module accuracy aligns with nn.Module
         with torch.no_grad():
-            source_result = source_model(*(inputs[0]))
+            source_result = source_model(*(qnn_intermediate_debugger.sample_input))
             score = torch.nn.functional.cosine_similarity(
                 edge_result.flatten(), source_result.flatten(), dim=0
             ).item()
             print("Cosine Similarity Score between nn.Module and Edge CPU is: ", score)
-
         # Users can generate multiple comparison metrics in a single execution.
-        # Below, we generate 3 metrics.
+
+        cos_comparator = qnn_intermediate_debugger.create_comparator(
+            QcomCosineSimilarityComparator, threshold=0.9
+        )
         qnn_intermediate_debugger.generate_results(
             title="ic3_cos_similarity_debugging_graph",
             path=args.artifact,
-            output_format=OutputFormat.SVG_GRAPHS,
-            inspector=inspector,
-            evaluator=CosineSimilarityEvaluator(0.9),
+            output_format=OutputFormat.SVG_GRAPH,
+            comparator=cos_comparator,
         )
+
         qnn_intermediate_debugger.generate_results(
-            title="ic3_cos_similarity_csv",
+            title="ic3_cos_similarity_debugging_graph",
             path=args.artifact,
-            output_format=OutputFormat.CSV_FILES,
-            inspector=inspector,
-            evaluator=CosineSimilarityEvaluator(0.9),
+            output_format=OutputFormat.CSV_FILE,
+            comparator=cos_comparator,
+        )
+
+        mse_comparator = qnn_intermediate_debugger.create_comparator(
+            QcomMSEComparator, threshold=0.1
         )
-        # Using self defined metrics to print svg graphs
         qnn_intermediate_debugger.generate_results(
-            title="ic3_rmse_debugging_graph",
+            title="ic3_mse_debugging_graph",
             path=args.artifact,
-            output_format=OutputFormat.SVG_GRAPHS,
-            inspector=inspector,
-            evaluator=RootMeanSquaredErrorEvaluator(0.9),
+            output_format=OutputFormat.SVG_GRAPH,
+            comparator=mse_comparator,
         )
 
     adb.pull_debug_output(
@@ -180,8 +167,8 @@ def validate_intermediate_tensor():
             conn.send(
                 json.dumps(
                     {
-                        "svg_path": f"{args.artifact}/ic3_rmse_debugging_graph.svg",
-                        "csv_path": f"{args.artifact}/ic3_cos_similarity_csv.csv",
+                        "svg_path": f"{args.artifact}/ic3_mse_debugging_graph.svg",
+                        "csv_path": f"{args.artifact}/ic3_cos_similarity_debugging_graph.csv",
                     }
                 )
             )