Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOGS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Change Logs
0.9.3
+++++

* :pr:`422`: add remove_inputs to InputObserver
* :pr:`421`: fix a few patches for MoE

0.9.2
+++++

Expand Down
2 changes: 1 addition & 1 deletion _doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def linkcode_resolve(domain, info):

if int(os.environ.get("UNITTEST_GOING", "0")):
sphinx_gallery_conf["ignore_pattern"] = (
".*((tiny_llm)|(dort)|(draft_mode)|(hub_codellama.py)|(whisper)|(optimind)).*"
".*((tiny_llm)|(dort)|(draft_mode)|(hub_codellama.py)|(whisper)|(optimind)|(export_with_modelbuilder)).*"
)
elif pv.Version(torch.__version__) < pv.Version("2.8"):
sphinx_gallery_conf["ignore_pattern"] = ".*((_oe_)|(dort)|(draft_mode)).*"
Expand Down
128 changes: 128 additions & 0 deletions _doc/examples/plot_export_with_modelbuilder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
.. _l-plot-export-model-builder:

Export with ModelBuilder
========================

"""

import sys
import os
import pandas
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from onnx_diagnostic import doc
from onnx_diagnostic.investigate.input_observer import InputObserver
from onnx_diagnostic.helpers.rt_helper import onnx_generate
from onnx_diagnostic.torch_export_patches import (
register_additional_serialization_functions,
torch_export_patches,
)
from onnx_diagnostic.export.api import to_onnx


def generate_text(
prompt,
model,
tokenizer,
max_length=50,
temperature=0.01,
top_k=50,
top_p=0.95,
do_sample=True,
device="cpu",
):
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=max_length,
temperature=temperature,
top_k=top_k,
top_p=top_p,
do_sample=do_sample,
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text


# %%
# filename for the model
MODEL_NAME = sys.argv[1] if sys.argv and len(sys.argv) > 1 else "arnir0/Tiny-LLM"
cache_dir = "dump_modelbuilder"
os.makedirs(cache_dir, exist_ok=True)
name = MODEL_NAME.replace("/", "_")
filename = os.path.join(cache_dir, f"plot_export_with_modelbuilder_{name}.onnx")


# %%
# Creating the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if not os.path.exists(filename):
print(f"-- creating... on {device} into {filename!r}")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, dtype=torch.bfloat16)
model = model.to(device)
config = model.config
else:
config = AutoConfig.from_pretrained(MODEL_NAME)


# %%
# Capturing inputs/outputs to infer dynamic shapes and arguments
print("-- capturing...")
prompt = "Continue: it rains, what should I do?"
if not os.path.exists(filename):
observer = InputObserver()
with register_additional_serialization_functions(patch_transformers=True), observer(model):
generate_text(prompt, model, tokenizer, device=device)


# %%
# Exporting.
if not os.path.exists(filename):
print("-- exporting...")
observer.remove_inputs(["cache_position", "logits_to_keep", "position_ids"])
ds = observer.infer_dynamic_shapes(set_batch_dimension_for=True)
kwargs = observer.infer_arguments()

with torch_export_patches(patch_transformers=True):
to_onnx(
model,
filename=filename,
kwargs=kwargs,
dynamic_shapes=ds,
exporter="modelbuilder",
)

data = observer.check_discrepancies(filename, progress_bar=True)
print(pandas.DataFrame(data))

# %%
# ONNX Prompt
# +++++++++++
print("-- ONNX prompts...")
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

onnx_tokens = onnx_generate(
filename,
input_ids=input_ids,
attention_mask=attention_mask,
eos_token_id=config.eos_token_id,
max_new_tokens=50,
)
onnx_generated_text = tokenizer.decode(onnx_tokens, skip_special_tokens=True)

print("-----------------")
print("\n".join(onnx_generated_text))
print("-----------------")

# %%
if os.stat(filename).st_size < 2**14:
doc.save_fig(doc.plot_dot(filename), f"{filename}.png", dpi=400)
2 changes: 1 addition & 1 deletion _doc/technical/plot_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
model_id = "microsoft/phi-1_5"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
config = get_pretrained_config(model_id)
task = task = task_from_id(model_id)
Expand Down
106 changes: 106 additions & 0 deletions _unittests/ut_investigate/test_input_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,6 +1196,112 @@ def forward(self, a, *args, **kwargs):
)
torch.export.export(model, args, kwargs=kwargs, dynamic_shapes=ds)

def test_remove_inputs_kwargs(self):
"""Test that remove_inputs removes a kwarg from the observer info."""

class Model(torch.nn.Module):
def forward(self, x, y, z=None):
r = x + y
if z is not None:
r += z
return r

inputs = [
dict(x=torch.randn((5, 6)), y=torch.randn((1, 6)), z=torch.randn((5, 6))),
dict(x=torch.randn((7, 7)), y=torch.randn((1, 7)), z=torch.randn((7, 7))),
dict(x=torch.randn((7, 8)), y=torch.randn((1, 8)), z=torch.randn((7, 8))),
]

model = Model()
observer = InputObserver()
with observer(model):
for kwargs in inputs:
model(**kwargs)
self.assertEqual(len(observer.info), 3)

cst = torch.export.Dim.DYNAMIC
ds = observer.infer_dynamic_shapes()
self.assertIn("z", ds)
self.assertIn("x", ds)
self.assertIn("y", ds)

# Remove z input
observer.remove_inputs(["z"])

ds_after = observer.infer_dynamic_shapes()
self.assertNotIn("z", ds_after)
self.assertIn("x", ds_after)
self.assertIn("y", ds_after)
self.assertEqual(dict(x={0: cst, 1: cst}, y={1: cst}), ds_after)

args_after = observer.infer_arguments()
self.assertIsInstance(args_after, dict)
self.assertNotIn("z", args_after)
self.assertIn("x", args_after)
self.assertIn("y", args_after)

def test_remove_inputs_multiple_kwargs(self):
"""Test that remove_inputs removes multiple kwargs at once."""

class Model(torch.nn.Module):
def forward(self, x, y, z=None, w=None):
r = x + y
if z is not None:
r += z
if w is not None:
r += w
return r

inputs = [
dict(
x=torch.randn((5, 6)),
y=torch.randn((1, 6)),
z=torch.randn((5, 6)),
w=torch.randn((1, 6)),
),
dict(
x=torch.randn((6, 7)),
y=torch.randn((1, 7)),
z=torch.randn((6, 7)),
w=torch.randn((1, 7)),
),
dict(
x=torch.randn((7, 8)),
y=torch.randn((1, 8)),
z=torch.randn((7, 8)),
w=torch.randn((1, 8)),
),
]

model = Model()
observer = InputObserver()
with observer(model):
for kwargs in inputs:
model(**kwargs)
self.assertEqual(len(observer.info), 3)

cst = torch.export.Dim.DYNAMIC
ds = observer.infer_dynamic_shapes()
self.assertIn("z", ds)
self.assertIn("w", ds)

# Remove z and w inputs
observer.remove_inputs(["z", "w"])

ds_after = observer.infer_dynamic_shapes()
self.assertNotIn("z", ds_after)
self.assertNotIn("w", ds_after)
self.assertIn("x", ds_after)
self.assertIn("y", ds_after)
self.assertEqual(dict(x={0: cst, 1: cst}, y={1: cst}), ds_after)

args_after = observer.infer_arguments()
self.assertIsInstance(args_after, dict)
self.assertNotIn("z", args_after)
self.assertNotIn("w", args_after)
self.assertIn("x", args_after)
self.assertIn("y", args_after)


if __name__ == "__main__":
unittest.main(verbosity=2)
10 changes: 5 additions & 5 deletions _unittests/ut_tasks/try_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def test_text_generation_phi4_moe(self):
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cuda",
torch_dtype="auto",
dtype="auto",
trust_remote_code=True,
# if you do not use Ampere or later GPUs, change attention to "eager"
# _attn_implementation='flash_attention_2',
Expand Down Expand Up @@ -352,7 +352,7 @@ def test_imagetext2text_generation_idefics(self):
mid = "HuggingFaceM4/tiny-random-idefics"
processor = AutoProcessor.from_pretrained(mid)
model = IdeficsForVisionText2Text.from_pretrained(
mid, torch_dtype=torch.bfloat16, device_map="auto"
mid, dtype=torch.bfloat16, device_map="auto"
)

prompt = [
Expand Down Expand Up @@ -699,7 +699,7 @@ def test_falcon_mamba_dev(self):
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
Expand Down Expand Up @@ -736,7 +736,7 @@ def test_falcon_mamba_7b(self):
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
Expand Down Expand Up @@ -802,7 +802,7 @@ def test_text_to_image(self):
from diffusers import StableDiffusionPipeline

model_id = "diffusers/tiny-torch-full-checker" # "stabilityai/stable-diffusion-2"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(
pipe = StableDiffusionPipeline.from_pretrained(model_id, dtype=torch.float16).to(
"cuda"
)

Expand Down
3 changes: 3 additions & 0 deletions _unittests/ut_xrun_doc/test_documentation_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def add_test_methods(cls):

# transformers

if not reason and name in {"plot_export_with_modelbuilder.py"}:
reason = "downloading"

if (
not reason
and name in {"plot_export_tiny_llm.py", "plot_export_tiny_llm_patched.py"}
Expand Down
2 changes: 1 addition & 1 deletion onnx_diagnostic/ci_models/export_phi4_mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ def main(
model_id,
config=config,
trust_remote_code=True,
torch_dtype=torch_dtype,
dtype=torch_dtype,
device_map=device,
attn_implementation="sdpa",
).eval()
Expand Down
7 changes: 6 additions & 1 deletion onnx_diagnostic/export/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
from collections.abc import Mapping, Iterable
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
import onnx
import torch
from .dynamic_shapes import ModelInputs
from .onnx_plug import EagerDirectReplacementWithOnnx
Expand Down Expand Up @@ -312,10 +313,14 @@ def to_onnx(
mod,
precision=str(first_float[0].dtype).split(".")[-1],
execution_provider="cuda" if first.is_cuda else "cpu",
cache_dir=os.path.dirname(filename),
cache_dir=os.path.dirname(filename) or ".",
**(exporter_kwargs or {}),
)
save_model_builder(onx, os.path.dirname(filename))
temp_filename = os.path.join(os.path.dirname(filename), "model.onnx")
# renaming
onx = onnx.load(temp_filename, load_external_data=True)
onnx.save(onx, filename, save_as_external_data=True)
Comment on lines +316 to +323
Copy link

Copilot AI Feb 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the modelbuilder exporter, save_model_builder(onx, os.path.dirname(filename)) passes an empty string when filename has no directory, which makes save_model_builder return an in-memory proto without writing a file. The subsequent onnx.load(temp_filename) will then fail. Use a normalized output dir (e.g., out_dir = os.path.dirname(filename) or ".") consistently for cache_dir, save_model_builder, and temp file paths.

Copilot uses AI. Check for mistakes.
return onx

raise ValueError(f"Unknown exporter={exporter!r}")
Expand Down
Loading
Loading