Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
1b1a090
implement remote gguf inspection on hugging face
pipe1os Jun 27, 2026
71ef3a3
split print_model_info test to comply with codacy method size limit
pipe1os Jun 27, 2026
d0c5474
fix codacy issues: add read limit, honor gpu_util, modularize hf pars…
pipe1os Jun 27, 2026
bebe2c1
refactor: split concurrent shards fetching to lower cyclomatic comple…
pipe1os Jun 27, 2026
6555e0e
fix codacy issues: compute GGUF group variant overhead dynamically
pipe1os Jun 27, 2026
357ee16
docs: document remote gguf inspection options in README.md
pipe1os Jun 27, 2026
5a8e6e2
fix: strip trailing slashes from model paths at entrypoint
pipe1os Jun 27, 2026
7dc8576
fix: handle reverse tensor shape ordering for gguf shape guessing
pipe1os Jun 27, 2026
0ef126b
fix: treat paths starting with local prefix as local files to prevent…
pipe1os Jun 27, 2026
b0b9744
fix: handle concurrent remote shard download failures gracefully
pipe1os Jun 27, 2026
9ba40d0
fix: resolve safetensors shard index prefix splitting
pipe1os Jun 27, 2026
738e291
merge: integrate branch 004-fix-safetensors-shard-prefix
pipe1os Jun 27, 2026
1b1c58f
merge: integrate branch 005-graceful-shard-downloads
pipe1os Jun 27, 2026
56bbf66
merge: integrate branch 006-fix-gguf-shape-guessing
pipe1os Jun 27, 2026
0cfe2e5
merge: integrate branch 007-refine-remote-detection
pipe1os Jun 27, 2026
2deb25a
merge: integrate branch 008-fix-comparison-trailing-slash
pipe1os Jun 27, 2026
32bf12b
fix: address codacy review feedback on disk size, regex, path parsing…
pipe1os Jun 27, 2026
65ab0c4
merge: sync with main and apply updates
pipe1os Jun 27, 2026
9649242
merge: sync with main
pipe1os Jun 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/modelinfo/architecture.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None)

metadata = tensors.get("__metadata__", {})
gen_arch = metadata.get("general.architecture")
is_gguf = "general.architecture" in metadata or any(k.startswith("general.") for k in metadata.keys())

# 1. Attempt explicit GGUF metadata
if gen_arch:
Expand Down Expand Up @@ -68,14 +69,14 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None)
found_k_proj = True
shape = meta.get("shape", [])
if len(shape) >= 2:
kv_dim = shape[0]
kv_dim = shape[-1] if is_gguf else shape[0]

if "qkv_proj.weight" in name or "c_attn.weight" in name:
found_fused = True
if not found_k_proj:
shape = meta.get("shape", [])
if len(shape) >= 2:
kv_dim = shape[0] // 3
kv_dim = (shape[-1] if is_gguf else shape[0]) // 3

num_layers = len(layers_set)
if found_fused and not found_k_proj and kv_dim > 0:
Expand Down
23 changes: 19 additions & 4 deletions src/modelinfo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,15 @@ def analyze_model(

is_remote = False
if not os.path.exists(file_path):
if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
is_remote = True
# ponytail: prevent routing explicit local paths or typos to HF
is_local_path = (
file_path.startswith((".", "/", "~"))
or os.path.isabs(file_path)
)
if not is_local_path:
# Treat as remote only if it contains a slash and does not end with a model extension
if "/" in file_path and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
is_remote = True

if is_remote:
from modelinfo.parsers.huggingface import fetch_huggingface_repo
Expand Down Expand Up @@ -212,8 +219,12 @@ def analyze_model(
num_layers = footprint["num_layers"]
arch_name = identify_architecture_name(tensors, num_layers, config)

if os.path.exists(file_path):
disk_size = os.path.getsize(file_path)
if not is_remote:
metadata = tensors.get("__metadata__", {})
if metadata.get("is_sharded") and "disk_size" in metadata:
disk_size = metadata["disk_size"]
elif os.path.exists(file_path):
disk_size = os.path.getsize(file_path)

tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])

Expand All @@ -240,6 +251,10 @@ def analyze_model(
def main(argv: Sequence[str] | None = None) -> int:
args = parse_args(argv)

# Strip trailing slashes from paths/repos to prevent empty basenames and routing issues
if args.file:
args.file = [path.rstrip("/\\") for path in args.file if path]

gpu_name_display = None
gpu_vram_gb = None
gpu_count = 1
Expand Down
26 changes: 17 additions & 9 deletions src/modelinfo/parsers/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,19 +212,27 @@ def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]]
return tensors


def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Dict[str, Any]:
def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Tuple[Dict[str, Any], int]:
def fetch_shard(shard: str):
return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
try:
header = _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
return shard, header, None
except Exception as e:
return shard, {}, e

tensors = {}
missing_shards = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
for future in concurrent.futures.as_completed(future_to_shard):
shard, shard_header = future.result()
for k, v in shard_header.items():
if k != "__metadata__":
tensors[k] = v
return tensors
shard, shard_header, error = future.result()
if error is not None:
missing_shards += 1
else:
for k, v in shard_header.items():
if k != "__metadata__":
tensors[k] = v
return tensors, missing_shards


def _fetch_remote_safetensors_sharded(
Expand Down Expand Up @@ -253,9 +261,9 @@ def _fetch_remote_safetensors_sharded(
"total_size": total_size
}
else:
tensors = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout)
tensors, missing_shards = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout)
tensors["__metadata__"] = {
"missing_shards": 0,
"missing_shards": missing_shards,
"total_shards": len(unique_shards),
"is_sharded": True
}
Expand Down
14 changes: 12 additions & 2 deletions src/modelinfo/parsers/safetensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,13 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
if path.endswith(".index.json"):
is_index = True
elif "-of-" in base_name and path.endswith(".safetensors"):
prefix = base_name.split("-")[0]
import re
match = re.match(r"^(.*?)-\d+-of-\d+\.safetensors$", base_name)
if match:
prefix = match.group(1)
else:
# Fallback to splitting in case of non-standard shard formatting
prefix = base_name.split("-")[0]
potential_index = os.path.join(dir_path, f"{prefix}.safetensors.index.json")
if os.path.exists(potential_index):
index_path = potential_index
Expand All @@ -49,9 +55,12 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
tensors = {}
missing_shards = 0
total_shards = len(unique_shards)
total_size = 0

for shard in unique_shards:
shard_path = os.path.join(dir_path, shard)
if os.path.exists(shard_path):
total_size += os.path.getsize(shard_path)
try:
shard_header = _read_single_header(shard_path)
for k, v in shard_header.items():
Expand All @@ -63,7 +72,8 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
tensors["__metadata__"] = {
"missing_shards": missing_shards,
"total_shards": total_shards,
"is_sharded": True
"is_sharded": True,
"disk_size": total_size
}

return tensors
43 changes: 43 additions & 0 deletions tests/test_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,46 @@ def test_vllm_capacity_simulation():
bytes_per_token = 40960
expected_capacity = math.floor(metrics["paged_kv_pool"] / bytes_per_token)
assert metrics["max_serving_capacity"] == expected_capacity


def test_gguf_shape_guessing_fallback():
"""Verify that shape guessing logic correctly extracts kv_dim using GGUF column-major ordering (shape[-1]) when metadata has no explicit keys."""
from modelinfo.architecture import extract_architecture

tensors = {
"__metadata__": {
"general.architecture": "llama",
},
"model.layers.0.self_attn.k_proj.weight": {
"shape": [4096, 1024],
"dtype": "F16"
},
"model.layers.1.self_attn.k_proj.weight": {
"shape": [4096, 1024],
"dtype": "F16"
}
}

num_layers, kv_dim, is_estimate = extract_architecture(tensors)
assert num_layers == 2
assert kv_dim == 1024
assert is_estimate is False

def test_gguf_shape_guessing_fallback_fused():
"""Verify that fused shape guessing extracts (shape[-1] // 3) for GGUF tensors."""
from modelinfo.architecture import extract_architecture

tensors = {
"__metadata__": {
"general.architecture": "gpt2",
},
"model.layers.0.self_attn.qkv_proj.weight": {
"shape": [4096, 3072],
"dtype": "F16"
}
}

num_layers, kv_dim, is_estimate = extract_architecture(tensors)
assert num_layers == 1
assert kv_dim == 1024
assert is_estimate is True
89 changes: 76 additions & 13 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,19 +187,7 @@ def fake_exists(path):
return False

def fake_fetch(repo_id, *, fetch_tensors, timeout):
tensors = {
"__metadata__": {
"general.architecture": "llama",
"llama.block_count": 32,
"llama.attention.head_count_kv": 8,
"llama.attention.key_length": 128,
"gguf_variants": [
{"filename": "model-q4.gguf", "size": 1000000000},
{"filename": "model-q8.gguf", "size": 2000000000}
],
"repo_id": "org/model-gguf"
}
}
tensors, _ = _get_mock_gguf_group_data()
return tensors, None, "GGUF_group", 0.0

monkeypatch.setattr(cli.os.path, "exists", fake_exists)
Expand Down Expand Up @@ -304,4 +292,79 @@ def test_print_model_info_gguf_group_with_gpu(capsys):
assert "model-q8.gguf" in out
assert "Fits" in out

def test_analyze_model_local_path_routing(monkeypatch):
"""Test that analyze_model treats paths starting with local prefix as local, raising an error instead of routing to Hugging Face."""
from modelinfo.parsers import huggingface

hf_fetched = []
def fake_fetch(repo_id, *, fetch_tensors, timeout):
hf_fetched.append(repo_id)
return {}, None, "SafeTensors", 0.0

monkeypatch.setattr(huggingface, "fetch_huggingface_repo", fake_fetch)

# Test cases that should NOT hit Hugging Face
local_paths = ["./missing.gguf", "../missing.safetensors", "/missing.bin", "~/missing.pt"]
for path in local_paths:
with pytest.raises((FileNotFoundError, ValueError, OSError)):
cli.analyze_model(path, context_override=128)

assert len(hf_fetched) == 0, f"Hugging Face fetch was triggered for local paths: {hf_fetched}"

# Test cases that SHOULD hit Hugging Face
remote_paths = ["meta-llama/Llama-2-7b-hf", "org/model"]
for path in remote_paths:
try:
cli.analyze_model(path, context_override=128)
except Exception:
# We don't care if calculation fails later because of empty dict from fake_fetch,
# we just care that it triggers fetch_huggingface_repo.
pass

assert hf_fetched == remote_paths


def test_cli_strips_trailing_slashes_from_model_paths(monkeypatch):
captured_paths = []

def fake_analyze_model(file_path, *args, **kwargs):
captured_paths.append(file_path)
return {
"format_name": "GGUF",
"arch_name": "Llama",
"tensor_count": 10,
"footprint": {
"total_params": 100,
"base_memory_bytes": 200,
"kv_cache_bytes": 100,
"overhead_bytes": 50,
"total_memory_bytes": 350,
"num_layers": 1,
},
"disk_size": 200,
"context_length": 128,
"is_default_context": True,
"tensors": {},
"max_context": 512,
"is_lazy": False,
"gpu_count": 1,
"topology": "pcie4",
"strategy": "tp",
"is_vllm": False,
"gpu_vram_gb": 0.0,
"gpu_util": 0.9,
}

monkeypatch.setattr(cli, "analyze_model", fake_analyze_model)
monkeypatch.setattr(cli, "print_compare_info", lambda models, max_vram, gpu_name: None)
monkeypatch.setattr(cli, "print_model_info", lambda *args, **kwargs: None)

# Test single model path with trailing slash
cli.main(["meta-llama/Llama-2-7b-hf/"])
assert captured_paths == ["meta-llama/Llama-2-7b-hf"]

captured_paths.clear()

# Test multiple model paths with trailing slashes (side-by-side comparison)
cli.main(["meta-llama/Llama-2-7b-hf/", "mistralai/Mistral-7B-v0.1/"])
assert captured_paths == ["meta-llama/Llama-2-7b-hf", "mistralai/Mistral-7B-v0.1"]
81 changes: 81 additions & 0 deletions tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,4 +201,85 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0):
huggingface.fetch_huggingface_repo("org/nonexistent-model")
assert "Could not find repository on Hugging Face" in str(exc_info.value)

def test_safetensors_sharded_with_hyphens(tmp_path):
"""Test safetensors parser sharded index path resolution when filename contains hyphens."""
import struct
import json

index_file = tmp_path / "mock-llama-3-8b.safetensors.index.json"
shard_file = tmp_path / "mock-llama-3-8b-00001-of-00002.safetensors"

index_data = {
"weight_map": {
"model.embed_tokens.weight": "mock-llama-3-8b-00001-of-00002.safetensors"
}
}
index_file.write_text(json.dumps(index_data), encoding="utf-8")

header_data = {
"model.embed_tokens.weight": {
"dtype": "BF16",
"shape": [32000, 4096],
"data_offsets": [0, 262144000]
}
}
header_json = json.dumps(header_data).encode("utf-8")
header_len = len(header_json)

with open(shard_file, "wb") as f:
f.write(struct.pack("<Q", header_len))
f.write(header_json)

tensors = parse_safetensors_header(str(shard_file))

assert tensors.get("__metadata__", {}).get("is_sharded") is True
assert tensors.get("__metadata__", {}).get("total_shards") == 1
assert tensors.get("__metadata__", {}).get("missing_shards") == 0
assert "model.embed_tokens.weight" in tensors
assert tensors["model.embed_tokens.weight"]["dtype"] == "BF16"


def test_remote_shard_download_failure(monkeypatch):
"""Test remote sharded safetensors parsing when one of the shard downloads fails."""
import json
import struct
import urllib.error
from modelinfo.parsers import huggingface

def fake_make_request(url, headers=None, limit=None, timeout=10.0):
if "/api/models/" in url:
return json.dumps({
"siblings": [
{"rfilename": "model.safetensors.index.json"},
{"rfilename": "model-00001-of-00002.safetensors"},
{"rfilename": "model-00002-of-00002.safetensors"}
]
}).encode("utf-8")
elif "model.safetensors.index.json" in url:
return json.dumps({
"metadata": {"total_size": 2000000000},
"weight_map": {
"layer1.weight": "model-00001-of-00002.safetensors",
"layer2.weight": "model-00002-of-00002.safetensors"
}
}).encode("utf-8")
elif "model-00001-of-00002.safetensors" in url:
header_json = json.dumps({"layer1.weight": {"dtype": "BF16", "shape": [1024, 1024]}}).encode("utf-8")
return struct.pack("<Q", len(header_json)) + header_json
elif "model-00002-of-00002.safetensors" in url:
raise urllib.error.HTTPError(url, 502, "Bad Gateway", {}, None)
raise ValueError(f"Unexpected url: {url}")

monkeypatch.setattr(huggingface, "_make_request", fake_make_request)

tensors, config, format_name, disk_size = huggingface.fetch_huggingface_repo(
"org/sharded-safetensors-model", fetch_tensors=True
)

assert format_name == "SafeTensors"
assert disk_size == 2000000000.0
assert tensors["__metadata__"]["missing_shards"] == 1
assert tensors["__metadata__"]["total_shards"] == 2
assert tensors["__metadata__"]["is_sharded"] is True
assert "layer1.weight" in tensors
assert "layer2.weight" not in tensors
Loading