From 1b1a0905b04ff21829907f95d52b540bdecfded4 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:16:03 -0400 Subject: [PATCH 01/12] implement remote gguf inspection on hugging face --- src/modelinfo/cli.py | 13 +- src/modelinfo/parsers/gguf.py | 81 ++++++------ src/modelinfo/parsers/huggingface.py | 179 +++++++++++++++++++++++---- src/modelinfo/ui.py | 55 ++++++++ tests/test_cli.py | 122 ++++++++++++++++++ tests/test_parsers.py | 87 +++++++++++++ 6 files changed, 473 insertions(+), 64 deletions(-) diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py index c89b6d7..cb4be02 100644 --- a/src/modelinfo/cli.py +++ b/src/modelinfo/cli.py @@ -149,7 +149,12 @@ def analyze_model( file_path_lower = file_path.lower() - if not os.path.exists(file_path) and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): + is_remote = False + if not os.path.exists(file_path): + if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): + is_remote = True + + if is_remote: from modelinfo.parsers.huggingface import fetch_huggingface_repo tensors, config, format_name, disk_size = fetch_huggingface_repo( file_path, fetch_tensors=fetch_tensors, timeout=timeout @@ -180,7 +185,7 @@ def analyze_model( max_context = None if config: max_context = config.get("max_position_embeddings") - elif format_name == "GGUF": + elif format_name in ("GGUF", "GGUF_group"): metadata = tensors.get("__metadata__", {}) gen_arch = metadata.get("general.architecture") if gen_arch: @@ -207,8 +212,8 @@ def analyze_model( num_layers = footprint["num_layers"] arch_name = identify_architecture_name(tensors, num_layers, config) - if format_name != "SafeTensors" or os.path.exists(file_path): - disk_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0.0 + if os.path.exists(file_path): + disk_size = os.path.getsize(file_path) tensor_count = len([k for k in tensors.keys() if k != "__metadata__"]) diff --git a/src/modelinfo/parsers/gguf.py b/src/modelinfo/parsers/gguf.py index 5f3d210..3af2fb4 100644 --- a/src/modelinfo/parsers/gguf.py +++ b/src/modelinfo/parsers/gguf.py @@ -44,46 +44,53 @@ def _read_gguf_value(f: Any, val_type: int) -> Any: raise ValueError(f"Unknown GGUF value type: {val_type}") -def parse_gguf_header(path: str) -> Dict[str, Any]: +def parse_gguf_header(path_or_file: str | Any) -> Dict[str, Any]: """Parses a GGUF file header and extracts tensor information.""" + if isinstance(path_or_file, str): + with open(path_or_file, "rb") as f: + return _parse_gguf_header_from_stream(f) + else: + return _parse_gguf_header_from_stream(path_or_file) + + +def _parse_gguf_header_from_stream(f: Any) -> Dict[str, Any]: tensors: Dict[str, Any] = {} + magic = f.read(4) + if magic != b"GGUF": + raise ValueError("Invalid GGUF file: Magic bytes missing.") + + version = struct.unpack(" float: + req = urllib.request.Request(f"{_get_hf_endpoint()}/{repo_id}/resolve/main/{filename}", method="HEAD") + token = _get_hf_token() + if token: + req.add_header("Authorization", f"Bearer {token}") + try: + with urllib.request.urlopen(req, timeout=timeout) as response: + return float(response.headers.get("Content-Length", 0)) + except Exception: + return 0.0 + + +class RemoteFileStream: + def __init__(self, url: str, chunk_size: int = 1024*1024, timeout: float = 10.0): + self.url = url + self.chunk_size = chunk_size + self.timeout = timeout + self.buffer = b"" + self.position = 0 + + def read(self, size: int = -1) -> bytes: + if size == -1: + raise NotImplementedError("Unlimited remote read is not supported.") + + end_pos = self.position + size + while end_pos > len(self.buffer): + start_bytes = len(self.buffer) + end_bytes = start_bytes + self.chunk_size - 1 + + headers = {"Range": f"bytes={start_bytes}-{end_bytes}"} + try: + chunk = _make_request( + self.url, + headers=headers, + limit=self.chunk_size, + timeout=self.timeout + ) + if not chunk: + break + self.buffer += chunk + except urllib.error.HTTPError as e: + if e.code == 416: + break + raise + except Exception: + raise + + result = self.buffer[self.position:self.position+size] + self.position += len(result) + return result + + def seek(self, offset: int, whence: int = 0) -> int: + if whence == 0: + self.position = offset + elif whence == 1: + self.position += offset + else: + raise NotImplementedError("Seek from end is not supported.") + return self.position + + def tell(self) -> int: + return self.position + + def close(self) -> None: + pass + + def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: float = 10.0) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]: """ Fetches the metadata directly from the Hugging Face Hub over the network. Returns: (tensors, config, format_name, disk_size) """ - api_url = f"{_get_hf_endpoint()}/api/models/{repo_id}" + target_filename = None + parts = repo_id.split("/") + if len(parts) >= 3 and parts[-1].lower().endswith(".gguf"): + real_repo_id = "/".join(parts[:2]) + target_filename = "/".join(parts[2:]) + else: + real_repo_id = repo_id + + api_url = f"{_get_hf_endpoint()}/api/models/{real_repo_id}" try: api_data = json.loads(_make_request(api_url, timeout=timeout).decode("utf-8")) except urllib.error.HTTPError as e: if e.code == 401: - raise PermissionError(f"Gated/Private Model (401 Unauthorized). Set the HF_TOKEN environment variable to access {repo_id}") + raise PermissionError(f"Gated/Private Model (401 Unauthorized). Set the HF_TOKEN environment variable to access {real_repo_id}") if e.code == 404: - raise FileNotFoundError(f"Could not find repository on Hugging Face (404 Not Found): {repo_id}") + raise FileNotFoundError(f"Could not find repository on Hugging Face (404 Not Found): {real_repo_id}") raise siblings = api_data.get("siblings", []) @@ -121,15 +196,38 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f config = None if "config.json" in filenames: - config_url = f"{_get_hf_endpoint()}/{repo_id}/resolve/main/config.json" + config_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/config.json" config = json.loads(_make_request(config_url, timeout=timeout).decode("utf-8")) + + # Find GGUF siblings + gguf_files = [] + for s in siblings: + fname = s.get("rfilename", "") + if fname.lower().endswith(".gguf"): + gguf_files.append({ + "filename": fname, + "size": s.get("size") + }) + + if target_filename: + target_sibling = next((g for g in gguf_files if g["filename"] == target_filename), None) + if not target_sibling: + raise FileNotFoundError(f"Could not find file '{target_filename}' in Hugging Face repository '{real_repo_id}'.") - tensors = {} - total_size = 0.0 - + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{target_filename}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + + size = target_sibling["size"] if target_sibling["size"] is not None else 0.0 + if size == 0.0: + size = _get_remote_file_size_fallback(real_repo_id, target_filename, timeout) + + return tensors, config, "GGUF", float(size) + + # Fallback to SafeTensors checks if no specific file is target if "model.safetensors.index.json" in filenames: - # Sharded SafeTensors - index_url = f"{_get_hf_endpoint()}/{repo_id}/resolve/main/model.safetensors.index.json" + index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json" index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8")) weight_map = index_data.get("weight_map", {}) @@ -137,8 +235,8 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f total_size = index_data.get("metadata", {}).get("total_size", 0.0) + tensors = {} if config and not fetch_tensors and total_size > 0: - # Lazy Fetch Paradigm for tensor_name in weight_map.keys(): tensors[tensor_name] = {"shape": [], "dtype": "BF16"} @@ -151,7 +249,7 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f } else: def fetch_shard(shard: str): - return shard, _fetch_safetensors_header(repo_id, shard, timeout=timeout) + return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} @@ -166,13 +264,11 @@ def fetch_shard(shard: str): "total_shards": len(unique_shards), "is_sharded": True } - format_name = "SafeTensors" + return tensors, config, "SafeTensors", float(total_size) elif "model.safetensors" in filenames: - # Single SafeTensors - - # Determine total size first - req = urllib.request.Request(f"{_get_hf_endpoint()}/{repo_id}/resolve/main/model.safetensors", method="HEAD") + total_size = 0.0 + req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD") token = _get_hf_token() if token: req.add_header("Authorization", f"Bearer {token}") @@ -182,12 +278,49 @@ def fetch_shard(shard: str): except Exception: pass - header = _fetch_safetensors_header(repo_id, "model.safetensors", timeout=timeout) - tensors = header + header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout) + return header, config, "SafeTensors", float(total_size) + + elif gguf_files: + if len(gguf_files) == 1: + single_file = gguf_files[0]["filename"] + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{single_file}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + size = gguf_files[0]["size"] if gguf_files[0]["size"] is not None else 0.0 + if size == 0.0: + size = _get_remote_file_size_fallback(real_repo_id, single_file, timeout) + return tensors, config, "GGUF", float(size) + else: + valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0] + if valid_sizes: + header_target = min(valid_sizes, key=lambda x: x["size"]) + else: + header_target = gguf_files[0] - format_name = "SafeTensors" - + header_file = header_target["filename"] + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + + variants = [] + for g in gguf_files: + v_size = g["size"] + if v_size is None or v_size == 0: + v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout) + variants.append({ + "filename": g["filename"], + "size": float(v_size) + }) + + tensors["__metadata__"] = tensors.get("__metadata__", {}) + tensors["__metadata__"]["gguf_variants"] = variants + tensors["__metadata__"]["repo_id"] = real_repo_id + + return tensors, config, "GGUF_group", 0.0 + else: - raise ValueError(f"Repository {repo_id} does not contain SafeTensors weights.") - - return tensors, config, format_name, float(total_size) + raise ValueError(f"Repository {real_repo_id} does not contain SafeTensors or GGUF weights.") + diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py index f69ce28..44e127f 100644 --- a/src/modelinfo/ui.py +++ b/src/modelinfo/ui.py @@ -56,6 +56,61 @@ def print_model_info( gpu_vram_gb: float = 0.0, gpu_util: float = 0.9 ) -> None: + if format_name == "GGUF_group": + metadata = tensors.get("__metadata__", {}) + variants = metadata.get("gguf_variants", []) + repo_id = metadata.get("repo_id", "") + + console.print(f"[bold]Repository:[/bold] {repo_id}") + console.print("[bold]Format:[/bold] GGUF (Multiple Quantizations)") + console.print(f"[bold]Architecture:[/bold] {arch_name}") + if max_context: + console.print(f"[bold]Context Limit:[/bold] {max_context:,} tokens") + console.print() + + table = Table(box=None, show_header=True, header_style="bold", pad_edge=False, padding=(0, 2)) + table.add_column("Quantization File") + table.add_column("File Size", justify="right") + table.add_column("KV Cache", justify="right") + table.add_column("Total VRAM", justify="right") + + show_fits = gpu_name is not None + if show_fits: + table.add_column("Fits", justify="left") + + kv_cache_bytes = footprint["kv_cache_bytes"] + overhead_bytes = footprint.get("overhead_bytes", 600 * 1024 * 1024) + + sorted_variants = sorted(variants, key=lambda x: x["size"], reverse=True) + for var in sorted_variants: + filename = var["filename"] + size_bytes = var["size"] + total_vram_bytes = size_bytes + kv_cache_bytes + overhead_bytes + + file_size_str = format_bytes(size_bytes) + kv_cache_str = format_bytes(kv_cache_bytes) + + vram_color = get_vram_color(total_vram_bytes, max_vram_gb) + total_vram_str = f"[{vram_color}]~{format_bytes(total_vram_bytes)}[/{vram_color}]" + + row_data = [filename, file_size_str, kv_cache_str, total_vram_str] + if show_fits: + utilization = total_vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0 + if utilization <= 0.90: + fit_text = "[green]✓ Yes[/green]" + elif utilization <= 0.99: + fit_text = "[yellow]⚠ Warning[/yellow]" + else: + fit_text = "[red]✗ No[/red]" + row_data.append(fit_text) + + table.add_row(*row_data) + + console.print(table) + console.print() + console.print(f"[dim]Tip: To view details for a specific quantization, run: modelinfo {repo_id}/{sorted_variants[0]['filename']}[/dim]") + return + summary = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2)) summary.add_column("Property", style="bold") summary.add_column("Value") diff --git a/tests/test_cli.py b/tests/test_cli.py index a5792d3..fd17262 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -177,3 +177,125 @@ def fake_calculate_footprint(tensors, *, context_length, batch_size, **kwargs): "fetch_tensors": True, "timeout": 22.5, } + + +def test_analyze_model_gguf_group(monkeypatch): + """Test that analyze_model correctly handles and propagates GGUF groups.""" + from modelinfo.parsers import huggingface + + def fake_exists(path): + return False + + def fake_fetch(repo_id, *, fetch_tensors, timeout): + tensors = { + "__metadata__": { + "general.architecture": "llama", + "llama.block_count": 32, + "llama.attention.head_count_kv": 8, + "llama.attention.key_length": 128, + "gguf_variants": [ + {"filename": "model-q4.gguf", "size": 1000000000}, + {"filename": "model-q8.gguf", "size": 2000000000} + ], + "repo_id": "org/model-gguf" + } + } + return tensors, None, "GGUF_group", 0.0 + + monkeypatch.setattr(cli.os.path, "exists", fake_exists) + monkeypatch.setattr(huggingface, "fetch_huggingface_repo", fake_fetch) + + def fake_calculate_footprint(*args, **kwargs): + return { + "total_params": 1000000, + "base_memory_bytes": 2000000.0, + "kv_cache_bytes": 1000000.0, + "overhead_bytes": 600000.0, + "total_memory_bytes": 3600000.0, + "num_layers": 32, + "kv_dim": 1024, + "primary_dtype": "Q4_0", + "kv_is_estimate": False, + "penalty_percentage": 0.0, + "vllm_metrics": {} + } + monkeypatch.setattr(cli, "calculate_footprint", fake_calculate_footprint) + + info = cli.analyze_model("org/model-gguf", context_override=128) + + assert info["format_name"] == "GGUF_group" + assert info["tensors"]["__metadata__"]["repo_id"] == "org/model-gguf" + assert len(info["tensors"]["__metadata__"]["gguf_variants"]) == 2 + + +def test_print_model_info_gguf_group(capsys): + """Test print_model_info renders a comparison table for GGUF groups.""" + from modelinfo.ui import print_model_info + + tensors = { + "__metadata__": { + "general.architecture": "llama", + "llama.block_count": 32, + "llama.attention.head_count_kv": 8, + "llama.attention.key_length": 128, + "gguf_variants": [ + {"filename": "model-q4.gguf", "size": 1000000000}, + {"filename": "model-q8.gguf", "size": 2000000000} + ], + "repo_id": "org/model-gguf" + } + } + + footprint = { + "total_params": 8000000000, + "base_memory_bytes": 4000000000.0, + "kv_cache_bytes": 1000000000.0, + "overhead_bytes": 600000000.0, + "total_memory_bytes": 5600000000.0, + "num_layers": 32, + "kv_dim": 1024, + "primary_dtype": "Q4_0", + "kv_is_estimate": False, + "penalty_percentage": 0.0, + "vllm_metrics": {} + } + + print_model_info( + format_name="GGUF_group", + arch_name="Llama (32 layers)", + tensor_count=0, + footprint=footprint, + disk_size=0.0, + context_length=8192, + is_default_context=True, + tensors=tensors, + max_context=32768, + max_vram_gb=8.0, + gpu_name=None + ) + + out, err = capsys.readouterr() + assert "model-q4.gguf" in out + assert "model-q8.gguf" in out + assert "Fits" not in out + assert "Tip:" in out + + print_model_info( + format_name="GGUF_group", + arch_name="Llama (32 layers)", + tensor_count=0, + footprint=footprint, + disk_size=0.0, + context_length=8192, + is_default_context=True, + tensors=tensors, + max_context=32768, + max_vram_gb=8.0, + gpu_name="RTX4080" + ) + + out, err = capsys.readouterr() + assert "model-q4.gguf" in out + assert "model-q8.gguf" in out + assert "Fits" in out + diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 10ebc91..e6cc4b6 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -82,3 +82,90 @@ def test_hf_endpoint_rejects_no_hostname(monkeypatch): monkeypatch.setenv("HF_ENDPOINT", "https:///repo") with pytest.raises(ValueError, match="must include a valid hostname"): _get_hf_endpoint() + + +def test_remote_gguf_parsing_single(monkeypatch): + """Test remote GGUF parsing when a single GGUF is found in the repository.""" + import json + from modelinfo.parsers import huggingface + + def fake_make_request(url, headers=None, limit=None, timeout=10.0): + if "/api/models/" in url: + return json.dumps({ + "siblings": [ + {"rfilename": "model-q4.gguf", "size": 1000000000} + ] + }).encode("utf-8") + elif "model-q4.gguf" in url: + import struct + header = b"GGUF" + struct.pack(" Date: Sat, 27 Jun 2026 11:18:31 -0400 Subject: [PATCH 02/12] split print_model_info test to comply with codacy method size limit --- tests/test_cli.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index fd17262..857a225 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -228,10 +228,7 @@ def fake_calculate_footprint(*args, **kwargs): assert len(info["tensors"]["__metadata__"]["gguf_variants"]) == 2 -def test_print_model_info_gguf_group(capsys): - """Test print_model_info renders a comparison table for GGUF groups.""" - from modelinfo.ui import print_model_info - +def _get_mock_gguf_group_data(): tensors = { "__metadata__": { "general.architecture": "llama", @@ -245,7 +242,6 @@ def test_print_model_info_gguf_group(capsys): "repo_id": "org/model-gguf" } } - footprint = { "total_params": 8000000000, "base_memory_bytes": 4000000000.0, @@ -259,7 +255,13 @@ def test_print_model_info_gguf_group(capsys): "penalty_percentage": 0.0, "vllm_metrics": {} } - + return tensors, footprint + + +def test_print_model_info_gguf_group_no_gpu(capsys): + """Test print_model_info renders comparison table without Fits column when no GPU target.""" + from modelinfo.ui import print_model_info + tensors, footprint = _get_mock_gguf_group_data() print_model_info( format_name="GGUF_group", arch_name="Llama (32 layers)", @@ -273,13 +275,17 @@ def test_print_model_info_gguf_group(capsys): max_vram_gb=8.0, gpu_name=None ) - - out, err = capsys.readouterr() + out, _ = capsys.readouterr() assert "model-q4.gguf" in out assert "model-q8.gguf" in out assert "Fits" not in out assert "Tip:" in out - + + +def test_print_model_info_gguf_group_with_gpu(capsys): + """Test print_model_info renders comparison table with Fits column when GPU target exists.""" + from modelinfo.ui import print_model_info + tensors, footprint = _get_mock_gguf_group_data() print_model_info( format_name="GGUF_group", arch_name="Llama (32 layers)", @@ -293,9 +299,9 @@ def test_print_model_info_gguf_group(capsys): max_vram_gb=8.0, gpu_name="RTX4080" ) - - out, err = capsys.readouterr() + out, _ = capsys.readouterr() assert "model-q4.gguf" in out assert "model-q8.gguf" in out assert "Fits" in out + From d0c5474a48b4cf4f3256b772cbf588a8c3c520e1 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:22:10 -0400 Subject: [PATCH 03/12] fix codacy issues: add read limit, honor gpu_util, modularize hf parser, add error tests --- src/modelinfo/parsers/huggingface.py | 208 +++++++++++++++------------ src/modelinfo/ui.py | 4 +- tests/test_parsers.py | 33 +++++ 3 files changed, 148 insertions(+), 97 deletions(-) diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py index 6ddc927..f63d212 100644 --- a/src/modelinfo/parsers/huggingface.py +++ b/src/modelinfo/parsers/huggingface.py @@ -5,7 +5,7 @@ import urllib.error import urllib.parse import urllib.request -from typing import Any, Dict, Tuple +from typing import Any, Dict, List, Tuple def _get_hf_endpoint() -> str: endpoint = os.environ.get("HF_ENDPOINT", "https://huggingface.co").strip() @@ -126,6 +126,9 @@ def read(self, size: int = -1) -> bytes: raise NotImplementedError("Unlimited remote read is not supported.") end_pos = self.position + size + if end_pos > 50 * 1024 * 1024: + raise ValueError("Remote header read limit exceeded (50MB). File might be invalid or too large.") + while end_pos > len(self.buffer): start_bytes = len(self.buffer) end_bytes = start_bytes + self.chunk_size - 1 @@ -168,6 +171,108 @@ def close(self) -> None: pass +def _fetch_remote_gguf_single(real_repo_id: str, filename: str, fallback_size: float | None, timeout: float) -> Tuple[Dict[str, Any], float]: + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{filename}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + + size = fallback_size if fallback_size is not None else 0.0 + if size == 0.0: + size = _get_remote_file_size_fallback(real_repo_id, filename, timeout) + return tensors, size + + +def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]], timeout: float) -> Dict[str, Any]: + valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0] + if valid_sizes: + header_target = min(valid_sizes, key=lambda x: x["size"]) + else: + header_target = gguf_files[0] + + header_file = header_target["filename"] + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + + variants = [] + for g in gguf_files: + v_size = g["size"] + if v_size is None or v_size == 0: + v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout) + variants.append({ + "filename": g["filename"], + "size": float(v_size) + }) + + tensors["__metadata__"] = tensors.get("__metadata__", {}) + tensors["__metadata__"]["gguf_variants"] = variants + tensors["__metadata__"]["repo_id"] = real_repo_id + return tensors + + +def _fetch_remote_safetensors_sharded( + real_repo_id: str, + config: Dict[str, Any] | None, + fetch_tensors: bool, + timeout: float +) -> Tuple[Dict[str, Any], float]: + index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json" + index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8")) + + weight_map = index_data.get("weight_map", {}) + unique_shards = list(set(weight_map.values())) + total_size = index_data.get("metadata", {}).get("total_size", 0.0) + + tensors = {} + if config and not fetch_tensors and total_size > 0: + for tensor_name in weight_map.keys(): + tensors[tensor_name] = {"shape": [], "dtype": "BF16"} + + tensors["__metadata__"] = { + "missing_shards": 0, + "total_shards": len(unique_shards), + "is_sharded": True, + "lazy_fetch": True, + "total_size": total_size + } + else: + def fetch_shard(shard: str): + return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) + + with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: + future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} + for future in concurrent.futures.as_completed(future_to_shard): + shard, shard_header = future.result() + for k, v in shard_header.items(): + if k != "__metadata__": + tensors[k] = v + + tensors["__metadata__"] = { + "missing_shards": 0, + "total_shards": len(unique_shards), + "is_sharded": True + } + return tensors, float(total_size) + + +def _fetch_remote_safetensors_single(real_repo_id: str, timeout: float) -> Tuple[Dict[str, Any], float]: + total_size = 0.0 + req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD") + token = _get_hf_token() + if token: + req.add_header("Authorization", f"Bearer {token}") + try: + with urllib.request.urlopen(req, timeout=timeout) as response: + total_size = int(response.headers.get("Content-Length", 0)) + except Exception: + pass + + header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout) + return header, float(total_size) + + def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: float = 10.0) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]: """ Fetches the metadata directly from the Hugging Face Hub over the network. @@ -213,112 +318,25 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f target_sibling = next((g for g in gguf_files if g["filename"] == target_filename), None) if not target_sibling: raise FileNotFoundError(f"Could not find file '{target_filename}' in Hugging Face repository '{real_repo_id}'.") - - url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{target_filename}" - stream = RemoteFileStream(url, timeout=timeout) - from modelinfo.parsers.gguf import parse_gguf_header - tensors = parse_gguf_header(stream) - - size = target_sibling["size"] if target_sibling["size"] is not None else 0.0 - if size == 0.0: - size = _get_remote_file_size_fallback(real_repo_id, target_filename, timeout) - + tensors, size = _fetch_remote_gguf_single(real_repo_id, target_filename, target_sibling["size"], timeout) return tensors, config, "GGUF", float(size) # Fallback to SafeTensors checks if no specific file is target if "model.safetensors.index.json" in filenames: - index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json" - index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8")) - - weight_map = index_data.get("weight_map", {}) - unique_shards = list(set(weight_map.values())) - - total_size = index_data.get("metadata", {}).get("total_size", 0.0) - - tensors = {} - if config and not fetch_tensors and total_size > 0: - for tensor_name in weight_map.keys(): - tensors[tensor_name] = {"shape": [], "dtype": "BF16"} - - tensors["__metadata__"] = { - "missing_shards": 0, - "total_shards": len(unique_shards), - "is_sharded": True, - "lazy_fetch": True, - "total_size": total_size - } - else: - def fetch_shard(shard: str): - return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) - - with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: - future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} - for future in concurrent.futures.as_completed(future_to_shard): - shard, shard_header = future.result() - for k, v in shard_header.items(): - if k != "__metadata__": - tensors[k] = v - - tensors["__metadata__"] = { - "missing_shards": 0, - "total_shards": len(unique_shards), - "is_sharded": True - } - return tensors, config, "SafeTensors", float(total_size) + tensors, total_size = _fetch_remote_safetensors_sharded(real_repo_id, config, fetch_tensors, timeout) + return tensors, config, "SafeTensors", total_size elif "model.safetensors" in filenames: - total_size = 0.0 - req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD") - token = _get_hf_token() - if token: - req.add_header("Authorization", f"Bearer {token}") - try: - with urllib.request.urlopen(req, timeout=timeout) as response: - total_size = int(response.headers.get("Content-Length", 0)) - except Exception: - pass - - header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout) - return header, config, "SafeTensors", float(total_size) + header, total_size = _fetch_remote_safetensors_single(real_repo_id, timeout) + return header, config, "SafeTensors", total_size elif gguf_files: if len(gguf_files) == 1: single_file = gguf_files[0]["filename"] - url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{single_file}" - stream = RemoteFileStream(url, timeout=timeout) - from modelinfo.parsers.gguf import parse_gguf_header - tensors = parse_gguf_header(stream) - size = gguf_files[0]["size"] if gguf_files[0]["size"] is not None else 0.0 - if size == 0.0: - size = _get_remote_file_size_fallback(real_repo_id, single_file, timeout) + tensors, size = _fetch_remote_gguf_single(real_repo_id, single_file, gguf_files[0]["size"], timeout) return tensors, config, "GGUF", float(size) else: - valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0] - if valid_sizes: - header_target = min(valid_sizes, key=lambda x: x["size"]) - else: - header_target = gguf_files[0] - - header_file = header_target["filename"] - url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}" - stream = RemoteFileStream(url, timeout=timeout) - from modelinfo.parsers.gguf import parse_gguf_header - tensors = parse_gguf_header(stream) - - variants = [] - for g in gguf_files: - v_size = g["size"] - if v_size is None or v_size == 0: - v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout) - variants.append({ - "filename": g["filename"], - "size": float(v_size) - }) - - tensors["__metadata__"] = tensors.get("__metadata__", {}) - tensors["__metadata__"]["gguf_variants"] = variants - tensors["__metadata__"]["repo_id"] = real_repo_id - + tensors = _fetch_remote_gguf_group(real_repo_id, gguf_files, timeout) return tensors, config, "GGUF_group", 0.0 else: diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py index 44e127f..b4947bf 100644 --- a/src/modelinfo/ui.py +++ b/src/modelinfo/ui.py @@ -96,7 +96,7 @@ def print_model_info( row_data = [filename, file_size_str, kv_cache_str, total_vram_str] if show_fits: utilization = total_vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0 - if utilization <= 0.90: + if utilization <= gpu_util: fit_text = "[green]✓ Yes[/green]" elif utilization <= 0.99: fit_text = "[yellow]⚠ Warning[/yellow]" @@ -198,7 +198,7 @@ def print_model_info( summary.add_row("VRAM (est):", vram_display) if gpu_name: utilization = vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0 - if utilization <= 0.90: + if utilization <= gpu_util: fit_text = f"[green]✓ Fits comfortably in {gpu_name} ({max_vram_gb:.1f} GB)[/green]" elif utilization <= 0.99: fit_text = f"[yellow]⚠ Warning: Extreme hardware limit on {gpu_name}. High risk of fragmentation OOM.[/yellow]" diff --git a/tests/test_parsers.py b/tests/test_parsers.py index e6cc4b6..c1d0b6e 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -169,3 +169,36 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0): assert disk_size == 2000000000.0 assert called_gguf == ["q8"] + +def test_remote_gguf_parsing_unauthorized(monkeypatch): + """Test remote parsing raises PermissionError for gated/unauthorized (401) model repositories.""" + import urllib.error + from modelinfo.parsers import huggingface + + def fake_make_request(url, headers=None, limit=None, timeout=10.0): + raise urllib.error.HTTPError(url, 401, "Unauthorized", {}, None) + + monkeypatch.setattr(huggingface, "_make_request", fake_make_request) + + import pytest + with pytest.raises(PermissionError) as exc_info: + huggingface.fetch_huggingface_repo("org/gated-model") + assert "Gated/Private Model" in str(exc_info.value) + + +def test_remote_gguf_parsing_not_found(monkeypatch): + """Test remote parsing raises FileNotFoundError for missing (404) model repositories.""" + import urllib.error + from modelinfo.parsers import huggingface + + def fake_make_request(url, headers=None, limit=None, timeout=10.0): + raise urllib.error.HTTPError(url, 404, "Not Found", {}, None) + + monkeypatch.setattr(huggingface, "_make_request", fake_make_request) + + import pytest + with pytest.raises(FileNotFoundError) as exc_info: + huggingface.fetch_huggingface_repo("org/nonexistent-model") + assert "Could not find repository on Hugging Face" in str(exc_info.value) + + From bebe2c170f43ffd06716646b4ea263b27e4d10b7 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:24:08 -0400 Subject: [PATCH 04/12] refactor: split concurrent shards fetching to lower cyclomatic complexity --- src/modelinfo/parsers/huggingface.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py index f63d212..00f09e3 100644 --- a/src/modelinfo/parsers/huggingface.py +++ b/src/modelinfo/parsers/huggingface.py @@ -212,6 +212,21 @@ def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]] return tensors +def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Dict[str, Any]: + def fetch_shard(shard: str): + return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) + + tensors = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: + future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} + for future in concurrent.futures.as_completed(future_to_shard): + shard, shard_header = future.result() + for k, v in shard_header.items(): + if k != "__metadata__": + tensors[k] = v + return tensors + + def _fetch_remote_safetensors_sharded( real_repo_id: str, config: Dict[str, Any] | None, @@ -238,17 +253,7 @@ def _fetch_remote_safetensors_sharded( "total_size": total_size } else: - def fetch_shard(shard: str): - return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) - - with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: - future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} - for future in concurrent.futures.as_completed(future_to_shard): - shard, shard_header = future.result() - for k, v in shard_header.items(): - if k != "__metadata__": - tensors[k] = v - + tensors = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout) tensors["__metadata__"] = { "missing_shards": 0, "total_shards": len(unique_shards), @@ -257,6 +262,7 @@ def fetch_shard(shard: str): return tensors, float(total_size) + def _fetch_remote_safetensors_single(real_repo_id: str, timeout: float) -> Tuple[Dict[str, Any], float]: total_size = 0.0 req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD") From 6555e0ece345c6d674b0aa8c23ea3194d98ad67f Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:27:49 -0400 Subject: [PATCH 05/12] fix codacy issues: compute GGUF group variant overhead dynamically --- src/modelinfo/ui.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py index b4947bf..11431e1 100644 --- a/src/modelinfo/ui.py +++ b/src/modelinfo/ui.py @@ -79,13 +79,15 @@ def print_model_info( table.add_column("Fits", justify="left") kv_cache_bytes = footprint["kv_cache_bytes"] - overhead_bytes = footprint.get("overhead_bytes", 600 * 1024 * 1024) + penalty_percentage = footprint.get("penalty_percentage", 0.0) + cuda_overhead = 600 * 1024 * 1024 * gpu_count sorted_variants = sorted(variants, key=lambda x: x["size"], reverse=True) for var in sorted_variants: filename = var["filename"] size_bytes = var["size"] - total_vram_bytes = size_bytes + kv_cache_bytes + overhead_bytes + variant_overhead = cuda_overhead + (size_bytes * penalty_percentage) + total_vram_bytes = size_bytes + kv_cache_bytes + variant_overhead file_size_str = format_bytes(size_bytes) kv_cache_str = format_bytes(kv_cache_bytes) From 357ee166baaae5ccc5d86a1611986a34518bf2b9 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:31:19 -0400 Subject: [PATCH 06/12] docs: document remote gguf inspection options in README.md --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dfef3c..f5186a4 100644 --- a/README.md +++ b/README.md @@ -64,10 +64,17 @@ Inspect a local model checkpoint: modelinfo mistral-7b.safetensors ``` -Inspect a remote model directly from the Hugging Face Hub: +Inspect a remote model directly from the Hugging Face Hub (both SafeTensors and GGUF): ```bash +# Inspect a remote SafeTensors repository modelinfo meta-llama/Llama-2-7b-hf + +# Inspect a remote GGUF repository (shows a comparison table of all quantizations) +modelinfo bartowski/Meta-Llama-3-8B-Instruct-GGUF + +# Inspect a specific remote GGUF file in a repository +modelinfo bartowski/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf ``` For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens). From 5a8e6e276b64597ec7d74d577335df08f703eb39 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:50:08 -0400 Subject: [PATCH 07/12] fix: strip trailing slashes from model paths at entrypoint --- src/modelinfo/cli.py | 4 ++++ tests/test_cli.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py index cb4be02..f4785eb 100644 --- a/src/modelinfo/cli.py +++ b/src/modelinfo/cli.py @@ -240,6 +240,10 @@ def analyze_model( def main(argv: Sequence[str] | None = None) -> int: args = parse_args(argv) + # Strip trailing slashes from paths/repos to prevent empty basenames and routing issues + if args.file: + args.file = [path.rstrip("/") for path in args.file if path] + gpu_name_display = None gpu_vram_gb = None gpu_count = 1 diff --git a/tests/test_cli.py b/tests/test_cli.py index 857a225..97fd544 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -304,4 +304,47 @@ def test_print_model_info_gguf_group_with_gpu(capsys): assert "model-q8.gguf" in out assert "Fits" in out +def test_cli_strips_trailing_slashes_from_model_paths(monkeypatch): + captured_paths = [] + + def fake_analyze_model(file_path, *args, **kwargs): + captured_paths.append(file_path) + return { + "format_name": "GGUF", + "arch_name": "Llama", + "tensor_count": 10, + "footprint": { + "total_params": 100, + "base_memory_bytes": 200, + "kv_cache_bytes": 100, + "overhead_bytes": 50, + "total_memory_bytes": 350, + "num_layers": 1, + }, + "disk_size": 200, + "context_length": 128, + "is_default_context": True, + "tensors": {}, + "max_context": 512, + "is_lazy": False, + "gpu_count": 1, + "topology": "pcie4", + "strategy": "tp", + "is_vllm": False, + "gpu_vram_gb": 0.0, + "gpu_util": 0.9, + } + + monkeypatch.setattr(cli, "analyze_model", fake_analyze_model) + monkeypatch.setattr(cli, "print_compare_info", lambda models, max_vram, gpu_name: None) + monkeypatch.setattr(cli, "print_model_info", lambda *args, **kwargs: None) + + # Test single model path with trailing slash + cli.main(["meta-llama/Llama-2-7b-hf/"]) + assert captured_paths == ["meta-llama/Llama-2-7b-hf"] + + captured_paths.clear() + # Test multiple model paths with trailing slashes (side-by-side comparison) + cli.main(["meta-llama/Llama-2-7b-hf/", "mistralai/Mistral-7B-v0.1/"]) + assert captured_paths == ["meta-llama/Llama-2-7b-hf", "mistralai/Mistral-7B-v0.1"] From 7dc85762a206b775724ba0c2db222d72bb02ca58 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:50:13 -0400 Subject: [PATCH 08/12] fix: handle reverse tensor shape ordering for gguf shape guessing --- src/modelinfo/architecture.py | 5 ++-- tests/test_calculator.py | 43 +++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/modelinfo/architecture.py b/src/modelinfo/architecture.py index b398213..bef7237 100644 --- a/src/modelinfo/architecture.py +++ b/src/modelinfo/architecture.py @@ -11,6 +11,7 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None) metadata = tensors.get("__metadata__", {}) gen_arch = metadata.get("general.architecture") + is_gguf = "general.architecture" in metadata or any(k.startswith("general.") for k in metadata.keys()) # 1. Attempt explicit GGUF metadata if gen_arch: @@ -68,14 +69,14 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None) found_k_proj = True shape = meta.get("shape", []) if len(shape) >= 2: - kv_dim = shape[0] + kv_dim = shape[-1] if is_gguf else shape[0] if "qkv_proj.weight" in name or "c_attn.weight" in name: found_fused = True if not found_k_proj: shape = meta.get("shape", []) if len(shape) >= 2: - kv_dim = shape[0] // 3 + kv_dim = (shape[-1] if is_gguf else shape[0]) // 3 num_layers = len(layers_set) if found_fused and not found_k_proj and kv_dim > 0: diff --git a/tests/test_calculator.py b/tests/test_calculator.py index 1ef701c..94cf3ea 100644 --- a/tests/test_calculator.py +++ b/tests/test_calculator.py @@ -169,3 +169,46 @@ def test_vllm_capacity_simulation(): bytes_per_token = 40960 expected_capacity = math.floor(metrics["paged_kv_pool"] / bytes_per_token) assert metrics["max_serving_capacity"] == expected_capacity + + +def test_gguf_shape_guessing_fallback(): + """Verify that shape guessing logic correctly extracts kv_dim using GGUF column-major ordering (shape[-1]) when metadata has no explicit keys.""" + from modelinfo.architecture import extract_architecture + + tensors = { + "__metadata__": { + "general.architecture": "llama", + }, + "model.layers.0.self_attn.k_proj.weight": { + "shape": [4096, 1024], + "dtype": "F16" + }, + "model.layers.1.self_attn.k_proj.weight": { + "shape": [4096, 1024], + "dtype": "F16" + } + } + + num_layers, kv_dim, is_estimate = extract_architecture(tensors) + assert num_layers == 2 + assert kv_dim == 1024 + assert is_estimate is False + +def test_gguf_shape_guessing_fallback_fused(): + """Verify that fused shape guessing extracts (shape[-1] // 3) for GGUF tensors.""" + from modelinfo.architecture import extract_architecture + + tensors = { + "__metadata__": { + "general.architecture": "gpt2", + }, + "model.layers.0.self_attn.qkv_proj.weight": { + "shape": [4096, 3072], + "dtype": "F16" + } + } + + num_layers, kv_dim, is_estimate = extract_architecture(tensors) + assert num_layers == 1 + assert kv_dim == 1024 + assert is_estimate is True From 0ef126b246c42cf76873d3a9f55742f87d2c1dc3 Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:50:13 -0400 Subject: [PATCH 09/12] fix: treat paths starting with local prefix as local files to prevent remote routing --- src/modelinfo/cli.py | 10 ++++++++-- tests/test_cli.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py index cb4be02..c9cb3ae 100644 --- a/src/modelinfo/cli.py +++ b/src/modelinfo/cli.py @@ -151,8 +151,14 @@ def analyze_model( is_remote = False if not os.path.exists(file_path): - if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): - is_remote = True + # ponytail: prevent routing explicit local paths or typos to HF + is_local_path = ( + file_path.startswith((".", "/", "~")) + or os.path.isabs(file_path) + ) + if not is_local_path: + if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): + is_remote = True if is_remote: from modelinfo.parsers.huggingface import fetch_huggingface_repo diff --git a/tests/test_cli.py b/tests/test_cli.py index 857a225..a1709bf 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -304,4 +304,33 @@ def test_print_model_info_gguf_group_with_gpu(capsys): assert "model-q8.gguf" in out assert "Fits" in out +def test_analyze_model_local_path_routing(monkeypatch): + """Test that analyze_model treats paths starting with local prefix as local, raising an error instead of routing to Hugging Face.""" + from modelinfo.parsers import huggingface + + hf_fetched = [] + def fake_fetch(repo_id, *, fetch_tensors, timeout): + hf_fetched.append(repo_id) + return {}, None, "SafeTensors", 0.0 + + monkeypatch.setattr(huggingface, "fetch_huggingface_repo", fake_fetch) + # Test cases that should NOT hit Hugging Face + local_paths = ["./missing.gguf", "../missing.safetensors", "/missing.bin", "~/missing.pt"] + for path in local_paths: + with pytest.raises((FileNotFoundError, ValueError, OSError)): + cli.analyze_model(path, context_override=128) + + assert len(hf_fetched) == 0, f"Hugging Face fetch was triggered for local paths: {hf_fetched}" + + # Test cases that SHOULD hit Hugging Face + remote_paths = ["meta-llama/Llama-2-7b-hf", "org/model"] + for path in remote_paths: + try: + cli.analyze_model(path, context_override=128) + except Exception: + # We don't care if calculation fails later because of empty dict from fake_fetch, + # we just care that it triggers fetch_huggingface_repo. + pass + + assert hf_fetched == remote_paths From b0b97445509c4ae653c92d1f7c02be5de5b82f4a Mon Sep 17 00:00:00 2001 From: Felipe Arce Date: Sat, 27 Jun 2026 11:50:21 -0400 Subject: [PATCH 10/12] fix: handle concurrent remote shard download failures gracefully --- src/modelinfo/parsers/huggingface.py | 26 +++++++++++------ tests/test_parsers.py | 43 ++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py index 00f09e3..b36dd7f 100644 --- a/src/modelinfo/parsers/huggingface.py +++ b/src/modelinfo/parsers/huggingface.py @@ -212,19 +212,27 @@ def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]] return tensors -def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Dict[str, Any]: +def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Tuple[Dict[str, Any], int]: def fetch_shard(shard: str): - return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) + try: + header = _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) + return shard, header, None + except Exception as e: + return shard, {}, e tensors = {} + missing_shards = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} for future in concurrent.futures.as_completed(future_to_shard): - shard, shard_header = future.result() - for k, v in shard_header.items(): - if k != "__metadata__": - tensors[k] = v - return tensors + shard, shard_header, error = future.result() + if error is not None: + missing_shards += 1 + else: + for k, v in shard_header.items(): + if k != "__metadata__": + tensors[k] = v + return tensors, missing_shards def _fetch_remote_safetensors_sharded( @@ -253,9 +261,9 @@ def _fetch_remote_safetensors_sharded( "total_size": total_size } else: - tensors = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout) + tensors, missing_shards = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout) tensors["__metadata__"] = { - "missing_shards": 0, + "missing_shards": missing_shards, "total_shards": len(unique_shards), "is_sharded": True } diff --git a/tests/test_parsers.py b/tests/test_parsers.py index c1d0b6e..a5d4a07 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -201,4 +201,47 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0): huggingface.fetch_huggingface_repo("org/nonexistent-model") assert "Could not find repository on Hugging Face" in str(exc_info.value) +def test_remote_shard_download_failure(monkeypatch): + """Test remote sharded safetensors parsing when one of the shard downloads fails.""" + import json + import struct + import urllib.error + from modelinfo.parsers import huggingface + def fake_make_request(url, headers=None, limit=None, timeout=10.0): + if "/api/models/" in url: + return json.dumps({ + "siblings": [ + {"rfilename": "model.safetensors.index.json"}, + {"rfilename": "model-00001-of-00002.safetensors"}, + {"rfilename": "model-00002-of-00002.safetensors"} + ] + }).encode("utf-8") + elif "model.safetensors.index.json" in url: + return json.dumps({ + "metadata": {"total_size": 2000000000}, + "weight_map": { + "layer1.weight": "model-00001-of-00002.safetensors", + "layer2.weight": "model-00002-of-00002.safetensors" + } + }).encode("utf-8") + elif "model-00001-of-00002.safetensors" in url: + header_json = json.dumps({"layer1.weight": {"dtype": "BF16", "shape": [1024, 1024]}}).encode("utf-8") + return struct.pack(" Date: Sat, 27 Jun 2026 11:50:35 -0400 Subject: [PATCH 11/12] fix: resolve safetensors shard index prefix splitting --- src/modelinfo/parsers/safetensors.py | 8 +++++- tests/test_parsers.py | 38 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/modelinfo/parsers/safetensors.py b/src/modelinfo/parsers/safetensors.py index 5d0289f..887f5c7 100644 --- a/src/modelinfo/parsers/safetensors.py +++ b/src/modelinfo/parsers/safetensors.py @@ -31,7 +31,13 @@ def parse_safetensors_header(path: str) -> dict[str, Any]: if path.endswith(".index.json"): is_index = True elif "-of-" in base_name and path.endswith(".safetensors"): - prefix = base_name.split("-")[0] + import re + match = re.match(r"^(.*?)-\d{5}-of-\d{5}\.safetensors$", base_name) + if match: + prefix = match.group(1) + else: + # Fallback to splitting in case of non-standard shard formatting + prefix = base_name.split("-")[0] potential_index = os.path.join(dir_path, f"{prefix}.safetensors.index.json") if os.path.exists(potential_index): index_path = potential_index diff --git a/tests/test_parsers.py b/tests/test_parsers.py index c1d0b6e..91b773f 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -202,3 +202,41 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0): assert "Could not find repository on Hugging Face" in str(exc_info.value) + + +def test_safetensors_sharded_with_hyphens(tmp_path): + """Test safetensors parser sharded index path resolution when filename contains hyphens.""" + import struct + import json + + index_file = tmp_path / "mock-llama-3-8b.safetensors.index.json" + shard_file = tmp_path / "mock-llama-3-8b-00001-of-00002.safetensors" + + index_data = { + "weight_map": { + "model.embed_tokens.weight": "mock-llama-3-8b-00001-of-00002.safetensors" + } + } + index_file.write_text(json.dumps(index_data), encoding="utf-8") + + header_data = { + "model.embed_tokens.weight": { + "dtype": "BF16", + "shape": [32000, 4096], + "data_offsets": [0, 262144000] + } + } + header_json = json.dumps(header_data).encode("utf-8") + header_len = len(header_json) + + with open(shard_file, "wb") as f: + f.write(struct.pack(" Date: Sat, 27 Jun 2026 11:56:14 -0400 Subject: [PATCH 12/12] fix: address codacy review feedback on disk size, regex, path parsing, and test helper --- src/modelinfo/cli.py | 13 +++++++++---- src/modelinfo/parsers/safetensors.py | 8 ++++++-- tests/test_cli.py | 14 +------------- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py index b0b09b3..56c27d0 100644 --- a/src/modelinfo/cli.py +++ b/src/modelinfo/cli.py @@ -157,7 +157,8 @@ def analyze_model( or os.path.isabs(file_path) ) if not is_local_path: - if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): + # Treat as remote only if it contains a slash and does not end with a model extension + if "/" in file_path and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): is_remote = True if is_remote: @@ -218,8 +219,12 @@ def analyze_model( num_layers = footprint["num_layers"] arch_name = identify_architecture_name(tensors, num_layers, config) - if os.path.exists(file_path): - disk_size = os.path.getsize(file_path) + if not is_remote: + metadata = tensors.get("__metadata__", {}) + if metadata.get("is_sharded") and "disk_size" in metadata: + disk_size = metadata["disk_size"] + elif os.path.exists(file_path): + disk_size = os.path.getsize(file_path) tensor_count = len([k for k in tensors.keys() if k != "__metadata__"]) @@ -248,7 +253,7 @@ def main(argv: Sequence[str] | None = None) -> int: # Strip trailing slashes from paths/repos to prevent empty basenames and routing issues if args.file: - args.file = [path.rstrip("/") for path in args.file if path] + args.file = [path.rstrip("/\\") for path in args.file if path] gpu_name_display = None gpu_vram_gb = None diff --git a/src/modelinfo/parsers/safetensors.py b/src/modelinfo/parsers/safetensors.py index 887f5c7..2e7d705 100644 --- a/src/modelinfo/parsers/safetensors.py +++ b/src/modelinfo/parsers/safetensors.py @@ -32,7 +32,7 @@ def parse_safetensors_header(path: str) -> dict[str, Any]: is_index = True elif "-of-" in base_name and path.endswith(".safetensors"): import re - match = re.match(r"^(.*?)-\d{5}-of-\d{5}\.safetensors$", base_name) + match = re.match(r"^(.*?)-\d+-of-\d+\.safetensors$", base_name) if match: prefix = match.group(1) else: @@ -55,9 +55,12 @@ def parse_safetensors_header(path: str) -> dict[str, Any]: tensors = {} missing_shards = 0 total_shards = len(unique_shards) + total_size = 0 for shard in unique_shards: shard_path = os.path.join(dir_path, shard) + if os.path.exists(shard_path): + total_size += os.path.getsize(shard_path) try: shard_header = _read_single_header(shard_path) for k, v in shard_header.items(): @@ -69,7 +72,8 @@ def parse_safetensors_header(path: str) -> dict[str, Any]: tensors["__metadata__"] = { "missing_shards": missing_shards, "total_shards": total_shards, - "is_sharded": True + "is_sharded": True, + "disk_size": total_size } return tensors diff --git a/tests/test_cli.py b/tests/test_cli.py index 0d7618f..267b98d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -187,19 +187,7 @@ def fake_exists(path): return False def fake_fetch(repo_id, *, fetch_tensors, timeout): - tensors = { - "__metadata__": { - "general.architecture": "llama", - "llama.block_count": 32, - "llama.attention.head_count_kv": 8, - "llama.attention.key_length": 128, - "gguf_variants": [ - {"filename": "model-q4.gguf", "size": 1000000000}, - {"filename": "model-q8.gguf", "size": 2000000000} - ], - "repo_id": "org/model-gguf" - } - } + tensors, _ = _get_mock_gguf_group_data() return tensors, None, "GGUF_group", 0.0 monkeypatch.setattr(cli.os.path, "exists", fake_exists)