From 1b1a0905b04ff21829907f95d52b540bdecfded4 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:16:03 -0400
Subject: [PATCH 01/12] implement remote gguf inspection on hugging face

---
 src/modelinfo/cli.py                 |  13 +-
 src/modelinfo/parsers/gguf.py        |  81 ++++++------
 src/modelinfo/parsers/huggingface.py | 179 +++++++++++++++++++++++----
 src/modelinfo/ui.py                  |  55 ++++++++
 tests/test_cli.py                    | 122 ++++++++++++++++++
 tests/test_parsers.py                |  87 +++++++++++++
 6 files changed, 473 insertions(+), 64 deletions(-)

diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py
index c89b6d7..cb4be02 100644
--- a/src/modelinfo/cli.py
+++ b/src/modelinfo/cli.py
@@ -149,7 +149,12 @@ def analyze_model(
     
     file_path_lower = file_path.lower()
     
-    if not os.path.exists(file_path) and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
+    is_remote = False
+    if not os.path.exists(file_path):
+        if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
+            is_remote = True
+
+    if is_remote:
         from modelinfo.parsers.huggingface import fetch_huggingface_repo
         tensors, config, format_name, disk_size = fetch_huggingface_repo(
             file_path, fetch_tensors=fetch_tensors, timeout=timeout
@@ -180,7 +185,7 @@ def analyze_model(
     max_context = None
     if config:
         max_context = config.get("max_position_embeddings")
-    elif format_name == "GGUF":
+    elif format_name in ("GGUF", "GGUF_group"):
         metadata = tensors.get("__metadata__", {})
         gen_arch = metadata.get("general.architecture")
         if gen_arch:
@@ -207,8 +212,8 @@ def analyze_model(
     num_layers = footprint["num_layers"]
     arch_name = identify_architecture_name(tensors, num_layers, config)
 
-    if format_name != "SafeTensors" or os.path.exists(file_path):
-        disk_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0.0
+    if os.path.exists(file_path):
+        disk_size = os.path.getsize(file_path)
         
     tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])
     
diff --git a/src/modelinfo/parsers/gguf.py b/src/modelinfo/parsers/gguf.py
index 5f3d210..3af2fb4 100644
--- a/src/modelinfo/parsers/gguf.py
+++ b/src/modelinfo/parsers/gguf.py
@@ -44,46 +44,53 @@ def _read_gguf_value(f: Any, val_type: int) -> Any:
         raise ValueError(f"Unknown GGUF value type: {val_type}")
 
 
-def parse_gguf_header(path: str) -> Dict[str, Any]:
+def parse_gguf_header(path_or_file: str | Any) -> Dict[str, Any]:
     """Parses a GGUF file header and extracts tensor information."""
+    if isinstance(path_or_file, str):
+        with open(path_or_file, "rb") as f:
+            return _parse_gguf_header_from_stream(f)
+    else:
+        return _parse_gguf_header_from_stream(path_or_file)
+
+
+def _parse_gguf_header_from_stream(f: Any) -> Dict[str, Any]:
     tensors: Dict[str, Any] = {}
+    magic = f.read(4)
+    if magic != b"GGUF":
+        raise ValueError("Invalid GGUF file: Magic bytes missing.")
+        
+    version = struct.unpack("<I", f.read(4))[0]
+    if version < 2:
+        raise ValueError(f"Unsupported GGUF version: {version}")
+        
+    tensor_count = struct.unpack("<Q", f.read(8))[0]
+    kv_count = struct.unpack("<Q", f.read(8))[0]
     
-    with open(path, "rb") as f:
-        magic = f.read(4)
-        if magic != b"GGUF":
-            raise ValueError("Invalid GGUF file: Magic bytes missing.")
-            
-        version = struct.unpack("<I", f.read(4))[0]
-        if version < 2:
-            raise ValueError(f"Unsupported GGUF version: {version}")
-            
-        tensor_count = struct.unpack("<Q", f.read(8))[0]
-        kv_count = struct.unpack("<Q", f.read(8))[0]
+    metadata = {}
+    for _ in range(kv_count):
+        key_len = struct.unpack("<Q", f.read(8))[0]
+        key_name = f.read(key_len).decode("utf-8")
+        val_type = struct.unpack("<I", f.read(4))[0]
+        metadata[key_name] = _read_gguf_value(f, val_type)
         
-        metadata = {}
-        for _ in range(kv_count):
-            key_len = struct.unpack("<Q", f.read(8))[0]
-            key_name = f.read(key_len).decode("utf-8")
-            val_type = struct.unpack("<I", f.read(4))[0]
-            metadata[key_name] = _read_gguf_value(f, val_type)
-            
-        tensors["__metadata__"] = metadata
-            
-        for _ in range(tensor_count):
-            name_len = struct.unpack("<Q", f.read(8))[0]
-            name = f.read(name_len).decode("utf-8")
-            
-            n_dims = struct.unpack("<I", f.read(4))[0]
-            shape = []
-            for _ in range(n_dims):
-                shape.append(struct.unpack("<Q", f.read(8))[0])
-            
-            t_type = struct.unpack("<I", f.read(4))[0]
-            f.read(8)  # skip offset bytes
-            
-            # Strict GGUF tensor type mapping
-            dtype = GGML_TYPE_MAP.get(t_type, "Unknown")
-                
-            tensors[name] = {"shape": shape, "dtype": dtype}
+    tensors["__metadata__"] = metadata
+        
+    for _ in range(tensor_count):
+        name_len = struct.unpack("<Q", f.read(8))[0]
+        name = f.read(name_len).decode("utf-8")
+        
+        n_dims = struct.unpack("<I", f.read(4))[0]
+        shape = []
+        for _ in range(n_dims):
+            shape.append(struct.unpack("<Q", f.read(8))[0])
+        
+        t_type = struct.unpack("<I", f.read(4))[0]
+        f.read(8)  # skip offset bytes
+        
+        # Strict GGUF tensor type mapping
+        dtype = GGML_TYPE_MAP.get(t_type, "Unknown")
             
+        tensors[name] = {"shape": shape, "dtype": dtype}
+        
     return tensors
+
diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py
index a070e41..6ddc927 100644
--- a/src/modelinfo/parsers/huggingface.py
+++ b/src/modelinfo/parsers/huggingface.py
@@ -101,19 +101,94 @@ def _fetch_safetensors_header(repo_id: str, filename: str, timeout: float = 10.0
         
     return json.loads(json_bytes)
 
+def _get_remote_file_size_fallback(repo_id: str, filename: str, timeout: float = 10.0) -> float:
+    req = urllib.request.Request(f"{_get_hf_endpoint()}/{repo_id}/resolve/main/{filename}", method="HEAD")
+    token = _get_hf_token()
+    if token:
+        req.add_header("Authorization", f"Bearer {token}")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return float(response.headers.get("Content-Length", 0))
+    except Exception:
+        return 0.0
+
+
+class RemoteFileStream:
+    def __init__(self, url: str, chunk_size: int = 1024*1024, timeout: float = 10.0):
+        self.url = url
+        self.chunk_size = chunk_size
+        self.timeout = timeout
+        self.buffer = b""
+        self.position = 0
+
+    def read(self, size: int = -1) -> bytes:
+        if size == -1:
+            raise NotImplementedError("Unlimited remote read is not supported.")
+            
+        end_pos = self.position + size
+        while end_pos > len(self.buffer):
+            start_bytes = len(self.buffer)
+            end_bytes = start_bytes + self.chunk_size - 1
+            
+            headers = {"Range": f"bytes={start_bytes}-{end_bytes}"}
+            try:
+                chunk = _make_request(
+                    self.url,
+                    headers=headers,
+                    limit=self.chunk_size,
+                    timeout=self.timeout
+                )
+                if not chunk:
+                    break
+                self.buffer += chunk
+            except urllib.error.HTTPError as e:
+                if e.code == 416:
+                    break
+                raise
+            except Exception:
+                raise
+                
+        result = self.buffer[self.position:self.position+size]
+        self.position += len(result)
+        return result
+
+    def seek(self, offset: int, whence: int = 0) -> int:
+        if whence == 0:
+            self.position = offset
+        elif whence == 1:
+            self.position += offset
+        else:
+            raise NotImplementedError("Seek from end is not supported.")
+        return self.position
+
+    def tell(self) -> int:
+        return self.position
+
+    def close(self) -> None:
+        pass
+
+
 def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: float = 10.0) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]:
     """
     Fetches the metadata directly from the Hugging Face Hub over the network.
     Returns: (tensors, config, format_name, disk_size)
     """
-    api_url = f"{_get_hf_endpoint()}/api/models/{repo_id}"
+    target_filename = None
+    parts = repo_id.split("/")
+    if len(parts) >= 3 and parts[-1].lower().endswith(".gguf"):
+        real_repo_id = "/".join(parts[:2])
+        target_filename = "/".join(parts[2:])
+    else:
+        real_repo_id = repo_id
+
+    api_url = f"{_get_hf_endpoint()}/api/models/{real_repo_id}"
     try:
         api_data = json.loads(_make_request(api_url, timeout=timeout).decode("utf-8"))
     except urllib.error.HTTPError as e:
         if e.code == 401:
-            raise PermissionError(f"Gated/Private Model (401 Unauthorized). Set the HF_TOKEN environment variable to access {repo_id}")
+            raise PermissionError(f"Gated/Private Model (401 Unauthorized). Set the HF_TOKEN environment variable to access {real_repo_id}")
         if e.code == 404:
-             raise FileNotFoundError(f"Could not find repository on Hugging Face (404 Not Found): {repo_id}")
+             raise FileNotFoundError(f"Could not find repository on Hugging Face (404 Not Found): {real_repo_id}")
         raise
         
     siblings = api_data.get("siblings", [])
@@ -121,15 +196,38 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f
     
     config = None
     if "config.json" in filenames:
-        config_url = f"{_get_hf_endpoint()}/{repo_id}/resolve/main/config.json"
+        config_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/config.json"
         config = json.loads(_make_request(config_url, timeout=timeout).decode("utf-8"))
+
+    # Find GGUF siblings
+    gguf_files = []
+    for s in siblings:
+        fname = s.get("rfilename", "")
+        if fname.lower().endswith(".gguf"):
+            gguf_files.append({
+                "filename": fname,
+                "size": s.get("size")
+            })
+
+    if target_filename:
+        target_sibling = next((g for g in gguf_files if g["filename"] == target_filename), None)
+        if not target_sibling:
+            raise FileNotFoundError(f"Could not find file '{target_filename}' in Hugging Face repository '{real_repo_id}'.")
         
-    tensors = {}
-    total_size = 0.0
-    
+        url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{target_filename}"
+        stream = RemoteFileStream(url, timeout=timeout)
+        from modelinfo.parsers.gguf import parse_gguf_header
+        tensors = parse_gguf_header(stream)
+        
+        size = target_sibling["size"] if target_sibling["size"] is not None else 0.0
+        if size == 0.0:
+            size = _get_remote_file_size_fallback(real_repo_id, target_filename, timeout)
+            
+        return tensors, config, "GGUF", float(size)
+
+    # Fallback to SafeTensors checks if no specific file is target
     if "model.safetensors.index.json" in filenames:
-        # Sharded SafeTensors
-        index_url = f"{_get_hf_endpoint()}/{repo_id}/resolve/main/model.safetensors.index.json"
+        index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json"
         index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8"))
         
         weight_map = index_data.get("weight_map", {})
@@ -137,8 +235,8 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f
         
         total_size = index_data.get("metadata", {}).get("total_size", 0.0)
         
+        tensors = {}
         if config and not fetch_tensors and total_size > 0:
-            # Lazy Fetch Paradigm
             for tensor_name in weight_map.keys():
                 tensors[tensor_name] = {"shape": [], "dtype": "BF16"}
                 
@@ -151,7 +249,7 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f
             }
         else:
             def fetch_shard(shard: str):
-                return shard, _fetch_safetensors_header(repo_id, shard, timeout=timeout)
+                return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
                 
             with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
                 future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
@@ -166,13 +264,11 @@ def fetch_shard(shard: str):
                 "total_shards": len(unique_shards),
                 "is_sharded": True
             }
-        format_name = "SafeTensors"
+        return tensors, config, "SafeTensors", float(total_size)
         
     elif "model.safetensors" in filenames:
-        # Single SafeTensors
-        
-        # Determine total size first
-        req = urllib.request.Request(f"{_get_hf_endpoint()}/{repo_id}/resolve/main/model.safetensors", method="HEAD")
+        total_size = 0.0
+        req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD")
         token = _get_hf_token()
         if token:
             req.add_header("Authorization", f"Bearer {token}")
@@ -182,12 +278,49 @@ def fetch_shard(shard: str):
         except Exception:
             pass
 
-        header = _fetch_safetensors_header(repo_id, "model.safetensors", timeout=timeout)
-        tensors = header
+        header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout)
+        return header, config, "SafeTensors", float(total_size)
+
+    elif gguf_files:
+        if len(gguf_files) == 1:
+            single_file = gguf_files[0]["filename"]
+            url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{single_file}"
+            stream = RemoteFileStream(url, timeout=timeout)
+            from modelinfo.parsers.gguf import parse_gguf_header
+            tensors = parse_gguf_header(stream)
+            size = gguf_files[0]["size"] if gguf_files[0]["size"] is not None else 0.0
+            if size == 0.0:
+                size = _get_remote_file_size_fallback(real_repo_id, single_file, timeout)
+            return tensors, config, "GGUF", float(size)
+        else:
+            valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0]
+            if valid_sizes:
+                header_target = min(valid_sizes, key=lambda x: x["size"])
+            else:
+                header_target = gguf_files[0]
             
-        format_name = "SafeTensors"
-        
+            header_file = header_target["filename"]
+            url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}"
+            stream = RemoteFileStream(url, timeout=timeout)
+            from modelinfo.parsers.gguf import parse_gguf_header
+            tensors = parse_gguf_header(stream)
+            
+            variants = []
+            for g in gguf_files:
+                v_size = g["size"]
+                if v_size is None or v_size == 0:
+                    v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout)
+                variants.append({
+                    "filename": g["filename"],
+                    "size": float(v_size)
+                })
+            
+            tensors["__metadata__"] = tensors.get("__metadata__", {})
+            tensors["__metadata__"]["gguf_variants"] = variants
+            tensors["__metadata__"]["repo_id"] = real_repo_id
+            
+            return tensors, config, "GGUF_group", 0.0
+
     else:
-        raise ValueError(f"Repository {repo_id} does not contain SafeTensors weights.")
-        
-    return tensors, config, format_name, float(total_size)
+        raise ValueError(f"Repository {real_repo_id} does not contain SafeTensors or GGUF weights.")
+
diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py
index f69ce28..44e127f 100644
--- a/src/modelinfo/ui.py
+++ b/src/modelinfo/ui.py
@@ -56,6 +56,61 @@ def print_model_info(
     gpu_vram_gb: float = 0.0,
     gpu_util: float = 0.9
 ) -> None:
+    if format_name == "GGUF_group":
+        metadata = tensors.get("__metadata__", {})
+        variants = metadata.get("gguf_variants", [])
+        repo_id = metadata.get("repo_id", "")
+        
+        console.print(f"[bold]Repository:[/bold]      {repo_id}")
+        console.print("[bold]Format:[/bold]          GGUF (Multiple Quantizations)")
+        console.print(f"[bold]Architecture:[/bold]    {arch_name}")
+        if max_context:
+            console.print(f"[bold]Context Limit:[/bold]   {max_context:,} tokens")
+        console.print()
+        
+        table = Table(box=None, show_header=True, header_style="bold", pad_edge=False, padding=(0, 2))
+        table.add_column("Quantization File")
+        table.add_column("File Size", justify="right")
+        table.add_column("KV Cache", justify="right")
+        table.add_column("Total VRAM", justify="right")
+        
+        show_fits = gpu_name is not None
+        if show_fits:
+            table.add_column("Fits", justify="left")
+            
+        kv_cache_bytes = footprint["kv_cache_bytes"]
+        overhead_bytes = footprint.get("overhead_bytes", 600 * 1024 * 1024)
+        
+        sorted_variants = sorted(variants, key=lambda x: x["size"], reverse=True)
+        for var in sorted_variants:
+            filename = var["filename"]
+            size_bytes = var["size"]
+            total_vram_bytes = size_bytes + kv_cache_bytes + overhead_bytes
+            
+            file_size_str = format_bytes(size_bytes)
+            kv_cache_str = format_bytes(kv_cache_bytes)
+            
+            vram_color = get_vram_color(total_vram_bytes, max_vram_gb)
+            total_vram_str = f"[{vram_color}]~{format_bytes(total_vram_bytes)}[/{vram_color}]"
+            
+            row_data = [filename, file_size_str, kv_cache_str, total_vram_str]
+            if show_fits:
+                utilization = total_vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0
+                if utilization <= 0.90:
+                    fit_text = "[green]✓ Yes[/green]"
+                elif utilization <= 0.99:
+                    fit_text = "[yellow]⚠ Warning[/yellow]"
+                else:
+                    fit_text = "[red]✗ No[/red]"
+                row_data.append(fit_text)
+                
+            table.add_row(*row_data)
+            
+        console.print(table)
+        console.print()
+        console.print(f"[dim]Tip: To view details for a specific quantization, run: modelinfo {repo_id}/{sorted_variants[0]['filename']}[/dim]")
+        return
+
     summary = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2))
     summary.add_column("Property", style="bold")
     summary.add_column("Value")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a5792d3..fd17262 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -177,3 +177,125 @@ def fake_calculate_footprint(tensors, *, context_length, batch_size, **kwargs):
         "fetch_tensors": True,
         "timeout": 22.5,
     }
+
+
+def test_analyze_model_gguf_group(monkeypatch):
+    """Test that analyze_model correctly handles and propagates GGUF groups."""
+    from modelinfo.parsers import huggingface
+    
+    def fake_exists(path):
+        return False
+        
+    def fake_fetch(repo_id, *, fetch_tensors, timeout):
+        tensors = {
+            "__metadata__": {
+                "general.architecture": "llama",
+                "llama.block_count": 32,
+                "llama.attention.head_count_kv": 8,
+                "llama.attention.key_length": 128,
+                "gguf_variants": [
+                    {"filename": "model-q4.gguf", "size": 1000000000},
+                    {"filename": "model-q8.gguf", "size": 2000000000}
+                ],
+                "repo_id": "org/model-gguf"
+            }
+        }
+        return tensors, None, "GGUF_group", 0.0
+        
+    monkeypatch.setattr(cli.os.path, "exists", fake_exists)
+    monkeypatch.setattr(huggingface, "fetch_huggingface_repo", fake_fetch)
+    
+    def fake_calculate_footprint(*args, **kwargs):
+        return {
+            "total_params": 1000000,
+            "base_memory_bytes": 2000000.0,
+            "kv_cache_bytes": 1000000.0,
+            "overhead_bytes": 600000.0,
+            "total_memory_bytes": 3600000.0,
+            "num_layers": 32,
+            "kv_dim": 1024,
+            "primary_dtype": "Q4_0",
+            "kv_is_estimate": False,
+            "penalty_percentage": 0.0,
+            "vllm_metrics": {}
+        }
+    monkeypatch.setattr(cli, "calculate_footprint", fake_calculate_footprint)
+    
+    info = cli.analyze_model("org/model-gguf", context_override=128)
+    
+    assert info["format_name"] == "GGUF_group"
+    assert info["tensors"]["__metadata__"]["repo_id"] == "org/model-gguf"
+    assert len(info["tensors"]["__metadata__"]["gguf_variants"]) == 2
+
+
+def test_print_model_info_gguf_group(capsys):
+    """Test print_model_info renders a comparison table for GGUF groups."""
+    from modelinfo.ui import print_model_info
+    
+    tensors = {
+        "__metadata__": {
+            "general.architecture": "llama",
+            "llama.block_count": 32,
+            "llama.attention.head_count_kv": 8,
+            "llama.attention.key_length": 128,
+            "gguf_variants": [
+                {"filename": "model-q4.gguf", "size": 1000000000},
+                {"filename": "model-q8.gguf", "size": 2000000000}
+            ],
+            "repo_id": "org/model-gguf"
+        }
+    }
+    
+    footprint = {
+        "total_params": 8000000000,
+        "base_memory_bytes": 4000000000.0,
+        "kv_cache_bytes": 1000000000.0,
+        "overhead_bytes": 600000000.0,
+        "total_memory_bytes": 5600000000.0,
+        "num_layers": 32,
+        "kv_dim": 1024,
+        "primary_dtype": "Q4_0",
+        "kv_is_estimate": False,
+        "penalty_percentage": 0.0,
+        "vllm_metrics": {}
+    }
+    
+    print_model_info(
+        format_name="GGUF_group",
+        arch_name="Llama (32 layers)",
+        tensor_count=0,
+        footprint=footprint,
+        disk_size=0.0,
+        context_length=8192,
+        is_default_context=True,
+        tensors=tensors,
+        max_context=32768,
+        max_vram_gb=8.0,
+        gpu_name=None
+    )
+    
+    out, err = capsys.readouterr()
+    assert "model-q4.gguf" in out
+    assert "model-q8.gguf" in out
+    assert "Fits" not in out
+    assert "Tip:" in out
+    
+    print_model_info(
+        format_name="GGUF_group",
+        arch_name="Llama (32 layers)",
+        tensor_count=0,
+        footprint=footprint,
+        disk_size=0.0,
+        context_length=8192,
+        is_default_context=True,
+        tensors=tensors,
+        max_context=32768,
+        max_vram_gb=8.0,
+        gpu_name="RTX4080"
+    )
+    
+    out, err = capsys.readouterr()
+    assert "model-q4.gguf" in out
+    assert "model-q8.gguf" in out
+    assert "Fits" in out
+
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index 10ebc91..e6cc4b6 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -82,3 +82,90 @@ def test_hf_endpoint_rejects_no_hostname(monkeypatch):
     monkeypatch.setenv("HF_ENDPOINT", "https:///repo")
     with pytest.raises(ValueError, match="must include a valid hostname"):
         _get_hf_endpoint()
+
+
+def test_remote_gguf_parsing_single(monkeypatch):
+    """Test remote GGUF parsing when a single GGUF is found in the repository."""
+    import json
+    from modelinfo.parsers import huggingface
+    
+    def fake_make_request(url, headers=None, limit=None, timeout=10.0):
+        if "/api/models/" in url:
+            return json.dumps({
+                "siblings": [
+                    {"rfilename": "model-q4.gguf", "size": 1000000000}
+                ]
+            }).encode("utf-8")
+        elif "model-q4.gguf" in url:
+            import struct
+            header = b"GGUF" + struct.pack("<IQQ", 2, 0, 0)
+            return header
+        raise ValueError(f"Unexpected url: {url}")
+        
+    monkeypatch.setattr(huggingface, "_make_request", fake_make_request)
+    
+    tensors, config, format_name, disk_size = huggingface.fetch_huggingface_repo("org/model-gguf")
+    
+    assert format_name == "GGUF"
+    assert disk_size == 1000000000.0
+    assert tensors.get("__metadata__") == {}
+
+
+def test_remote_gguf_parsing_group(monkeypatch):
+    """Test remote GGUF parsing when multiple GGUF files are present in the repository."""
+    import json
+    from modelinfo.parsers import huggingface
+    
+    def fake_make_request(url, headers=None, limit=None, timeout=10.0):
+        if "/api/models/" in url:
+            return json.dumps({
+                "siblings": [
+                    {"rfilename": "model-q4.gguf", "size": 1000000000},
+                    {"rfilename": "model-q8.gguf", "size": 2000000000}
+                ]
+            }).encode("utf-8")
+        elif "model-q4.gguf" in url:
+            import struct
+            header = b"GGUF" + struct.pack("<IQQ", 2, 0, 0)
+            return header
+        raise ValueError(f"Unexpected url: {url}")
+        
+    monkeypatch.setattr(huggingface, "_make_request", fake_make_request)
+    
+    tensors, config, format_name, disk_size = huggingface.fetch_huggingface_repo("org/model-gguf")
+    
+    assert format_name == "GGUF_group"
+    assert disk_size == 0.0
+    assert "gguf_variants" in tensors["__metadata__"]
+    assert len(tensors["__metadata__"]["gguf_variants"]) == 2
+
+
+def test_remote_gguf_parsing_explicit(monkeypatch):
+    """Test remote GGUF parsing when the user targets a specific GGUF file in the repo id."""
+    import json
+    from modelinfo.parsers import huggingface
+    
+    called_gguf = []
+    def fake_make_request(url, headers=None, limit=None, timeout=10.0):
+        if "/api/models/" in url:
+            return json.dumps({
+                "siblings": [
+                    {"rfilename": "model-q4.gguf", "size": 1000000000},
+                    {"rfilename": "model-q8.gguf", "size": 2000000000}
+                ]
+            }).encode("utf-8")
+        elif "model-q8.gguf" in url:
+            called_gguf.append("q8")
+            import struct
+            header = b"GGUF" + struct.pack("<IQQ", 2, 0, 0)
+            return header
+        raise ValueError(f"Unexpected url: {url}")
+        
+    monkeypatch.setattr(huggingface, "_make_request", fake_make_request)
+    
+    tensors, config, format_name, disk_size = huggingface.fetch_huggingface_repo("org/model-gguf/model-q8.gguf")
+    
+    assert format_name == "GGUF"
+    assert disk_size == 2000000000.0
+    assert called_gguf == ["q8"]
+

From 71ef3a3cc83fac164700f837457e3a7d06e9b9a4 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:18:31 -0400
Subject: [PATCH 02/12] split print_model_info test to comply with codacy
 method size limit

---
 tests/test_cli.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index fd17262..857a225 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -228,10 +228,7 @@ def fake_calculate_footprint(*args, **kwargs):
     assert len(info["tensors"]["__metadata__"]["gguf_variants"]) == 2
 
 
-def test_print_model_info_gguf_group(capsys):
-    """Test print_model_info renders a comparison table for GGUF groups."""
-    from modelinfo.ui import print_model_info
-    
+def _get_mock_gguf_group_data():
     tensors = {
         "__metadata__": {
             "general.architecture": "llama",
@@ -245,7 +242,6 @@ def test_print_model_info_gguf_group(capsys):
             "repo_id": "org/model-gguf"
         }
     }
-    
     footprint = {
         "total_params": 8000000000,
         "base_memory_bytes": 4000000000.0,
@@ -259,7 +255,13 @@ def test_print_model_info_gguf_group(capsys):
         "penalty_percentage": 0.0,
         "vllm_metrics": {}
     }
-    
+    return tensors, footprint
+
+
+def test_print_model_info_gguf_group_no_gpu(capsys):
+    """Test print_model_info renders comparison table without Fits column when no GPU target."""
+    from modelinfo.ui import print_model_info
+    tensors, footprint = _get_mock_gguf_group_data()
     print_model_info(
         format_name="GGUF_group",
         arch_name="Llama (32 layers)",
@@ -273,13 +275,17 @@ def test_print_model_info_gguf_group(capsys):
         max_vram_gb=8.0,
         gpu_name=None
     )
-    
-    out, err = capsys.readouterr()
+    out, _ = capsys.readouterr()
     assert "model-q4.gguf" in out
     assert "model-q8.gguf" in out
     assert "Fits" not in out
     assert "Tip:" in out
-    
+
+
+def test_print_model_info_gguf_group_with_gpu(capsys):
+    """Test print_model_info renders comparison table with Fits column when GPU target exists."""
+    from modelinfo.ui import print_model_info
+    tensors, footprint = _get_mock_gguf_group_data()
     print_model_info(
         format_name="GGUF_group",
         arch_name="Llama (32 layers)",
@@ -293,9 +299,9 @@ def test_print_model_info_gguf_group(capsys):
         max_vram_gb=8.0,
         gpu_name="RTX4080"
     )
-    
-    out, err = capsys.readouterr()
+    out, _ = capsys.readouterr()
     assert "model-q4.gguf" in out
     assert "model-q8.gguf" in out
     assert "Fits" in out
 
+

From d0c5474a48b4cf4f3256b772cbf588a8c3c520e1 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:22:10 -0400
Subject: [PATCH 03/12] fix codacy issues: add read limit, honor gpu_util,
 modularize hf parser, add error tests

---
 src/modelinfo/parsers/huggingface.py | 208 +++++++++++++++------------
 src/modelinfo/ui.py                  |   4 +-
 tests/test_parsers.py                |  33 +++++
 3 files changed, 148 insertions(+), 97 deletions(-)

diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py
index 6ddc927..f63d212 100644
--- a/src/modelinfo/parsers/huggingface.py
+++ b/src/modelinfo/parsers/huggingface.py
@@ -5,7 +5,7 @@
 import urllib.error
 import urllib.parse
 import urllib.request
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, List, Tuple
 
 def _get_hf_endpoint() -> str:
     endpoint = os.environ.get("HF_ENDPOINT", "https://huggingface.co").strip()
@@ -126,6 +126,9 @@ def read(self, size: int = -1) -> bytes:
             raise NotImplementedError("Unlimited remote read is not supported.")
             
         end_pos = self.position + size
+        if end_pos > 50 * 1024 * 1024:
+            raise ValueError("Remote header read limit exceeded (50MB). File might be invalid or too large.")
+            
         while end_pos > len(self.buffer):
             start_bytes = len(self.buffer)
             end_bytes = start_bytes + self.chunk_size - 1
@@ -168,6 +171,108 @@ def close(self) -> None:
         pass
 
 
+def _fetch_remote_gguf_single(real_repo_id: str, filename: str, fallback_size: float | None, timeout: float) -> Tuple[Dict[str, Any], float]:
+    url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{filename}"
+    stream = RemoteFileStream(url, timeout=timeout)
+    from modelinfo.parsers.gguf import parse_gguf_header
+    tensors = parse_gguf_header(stream)
+    
+    size = fallback_size if fallback_size is not None else 0.0
+    if size == 0.0:
+        size = _get_remote_file_size_fallback(real_repo_id, filename, timeout)
+    return tensors, size
+
+
+def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]], timeout: float) -> Dict[str, Any]:
+    valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0]
+    if valid_sizes:
+        header_target = min(valid_sizes, key=lambda x: x["size"])
+    else:
+        header_target = gguf_files[0]
+    
+    header_file = header_target["filename"]
+    url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}"
+    stream = RemoteFileStream(url, timeout=timeout)
+    from modelinfo.parsers.gguf import parse_gguf_header
+    tensors = parse_gguf_header(stream)
+    
+    variants = []
+    for g in gguf_files:
+        v_size = g["size"]
+        if v_size is None or v_size == 0:
+            v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout)
+        variants.append({
+            "filename": g["filename"],
+            "size": float(v_size)
+        })
+    
+    tensors["__metadata__"] = tensors.get("__metadata__", {})
+    tensors["__metadata__"]["gguf_variants"] = variants
+    tensors["__metadata__"]["repo_id"] = real_repo_id
+    return tensors
+
+
+def _fetch_remote_safetensors_sharded(
+    real_repo_id: str,
+    config: Dict[str, Any] | None,
+    fetch_tensors: bool,
+    timeout: float
+) -> Tuple[Dict[str, Any], float]:
+    index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json"
+    index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8"))
+    
+    weight_map = index_data.get("weight_map", {})
+    unique_shards = list(set(weight_map.values()))
+    total_size = index_data.get("metadata", {}).get("total_size", 0.0)
+    
+    tensors = {}
+    if config and not fetch_tensors and total_size > 0:
+        for tensor_name in weight_map.keys():
+            tensors[tensor_name] = {"shape": [], "dtype": "BF16"}
+            
+        tensors["__metadata__"] = {
+            "missing_shards": 0,
+            "total_shards": len(unique_shards),
+            "is_sharded": True,
+            "lazy_fetch": True,
+            "total_size": total_size
+        }
+    else:
+        def fetch_shard(shard: str):
+            return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
+            
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
+            future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
+            for future in concurrent.futures.as_completed(future_to_shard):
+                shard, shard_header = future.result()
+                for k, v in shard_header.items():
+                    if k != "__metadata__":
+                        tensors[k] = v
+                        
+        tensors["__metadata__"] = {
+            "missing_shards": 0,
+            "total_shards": len(unique_shards),
+            "is_sharded": True
+        }
+    return tensors, float(total_size)
+
+
+def _fetch_remote_safetensors_single(real_repo_id: str, timeout: float) -> Tuple[Dict[str, Any], float]:
+    total_size = 0.0
+    req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD")
+    token = _get_hf_token()
+    if token:
+        req.add_header("Authorization", f"Bearer {token}")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            total_size = int(response.headers.get("Content-Length", 0))
+    except Exception:
+        pass
+
+    header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout)
+    return header, float(total_size)
+
+
 def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: float = 10.0) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]:
     """
     Fetches the metadata directly from the Hugging Face Hub over the network.
@@ -213,112 +318,25 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f
         target_sibling = next((g for g in gguf_files if g["filename"] == target_filename), None)
         if not target_sibling:
             raise FileNotFoundError(f"Could not find file '{target_filename}' in Hugging Face repository '{real_repo_id}'.")
-        
-        url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{target_filename}"
-        stream = RemoteFileStream(url, timeout=timeout)
-        from modelinfo.parsers.gguf import parse_gguf_header
-        tensors = parse_gguf_header(stream)
-        
-        size = target_sibling["size"] if target_sibling["size"] is not None else 0.0
-        if size == 0.0:
-            size = _get_remote_file_size_fallback(real_repo_id, target_filename, timeout)
-            
+        tensors, size = _fetch_remote_gguf_single(real_repo_id, target_filename, target_sibling["size"], timeout)
         return tensors, config, "GGUF", float(size)
 
     # Fallback to SafeTensors checks if no specific file is target
     if "model.safetensors.index.json" in filenames:
-        index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json"
-        index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8"))
-        
-        weight_map = index_data.get("weight_map", {})
-        unique_shards = list(set(weight_map.values()))
-        
-        total_size = index_data.get("metadata", {}).get("total_size", 0.0)
-        
-        tensors = {}
-        if config and not fetch_tensors and total_size > 0:
-            for tensor_name in weight_map.keys():
-                tensors[tensor_name] = {"shape": [], "dtype": "BF16"}
-                
-            tensors["__metadata__"] = {
-                "missing_shards": 0,
-                "total_shards": len(unique_shards),
-                "is_sharded": True,
-                "lazy_fetch": True,
-                "total_size": total_size
-            }
-        else:
-            def fetch_shard(shard: str):
-                return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
-                
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
-                future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
-                for future in concurrent.futures.as_completed(future_to_shard):
-                    shard, shard_header = future.result()
-                    for k, v in shard_header.items():
-                        if k != "__metadata__":
-                            tensors[k] = v
-                            
-            tensors["__metadata__"] = {
-                "missing_shards": 0,
-                "total_shards": len(unique_shards),
-                "is_sharded": True
-            }
-        return tensors, config, "SafeTensors", float(total_size)
+        tensors, total_size = _fetch_remote_safetensors_sharded(real_repo_id, config, fetch_tensors, timeout)
+        return tensors, config, "SafeTensors", total_size
         
     elif "model.safetensors" in filenames:
-        total_size = 0.0
-        req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD")
-        token = _get_hf_token()
-        if token:
-            req.add_header("Authorization", f"Bearer {token}")
-        try:
-            with urllib.request.urlopen(req, timeout=timeout) as response:
-                total_size = int(response.headers.get("Content-Length", 0))
-        except Exception:
-            pass
-
-        header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout)
-        return header, config, "SafeTensors", float(total_size)
+        header, total_size = _fetch_remote_safetensors_single(real_repo_id, timeout)
+        return header, config, "SafeTensors", total_size
 
     elif gguf_files:
         if len(gguf_files) == 1:
             single_file = gguf_files[0]["filename"]
-            url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{single_file}"
-            stream = RemoteFileStream(url, timeout=timeout)
-            from modelinfo.parsers.gguf import parse_gguf_header
-            tensors = parse_gguf_header(stream)
-            size = gguf_files[0]["size"] if gguf_files[0]["size"] is not None else 0.0
-            if size == 0.0:
-                size = _get_remote_file_size_fallback(real_repo_id, single_file, timeout)
+            tensors, size = _fetch_remote_gguf_single(real_repo_id, single_file, gguf_files[0]["size"], timeout)
             return tensors, config, "GGUF", float(size)
         else:
-            valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0]
-            if valid_sizes:
-                header_target = min(valid_sizes, key=lambda x: x["size"])
-            else:
-                header_target = gguf_files[0]
-            
-            header_file = header_target["filename"]
-            url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}"
-            stream = RemoteFileStream(url, timeout=timeout)
-            from modelinfo.parsers.gguf import parse_gguf_header
-            tensors = parse_gguf_header(stream)
-            
-            variants = []
-            for g in gguf_files:
-                v_size = g["size"]
-                if v_size is None or v_size == 0:
-                    v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout)
-                variants.append({
-                    "filename": g["filename"],
-                    "size": float(v_size)
-                })
-            
-            tensors["__metadata__"] = tensors.get("__metadata__", {})
-            tensors["__metadata__"]["gguf_variants"] = variants
-            tensors["__metadata__"]["repo_id"] = real_repo_id
-            
+            tensors = _fetch_remote_gguf_group(real_repo_id, gguf_files, timeout)
             return tensors, config, "GGUF_group", 0.0
 
     else:
diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py
index 44e127f..b4947bf 100644
--- a/src/modelinfo/ui.py
+++ b/src/modelinfo/ui.py
@@ -96,7 +96,7 @@ def print_model_info(
             row_data = [filename, file_size_str, kv_cache_str, total_vram_str]
             if show_fits:
                 utilization = total_vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0
-                if utilization <= 0.90:
+                if utilization <= gpu_util:
                     fit_text = "[green]✓ Yes[/green]"
                 elif utilization <= 0.99:
                     fit_text = "[yellow]⚠ Warning[/yellow]"
@@ -198,7 +198,7 @@ def print_model_info(
         summary.add_row("VRAM (est):", vram_display)
         if gpu_name:
             utilization = vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0
-            if utilization <= 0.90:
+            if utilization <= gpu_util:
                 fit_text = f"[green]✓ Fits comfortably in {gpu_name} ({max_vram_gb:.1f} GB)[/green]"
             elif utilization <= 0.99:
                 fit_text = f"[yellow]⚠ Warning: Extreme hardware limit on {gpu_name}. High risk of fragmentation OOM.[/yellow]"
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index e6cc4b6..c1d0b6e 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -169,3 +169,36 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0):
     assert disk_size == 2000000000.0
     assert called_gguf == ["q8"]
 
+
+def test_remote_gguf_parsing_unauthorized(monkeypatch):
+    """Test remote parsing raises PermissionError for gated/unauthorized (401) model repositories."""
+    import urllib.error
+    from modelinfo.parsers import huggingface
+    
+    def fake_make_request(url, headers=None, limit=None, timeout=10.0):
+        raise urllib.error.HTTPError(url, 401, "Unauthorized", {}, None)
+        
+    monkeypatch.setattr(huggingface, "_make_request", fake_make_request)
+    
+    import pytest
+    with pytest.raises(PermissionError) as exc_info:
+        huggingface.fetch_huggingface_repo("org/gated-model")
+    assert "Gated/Private Model" in str(exc_info.value)
+
+
+def test_remote_gguf_parsing_not_found(monkeypatch):
+    """Test remote parsing raises FileNotFoundError for missing (404) model repositories."""
+    import urllib.error
+    from modelinfo.parsers import huggingface
+    
+    def fake_make_request(url, headers=None, limit=None, timeout=10.0):
+        raise urllib.error.HTTPError(url, 404, "Not Found", {}, None)
+        
+    monkeypatch.setattr(huggingface, "_make_request", fake_make_request)
+    
+    import pytest
+    with pytest.raises(FileNotFoundError) as exc_info:
+        huggingface.fetch_huggingface_repo("org/nonexistent-model")
+    assert "Could not find repository on Hugging Face" in str(exc_info.value)
+
+

From bebe2c170f43ffd06716646b4ea263b27e4d10b7 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:24:08 -0400
Subject: [PATCH 04/12] refactor: split concurrent shards fetching to lower
 cyclomatic complexity

---
 src/modelinfo/parsers/huggingface.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py
index f63d212..00f09e3 100644
--- a/src/modelinfo/parsers/huggingface.py
+++ b/src/modelinfo/parsers/huggingface.py
@@ -212,6 +212,21 @@ def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]]
     return tensors
 
 
+def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Dict[str, Any]:
+    def fetch_shard(shard: str):
+        return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
+        
+    tensors = {}
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
+        future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
+        for future in concurrent.futures.as_completed(future_to_shard):
+            shard, shard_header = future.result()
+            for k, v in shard_header.items():
+                if k != "__metadata__":
+                    tensors[k] = v
+    return tensors
+
+
 def _fetch_remote_safetensors_sharded(
     real_repo_id: str,
     config: Dict[str, Any] | None,
@@ -238,17 +253,7 @@ def _fetch_remote_safetensors_sharded(
             "total_size": total_size
         }
     else:
-        def fetch_shard(shard: str):
-            return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
-            
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
-            future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
-            for future in concurrent.futures.as_completed(future_to_shard):
-                shard, shard_header = future.result()
-                for k, v in shard_header.items():
-                    if k != "__metadata__":
-                        tensors[k] = v
-                        
+        tensors = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout)
         tensors["__metadata__"] = {
             "missing_shards": 0,
             "total_shards": len(unique_shards),
@@ -257,6 +262,7 @@ def fetch_shard(shard: str):
     return tensors, float(total_size)
 
 
+
 def _fetch_remote_safetensors_single(real_repo_id: str, timeout: float) -> Tuple[Dict[str, Any], float]:
     total_size = 0.0
     req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD")

From 6555e0ece345c6d674b0aa8c23ea3194d98ad67f Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:27:49 -0400
Subject: [PATCH 05/12] fix codacy issues: compute GGUF group variant overhead
 dynamically

---
 src/modelinfo/ui.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py
index b4947bf..11431e1 100644
--- a/src/modelinfo/ui.py
+++ b/src/modelinfo/ui.py
@@ -79,13 +79,15 @@ def print_model_info(
             table.add_column("Fits", justify="left")
             
         kv_cache_bytes = footprint["kv_cache_bytes"]
-        overhead_bytes = footprint.get("overhead_bytes", 600 * 1024 * 1024)
+        penalty_percentage = footprint.get("penalty_percentage", 0.0)
+        cuda_overhead = 600 * 1024 * 1024 * gpu_count
         
         sorted_variants = sorted(variants, key=lambda x: x["size"], reverse=True)
         for var in sorted_variants:
             filename = var["filename"]
             size_bytes = var["size"]
-            total_vram_bytes = size_bytes + kv_cache_bytes + overhead_bytes
+            variant_overhead = cuda_overhead + (size_bytes * penalty_percentage)
+            total_vram_bytes = size_bytes + kv_cache_bytes + variant_overhead
             
             file_size_str = format_bytes(size_bytes)
             kv_cache_str = format_bytes(kv_cache_bytes)

From 357ee166baaae5ccc5d86a1611986a34518bf2b9 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:31:19 -0400
Subject: [PATCH 06/12] docs: document remote gguf inspection options in
 README.md

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3dfef3c..f5186a4 100644
--- a/README.md
+++ b/README.md
@@ -64,10 +64,17 @@ Inspect a local model checkpoint:
 modelinfo mistral-7b.safetensors
 ```
 
-Inspect a remote model directly from the Hugging Face Hub:
+Inspect a remote model directly from the Hugging Face Hub (both SafeTensors and GGUF):
 
 ```bash
+# Inspect a remote SafeTensors repository
 modelinfo meta-llama/Llama-2-7b-hf
+
+# Inspect a remote GGUF repository (shows a comparison table of all quantizations)
+modelinfo bartowski/Meta-Llama-3-8B-Instruct-GGUF
+
+# Inspect a specific remote GGUF file in a repository
+modelinfo bartowski/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf
 ```
 
 For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens).

From 5a8e6e276b64597ec7d74d577335df08f703eb39 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:50:08 -0400
Subject: [PATCH 07/12] fix: strip trailing slashes from model paths at
 entrypoint

---
 src/modelinfo/cli.py |  4 ++++
 tests/test_cli.py    | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py
index cb4be02..f4785eb 100644
--- a/src/modelinfo/cli.py
+++ b/src/modelinfo/cli.py
@@ -240,6 +240,10 @@ def analyze_model(
 def main(argv: Sequence[str] | None = None) -> int:
     args = parse_args(argv)
 
+    # Strip trailing slashes from paths/repos to prevent empty basenames and routing issues
+    if args.file:
+        args.file = [path.rstrip("/") for path in args.file if path]
+
     gpu_name_display = None
     gpu_vram_gb = None
     gpu_count = 1
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 857a225..97fd544 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -304,4 +304,47 @@ def test_print_model_info_gguf_group_with_gpu(capsys):
     assert "model-q8.gguf" in out
     assert "Fits" in out
 
+def test_cli_strips_trailing_slashes_from_model_paths(monkeypatch):
+    captured_paths = []
+    
+    def fake_analyze_model(file_path, *args, **kwargs):
+        captured_paths.append(file_path)
+        return {
+            "format_name": "GGUF",
+            "arch_name": "Llama",
+            "tensor_count": 10,
+            "footprint": {
+                "total_params": 100,
+                "base_memory_bytes": 200,
+                "kv_cache_bytes": 100,
+                "overhead_bytes": 50,
+                "total_memory_bytes": 350,
+                "num_layers": 1,
+            },
+            "disk_size": 200,
+            "context_length": 128,
+            "is_default_context": True,
+            "tensors": {},
+            "max_context": 512,
+            "is_lazy": False,
+            "gpu_count": 1,
+            "topology": "pcie4",
+            "strategy": "tp",
+            "is_vllm": False,
+            "gpu_vram_gb": 0.0,
+            "gpu_util": 0.9,
+        }
+
+    monkeypatch.setattr(cli, "analyze_model", fake_analyze_model)
+    monkeypatch.setattr(cli, "print_compare_info", lambda models, max_vram, gpu_name: None)
+    monkeypatch.setattr(cli, "print_model_info", lambda *args, **kwargs: None)
+
+    # Test single model path with trailing slash
+    cli.main(["meta-llama/Llama-2-7b-hf/"])
+    assert captured_paths == ["meta-llama/Llama-2-7b-hf"]
+
+    captured_paths.clear()
 
+    # Test multiple model paths with trailing slashes (side-by-side comparison)
+    cli.main(["meta-llama/Llama-2-7b-hf/", "mistralai/Mistral-7B-v0.1/"])
+    assert captured_paths == ["meta-llama/Llama-2-7b-hf", "mistralai/Mistral-7B-v0.1"]

From 7dc85762a206b775724ba0c2db222d72bb02ca58 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:50:13 -0400
Subject: [PATCH 08/12] fix: handle reverse tensor shape ordering for gguf
 shape guessing

---
 src/modelinfo/architecture.py |  5 ++--
 tests/test_calculator.py      | 43 +++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/src/modelinfo/architecture.py b/src/modelinfo/architecture.py
index b398213..bef7237 100644
--- a/src/modelinfo/architecture.py
+++ b/src/modelinfo/architecture.py
@@ -11,6 +11,7 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None)
 
     metadata = tensors.get("__metadata__", {})
     gen_arch = metadata.get("general.architecture")
+    is_gguf = "general.architecture" in metadata or any(k.startswith("general.") for k in metadata.keys())
 
     # 1. Attempt explicit GGUF metadata
     if gen_arch:
@@ -68,14 +69,14 @@ def extract_architecture(tensors: Dict[str, Any], config: Dict[str, Any] = None)
             found_k_proj = True
             shape = meta.get("shape", [])
             if len(shape) >= 2:
-                kv_dim = shape[0]
+                kv_dim = shape[-1] if is_gguf else shape[0]
 
         if "qkv_proj.weight" in name or "c_attn.weight" in name:
             found_fused = True
             if not found_k_proj:
                 shape = meta.get("shape", [])
                 if len(shape) >= 2:
-                    kv_dim = shape[0] // 3
+                    kv_dim = (shape[-1] if is_gguf else shape[0]) // 3
 
     num_layers = len(layers_set)
     if found_fused and not found_k_proj and kv_dim > 0:
diff --git a/tests/test_calculator.py b/tests/test_calculator.py
index 1ef701c..94cf3ea 100644
--- a/tests/test_calculator.py
+++ b/tests/test_calculator.py
@@ -169,3 +169,46 @@ def test_vllm_capacity_simulation():
     bytes_per_token = 40960
     expected_capacity = math.floor(metrics["paged_kv_pool"] / bytes_per_token)
     assert metrics["max_serving_capacity"] == expected_capacity
+
+
+def test_gguf_shape_guessing_fallback():
+    """Verify that shape guessing logic correctly extracts kv_dim using GGUF column-major ordering (shape[-1]) when metadata has no explicit keys."""
+    from modelinfo.architecture import extract_architecture
+
+    tensors = {
+        "__metadata__": {
+            "general.architecture": "llama",
+        },
+        "model.layers.0.self_attn.k_proj.weight": {
+            "shape": [4096, 1024],
+            "dtype": "F16"
+        },
+        "model.layers.1.self_attn.k_proj.weight": {
+            "shape": [4096, 1024],
+            "dtype": "F16"
+        }
+    }
+
+    num_layers, kv_dim, is_estimate = extract_architecture(tensors)
+    assert num_layers == 2
+    assert kv_dim == 1024
+    assert is_estimate is False
+
+def test_gguf_shape_guessing_fallback_fused():
+    """Verify that fused shape guessing extracts (shape[-1] // 3) for GGUF tensors."""
+    from modelinfo.architecture import extract_architecture
+
+    tensors = {
+        "__metadata__": {
+            "general.architecture": "gpt2",
+        },
+        "model.layers.0.self_attn.qkv_proj.weight": {
+            "shape": [4096, 3072],
+            "dtype": "F16"
+        }
+    }
+
+    num_layers, kv_dim, is_estimate = extract_architecture(tensors)
+    assert num_layers == 1
+    assert kv_dim == 1024
+    assert is_estimate is True

From 0ef126b246c42cf76873d3a9f55742f87d2c1dc3 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:50:13 -0400
Subject: [PATCH 09/12] fix: treat paths starting with local prefix as local
 files to prevent remote routing

---
 src/modelinfo/cli.py | 10 ++++++++--
 tests/test_cli.py    | 29 +++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py
index cb4be02..c9cb3ae 100644
--- a/src/modelinfo/cli.py
+++ b/src/modelinfo/cli.py
@@ -151,8 +151,14 @@ def analyze_model(
     
     is_remote = False
     if not os.path.exists(file_path):
-        if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
-            is_remote = True
+        # ponytail: prevent routing explicit local paths or typos to HF
+        is_local_path = (
+            file_path.startswith((".", "/", "~"))
+            or os.path.isabs(file_path)
+        )
+        if not is_local_path:
+            if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
+                is_remote = True
 
     if is_remote:
         from modelinfo.parsers.huggingface import fetch_huggingface_repo
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 857a225..a1709bf 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -304,4 +304,33 @@ def test_print_model_info_gguf_group_with_gpu(capsys):
     assert "model-q8.gguf" in out
     assert "Fits" in out
 
+def test_analyze_model_local_path_routing(monkeypatch):
+    """Test that analyze_model treats paths starting with local prefix as local, raising an error instead of routing to Hugging Face."""
+    from modelinfo.parsers import huggingface
+
+    hf_fetched = []
+    def fake_fetch(repo_id, *, fetch_tensors, timeout):
+        hf_fetched.append(repo_id)
+        return {}, None, "SafeTensors", 0.0
+
+    monkeypatch.setattr(huggingface, "fetch_huggingface_repo", fake_fetch)
 
+    # Test cases that should NOT hit Hugging Face
+    local_paths = ["./missing.gguf", "../missing.safetensors", "/missing.bin", "~/missing.pt"]
+    for path in local_paths:
+        with pytest.raises((FileNotFoundError, ValueError, OSError)):
+            cli.analyze_model(path, context_override=128)
+
+    assert len(hf_fetched) == 0, f"Hugging Face fetch was triggered for local paths: {hf_fetched}"
+
+    # Test cases that SHOULD hit Hugging Face
+    remote_paths = ["meta-llama/Llama-2-7b-hf", "org/model"]
+    for path in remote_paths:
+        try:
+            cli.analyze_model(path, context_override=128)
+        except Exception:
+            # We don't care if calculation fails later because of empty dict from fake_fetch,
+            # we just care that it triggers fetch_huggingface_repo.
+            pass
+
+    assert hf_fetched == remote_paths

From b0b97445509c4ae653c92d1f7c02be5de5b82f4a Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:50:21 -0400
Subject: [PATCH 10/12] fix: handle concurrent remote shard download failures
 gracefully

---
 src/modelinfo/parsers/huggingface.py | 26 +++++++++++------
 tests/test_parsers.py                | 43 ++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/src/modelinfo/parsers/huggingface.py b/src/modelinfo/parsers/huggingface.py
index 00f09e3..b36dd7f 100644
--- a/src/modelinfo/parsers/huggingface.py
+++ b/src/modelinfo/parsers/huggingface.py
@@ -212,19 +212,27 @@ def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]]
     return tensors
 
 
-def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Dict[str, Any]:
+def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Tuple[Dict[str, Any], int]:
     def fetch_shard(shard: str):
-        return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
+        try:
+            header = _fetch_safetensors_header(real_repo_id, shard, timeout=timeout)
+            return shard, header, None
+        except Exception as e:
+            return shard, {}, e
         
     tensors = {}
+    missing_shards = 0
     with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor:
         future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards}
         for future in concurrent.futures.as_completed(future_to_shard):
-            shard, shard_header = future.result()
-            for k, v in shard_header.items():
-                if k != "__metadata__":
-                    tensors[k] = v
-    return tensors
+            shard, shard_header, error = future.result()
+            if error is not None:
+                missing_shards += 1
+            else:
+                for k, v in shard_header.items():
+                    if k != "__metadata__":
+                        tensors[k] = v
+    return tensors, missing_shards
 
 
 def _fetch_remote_safetensors_sharded(
@@ -253,9 +261,9 @@ def _fetch_remote_safetensors_sharded(
             "total_size": total_size
         }
     else:
-        tensors = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout)
+        tensors, missing_shards = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout)
         tensors["__metadata__"] = {
-            "missing_shards": 0,
+            "missing_shards": missing_shards,
             "total_shards": len(unique_shards),
             "is_sharded": True
         }
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index c1d0b6e..a5d4a07 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -201,4 +201,47 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0):
         huggingface.fetch_huggingface_repo("org/nonexistent-model")
     assert "Could not find repository on Hugging Face" in str(exc_info.value)
 
+def test_remote_shard_download_failure(monkeypatch):
+    """Test remote sharded safetensors parsing when one of the shard downloads fails."""
+    import json
+    import struct
+    import urllib.error
+    from modelinfo.parsers import huggingface
 
+    def fake_make_request(url, headers=None, limit=None, timeout=10.0):
+        if "/api/models/" in url:
+            return json.dumps({
+                "siblings": [
+                    {"rfilename": "model.safetensors.index.json"},
+                    {"rfilename": "model-00001-of-00002.safetensors"},
+                    {"rfilename": "model-00002-of-00002.safetensors"}
+                ]
+            }).encode("utf-8")
+        elif "model.safetensors.index.json" in url:
+            return json.dumps({
+                "metadata": {"total_size": 2000000000},
+                "weight_map": {
+                    "layer1.weight": "model-00001-of-00002.safetensors",
+                    "layer2.weight": "model-00002-of-00002.safetensors"
+                }
+            }).encode("utf-8")
+        elif "model-00001-of-00002.safetensors" in url:
+            header_json = json.dumps({"layer1.weight": {"dtype": "BF16", "shape": [1024, 1024]}}).encode("utf-8")
+            return struct.pack("<Q", len(header_json)) + header_json
+        elif "model-00002-of-00002.safetensors" in url:
+            raise urllib.error.HTTPError(url, 502, "Bad Gateway", {}, None)
+        raise ValueError(f"Unexpected url: {url}")
+
+    monkeypatch.setattr(huggingface, "_make_request", fake_make_request)
+
+    tensors, config, format_name, disk_size = huggingface.fetch_huggingface_repo(
+        "org/sharded-safetensors-model", fetch_tensors=True
+    )
+
+    assert format_name == "SafeTensors"
+    assert disk_size == 2000000000.0
+    assert tensors["__metadata__"]["missing_shards"] == 1
+    assert tensors["__metadata__"]["total_shards"] == 2
+    assert tensors["__metadata__"]["is_sharded"] is True
+    assert "layer1.weight" in tensors
+    assert "layer2.weight" not in tensors

From 9ba40d006af6175d6de30965d9ff089b65e6936b Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:50:35 -0400
Subject: [PATCH 11/12] fix: resolve safetensors shard index prefix splitting

---
 src/modelinfo/parsers/safetensors.py |  8 +++++-
 tests/test_parsers.py                | 38 ++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/modelinfo/parsers/safetensors.py b/src/modelinfo/parsers/safetensors.py
index 5d0289f..887f5c7 100644
--- a/src/modelinfo/parsers/safetensors.py
+++ b/src/modelinfo/parsers/safetensors.py
@@ -31,7 +31,13 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
     if path.endswith(".index.json"):
         is_index = True
     elif "-of-" in base_name and path.endswith(".safetensors"):
-        prefix = base_name.split("-")[0]
+        import re
+        match = re.match(r"^(.*?)-\d{5}-of-\d{5}\.safetensors$", base_name)
+        if match:
+            prefix = match.group(1)
+        else:
+            # Fallback to splitting in case of non-standard shard formatting
+            prefix = base_name.split("-")[0]
         potential_index = os.path.join(dir_path, f"{prefix}.safetensors.index.json")
         if os.path.exists(potential_index):
             index_path = potential_index
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index c1d0b6e..91b773f 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -202,3 +202,41 @@ def fake_make_request(url, headers=None, limit=None, timeout=10.0):
     assert "Could not find repository on Hugging Face" in str(exc_info.value)
 
 
+
+
+def test_safetensors_sharded_with_hyphens(tmp_path):
+    """Test safetensors parser sharded index path resolution when filename contains hyphens."""
+    import struct
+    import json
+    
+    index_file = tmp_path / "mock-llama-3-8b.safetensors.index.json"
+    shard_file = tmp_path / "mock-llama-3-8b-00001-of-00002.safetensors"
+    
+    index_data = {
+        "weight_map": {
+            "model.embed_tokens.weight": "mock-llama-3-8b-00001-of-00002.safetensors"
+        }
+    }
+    index_file.write_text(json.dumps(index_data), encoding="utf-8")
+    
+    header_data = {
+        "model.embed_tokens.weight": {
+            "dtype": "BF16",
+            "shape": [32000, 4096],
+            "data_offsets": [0, 262144000]
+        }
+    }
+    header_json = json.dumps(header_data).encode("utf-8")
+    header_len = len(header_json)
+    
+    with open(shard_file, "wb") as f:
+        f.write(struct.pack("<Q", header_len))
+        f.write(header_json)
+        
+    tensors = parse_safetensors_header(str(shard_file))
+    
+    assert tensors.get("__metadata__", {}).get("is_sharded") is True
+    assert tensors.get("__metadata__", {}).get("total_shards") == 1
+    assert tensors.get("__metadata__", {}).get("missing_shards") == 0
+    assert "model.embed_tokens.weight" in tensors
+    assert tensors["model.embed_tokens.weight"]["dtype"] == "BF16"

From 32bf12bfca6fe28dc1b11f159c74577af0a38de8 Mon Sep 17 00:00:00 2001
From: Felipe Arce <felipearce.2004@gmail.com>
Date: Sat, 27 Jun 2026 11:56:14 -0400
Subject: [PATCH 12/12] fix: address codacy review feedback on disk size,
 regex, path parsing, and test helper

---
 src/modelinfo/cli.py                 | 13 +++++++++----
 src/modelinfo/parsers/safetensors.py |  8 ++++++--
 tests/test_cli.py                    | 14 +-------------
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py
index b0b09b3..56c27d0 100644
--- a/src/modelinfo/cli.py
+++ b/src/modelinfo/cli.py
@@ -157,7 +157,8 @@ def analyze_model(
             or os.path.isabs(file_path)
         )
         if not is_local_path:
-            if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
+            # Treat as remote only if it contains a slash and does not end with a model extension
+            if "/" in file_path and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")):
                 is_remote = True
 
     if is_remote:
@@ -218,8 +219,12 @@ def analyze_model(
     num_layers = footprint["num_layers"]
     arch_name = identify_architecture_name(tensors, num_layers, config)
 
-    if os.path.exists(file_path):
-        disk_size = os.path.getsize(file_path)
+    if not is_remote:
+        metadata = tensors.get("__metadata__", {})
+        if metadata.get("is_sharded") and "disk_size" in metadata:
+            disk_size = metadata["disk_size"]
+        elif os.path.exists(file_path):
+            disk_size = os.path.getsize(file_path)
         
     tensor_count = len([k for k in tensors.keys() if k != "__metadata__"])
     
@@ -248,7 +253,7 @@ def main(argv: Sequence[str] | None = None) -> int:
 
     # Strip trailing slashes from paths/repos to prevent empty basenames and routing issues
     if args.file:
-        args.file = [path.rstrip("/") for path in args.file if path]
+        args.file = [path.rstrip("/\\") for path in args.file if path]
 
     gpu_name_display = None
     gpu_vram_gb = None
diff --git a/src/modelinfo/parsers/safetensors.py b/src/modelinfo/parsers/safetensors.py
index 887f5c7..2e7d705 100644
--- a/src/modelinfo/parsers/safetensors.py
+++ b/src/modelinfo/parsers/safetensors.py
@@ -32,7 +32,7 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
         is_index = True
     elif "-of-" in base_name and path.endswith(".safetensors"):
         import re
-        match = re.match(r"^(.*?)-\d{5}-of-\d{5}\.safetensors$", base_name)
+        match = re.match(r"^(.*?)-\d+-of-\d+\.safetensors$", base_name)
         if match:
             prefix = match.group(1)
         else:
@@ -55,9 +55,12 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
     tensors = {}
     missing_shards = 0
     total_shards = len(unique_shards)
+    total_size = 0
     
     for shard in unique_shards:
         shard_path = os.path.join(dir_path, shard)
+        if os.path.exists(shard_path):
+            total_size += os.path.getsize(shard_path)
         try:
             shard_header = _read_single_header(shard_path)
             for k, v in shard_header.items():
@@ -69,7 +72,8 @@ def parse_safetensors_header(path: str) -> dict[str, Any]:
     tensors["__metadata__"] = {
         "missing_shards": missing_shards,
         "total_shards": total_shards,
-        "is_sharded": True
+        "is_sharded": True,
+        "disk_size": total_size
     }
     
     return tensors
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 0d7618f..267b98d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -187,19 +187,7 @@ def fake_exists(path):
         return False
         
     def fake_fetch(repo_id, *, fetch_tensors, timeout):
-        tensors = {
-            "__metadata__": {
-                "general.architecture": "llama",
-                "llama.block_count": 32,
-                "llama.attention.head_count_kv": 8,
-                "llama.attention.key_length": 128,
-                "gguf_variants": [
-                    {"filename": "model-q4.gguf", "size": 1000000000},
-                    {"filename": "model-q8.gguf", "size": 2000000000}
-                ],
-                "repo_id": "org/model-gguf"
-            }
-        }
+        tensors, _ = _get_mock_gguf_group_data()
         return tensors, None, "GGUF_group", 0.0
         
     monkeypatch.setattr(cli.os.path, "exists", fake_exists)