diff --git a/README.md b/README.md index 3dfef3c..f5186a4 100644 --- a/README.md +++ b/README.md @@ -64,10 +64,17 @@ Inspect a local model checkpoint: modelinfo mistral-7b.safetensors ``` -Inspect a remote model directly from the Hugging Face Hub: +Inspect a remote model directly from the Hugging Face Hub (both SafeTensors and GGUF): ```bash +# Inspect a remote SafeTensors repository modelinfo meta-llama/Llama-2-7b-hf + +# Inspect a remote GGUF repository (shows a comparison table of all quantizations) +modelinfo bartowski/Meta-Llama-3-8B-Instruct-GGUF + +# Inspect a specific remote GGUF file in a repository +modelinfo bartowski/Meta-Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf ``` For gated models (e.g., Llama 2), you must provide authentication by setting the `HF_TOKEN` environment variable. You can create a token in your [Hugging Face settings](https://huggingface.co/settings/tokens). diff --git a/src/modelinfo/cli.py b/src/modelinfo/cli.py index c89b6d7..cb4be02 100644 --- a/src/modelinfo/cli.py +++ b/src/modelinfo/cli.py @@ -149,7 +149,12 @@ def analyze_model( file_path_lower = file_path.lower() - if not os.path.exists(file_path) and not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): + is_remote = False + if not os.path.exists(file_path): + if "/" in file_path or not file_path_lower.endswith((".safetensors", ".gguf", ".pt", ".bin", ".index.json")): + is_remote = True + + if is_remote: from modelinfo.parsers.huggingface import fetch_huggingface_repo tensors, config, format_name, disk_size = fetch_huggingface_repo( file_path, fetch_tensors=fetch_tensors, timeout=timeout @@ -180,7 +185,7 @@ def analyze_model( max_context = None if config: max_context = config.get("max_position_embeddings") - elif format_name == "GGUF": + elif format_name in ("GGUF", "GGUF_group"): metadata = tensors.get("__metadata__", {}) gen_arch = metadata.get("general.architecture") if gen_arch: @@ -207,8 +212,8 @@ def analyze_model( num_layers = footprint["num_layers"] arch_name = identify_architecture_name(tensors, num_layers, config) - if format_name != "SafeTensors" or os.path.exists(file_path): - disk_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0.0 + if os.path.exists(file_path): + disk_size = os.path.getsize(file_path) tensor_count = len([k for k in tensors.keys() if k != "__metadata__"]) diff --git a/src/modelinfo/parsers/gguf.py b/src/modelinfo/parsers/gguf.py index 5f3d210..3af2fb4 100644 --- a/src/modelinfo/parsers/gguf.py +++ b/src/modelinfo/parsers/gguf.py @@ -44,46 +44,53 @@ def _read_gguf_value(f: Any, val_type: int) -> Any: raise ValueError(f"Unknown GGUF value type: {val_type}") -def parse_gguf_header(path: str) -> Dict[str, Any]: +def parse_gguf_header(path_or_file: str | Any) -> Dict[str, Any]: """Parses a GGUF file header and extracts tensor information.""" + if isinstance(path_or_file, str): + with open(path_or_file, "rb") as f: + return _parse_gguf_header_from_stream(f) + else: + return _parse_gguf_header_from_stream(path_or_file) + + +def _parse_gguf_header_from_stream(f: Any) -> Dict[str, Any]: tensors: Dict[str, Any] = {} + magic = f.read(4) + if magic != b"GGUF": + raise ValueError("Invalid GGUF file: Magic bytes missing.") + + version = struct.unpack(" str: endpoint = os.environ.get("HF_ENDPOINT", "https://huggingface.co").strip() @@ -101,19 +101,205 @@ def _fetch_safetensors_header(repo_id: str, filename: str, timeout: float = 10.0 return json.loads(json_bytes) +def _get_remote_file_size_fallback(repo_id: str, filename: str, timeout: float = 10.0) -> float: + req = urllib.request.Request(f"{_get_hf_endpoint()}/{repo_id}/resolve/main/{filename}", method="HEAD") + token = _get_hf_token() + if token: + req.add_header("Authorization", f"Bearer {token}") + try: + with urllib.request.urlopen(req, timeout=timeout) as response: + return float(response.headers.get("Content-Length", 0)) + except Exception: + return 0.0 + + +class RemoteFileStream: + def __init__(self, url: str, chunk_size: int = 1024*1024, timeout: float = 10.0): + self.url = url + self.chunk_size = chunk_size + self.timeout = timeout + self.buffer = b"" + self.position = 0 + + def read(self, size: int = -1) -> bytes: + if size == -1: + raise NotImplementedError("Unlimited remote read is not supported.") + + end_pos = self.position + size + if end_pos > 50 * 1024 * 1024: + raise ValueError("Remote header read limit exceeded (50MB). File might be invalid or too large.") + + while end_pos > len(self.buffer): + start_bytes = len(self.buffer) + end_bytes = start_bytes + self.chunk_size - 1 + + headers = {"Range": f"bytes={start_bytes}-{end_bytes}"} + try: + chunk = _make_request( + self.url, + headers=headers, + limit=self.chunk_size, + timeout=self.timeout + ) + if not chunk: + break + self.buffer += chunk + except urllib.error.HTTPError as e: + if e.code == 416: + break + raise + except Exception: + raise + + result = self.buffer[self.position:self.position+size] + self.position += len(result) + return result + + def seek(self, offset: int, whence: int = 0) -> int: + if whence == 0: + self.position = offset + elif whence == 1: + self.position += offset + else: + raise NotImplementedError("Seek from end is not supported.") + return self.position + + def tell(self) -> int: + return self.position + + def close(self) -> None: + pass + + +def _fetch_remote_gguf_single(real_repo_id: str, filename: str, fallback_size: float | None, timeout: float) -> Tuple[Dict[str, Any], float]: + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{filename}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + + size = fallback_size if fallback_size is not None else 0.0 + if size == 0.0: + size = _get_remote_file_size_fallback(real_repo_id, filename, timeout) + return tensors, size + + +def _fetch_remote_gguf_group(real_repo_id: str, gguf_files: List[Dict[str, Any]], timeout: float) -> Dict[str, Any]: + valid_sizes = [g for g in gguf_files if g["size"] is not None and g["size"] > 0] + if valid_sizes: + header_target = min(valid_sizes, key=lambda x: x["size"]) + else: + header_target = gguf_files[0] + + header_file = header_target["filename"] + url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/{header_file}" + stream = RemoteFileStream(url, timeout=timeout) + from modelinfo.parsers.gguf import parse_gguf_header + tensors = parse_gguf_header(stream) + + variants = [] + for g in gguf_files: + v_size = g["size"] + if v_size is None or v_size == 0: + v_size = _get_remote_file_size_fallback(real_repo_id, g["filename"], timeout) + variants.append({ + "filename": g["filename"], + "size": float(v_size) + }) + + tensors["__metadata__"] = tensors.get("__metadata__", {}) + tensors["__metadata__"]["gguf_variants"] = variants + tensors["__metadata__"]["repo_id"] = real_repo_id + return tensors + + +def _fetch_shards_concurrently(real_repo_id: str, unique_shards: List[str], timeout: float) -> Dict[str, Any]: + def fetch_shard(shard: str): + return shard, _fetch_safetensors_header(real_repo_id, shard, timeout=timeout) + + tensors = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: + future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} + for future in concurrent.futures.as_completed(future_to_shard): + shard, shard_header = future.result() + for k, v in shard_header.items(): + if k != "__metadata__": + tensors[k] = v + return tensors + + +def _fetch_remote_safetensors_sharded( + real_repo_id: str, + config: Dict[str, Any] | None, + fetch_tensors: bool, + timeout: float +) -> Tuple[Dict[str, Any], float]: + index_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors.index.json" + index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8")) + + weight_map = index_data.get("weight_map", {}) + unique_shards = list(set(weight_map.values())) + total_size = index_data.get("metadata", {}).get("total_size", 0.0) + + tensors = {} + if config and not fetch_tensors and total_size > 0: + for tensor_name in weight_map.keys(): + tensors[tensor_name] = {"shape": [], "dtype": "BF16"} + + tensors["__metadata__"] = { + "missing_shards": 0, + "total_shards": len(unique_shards), + "is_sharded": True, + "lazy_fetch": True, + "total_size": total_size + } + else: + tensors = _fetch_shards_concurrently(real_repo_id, unique_shards, timeout) + tensors["__metadata__"] = { + "missing_shards": 0, + "total_shards": len(unique_shards), + "is_sharded": True + } + return tensors, float(total_size) + + + +def _fetch_remote_safetensors_single(real_repo_id: str, timeout: float) -> Tuple[Dict[str, Any], float]: + total_size = 0.0 + req = urllib.request.Request(f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/model.safetensors", method="HEAD") + token = _get_hf_token() + if token: + req.add_header("Authorization", f"Bearer {token}") + try: + with urllib.request.urlopen(req, timeout=timeout) as response: + total_size = int(response.headers.get("Content-Length", 0)) + except Exception: + pass + + header = _fetch_safetensors_header(real_repo_id, "model.safetensors", timeout=timeout) + return header, float(total_size) + + def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: float = 10.0) -> Tuple[Dict[str, Any], Dict[str, Any] | None, str, float]: """ Fetches the metadata directly from the Hugging Face Hub over the network. Returns: (tensors, config, format_name, disk_size) """ - api_url = f"{_get_hf_endpoint()}/api/models/{repo_id}" + target_filename = None + parts = repo_id.split("/") + if len(parts) >= 3 and parts[-1].lower().endswith(".gguf"): + real_repo_id = "/".join(parts[:2]) + target_filename = "/".join(parts[2:]) + else: + real_repo_id = repo_id + + api_url = f"{_get_hf_endpoint()}/api/models/{real_repo_id}" try: api_data = json.loads(_make_request(api_url, timeout=timeout).decode("utf-8")) except urllib.error.HTTPError as e: if e.code == 401: - raise PermissionError(f"Gated/Private Model (401 Unauthorized). Set the HF_TOKEN environment variable to access {repo_id}") + raise PermissionError(f"Gated/Private Model (401 Unauthorized). Set the HF_TOKEN environment variable to access {real_repo_id}") if e.code == 404: - raise FileNotFoundError(f"Could not find repository on Hugging Face (404 Not Found): {repo_id}") + raise FileNotFoundError(f"Could not find repository on Hugging Face (404 Not Found): {real_repo_id}") raise siblings = api_data.get("siblings", []) @@ -121,73 +307,44 @@ def fetch_huggingface_repo(repo_id: str, fetch_tensors: bool = False, timeout: f config = None if "config.json" in filenames: - config_url = f"{_get_hf_endpoint()}/{repo_id}/resolve/main/config.json" + config_url = f"{_get_hf_endpoint()}/{real_repo_id}/resolve/main/config.json" config = json.loads(_make_request(config_url, timeout=timeout).decode("utf-8")) - - tensors = {} - total_size = 0.0 - + + # Find GGUF siblings + gguf_files = [] + for s in siblings: + fname = s.get("rfilename", "") + if fname.lower().endswith(".gguf"): + gguf_files.append({ + "filename": fname, + "size": s.get("size") + }) + + if target_filename: + target_sibling = next((g for g in gguf_files if g["filename"] == target_filename), None) + if not target_sibling: + raise FileNotFoundError(f"Could not find file '{target_filename}' in Hugging Face repository '{real_repo_id}'.") + tensors, size = _fetch_remote_gguf_single(real_repo_id, target_filename, target_sibling["size"], timeout) + return tensors, config, "GGUF", float(size) + + # Fallback to SafeTensors checks if no specific file is target if "model.safetensors.index.json" in filenames: - # Sharded SafeTensors - index_url = f"{_get_hf_endpoint()}/{repo_id}/resolve/main/model.safetensors.index.json" - index_data = json.loads(_make_request(index_url, timeout=timeout).decode("utf-8")) - - weight_map = index_data.get("weight_map", {}) - unique_shards = list(set(weight_map.values())) - - total_size = index_data.get("metadata", {}).get("total_size", 0.0) - - if config and not fetch_tensors and total_size > 0: - # Lazy Fetch Paradigm - for tensor_name in weight_map.keys(): - tensors[tensor_name] = {"shape": [], "dtype": "BF16"} - - tensors["__metadata__"] = { - "missing_shards": 0, - "total_shards": len(unique_shards), - "is_sharded": True, - "lazy_fetch": True, - "total_size": total_size - } - else: - def fetch_shard(shard: str): - return shard, _fetch_safetensors_header(repo_id, shard, timeout=timeout) - - with concurrent.futures.ThreadPoolExecutor(max_workers=max(1, min(8, len(unique_shards)))) as executor: - future_to_shard = {executor.submit(fetch_shard, shard): shard for shard in unique_shards} - for future in concurrent.futures.as_completed(future_to_shard): - shard, shard_header = future.result() - for k, v in shard_header.items(): - if k != "__metadata__": - tensors[k] = v - - tensors["__metadata__"] = { - "missing_shards": 0, - "total_shards": len(unique_shards), - "is_sharded": True - } - format_name = "SafeTensors" + tensors, total_size = _fetch_remote_safetensors_sharded(real_repo_id, config, fetch_tensors, timeout) + return tensors, config, "SafeTensors", total_size elif "model.safetensors" in filenames: - # Single SafeTensors - - # Determine total size first - req = urllib.request.Request(f"{_get_hf_endpoint()}/{repo_id}/resolve/main/model.safetensors", method="HEAD") - token = _get_hf_token() - if token: - req.add_header("Authorization", f"Bearer {token}") - try: - with urllib.request.urlopen(req, timeout=timeout) as response: - total_size = int(response.headers.get("Content-Length", 0)) - except Exception: - pass + header, total_size = _fetch_remote_safetensors_single(real_repo_id, timeout) + return header, config, "SafeTensors", total_size + + elif gguf_files: + if len(gguf_files) == 1: + single_file = gguf_files[0]["filename"] + tensors, size = _fetch_remote_gguf_single(real_repo_id, single_file, gguf_files[0]["size"], timeout) + return tensors, config, "GGUF", float(size) + else: + tensors = _fetch_remote_gguf_group(real_repo_id, gguf_files, timeout) + return tensors, config, "GGUF_group", 0.0 - header = _fetch_safetensors_header(repo_id, "model.safetensors", timeout=timeout) - tensors = header - - format_name = "SafeTensors" - else: - raise ValueError(f"Repository {repo_id} does not contain SafeTensors weights.") - - return tensors, config, format_name, float(total_size) + raise ValueError(f"Repository {real_repo_id} does not contain SafeTensors or GGUF weights.") + diff --git a/src/modelinfo/ui.py b/src/modelinfo/ui.py index f69ce28..11431e1 100644 --- a/src/modelinfo/ui.py +++ b/src/modelinfo/ui.py @@ -56,6 +56,63 @@ def print_model_info( gpu_vram_gb: float = 0.0, gpu_util: float = 0.9 ) -> None: + if format_name == "GGUF_group": + metadata = tensors.get("__metadata__", {}) + variants = metadata.get("gguf_variants", []) + repo_id = metadata.get("repo_id", "") + + console.print(f"[bold]Repository:[/bold] {repo_id}") + console.print("[bold]Format:[/bold] GGUF (Multiple Quantizations)") + console.print(f"[bold]Architecture:[/bold] {arch_name}") + if max_context: + console.print(f"[bold]Context Limit:[/bold] {max_context:,} tokens") + console.print() + + table = Table(box=None, show_header=True, header_style="bold", pad_edge=False, padding=(0, 2)) + table.add_column("Quantization File") + table.add_column("File Size", justify="right") + table.add_column("KV Cache", justify="right") + table.add_column("Total VRAM", justify="right") + + show_fits = gpu_name is not None + if show_fits: + table.add_column("Fits", justify="left") + + kv_cache_bytes = footprint["kv_cache_bytes"] + penalty_percentage = footprint.get("penalty_percentage", 0.0) + cuda_overhead = 600 * 1024 * 1024 * gpu_count + + sorted_variants = sorted(variants, key=lambda x: x["size"], reverse=True) + for var in sorted_variants: + filename = var["filename"] + size_bytes = var["size"] + variant_overhead = cuda_overhead + (size_bytes * penalty_percentage) + total_vram_bytes = size_bytes + kv_cache_bytes + variant_overhead + + file_size_str = format_bytes(size_bytes) + kv_cache_str = format_bytes(kv_cache_bytes) + + vram_color = get_vram_color(total_vram_bytes, max_vram_gb) + total_vram_str = f"[{vram_color}]~{format_bytes(total_vram_bytes)}[/{vram_color}]" + + row_data = [filename, file_size_str, kv_cache_str, total_vram_str] + if show_fits: + utilization = total_vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0 + if utilization <= gpu_util: + fit_text = "[green]✓ Yes[/green]" + elif utilization <= 0.99: + fit_text = "[yellow]⚠ Warning[/yellow]" + else: + fit_text = "[red]✗ No[/red]" + row_data.append(fit_text) + + table.add_row(*row_data) + + console.print(table) + console.print() + console.print(f"[dim]Tip: To view details for a specific quantization, run: modelinfo {repo_id}/{sorted_variants[0]['filename']}[/dim]") + return + summary = Table(box=None, show_header=False, pad_edge=False, padding=(0, 2)) summary.add_column("Property", style="bold") summary.add_column("Value") @@ -143,7 +200,7 @@ def print_model_info( summary.add_row("VRAM (est):", vram_display) if gpu_name: utilization = vram_bytes / (max_vram_gb * 1024**3) if max_vram_gb > 0 else 2.0 - if utilization <= 0.90: + if utilization <= gpu_util: fit_text = f"[green]✓ Fits comfortably in {gpu_name} ({max_vram_gb:.1f} GB)[/green]" elif utilization <= 0.99: fit_text = f"[yellow]⚠ Warning: Extreme hardware limit on {gpu_name}. High risk of fragmentation OOM.[/yellow]" diff --git a/tests/test_cli.py b/tests/test_cli.py index a5792d3..857a225 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -177,3 +177,131 @@ def fake_calculate_footprint(tensors, *, context_length, batch_size, **kwargs): "fetch_tensors": True, "timeout": 22.5, } + + +def test_analyze_model_gguf_group(monkeypatch): + """Test that analyze_model correctly handles and propagates GGUF groups.""" + from modelinfo.parsers import huggingface + + def fake_exists(path): + return False + + def fake_fetch(repo_id, *, fetch_tensors, timeout): + tensors = { + "__metadata__": { + "general.architecture": "llama", + "llama.block_count": 32, + "llama.attention.head_count_kv": 8, + "llama.attention.key_length": 128, + "gguf_variants": [ + {"filename": "model-q4.gguf", "size": 1000000000}, + {"filename": "model-q8.gguf", "size": 2000000000} + ], + "repo_id": "org/model-gguf" + } + } + return tensors, None, "GGUF_group", 0.0 + + monkeypatch.setattr(cli.os.path, "exists", fake_exists) + monkeypatch.setattr(huggingface, "fetch_huggingface_repo", fake_fetch) + + def fake_calculate_footprint(*args, **kwargs): + return { + "total_params": 1000000, + "base_memory_bytes": 2000000.0, + "kv_cache_bytes": 1000000.0, + "overhead_bytes": 600000.0, + "total_memory_bytes": 3600000.0, + "num_layers": 32, + "kv_dim": 1024, + "primary_dtype": "Q4_0", + "kv_is_estimate": False, + "penalty_percentage": 0.0, + "vllm_metrics": {} + } + monkeypatch.setattr(cli, "calculate_footprint", fake_calculate_footprint) + + info = cli.analyze_model("org/model-gguf", context_override=128) + + assert info["format_name"] == "GGUF_group" + assert info["tensors"]["__metadata__"]["repo_id"] == "org/model-gguf" + assert len(info["tensors"]["__metadata__"]["gguf_variants"]) == 2 + + +def _get_mock_gguf_group_data(): + tensors = { + "__metadata__": { + "general.architecture": "llama", + "llama.block_count": 32, + "llama.attention.head_count_kv": 8, + "llama.attention.key_length": 128, + "gguf_variants": [ + {"filename": "model-q4.gguf", "size": 1000000000}, + {"filename": "model-q8.gguf", "size": 2000000000} + ], + "repo_id": "org/model-gguf" + } + } + footprint = { + "total_params": 8000000000, + "base_memory_bytes": 4000000000.0, + "kv_cache_bytes": 1000000000.0, + "overhead_bytes": 600000000.0, + "total_memory_bytes": 5600000000.0, + "num_layers": 32, + "kv_dim": 1024, + "primary_dtype": "Q4_0", + "kv_is_estimate": False, + "penalty_percentage": 0.0, + "vllm_metrics": {} + } + return tensors, footprint + + +def test_print_model_info_gguf_group_no_gpu(capsys): + """Test print_model_info renders comparison table without Fits column when no GPU target.""" + from modelinfo.ui import print_model_info + tensors, footprint = _get_mock_gguf_group_data() + print_model_info( + format_name="GGUF_group", + arch_name="Llama (32 layers)", + tensor_count=0, + footprint=footprint, + disk_size=0.0, + context_length=8192, + is_default_context=True, + tensors=tensors, + max_context=32768, + max_vram_gb=8.0, + gpu_name=None + ) + out, _ = capsys.readouterr() + assert "model-q4.gguf" in out + assert "model-q8.gguf" in out + assert "Fits" not in out + assert "Tip:" in out + + +def test_print_model_info_gguf_group_with_gpu(capsys): + """Test print_model_info renders comparison table with Fits column when GPU target exists.""" + from modelinfo.ui import print_model_info + tensors, footprint = _get_mock_gguf_group_data() + print_model_info( + format_name="GGUF_group", + arch_name="Llama (32 layers)", + tensor_count=0, + footprint=footprint, + disk_size=0.0, + context_length=8192, + is_default_context=True, + tensors=tensors, + max_context=32768, + max_vram_gb=8.0, + gpu_name="RTX4080" + ) + out, _ = capsys.readouterr() + assert "model-q4.gguf" in out + assert "model-q8.gguf" in out + assert "Fits" in out + + diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 10ebc91..c1d0b6e 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -82,3 +82,123 @@ def test_hf_endpoint_rejects_no_hostname(monkeypatch): monkeypatch.setenv("HF_ENDPOINT", "https:///repo") with pytest.raises(ValueError, match="must include a valid hostname"): _get_hf_endpoint() + + +def test_remote_gguf_parsing_single(monkeypatch): + """Test remote GGUF parsing when a single GGUF is found in the repository.""" + import json + from modelinfo.parsers import huggingface + + def fake_make_request(url, headers=None, limit=None, timeout=10.0): + if "/api/models/" in url: + return json.dumps({ + "siblings": [ + {"rfilename": "model-q4.gguf", "size": 1000000000} + ] + }).encode("utf-8") + elif "model-q4.gguf" in url: + import struct + header = b"GGUF" + struct.pack("