open-gitagent · shreyas-lyzr · Jun 21, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 20, 2026
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@
 
 **A fine-tuning SDK. Any open model — with any method, on any hardware, for any harness.**
 
-Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) · `slm♥`
+Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) & [Shreyas Kapale](mailto:shreyas@lyzr.ai) · `slm♥`
 
 ```bash
 pip install shadowlm             # batteries included — the full training stack

diff --git a/examples/evaluate.py b/examples/evaluate.py
@@ -0,0 +1,33 @@
+"""Score a model on a task — quality, not training loss.
+
+`finetune` tells you the loss went down; `evaluate` tells you whether the model
+actually does the job. Point a loaded model at a dataset, pick a metric, get one
+number plus a per-row breakdown.
+
+    python examples/evaluate.py        # runs from any working directory
+"""
+
+from pathlib import Path
+
+import shadowlm as slm
+
+MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit"
+# Resolve the dataset next to this script, so the demo runs from any CWD.
+DATA = Path(__file__).resolve().parent / "sample_dataset.jsonl"
+
+# A dataset with a prompt column (instruction/question/...) and an answer column.
+ds = slm.Dataset.from_jsonl(DATA)
+model = slm.load(MODEL)
+
+# contains-match: 1.0 when the expected answer appears in the output ----------
+res = slm.evaluate(model, ds, metric="contains")
+print(res)                       # EvalResult(metric='contains', score=..., n=...)
+print("per-row:", res.sparkline())
+
+# the rows it did worst on ----------------------------------------------------
+for ex in res.worst(3):
+    print(f"  {ex['score']:.1f}  {ex['input'][:50]!r} → {ex['output'][:50]!r}")
+
+# LLM-as-judge scoring (here the model judges itself; use a stronger judge for real)
+judged = slm.evaluate(model, ds, judge=model)
+print("judge score:", round(judged.score, 3))
diff --git a/shadowlm/__init__.py b/shadowlm/__init__.py
@@ -20,6 +20,7 @@
 from .capture import CaptureProxy, capture
 from .checkpoints import Checkpoint
 from .data import Dataset
+from .eval import EvalResult, evaluate
 from .models import Model, Reply, load
 from .rl import Trajectory, TrajectoryGroup, judge_group
 from .training import Metric, TrainConfig, TrainingRun
@@ -29,6 +30,8 @@
 __all__ = [
     "APORun",
     "optimize_prompt",
+    "evaluate",
+    "EvalResult",
     "CaptureProxy",
     "capture",
     "Checkpoint",

diff --git a/shadowlm/apo.py b/shadowlm/apo.py
@@ -67,10 +67,14 @@ def _cols(row: dict) -> tuple[str, str | None]:
     return p, a
 
 
+def _norm(text: str) -> str:
+    """Whitespace-collapsed, lowercased text for tolerant string matching."""
+    return " ".join(str(text).lower().split())
+
+
 def _contains_score(output: str, expected: str) -> float:
     """Default scorer: 1.0 if the expected answer appears in the output."""
-    o = " ".join(str(output).lower().split())
-    e = " ".join(str(expected).lower().split())
+    o, e = _norm(output), _norm(expected)
     return 1.0 if e and e in o else 0.0
 
 
@@ -218,15 +222,47 @@ def _propose(optimizer, current, failures, k, temperature, max_new_tokens) -> li
     return out
 
 
+# The shared single-answer judge: a short rubric + a tolerant number parse, used
+# by both APO and `evaluate` so they agree on what a good answer is. (The RL judge
+# in rl.py is a *group-relative* ranker — it can't score a lone eval row — so eval
+# reuses this scorer, not judge_group.)
+_JUDGE_RUBRIC = (
+    "Reward correctness first, then helpfulness, then concision. "
+    "Penalize factual errors and ignored instructions."
+)
+
+
+def _parse_judge_score(raw: str) -> float:
+    """Tolerantly pull a 0–1 score out of a judge's reply.
+
+    Small judges phrase scores many ways — a bare decimal ("0.7"), a ratio
+    ("7/10"), or an integer rating ("8" → 0.8). Handle all three, then clamp.
+    """
+    import re  # noqa: PLC0415
+
+    s = str(raw)
+    m = re.search(r"(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", s)  # "7/10"
+    if m:
+        num, den = float(m.group(1)), float(m.group(2))
+        return max(0.0, min(1.0, num / den)) if den else 0.0
+    m = re.search(r"\d+\.\d+", s)  # a decimal like "0.7"
+    if m:
+        return max(0.0, min(1.0, float(m.group())))
+    m = re.search(r"\d+", s)  # a bare integer — assume an x/10 rating above 1
+    if m:
+        v = float(m.group())
+        return max(0.0, min(1.0, v if v <= 1 else v / 10.0))
+    return 0.0
+
+
 def _judge_one(judge, question: str, output: str, expected: str) -> float:
     prompt = (
         "Score how well the ANSWER responds to the INPUT from 0.0 to 1.0.\n"
+        f"{_JUDGE_RUBRIC}\n"
         f"INPUT: {question}\nANSWER: {output}\n"
         + (f"REFERENCE: {expected}\n" if expected else "")
         + 'Reply with ONLY a number like 0.7.'
     )
     raw = str(judge.chat([{"role": "user", "content": prompt}],
                          temperature=0.0, max_new_tokens=8))
-    import re  # noqa: PLC0415
-    m = re.search(r"[01](?:\.\d+)?", raw)
-    return max(0.0, min(1.0, float(m.group()))) if m else 0.0
+    return _parse_judge_score(raw)
diff --git a/shadowlm/cli.py b/shadowlm/cli.py
@@ -378,6 +378,63 @@ def export(
     console.print(f"exported [slm]{format}[/slm] → [slm]{out}[/slm]")
 
 
+@app.command(name="eval", rich_help_panel="Models")
+def evaluate_cmd(
+    target: Annotated[str, typer.Argument(help="model name, or an adapter directory")],
+    dataset: Annotated[str, typer.Argument(
+        help="dataset (.jsonl/.json/.csv/.parquet) with a prompt + answer column")],
+    metric: Annotated[str, typer.Option(help="contains | exact | judge")] = "contains",
+    judge: Annotated[Optional[str], typer.Option(
+        help="judge model (HF id) — implies --metric judge")] = None,
+    system: Annotated[Optional[str], typer.Option(help="system prompt for every query")] = None,
+    sample: Annotated[Optional[int], typer.Option(help="evaluate only the first N rows")] = None,
+    show: Annotated[int, typer.Option(help="how many worst examples to show")] = 5,
+    model: Annotated[Optional[str], typer.Option("--model", "-m",
+        help="base model override for adapter dirs")] = None,
+    backend: Annotated[str, typer.Option(help="auto | mlx | torch")] = "auto",
+    load_in_4bit: Annotated[bool, typer.Option("--load-in-4bit")] = False,
+    max_new_tokens: Annotated[int, typer.Option("--max-new-tokens")] = 256,
+    hf_token: Annotated[Optional[str], typer.Option("--hf-token", envvar="HF_TOKEN")] = None,
+):
+    """Score a model on a dataset — task quality, not training loss."""
+    from .data import Dataset  # noqa: PLC0415
+    from .eval import evaluate as _evaluate  # noqa: PLC0415
+    from .models import load  # noqa: PLC0415
+
+    if metric not in ("contains", "exact", "judge"):
+        raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'")
+    # Validate before loading the model — otherwise the user waits for a full
+    # model download only to hit a scorer error.
+    if metric == "judge" and not judge:
+        raise typer.BadParameter("--metric judge needs a judge model: --judge <hf-id>")
+    _maybe_set_token(hf_token)
+    m = _resolve_target(target, model, backend, load_in_4bit)
+    judge_model = load(judge, backend=backend) if judge else None
+    data = Dataset.load(dataset)
+
+    result = _evaluate(m, data, metric=metric, judge=judge_model, system=system,
+                       sample=sample, max_new_tokens=max_new_tokens, verbose=False)
+
+    console.print(
+        f"[slm]{result.metric}[/slm]  score [ok]{result.score:.3f}[/ok]  "
+        f"over {result.n} rows  {result.sparkline()}")
+    worst = result.worst(show)
+    if worst and result.score < 1.0:
+        table = Table(title="lowest-scoring examples", title_style="slm",
+                      header_style="slm", border_style="muted")
+        for col in ("score", "input", "expected", "output"):
+            table.add_column(col, no_wrap=(col == "score"))
+        for ex in worst:
+            table.add_row(f"{ex['score']:.2f}", _trunc(ex["input"]),
+                          _trunc(ex["expected"]), _trunc(ex["output"]))
+        console.print(table)
+
+
+def _trunc(text: str, n: int = 60) -> str:
+    text = " ".join(str(text).split())
+    return text if len(text) <= n else text[: n - 1] + "…"
+
+
 # ---- runs / history ---------------------------------------------------------
 @app.command(rich_help_panel="Runs")
 def runs(

diff --git a/shadowlm/eval.py b/shadowlm/eval.py
@@ -0,0 +1,183 @@
+"""Evaluation — score a model on a task, not just its training loss.
+
+`finetune` reports next-token *loss*; this reports task *quality*. Point a loaded
+model at a dataset, pick a metric, and get one number plus a per-row breakdown:
+
+    res = slm.evaluate(model, "qa.jsonl")                 # contains-match
+    res = slm.evaluate(model, ds, metric="exact")          # exact-match
+    res = slm.evaluate(model, ds, judge=judge)             # LLM-as-judge
+    res = slm.evaluate(model, ds, metric=my_score_fn)      # custom scorer
+    print(res.score, res.sparkline())
+
+This is the front half of an "eval gate" — the same capture/judge primitives,
+turned toward *measuring* a model instead of training one. Pure ShadowLM: it
+only needs a loaded model's `.chat()`. The built-in scorers are reused from APO
+(`apo._contains_score`, `apo._judge_one`) so eval and prompt-optimization agree
+on what a good answer is.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from .data import Dataset
+
+
+def _exact_score(output: str, expected: str) -> float:
+    """1.0 when the output equals the expected answer (case/space-insensitive)."""
+    from .apo import _norm  # noqa: PLC0415
+
+    o, e = _norm(output), _norm(expected)
+    return 1.0 if e and o == e else 0.0
+
+
+@dataclass
+class EvalResult:
+    """The outcome of an `evaluate` run: an aggregate score plus per-row detail."""
+
+    metric: str
+    score: float  # mean of `scores`
+    scores: list[float] = field(default_factory=list)
+    examples: list[dict] = field(default_factory=list)  # [{input, output, expected, score}]
+    n: int = 0
+
+    def sparkline(self) -> str:
+        """A tiny unicode bar of per-row scores — handy in a REPL or log line."""
+        if not self.scores:
+            return ""
+        bars = "▁▂▃▄▅▆▇█"
+        lo, hi = min(self.scores), max(self.scores)
+        rng = (hi - lo) or 1.0
+        return "".join(bars[min(7, int((s - lo) / rng * 7))] for s in self.scores)
+
+    def worst(self, k: int = 5) -> list[dict]:
+        """The k lowest-scoring examples (for eyeballing where the model fails)."""
+        return sorted(self.examples, key=lambda e: e["score"])[:k]
+
+    def to_dict(self) -> dict:
+        return {"metric": self.metric, "score": self.score, "n": self.n,
+                "scores": self.scores, "examples": self.examples}
+
+    def __repr__(self) -> str:
+        return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})"
+
+
+def _row_io(row: dict, fmt: str) -> tuple[list[dict], str]:
+    """Pull (history, expected_answer) out of a row, by dataset format.
+
+    `history` is the conversation to feed the model — a full multi-turn prefix
+    for chat rows, or a single user turn for QA/preference rows. `expected` may
+    be "" when the dataset carries no reference answer (e.g. judge scoring on
+    prompts alone).
+    """
+    from .data import CHAT, PREFERENCE  # noqa: PLC0415
+
+    if fmt == CHAT or "messages" in row:
+        msgs = [{"role": m.get("role", "user"), "content": m.get("content") or ""}
+                for m in row.get("messages", [])]
+        # Everything up to the final assistant turn is context; that turn is the
+        # reference — so a multi-turn row is answered in its full conversation,
+        # not scored as "answer the opening question".
+        last_asst = next((i for i in range(len(msgs) - 1, -1, -1)
+                          if msgs[i]["role"] == "assistant"), None)
+        if last_asst is None:
+            return (msgs or [{"role": "user", "content": ""}]), ""
+        history = msgs[:last_asst] or [{"role": "user", "content": ""}]
+        return history, msgs[last_asst]["content"]
+    if fmt == PREFERENCE or ("chosen" in row and "prompt" in row):
+        return [{"role": "user", "content": str(row.get("prompt", ""))}], \
+            str(row.get("chosen", ""))
+    # instruction / QA / raw dict — auto-detect the prompt & answer columns
+    from .apo import _cols  # noqa: PLC0415
+
+    pcol, acol = _cols(row)
+    if not pcol:
+        from .apo import _PROMPT_KEYS  # noqa: PLC0415
+
+        raise ValueError(
+            f"no prompt column found in row (looked for {_PROMPT_KEYS}); "
+            "pass chat-format rows or a dataset with a prompt/question column")
+    prompt = str(row[pcol])
+    # alpaca-style extra context column, when distinct from the prompt
+    if pcol != "input" and row.get("input"):
+        prompt = f"{prompt}\n\n{row['input']}"
+    return [{"role": "user", "content": prompt}], \
+        (str(row.get(acol, "")) if acol else "")
+
+
+def _resolve_scorer(metric, judge):
+    """Map the metric arg to a scorer `(output, expected, prompt) -> float`."""
+    if callable(metric):
+        return metric, getattr(metric, "__name__", "custom")
+    from .apo import _contains_score, _judge_one  # noqa: PLC0415
+
+    if metric == "contains":
+        return (lambda out, exp, q: _contains_score(out, exp)), "contains"
+    if metric == "exact":
+        return (lambda out, exp, q: _exact_score(out, exp)), "exact"
+    if metric == "judge":
+        if judge is None:
+            raise ValueError("metric='judge' needs a judge model: evaluate(..., judge=model)")
+        return (lambda out, exp, q: _judge_one(judge, q, out, exp)), "judge"
+    raise ValueError(
+        f"unknown metric {metric!r} (expected 'contains', 'exact', 'judge', or a callable)")
+
+
+def evaluate(
+    model,
+    data: Dataset | list[dict] | str,
+    *,
+    metric="contains",
+    judge=None,
+    system: str | None = None,
+    sample: int | None = None,
+    max_new_tokens: int = 256,
+    temperature: float = 0.0,
+    verbose: bool = True,
+) -> EvalResult:
+    """Score `model` on `data`, returning an `EvalResult`.
+
+    model: a loaded shadowlm Model (answers each row via `.chat`).
+    data: a Dataset, rows, or a path to a dataset file (jsonl/json/csv/parquet).
+    metric: "contains" (default — expected answer appears in the output), "exact"
+        (normalized equality), "judge" (LLM-as-judge, needs `judge=`), or a custom
+        callable `(output, expected, prompt) -> float in [0, 1]`.
+    judge: a Model that scores answers 0–1. Passing it defaults `metric` to "judge".
+    system: optional system prompt prepended to every query.
+    sample: evaluate only the first N rows.
+    temperature: generation temperature — 0.0 (default) for deterministic scoring.
+    """
+    if isinstance(data, str):
+        data = Dataset.load(data)
+    fmt = data.format if isinstance(data, Dataset) else None
+    rows = list(data.rows if isinstance(data, Dataset) else data)
+    if sample is not None:
+        rows = rows[:sample]
+    if not rows:
+        raise ValueError("evaluate needs at least one row")
+    if judge is not None and metric == "contains":
+        metric = "judge"  # passing a judge implies judge scoring
+    if fmt is None:
+        from .data import _detect_format  # noqa: PLC0415
+
+        fmt = _detect_format(rows)
+    scorer, metric_name = _resolve_scorer(metric, judge)
+
+    scores: list[float] = []
+    examples: list[dict] = []
+    for r in rows:
+        history, expected = _row_io(r, fmt)
+        msgs = ([{"role": "system", "content": system}] if system else []) + history
+        out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens))
+        # the last user turn is the "question" passed to a judge / shown in output
+        question = next((m["content"] for m in reversed(history)
+                         if m["role"] == "user"), "")
+        s = max(0.0, min(1.0, float(scorer(out, expected, question))))
+        scores.append(s)
+        examples.append({"input": question, "output": out, "expected": expected, "score": s})
+
+    score = sum(scores) / len(scores)
+    if verbose:
+        print(f"[eval] {metric_name} · {score:.3f} over {len(scores)} rows", flush=True)
+    return EvalResult(metric=metric_name, score=score, scores=scores,
+                      examples=examples, n=len(scores))