From ca2dcb4b6ea1038e869af51872fcc49a0431d36d Mon Sep 17 00:00:00 2001
From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com>
Date: Thu, 18 Jun 2026 13:36:13 -0400
Subject: [PATCH 1/4] =?UTF-8?q?Add=20slm.evaluate()=20=E2=80=94=20a=20stan?=
 =?UTF-8?q?dalone=20eval=20harness=20(SDK=20+=20CLI)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ShadowLM had no way to score a model on a task — only train-time eval_loss
(next-token loss), not task quality. This adds the smallest meaningful slice
of an "eval gate": evaluate a loaded model over a dataset and get an aggregate
score plus a per-row breakdown.

- shadowlm/eval.py: evaluate(model, data, metric=...) + EvalResult. Metrics:
  contains (default), exact, judge (LLM-as-judge), or a custom
  (output, expected, prompt) -> float callable. Reuses APO's existing scorers
  (_contains_score, _judge_one) and column detection; handles chat /
  instruction / preference rows and dataset-path inputs.
- shadowlm/__init__.py: export evaluate, EvalResult.
- shadowlm/cli.py: `shadowlm eval <model> <dataset>` command.
- examples/evaluate.py: end-to-end demo.
---
 examples/evaluate.py |  29 ++++++++
 shadowlm/__init__.py |   3 +
 shadowlm/cli.py      |  53 ++++++++++++++
 shadowlm/eval.py     | 171 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 256 insertions(+)
 create mode 100644 examples/evaluate.py
 create mode 100644 shadowlm/eval.py
diff --git a/examples/evaluate.py b/examples/evaluate.py
new file mode 100644
index 0000000..bde5c83
--- /dev/null
+++ b/examples/evaluate.py
@@ -0,0 +1,29 @@
+"""Score a model on a task — quality, not training loss.
+
+`finetune` tells you the loss went down; `evaluate` tells you whether the model
+actually does the job. Point a loaded model at a dataset, pick a metric, get one
+number plus a per-row breakdown.
+
+    python examples/evaluate.py
+"""
+
+import shadowlm as slm
+
+MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit"
+
+# A dataset with a prompt column (instruction/question/...) and an answer column.
+ds = slm.Dataset.from_jsonl("examples/sample_dataset.jsonl")
+model = slm.load(MODEL)
+
+# contains-match: 1.0 when the expected answer appears in the output ----------
+res = slm.evaluate(model, ds, metric="contains")
+print(res)                       # EvalResult(metric='contains', score=..., n=...)
+print("per-row:", res.sparkline())
+
+# the rows it did worst on ----------------------------------------------------
+for ex in res.worst(3):
+    print(f"  {ex['score']:.1f}  {ex['input'][:50]!r} → {ex['output'][:50]!r}")
+
+# LLM-as-judge scoring (here the model judges itself; use a stronger judge for real)
+judged = slm.evaluate(model, ds, judge=model)
+print("judge score:", round(judged.score, 3))
diff --git a/shadowlm/__init__.py b/shadowlm/__init__.py
index bf452fa..96890d8 100644
--- a/shadowlm/__init__.py
+++ b/shadowlm/__init__.py
@@ -20,6 +20,7 @@
 from .capture import CaptureProxy, capture
 from .checkpoints import Checkpoint
 from .data import Dataset
+from .eval import EvalResult, evaluate
 from .models import Model, Reply, load
 from .rl import Trajectory, TrajectoryGroup, judge_group
 from .training import Metric, TrainConfig, TrainingRun
@@ -29,6 +30,8 @@
 __all__ = [
     "APORun",
     "optimize_prompt",
+    "evaluate",
+    "EvalResult",
     "CaptureProxy",
     "capture",
     "Checkpoint",
diff --git a/shadowlm/cli.py b/shadowlm/cli.py
index 916ea00..93bd50a 100644
--- a/shadowlm/cli.py
+++ b/shadowlm/cli.py
@@ -378,6 +378,59 @@ def export(
     console.print(f"exported [slm]{format}[/slm] → [slm]{out}[/slm]")
 
 
+@app.command(name="eval", rich_help_panel="Models")
+def evaluate_cmd(
+    target: Annotated[str, typer.Argument(help="model name, or an adapter directory")],
+    dataset: Annotated[str, typer.Argument(
+        help="dataset (.jsonl/.json/.csv/.parquet) with a prompt + answer column")],
+    metric: Annotated[str, typer.Option(help="contains | exact | judge")] = "contains",
+    judge: Annotated[Optional[str], typer.Option(
+        help="judge model (HF id) — implies --metric judge")] = None,
+    system: Annotated[Optional[str], typer.Option(help="system prompt for every query")] = None,
+    sample: Annotated[Optional[int], typer.Option(help="evaluate only the first N rows")] = None,
+    show: Annotated[int, typer.Option(help="how many worst examples to show")] = 5,
+    model: Annotated[Optional[str], typer.Option("--model", "-m",
+        help="base model override for adapter dirs")] = None,
+    backend: Annotated[str, typer.Option(help="auto | mlx | torch")] = "auto",
+    load_in_4bit: Annotated[bool, typer.Option("--load-in-4bit")] = False,
+    max_new_tokens: Annotated[int, typer.Option("--max-new-tokens")] = 256,
+    hf_token: Annotated[Optional[str], typer.Option("--hf-token", envvar="HF_TOKEN")] = None,
+):
+    """Score a model on a dataset — task quality, not training loss."""
+    from .data import Dataset  # noqa: PLC0415
+    from .eval import evaluate as _evaluate  # noqa: PLC0415
+    from .models import load  # noqa: PLC0415
+
+    if metric not in ("contains", "exact", "judge"):
+        raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'")
+    _maybe_set_token(hf_token)
+    m = _resolve_target(target, model, backend, load_in_4bit)
+    judge_model = load(judge, backend=backend) if judge else None
+    data = Dataset.load(dataset)
+
+    result = _evaluate(m, data, metric=metric, judge=judge_model, system=system,
+                       sample=sample, max_new_tokens=max_new_tokens, verbose=False)
+
+    console.print(
+        f"[slm]{result.metric}[/slm]  score [ok]{result.score:.3f}[/ok]  "
+        f"over {result.n} rows  {result.sparkline()}")
+    worst = result.worst(show)
+    if worst and result.score < 1.0:
+        table = Table(title="lowest-scoring examples", title_style="slm",
+                      header_style="slm", border_style="muted")
+        for col in ("score", "input", "expected", "output"):
+            table.add_column(col, no_wrap=(col == "score"))
+        for ex in worst:
+            table.add_row(f"{ex['score']:.2f}", _trunc(ex["input"]),
+                          _trunc(ex["expected"]), _trunc(ex["output"]))
+        console.print(table)
+
+
+def _trunc(text: str, n: int = 60) -> str:
+    text = " ".join(str(text).split())
+    return text if len(text) <= n else text[: n - 1] + "…"
+
+
 # ---- runs / history ---------------------------------------------------------
 @app.command(rich_help_panel="Runs")
 def runs(
diff --git a/shadowlm/eval.py b/shadowlm/eval.py
new file mode 100644
index 0000000..011ed5e
--- /dev/null
+++ b/shadowlm/eval.py
@@ -0,0 +1,171 @@
+"""Evaluation — score a model on a task, not just its training loss.
+
+`finetune` reports next-token *loss*; this reports task *quality*. Point a loaded
+model at a dataset, pick a metric, and get one number plus a per-row breakdown:
+
+    res = slm.evaluate(model, "qa.jsonl")                 # contains-match
+    res = slm.evaluate(model, ds, metric="exact")          # exact-match
+    res = slm.evaluate(model, ds, judge=judge)             # LLM-as-judge
+    res = slm.evaluate(model, ds, metric=my_score_fn)      # custom scorer
+    print(res.score, res.sparkline())
+
+This is the front half of an "eval gate" — the same capture/judge primitives,
+turned toward *measuring* a model instead of training one. Pure ShadowLM: it
+only needs a loaded model's `.chat()`. The built-in scorers are reused from APO
+(`apo._contains_score`, `apo._judge_one`) so eval and prompt-optimization agree
+on what a good answer is.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+from .data import Dataset
+
+
+def _exact_score(output: str, expected: str) -> float:
+    """1.0 when the output equals the expected answer (case/space-insensitive)."""
+    o = " ".join(str(output).lower().split())
+    e = " ".join(str(expected).lower().split())
+    return 1.0 if e and o == e else 0.0
+
+
+@dataclass
+class EvalResult:
+    """The outcome of an `evaluate` run: an aggregate score plus per-row detail."""
+
+    metric: str
+    score: float  # mean of `scores`
+    scores: list[float] = field(default_factory=list)
+    examples: list[dict] = field(default_factory=list)  # [{input, output, expected, score}]
+    n: int = 0
+
+    def sparkline(self) -> str:
+        """A tiny unicode bar of per-row scores — handy in a REPL or log line."""
+        if not self.scores:
+            return ""
+        bars = "▁▂▃▄▅▆▇█"
+        lo, hi = min(self.scores), max(self.scores)
+        rng = (hi - lo) or 1.0
+        return "".join(bars[min(7, int((s - lo) / rng * 7))] for s in self.scores)
+
+    def worst(self, k: int = 5) -> list[dict]:
+        """The k lowest-scoring examples (for eyeballing where the model fails)."""
+        return sorted(self.examples, key=lambda e: e["score"])[:k]
+
+    def to_dict(self) -> dict:
+        return {"metric": self.metric, "score": self.score, "n": self.n,
+                "scores": self.scores, "examples": self.examples}
+
+    def __repr__(self) -> str:
+        return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})"
+
+
+def _row_io(row: dict, fmt: str) -> tuple[str, str]:
+    """Pull (input_prompt, expected_answer) out of a row, by dataset format.
+
+    expected may be "" when the dataset carries no reference answer (e.g. judge
+    scoring on prompts alone).
+    """
+    from .data import CHAT, PREFERENCE  # noqa: PLC0415
+
+    if fmt == CHAT or "messages" in row:
+        msgs = row.get("messages", [])
+        prompt = next((m.get("content") or "" for m in msgs
+                       if m.get("role") == "user"), "")
+        expected = next((m.get("content") or "" for m in reversed(msgs)
+                         if m.get("role") == "assistant"), "")
+        return str(prompt), str(expected)
+    if fmt == PREFERENCE or ("chosen" in row and "prompt" in row):
+        return str(row.get("prompt", "")), str(row.get("chosen", ""))
+    # instruction / QA / raw dict — auto-detect the prompt & answer columns
+    from .apo import _cols  # noqa: PLC0415
+
+    pcol, acol = _cols(row)
+    if not pcol:
+        from .apo import _PROMPT_KEYS  # noqa: PLC0415
+
+        raise ValueError(
+            f"no prompt column found in row (looked for {_PROMPT_KEYS}); "
+            "pass chat-format rows or a dataset with a prompt/question column")
+    prompt = str(row[pcol])
+    # alpaca-style extra context column, when distinct from the prompt
+    if pcol != "input" and row.get("input"):
+        prompt = f"{prompt}\n\n{row['input']}"
+    return prompt, str(row.get(acol, "")) if acol else ""
+
+
+def _resolve_scorer(metric, judge):
+    """Map the metric arg to a scorer `(output, expected, prompt) -> float`."""
+    if callable(metric):
+        return metric, getattr(metric, "__name__", "custom")
+    from .apo import _contains_score, _judge_one  # noqa: PLC0415
+
+    if metric == "contains":
+        return (lambda out, exp, q: _contains_score(out, exp)), "contains"
+    if metric == "exact":
+        return (lambda out, exp, q: _exact_score(out, exp)), "exact"
+    if metric == "judge":
+        if judge is None:
+            raise ValueError("metric='judge' needs a judge model: evaluate(..., judge=model)")
+        return (lambda out, exp, q: _judge_one(judge, q, out, exp)), "judge"
+    raise ValueError(
+        f"unknown metric {metric!r} (expected 'contains', 'exact', 'judge', or a callable)")
+
+
+def evaluate(
+    model,
+    data: Dataset | list[dict] | str,
+    *,
+    metric="contains",
+    judge=None,
+    system: str | None = None,
+    sample: int | None = None,
+    max_new_tokens: int = 256,
+    temperature: float = 0.0,
+    verbose: bool = True,
+) -> EvalResult:
+    """Score `model` on `data`, returning an `EvalResult`.
+
+    model: a loaded shadowlm Model (answers each row via `.chat`).
+    data: a Dataset, rows, or a path to a dataset file (jsonl/json/csv/parquet).
+    metric: "contains" (default — expected answer appears in the output), "exact"
+        (normalized equality), "judge" (LLM-as-judge, needs `judge=`), or a custom
+        callable `(output, expected, prompt) -> float in [0, 1]`.
+    judge: a Model that scores answers 0–1. Passing it defaults `metric` to "judge".
+    system: optional system prompt prepended to every query.
+    sample: evaluate only the first N rows.
+    temperature: generation temperature — 0.0 (default) for deterministic scoring.
+    """
+    if isinstance(data, str):
+        data = Dataset.load(data)
+    fmt = data.format if isinstance(data, Dataset) else None
+    rows = list(data.rows if isinstance(data, Dataset) else data)
+    if sample:
+        rows = rows[:sample]
+    if not rows:
+        raise ValueError("evaluate needs at least one row")
+    if judge is not None and metric == "contains":
+        metric = "judge"  # passing a judge implies judge scoring
+    if fmt is None:
+        from .data import _detect_format  # noqa: PLC0415
+
+        fmt = _detect_format(rows)
+    scorer, metric_name = _resolve_scorer(metric, judge)
+
+    scores: list[float] = []
+    examples: list[dict] = []
+    for r in rows:
+        prompt, expected = _row_io(r, fmt)
+        msgs = ([{"role": "system", "content": system}] if system else []) + \
+               [{"role": "user", "content": prompt}]
+        out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens))
+        s = max(0.0, min(1.0, float(scorer(out, expected, prompt))))
+        scores.append(s)
+        examples.append({"input": prompt, "output": out, "expected": expected, "score": s})
+
+    score = sum(scores) / len(scores)
+    if verbose:
+        print(f"[eval] {metric_name} · {score:.3f} over {len(scores)} rows", flush=True)
+    return EvalResult(metric=metric_name, score=score, scores=scores,
+                      examples=examples, n=len(scores))

From 1b5544cbb7ea465a9109fdfe0eb534fec4c28519 Mon Sep 17 00:00:00 2001
From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com>
Date: Thu, 18 Jun 2026 15:45:55 -0400
Subject: [PATCH 2/4] README: add Shreyas Kapale as second maintainer

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3dcf522..734bb57 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 
 **A fine-tuning SDK. Any open model — with any method, on any hardware, for any harness.**
 
-Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) · `slm♥`
+Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) & [Shreyas Kapale](mailto:shreyas@lyzr.ai) · `slm♥`
 
 ```bash
 pip install shadowlm             # batteries included — the full training stack

From 5ec20289686512a8dec81f4ddc6a9e5b57197bf4 Mon Sep 17 00:00:00 2001
From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com>
Date: Sat, 20 Jun 2026 09:20:07 -0400
Subject: [PATCH 3/4] Address review: judge scorer, multi-turn rows, CLI guard,
 sample=0, tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- apo._judge_one: add a rubric + tolerant number parse (_parse_judge_score
  handles "0.7", "7/10", "8"→0.8); evaluate routes through this shared scorer,
  so APO and eval agree on a good answer. Not routing eval through judge_group
  (it's group-relative / RL-only and can't score a lone row).
- eval._row_io: feed the full conversation prefix for multi-turn chat rows and
  compare to the final assistant turn (was: first-user vs last-assistant).
- cli eval: reject --metric judge with no --judge up front (typer.BadParameter)
  instead of a raw ValueError after the model has loaded.
- eval: `if sample is not None` so --sample 0 doesn't mean "evaluate everything".
- Pull shared whitespace/lowercase normalization into apo._norm (used by both
  _contains_score and _exact_score).
- tests/test_eval.py: no-GPU stub coverage for every metric, both error paths,
  multi-turn context, preference rows, sample=0, and the judge parser.
---
 shadowlm/apo.py    |  46 ++++++++++++++--
 shadowlm/cli.py    |   4 ++
 shadowlm/eval.py   |  52 +++++++++++-------
 tests/test_eval.py | 129 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 206 insertions(+), 25 deletions(-)
 create mode 100644 tests/test_eval.py

diff --git a/shadowlm/apo.py b/shadowlm/apo.py
index ad9ef86..deaf5bc 100644
--- a/shadowlm/apo.py
+++ b/shadowlm/apo.py
@@ -67,10 +67,14 @@ def _cols(row: dict) -> tuple[str, str | None]:
     return p, a
 
 
+def _norm(text: str) -> str:
+    """Whitespace-collapsed, lowercased text for tolerant string matching."""
+    return " ".join(str(text).lower().split())
+
+
 def _contains_score(output: str, expected: str) -> float:
     """Default scorer: 1.0 if the expected answer appears in the output."""
-    o = " ".join(str(output).lower().split())
-    e = " ".join(str(expected).lower().split())
+    o, e = _norm(output), _norm(expected)
     return 1.0 if e and e in o else 0.0
 
 
@@ -218,15 +222,47 @@ def _propose(optimizer, current, failures, k, temperature, max_new_tokens) -> li
     return out
 
 
+# The shared single-answer judge: a short rubric + a tolerant number parse, used
+# by both APO and `evaluate` so they agree on what a good answer is. (The RL judge
+# in rl.py is a *group-relative* ranker — it can't score a lone eval row — so eval
+# reuses this scorer, not judge_group.)
+_JUDGE_RUBRIC = (
+    "Reward correctness first, then helpfulness, then concision. "
+    "Penalize factual errors and ignored instructions."
+)
+
+
+def _parse_judge_score(raw: str) -> float:
+    """Tolerantly pull a 0–1 score out of a judge's reply.
+
+    Small judges phrase scores many ways — a bare decimal ("0.7"), a ratio
+    ("7/10"), or an integer rating ("8" → 0.8). Handle all three, then clamp.
+    """
+    import re  # noqa: PLC0415
+
+    s = str(raw)
+    m = re.search(r"(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", s)  # "7/10"
+    if m:
+        num, den = float(m.group(1)), float(m.group(2))
+        return max(0.0, min(1.0, num / den)) if den else 0.0
+    m = re.search(r"\d+\.\d+", s)  # a decimal like "0.7"
+    if m:
+        return max(0.0, min(1.0, float(m.group())))
+    m = re.search(r"\d+", s)  # a bare integer — assume an x/10 rating above 1
+    if m:
+        v = float(m.group())
+        return max(0.0, min(1.0, v if v <= 1 else v / 10.0))
+    return 0.0
+
+
 def _judge_one(judge, question: str, output: str, expected: str) -> float:
     prompt = (
         "Score how well the ANSWER responds to the INPUT from 0.0 to 1.0.\n"
+        f"{_JUDGE_RUBRIC}\n"
         f"INPUT: {question}\nANSWER: {output}\n"
         + (f"REFERENCE: {expected}\n" if expected else "")
         + 'Reply with ONLY a number like 0.7.'
     )
     raw = str(judge.chat([{"role": "user", "content": prompt}],
                          temperature=0.0, max_new_tokens=8))
-    import re  # noqa: PLC0415
-    m = re.search(r"[01](?:\.\d+)?", raw)
-    return max(0.0, min(1.0, float(m.group()))) if m else 0.0
+    return _parse_judge_score(raw)
diff --git a/shadowlm/cli.py b/shadowlm/cli.py
index 93bd50a..8ef540b 100644
--- a/shadowlm/cli.py
+++ b/shadowlm/cli.py
@@ -403,6 +403,10 @@ def evaluate_cmd(
 
     if metric not in ("contains", "exact", "judge"):
         raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'")
+    # Validate before loading the model — otherwise the user waits for a full
+    # model download only to hit a scorer error.
+    if metric == "judge" and not judge:
+        raise typer.BadParameter("--metric judge needs a judge model: --judge <hf-id>")
     _maybe_set_token(hf_token)
     m = _resolve_target(target, model, backend, load_in_4bit)
     judge_model = load(judge, backend=backend) if judge else None
diff --git a/shadowlm/eval.py b/shadowlm/eval.py
index 011ed5e..1cae1c3 100644
--- a/shadowlm/eval.py
+++ b/shadowlm/eval.py
@@ -25,8 +25,9 @@
 
 def _exact_score(output: str, expected: str) -> float:
     """1.0 when the output equals the expected answer (case/space-insensitive)."""
-    o = " ".join(str(output).lower().split())
-    e = " ".join(str(expected).lower().split())
+    from .apo import _norm  # noqa: PLC0415
+
+    o, e = _norm(output), _norm(expected)
     return 1.0 if e and o == e else 0.0
 
 
@@ -61,23 +62,31 @@ def __repr__(self) -> str:
         return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})"
 
 
-def _row_io(row: dict, fmt: str) -> tuple[str, str]:
-    """Pull (input_prompt, expected_answer) out of a row, by dataset format.
+def _row_io(row: dict, fmt: str) -> tuple[list[dict], str]:
+    """Pull (history, expected_answer) out of a row, by dataset format.
 
-    expected may be "" when the dataset carries no reference answer (e.g. judge
-    scoring on prompts alone).
+    `history` is the conversation to feed the model — a full multi-turn prefix
+    for chat rows, or a single user turn for QA/preference rows. `expected` may
+    be "" when the dataset carries no reference answer (e.g. judge scoring on
+    prompts alone).
     """
     from .data import CHAT, PREFERENCE  # noqa: PLC0415
 
     if fmt == CHAT or "messages" in row:
-        msgs = row.get("messages", [])
-        prompt = next((m.get("content") or "" for m in msgs
-                       if m.get("role") == "user"), "")
-        expected = next((m.get("content") or "" for m in reversed(msgs)
-                         if m.get("role") == "assistant"), "")
-        return str(prompt), str(expected)
+        msgs = [{"role": m.get("role", "user"), "content": m.get("content") or ""}
+                for m in row.get("messages", [])]
+        # Everything up to the final assistant turn is context; that turn is the
+        # reference — so a multi-turn row is answered in its full conversation,
+        # not scored as "answer the opening question".
+        last_asst = next((i for i in range(len(msgs) - 1, -1, -1)
+                          if msgs[i]["role"] == "assistant"), None)
+        if last_asst is None:
+            return (msgs or [{"role": "user", "content": ""}]), ""
+        history = msgs[:last_asst] or [{"role": "user", "content": ""}]
+        return history, msgs[last_asst]["content"]
     if fmt == PREFERENCE or ("chosen" in row and "prompt" in row):
-        return str(row.get("prompt", "")), str(row.get("chosen", ""))
+        return [{"role": "user", "content": str(row.get("prompt", ""))}], \
+            str(row.get("chosen", ""))
     # instruction / QA / raw dict — auto-detect the prompt & answer columns
     from .apo import _cols  # noqa: PLC0415
 
@@ -92,7 +101,8 @@ def _row_io(row: dict, fmt: str) -> tuple[str, str]:
     # alpaca-style extra context column, when distinct from the prompt
     if pcol != "input" and row.get("input"):
         prompt = f"{prompt}\n\n{row['input']}"
-    return prompt, str(row.get(acol, "")) if acol else ""
+    return [{"role": "user", "content": prompt}], \
+        (str(row.get(acol, "")) if acol else "")
 
 
 def _resolve_scorer(metric, judge):
@@ -141,7 +151,7 @@ def evaluate(
         data = Dataset.load(data)
     fmt = data.format if isinstance(data, Dataset) else None
     rows = list(data.rows if isinstance(data, Dataset) else data)
-    if sample:
+    if sample is not None:
         rows = rows[:sample]
     if not rows:
         raise ValueError("evaluate needs at least one row")
@@ -156,13 +166,15 @@ def evaluate(
     scores: list[float] = []
     examples: list[dict] = []
     for r in rows:
-        prompt, expected = _row_io(r, fmt)
-        msgs = ([{"role": "system", "content": system}] if system else []) + \
-               [{"role": "user", "content": prompt}]
+        history, expected = _row_io(r, fmt)
+        msgs = ([{"role": "system", "content": system}] if system else []) + history
         out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens))
-        s = max(0.0, min(1.0, float(scorer(out, expected, prompt))))
+        # the last user turn is the "question" passed to a judge / shown in output
+        question = next((m["content"] for m in reversed(history)
+                         if m["role"] == "user"), "")
+        s = max(0.0, min(1.0, float(scorer(out, expected, question))))
         scores.append(s)
-        examples.append({"input": prompt, "output": out, "expected": expected, "score": s})
+        examples.append({"input": question, "output": out, "expected": expected, "score": s})
 
     score = sum(scores) / len(scores)
     if verbose:
diff --git a/tests/test_eval.py b/tests/test_eval.py
new file mode 100644
index 0000000..6c96f46
--- /dev/null
+++ b/tests/test_eval.py
@@ -0,0 +1,129 @@
+"""No-GPU tests for `slm.evaluate` — scorers, format dispatch, and error paths.
+
+Uses a stub model (canned `.chat`) so nothing downloads. Runs under pytest, or
+standalone: `python tests/test_eval.py` (exit 0 = all passed).
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from shadowlm.apo import _parse_judge_score  # noqa: E402
+from shadowlm.data import Dataset  # noqa: E402
+from shadowlm.eval import EvalResult, evaluate  # noqa: E402
+
+
+class Stub:
+    """A model whose `.chat` returns a canned reply and records what it saw."""
+
+    def __init__(self, reply: str) -> None:
+        self.reply = reply
+        self.seen: list[list[dict]] = []
+
+    def chat(self, messages, **kw):
+        self.seen.append(messages)
+        return self.reply
+
+
+QA = [{"question": "2+2?", "answer": "4"}, {"question": "cap of France?", "answer": "Paris"}]
+
+
+def test_contains_metric():
+    r = evaluate(Stub("the answer is 4"), QA, metric="contains", verbose=False)
+    assert isinstance(r, EvalResult) and r.metric == "contains"
+    assert r.scores == [1.0, 0.0] and r.n == 2
+    assert abs(r.score - 0.5) < 1e-9
+
+
+def test_exact_metric_normalizes():
+    r = evaluate(Stub("  PARIS "), QA, metric="exact", verbose=False)
+    assert r.scores == [0.0, 1.0]  # case/space-insensitive equality
+
+
+def test_custom_callable_scorer():
+    r = evaluate(Stub("xx"), QA, metric=lambda out, exp, q: len(out) / 10, verbose=False)
+    assert r.metric == "<lambda>" and r.scores == [0.2, 0.2]
+
+
+def test_judge_metric_and_implied_flip():
+    # passing judge= flips the default metric to "judge"
+    r = evaluate(Stub("4"), QA, judge=Stub("0.9"), verbose=False)
+    assert r.metric == "judge" and r.scores == [0.9, 0.9]
+
+
+def test_judge_without_model_raises():
+    try:
+        evaluate(Stub("x"), QA, metric="judge", verbose=False)
+    except ValueError as e:
+        assert "judge" in str(e)
+    else:
+        raise AssertionError("expected ValueError for metric='judge' without a judge")
+
+
+def test_missing_prompt_column_raises():
+    try:
+        evaluate(Stub("x"), [{"foo": "bar"}], metric="exact", verbose=False)
+    except ValueError as e:
+        assert "prompt column" in str(e)
+    else:
+        raise AssertionError("expected ValueError for a row with no prompt column")
+
+
+def test_chat_multiturn_keeps_context():
+    stub = Stub("blue")
+    row = {"messages": [
+        {"role": "user", "content": "pick a color"},
+        {"role": "assistant", "content": "ok"},
+        {"role": "user", "content": "now say it"},
+        {"role": "assistant", "content": "blue"},
+    ]}
+    r = evaluate(stub, Dataset.from_list([row]), metric="contains", verbose=False)
+    assert r.scores == [1.0]
+    # the model must have received the full prefix (3 turns), not just turn 1
+    sent = stub.seen[0]
+    assert [m["role"] for m in sent] == ["user", "assistant", "user"]
+    assert r.examples[0]["input"] == "now say it"  # last user turn is the question
+    assert r.examples[0]["expected"] == "blue"      # final assistant turn is the ref
+
+
+def test_preference_format():
+    ds = Dataset.from_list([{"prompt": "q", "chosen": "good", "rejected": "bad"}])
+    r = evaluate(Stub("this is good"), ds, metric="contains", verbose=False)
+    assert r.scores == [1.0]
+
+
+def test_sample_zero_is_not_whole_dataset():
+    # `--sample 0` must not silently mean "evaluate everything"
+    try:
+        evaluate(Stub("x"), QA, metric="exact", sample=0, verbose=False)
+    except ValueError as e:
+        assert "at least one row" in str(e)
+    else:
+        raise AssertionError("expected sample=0 to yield no rows, not the full set")
+
+
+def test_path_input_and_result_helpers():
+    path = str(Path(__file__).resolve().parents[1] / "examples" / "sample_dataset.jsonl")
+    r = evaluate(Stub("Paris"), path, metric="contains", sample=3, verbose=False)
+    assert r.n == 3 and len(r.sparkline()) == 3
+    assert sorted(r.to_dict()) == ["examples", "metric", "n", "score", "scores"]
+    assert len(r.worst(2)) == 2
+
+
+def test_judge_score_parser_tolerant():
+    assert _parse_judge_score("0.7") == 0.7
+    assert abs(_parse_judge_score("7/10") - 0.7) < 1e-9
+    assert _parse_judge_score("I'd rate this an 8") == 0.8   # x/10 rating
+    assert _parse_judge_score("score: 1") == 1.0
+    assert _parse_judge_score("nonsense") == 0.0
+
+
+if __name__ == "__main__":
+    fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
+    for fn in fns:
+        fn()
+        print(f"ok  {fn.__name__}")
+    print(f"\n{len(fns)} tests passed")

From e42421ec1b4e8fb388b5b95538b89b9b53add3b5 Mon Sep 17 00:00:00 2001
From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com>
Date: Sat, 20 Jun 2026 15:28:05 -0400
Subject: [PATCH 4/4] Harden eval demo + tests: CWD-independent path, edge-case
 coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- examples/evaluate.py: resolve sample_dataset.jsonl relative to __file__ so
  `python examples/evaluate.py` runs from any working directory (the dataset is
  present and tracked; this removes the only real failure mode — a CWD-relative
  path).
- tests/test_eval.py: add regression tests for unknown-metric ValueError,
  out-of-range scorer clamping, and degenerate chat rows (None content,
  assistant-only, system-in-prefix). 14 tests, all no-GPU.
---
 examples/evaluate.py |  8 ++++++--
 tests/test_eval.py   | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/examples/evaluate.py b/examples/evaluate.py
index bde5c83..d00384f 100644
--- a/examples/evaluate.py
+++ b/examples/evaluate.py
@@ -4,15 +4,19 @@
 actually does the job. Point a loaded model at a dataset, pick a metric, get one
 number plus a per-row breakdown.
 
-    python examples/evaluate.py
+    python examples/evaluate.py        # runs from any working directory
 """
 
+from pathlib import Path
+
 import shadowlm as slm
 
 MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit"
+# Resolve the dataset next to this script, so the demo runs from any CWD.
+DATA = Path(__file__).resolve().parent / "sample_dataset.jsonl"
 
 # A dataset with a prompt column (instruction/question/...) and an answer column.
-ds = slm.Dataset.from_jsonl("examples/sample_dataset.jsonl")
+ds = slm.Dataset.from_jsonl(DATA)
 model = slm.load(MODEL)
 
 # contains-match: 1.0 when the expected answer appears in the output ----------
diff --git a/tests/test_eval.py b/tests/test_eval.py
index 6c96f46..cb70a68 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -113,6 +113,35 @@ def test_path_input_and_result_helpers():
     assert len(r.worst(2)) == 2
 
 
+def test_unknown_metric_raises():
+    try:
+        evaluate(Stub("a"), QA, metric="bleu", verbose=False)
+    except ValueError as e:
+        assert "unknown metric" in str(e)
+    else:
+        raise AssertionError("expected ValueError for an unknown metric name")
+
+
+def test_scores_are_clamped_to_unit_interval():
+    assert all(s == 1.0 for s in evaluate(Stub("x"), QA, metric=lambda o, e, q: 5.0, verbose=False).scores)
+    assert all(s == 0.0 for s in evaluate(Stub("x"), QA, metric=lambda o, e, q: -3, verbose=False).scores)
+
+
+def test_degenerate_chat_rows_dont_crash():
+    from shadowlm.data import CHAT
+    from shadowlm.eval import _row_io
+
+    # None content is coerced to ""; an assistant-only row yields a placeholder turn
+    h, exp = _row_io({"messages": [{"role": "assistant", "content": None}]}, CHAT)
+    assert h == [{"role": "user", "content": ""}] and exp == ""
+    # a system turn is kept in the context prefix, not dropped
+    h, exp = _row_io({"messages": [
+        {"role": "system", "content": "be brief"},
+        {"role": "user", "content": "hi"},
+        {"role": "assistant", "content": "hello"}]}, CHAT)
+    assert [m["role"] for m in h] == ["system", "user"] and exp == "hello"
+
+
 def test_judge_score_parser_tolerant():
     assert _parse_judge_score("0.7") == 0.7
     assert abs(_parse_judge_score("7/10") - 0.7) < 1e-9