From ca2dcb4b6ea1038e869af51872fcc49a0431d36d Mon Sep 17 00:00:00 2001 From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com> Date: Thu, 18 Jun 2026 13:36:13 -0400 Subject: [PATCH 1/4] =?UTF-8?q?Add=20slm.evaluate()=20=E2=80=94=20a=20stan?= =?UTF-8?q?dalone=20eval=20harness=20(SDK=20+=20CLI)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ShadowLM had no way to score a model on a task — only train-time eval_loss (next-token loss), not task quality. This adds the smallest meaningful slice of an "eval gate": evaluate a loaded model over a dataset and get an aggregate score plus a per-row breakdown. - shadowlm/eval.py: evaluate(model, data, metric=...) + EvalResult. Metrics: contains (default), exact, judge (LLM-as-judge), or a custom (output, expected, prompt) -> float callable. Reuses APO's existing scorers (_contains_score, _judge_one) and column detection; handles chat / instruction / preference rows and dataset-path inputs. - shadowlm/__init__.py: export evaluate, EvalResult. - shadowlm/cli.py: `shadowlm eval ` command. - examples/evaluate.py: end-to-end demo. --- examples/evaluate.py | 29 ++++++++ shadowlm/__init__.py | 3 + shadowlm/cli.py | 53 ++++++++++++++ shadowlm/eval.py | 171 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 256 insertions(+) create mode 100644 examples/evaluate.py create mode 100644 shadowlm/eval.py diff --git a/examples/evaluate.py b/examples/evaluate.py new file mode 100644 index 0000000..bde5c83 --- /dev/null +++ b/examples/evaluate.py @@ -0,0 +1,29 @@ +"""Score a model on a task — quality, not training loss. + +`finetune` tells you the loss went down; `evaluate` tells you whether the model +actually does the job. Point a loaded model at a dataset, pick a metric, get one +number plus a per-row breakdown. + + python examples/evaluate.py +""" + +import shadowlm as slm + +MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit" + +# A dataset with a prompt column (instruction/question/...) and an answer column. +ds = slm.Dataset.from_jsonl("examples/sample_dataset.jsonl") +model = slm.load(MODEL) + +# contains-match: 1.0 when the expected answer appears in the output ---------- +res = slm.evaluate(model, ds, metric="contains") +print(res) # EvalResult(metric='contains', score=..., n=...) +print("per-row:", res.sparkline()) + +# the rows it did worst on ---------------------------------------------------- +for ex in res.worst(3): + print(f" {ex['score']:.1f} {ex['input'][:50]!r} → {ex['output'][:50]!r}") + +# LLM-as-judge scoring (here the model judges itself; use a stronger judge for real) +judged = slm.evaluate(model, ds, judge=model) +print("judge score:", round(judged.score, 3)) diff --git a/shadowlm/__init__.py b/shadowlm/__init__.py index bf452fa..96890d8 100644 --- a/shadowlm/__init__.py +++ b/shadowlm/__init__.py @@ -20,6 +20,7 @@ from .capture import CaptureProxy, capture from .checkpoints import Checkpoint from .data import Dataset +from .eval import EvalResult, evaluate from .models import Model, Reply, load from .rl import Trajectory, TrajectoryGroup, judge_group from .training import Metric, TrainConfig, TrainingRun @@ -29,6 +30,8 @@ __all__ = [ "APORun", "optimize_prompt", + "evaluate", + "EvalResult", "CaptureProxy", "capture", "Checkpoint", diff --git a/shadowlm/cli.py b/shadowlm/cli.py index 916ea00..93bd50a 100644 --- a/shadowlm/cli.py +++ b/shadowlm/cli.py @@ -378,6 +378,59 @@ def export( console.print(f"exported [slm]{format}[/slm] → [slm]{out}[/slm]") +@app.command(name="eval", rich_help_panel="Models") +def evaluate_cmd( + target: Annotated[str, typer.Argument(help="model name, or an adapter directory")], + dataset: Annotated[str, typer.Argument( + help="dataset (.jsonl/.json/.csv/.parquet) with a prompt + answer column")], + metric: Annotated[str, typer.Option(help="contains | exact | judge")] = "contains", + judge: Annotated[Optional[str], typer.Option( + help="judge model (HF id) — implies --metric judge")] = None, + system: Annotated[Optional[str], typer.Option(help="system prompt for every query")] = None, + sample: Annotated[Optional[int], typer.Option(help="evaluate only the first N rows")] = None, + show: Annotated[int, typer.Option(help="how many worst examples to show")] = 5, + model: Annotated[Optional[str], typer.Option("--model", "-m", + help="base model override for adapter dirs")] = None, + backend: Annotated[str, typer.Option(help="auto | mlx | torch")] = "auto", + load_in_4bit: Annotated[bool, typer.Option("--load-in-4bit")] = False, + max_new_tokens: Annotated[int, typer.Option("--max-new-tokens")] = 256, + hf_token: Annotated[Optional[str], typer.Option("--hf-token", envvar="HF_TOKEN")] = None, +): + """Score a model on a dataset — task quality, not training loss.""" + from .data import Dataset # noqa: PLC0415 + from .eval import evaluate as _evaluate # noqa: PLC0415 + from .models import load # noqa: PLC0415 + + if metric not in ("contains", "exact", "judge"): + raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'") + _maybe_set_token(hf_token) + m = _resolve_target(target, model, backend, load_in_4bit) + judge_model = load(judge, backend=backend) if judge else None + data = Dataset.load(dataset) + + result = _evaluate(m, data, metric=metric, judge=judge_model, system=system, + sample=sample, max_new_tokens=max_new_tokens, verbose=False) + + console.print( + f"[slm]{result.metric}[/slm] score [ok]{result.score:.3f}[/ok] " + f"over {result.n} rows {result.sparkline()}") + worst = result.worst(show) + if worst and result.score < 1.0: + table = Table(title="lowest-scoring examples", title_style="slm", + header_style="slm", border_style="muted") + for col in ("score", "input", "expected", "output"): + table.add_column(col, no_wrap=(col == "score")) + for ex in worst: + table.add_row(f"{ex['score']:.2f}", _trunc(ex["input"]), + _trunc(ex["expected"]), _trunc(ex["output"])) + console.print(table) + + +def _trunc(text: str, n: int = 60) -> str: + text = " ".join(str(text).split()) + return text if len(text) <= n else text[: n - 1] + "…" + + # ---- runs / history --------------------------------------------------------- @app.command(rich_help_panel="Runs") def runs( diff --git a/shadowlm/eval.py b/shadowlm/eval.py new file mode 100644 index 0000000..011ed5e --- /dev/null +++ b/shadowlm/eval.py @@ -0,0 +1,171 @@ +"""Evaluation — score a model on a task, not just its training loss. + +`finetune` reports next-token *loss*; this reports task *quality*. Point a loaded +model at a dataset, pick a metric, and get one number plus a per-row breakdown: + + res = slm.evaluate(model, "qa.jsonl") # contains-match + res = slm.evaluate(model, ds, metric="exact") # exact-match + res = slm.evaluate(model, ds, judge=judge) # LLM-as-judge + res = slm.evaluate(model, ds, metric=my_score_fn) # custom scorer + print(res.score, res.sparkline()) + +This is the front half of an "eval gate" — the same capture/judge primitives, +turned toward *measuring* a model instead of training one. Pure ShadowLM: it +only needs a loaded model's `.chat()`. The built-in scorers are reused from APO +(`apo._contains_score`, `apo._judge_one`) so eval and prompt-optimization agree +on what a good answer is. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from .data import Dataset + + +def _exact_score(output: str, expected: str) -> float: + """1.0 when the output equals the expected answer (case/space-insensitive).""" + o = " ".join(str(output).lower().split()) + e = " ".join(str(expected).lower().split()) + return 1.0 if e and o == e else 0.0 + + +@dataclass +class EvalResult: + """The outcome of an `evaluate` run: an aggregate score plus per-row detail.""" + + metric: str + score: float # mean of `scores` + scores: list[float] = field(default_factory=list) + examples: list[dict] = field(default_factory=list) # [{input, output, expected, score}] + n: int = 0 + + def sparkline(self) -> str: + """A tiny unicode bar of per-row scores — handy in a REPL or log line.""" + if not self.scores: + return "" + bars = "▁▂▃▄▅▆▇█" + lo, hi = min(self.scores), max(self.scores) + rng = (hi - lo) or 1.0 + return "".join(bars[min(7, int((s - lo) / rng * 7))] for s in self.scores) + + def worst(self, k: int = 5) -> list[dict]: + """The k lowest-scoring examples (for eyeballing where the model fails).""" + return sorted(self.examples, key=lambda e: e["score"])[:k] + + def to_dict(self) -> dict: + return {"metric": self.metric, "score": self.score, "n": self.n, + "scores": self.scores, "examples": self.examples} + + def __repr__(self) -> str: + return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})" + + +def _row_io(row: dict, fmt: str) -> tuple[str, str]: + """Pull (input_prompt, expected_answer) out of a row, by dataset format. + + expected may be "" when the dataset carries no reference answer (e.g. judge + scoring on prompts alone). + """ + from .data import CHAT, PREFERENCE # noqa: PLC0415 + + if fmt == CHAT or "messages" in row: + msgs = row.get("messages", []) + prompt = next((m.get("content") or "" for m in msgs + if m.get("role") == "user"), "") + expected = next((m.get("content") or "" for m in reversed(msgs) + if m.get("role") == "assistant"), "") + return str(prompt), str(expected) + if fmt == PREFERENCE or ("chosen" in row and "prompt" in row): + return str(row.get("prompt", "")), str(row.get("chosen", "")) + # instruction / QA / raw dict — auto-detect the prompt & answer columns + from .apo import _cols # noqa: PLC0415 + + pcol, acol = _cols(row) + if not pcol: + from .apo import _PROMPT_KEYS # noqa: PLC0415 + + raise ValueError( + f"no prompt column found in row (looked for {_PROMPT_KEYS}); " + "pass chat-format rows or a dataset with a prompt/question column") + prompt = str(row[pcol]) + # alpaca-style extra context column, when distinct from the prompt + if pcol != "input" and row.get("input"): + prompt = f"{prompt}\n\n{row['input']}" + return prompt, str(row.get(acol, "")) if acol else "" + + +def _resolve_scorer(metric, judge): + """Map the metric arg to a scorer `(output, expected, prompt) -> float`.""" + if callable(metric): + return metric, getattr(metric, "__name__", "custom") + from .apo import _contains_score, _judge_one # noqa: PLC0415 + + if metric == "contains": + return (lambda out, exp, q: _contains_score(out, exp)), "contains" + if metric == "exact": + return (lambda out, exp, q: _exact_score(out, exp)), "exact" + if metric == "judge": + if judge is None: + raise ValueError("metric='judge' needs a judge model: evaluate(..., judge=model)") + return (lambda out, exp, q: _judge_one(judge, q, out, exp)), "judge" + raise ValueError( + f"unknown metric {metric!r} (expected 'contains', 'exact', 'judge', or a callable)") + + +def evaluate( + model, + data: Dataset | list[dict] | str, + *, + metric="contains", + judge=None, + system: str | None = None, + sample: int | None = None, + max_new_tokens: int = 256, + temperature: float = 0.0, + verbose: bool = True, +) -> EvalResult: + """Score `model` on `data`, returning an `EvalResult`. + + model: a loaded shadowlm Model (answers each row via `.chat`). + data: a Dataset, rows, or a path to a dataset file (jsonl/json/csv/parquet). + metric: "contains" (default — expected answer appears in the output), "exact" + (normalized equality), "judge" (LLM-as-judge, needs `judge=`), or a custom + callable `(output, expected, prompt) -> float in [0, 1]`. + judge: a Model that scores answers 0–1. Passing it defaults `metric` to "judge". + system: optional system prompt prepended to every query. + sample: evaluate only the first N rows. + temperature: generation temperature — 0.0 (default) for deterministic scoring. + """ + if isinstance(data, str): + data = Dataset.load(data) + fmt = data.format if isinstance(data, Dataset) else None + rows = list(data.rows if isinstance(data, Dataset) else data) + if sample: + rows = rows[:sample] + if not rows: + raise ValueError("evaluate needs at least one row") + if judge is not None and metric == "contains": + metric = "judge" # passing a judge implies judge scoring + if fmt is None: + from .data import _detect_format # noqa: PLC0415 + + fmt = _detect_format(rows) + scorer, metric_name = _resolve_scorer(metric, judge) + + scores: list[float] = [] + examples: list[dict] = [] + for r in rows: + prompt, expected = _row_io(r, fmt) + msgs = ([{"role": "system", "content": system}] if system else []) + \ + [{"role": "user", "content": prompt}] + out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens)) + s = max(0.0, min(1.0, float(scorer(out, expected, prompt)))) + scores.append(s) + examples.append({"input": prompt, "output": out, "expected": expected, "score": s}) + + score = sum(scores) / len(scores) + if verbose: + print(f"[eval] {metric_name} · {score:.3f} over {len(scores)} rows", flush=True) + return EvalResult(metric=metric_name, score=score, scores=scores, + examples=examples, n=len(scores)) From 1b5544cbb7ea465a9109fdfe0eb534fec4c28519 Mon Sep 17 00:00:00 2001 From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com> Date: Thu, 18 Jun 2026 15:45:55 -0400 Subject: [PATCH 2/4] README: add Shreyas Kapale as second maintainer --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dcf522..734bb57 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ **A fine-tuning SDK. Any open model — with any method, on any hardware, for any harness.** -Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) · `slm♥` +Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) & [Shreyas Kapale](mailto:shreyas@lyzr.ai) · `slm♥` ```bash pip install shadowlm # batteries included — the full training stack From 5ec20289686512a8dec81f4ddc6a9e5b57197bf4 Mon Sep 17 00:00:00 2001 From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com> Date: Sat, 20 Jun 2026 09:20:07 -0400 Subject: [PATCH 3/4] Address review: judge scorer, multi-turn rows, CLI guard, sample=0, tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - apo._judge_one: add a rubric + tolerant number parse (_parse_judge_score handles "0.7", "7/10", "8"→0.8); evaluate routes through this shared scorer, so APO and eval agree on a good answer. Not routing eval through judge_group (it's group-relative / RL-only and can't score a lone row). - eval._row_io: feed the full conversation prefix for multi-turn chat rows and compare to the final assistant turn (was: first-user vs last-assistant). - cli eval: reject --metric judge with no --judge up front (typer.BadParameter) instead of a raw ValueError after the model has loaded. - eval: `if sample is not None` so --sample 0 doesn't mean "evaluate everything". - Pull shared whitespace/lowercase normalization into apo._norm (used by both _contains_score and _exact_score). - tests/test_eval.py: no-GPU stub coverage for every metric, both error paths, multi-turn context, preference rows, sample=0, and the judge parser. --- shadowlm/apo.py | 46 ++++++++++++++-- shadowlm/cli.py | 4 ++ shadowlm/eval.py | 52 +++++++++++------- tests/test_eval.py | 129 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 206 insertions(+), 25 deletions(-) create mode 100644 tests/test_eval.py diff --git a/shadowlm/apo.py b/shadowlm/apo.py index ad9ef86..deaf5bc 100644 --- a/shadowlm/apo.py +++ b/shadowlm/apo.py @@ -67,10 +67,14 @@ def _cols(row: dict) -> tuple[str, str | None]: return p, a +def _norm(text: str) -> str: + """Whitespace-collapsed, lowercased text for tolerant string matching.""" + return " ".join(str(text).lower().split()) + + def _contains_score(output: str, expected: str) -> float: """Default scorer: 1.0 if the expected answer appears in the output.""" - o = " ".join(str(output).lower().split()) - e = " ".join(str(expected).lower().split()) + o, e = _norm(output), _norm(expected) return 1.0 if e and e in o else 0.0 @@ -218,15 +222,47 @@ def _propose(optimizer, current, failures, k, temperature, max_new_tokens) -> li return out +# The shared single-answer judge: a short rubric + a tolerant number parse, used +# by both APO and `evaluate` so they agree on what a good answer is. (The RL judge +# in rl.py is a *group-relative* ranker — it can't score a lone eval row — so eval +# reuses this scorer, not judge_group.) +_JUDGE_RUBRIC = ( + "Reward correctness first, then helpfulness, then concision. " + "Penalize factual errors and ignored instructions." +) + + +def _parse_judge_score(raw: str) -> float: + """Tolerantly pull a 0–1 score out of a judge's reply. + + Small judges phrase scores many ways — a bare decimal ("0.7"), a ratio + ("7/10"), or an integer rating ("8" → 0.8). Handle all three, then clamp. + """ + import re # noqa: PLC0415 + + s = str(raw) + m = re.search(r"(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", s) # "7/10" + if m: + num, den = float(m.group(1)), float(m.group(2)) + return max(0.0, min(1.0, num / den)) if den else 0.0 + m = re.search(r"\d+\.\d+", s) # a decimal like "0.7" + if m: + return max(0.0, min(1.0, float(m.group()))) + m = re.search(r"\d+", s) # a bare integer — assume an x/10 rating above 1 + if m: + v = float(m.group()) + return max(0.0, min(1.0, v if v <= 1 else v / 10.0)) + return 0.0 + + def _judge_one(judge, question: str, output: str, expected: str) -> float: prompt = ( "Score how well the ANSWER responds to the INPUT from 0.0 to 1.0.\n" + f"{_JUDGE_RUBRIC}\n" f"INPUT: {question}\nANSWER: {output}\n" + (f"REFERENCE: {expected}\n" if expected else "") + 'Reply with ONLY a number like 0.7.' ) raw = str(judge.chat([{"role": "user", "content": prompt}], temperature=0.0, max_new_tokens=8)) - import re # noqa: PLC0415 - m = re.search(r"[01](?:\.\d+)?", raw) - return max(0.0, min(1.0, float(m.group()))) if m else 0.0 + return _parse_judge_score(raw) diff --git a/shadowlm/cli.py b/shadowlm/cli.py index 93bd50a..8ef540b 100644 --- a/shadowlm/cli.py +++ b/shadowlm/cli.py @@ -403,6 +403,10 @@ def evaluate_cmd( if metric not in ("contains", "exact", "judge"): raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'") + # Validate before loading the model — otherwise the user waits for a full + # model download only to hit a scorer error. + if metric == "judge" and not judge: + raise typer.BadParameter("--metric judge needs a judge model: --judge ") _maybe_set_token(hf_token) m = _resolve_target(target, model, backend, load_in_4bit) judge_model = load(judge, backend=backend) if judge else None diff --git a/shadowlm/eval.py b/shadowlm/eval.py index 011ed5e..1cae1c3 100644 --- a/shadowlm/eval.py +++ b/shadowlm/eval.py @@ -25,8 +25,9 @@ def _exact_score(output: str, expected: str) -> float: """1.0 when the output equals the expected answer (case/space-insensitive).""" - o = " ".join(str(output).lower().split()) - e = " ".join(str(expected).lower().split()) + from .apo import _norm # noqa: PLC0415 + + o, e = _norm(output), _norm(expected) return 1.0 if e and o == e else 0.0 @@ -61,23 +62,31 @@ def __repr__(self) -> str: return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})" -def _row_io(row: dict, fmt: str) -> tuple[str, str]: - """Pull (input_prompt, expected_answer) out of a row, by dataset format. +def _row_io(row: dict, fmt: str) -> tuple[list[dict], str]: + """Pull (history, expected_answer) out of a row, by dataset format. - expected may be "" when the dataset carries no reference answer (e.g. judge - scoring on prompts alone). + `history` is the conversation to feed the model — a full multi-turn prefix + for chat rows, or a single user turn for QA/preference rows. `expected` may + be "" when the dataset carries no reference answer (e.g. judge scoring on + prompts alone). """ from .data import CHAT, PREFERENCE # noqa: PLC0415 if fmt == CHAT or "messages" in row: - msgs = row.get("messages", []) - prompt = next((m.get("content") or "" for m in msgs - if m.get("role") == "user"), "") - expected = next((m.get("content") or "" for m in reversed(msgs) - if m.get("role") == "assistant"), "") - return str(prompt), str(expected) + msgs = [{"role": m.get("role", "user"), "content": m.get("content") or ""} + for m in row.get("messages", [])] + # Everything up to the final assistant turn is context; that turn is the + # reference — so a multi-turn row is answered in its full conversation, + # not scored as "answer the opening question". + last_asst = next((i for i in range(len(msgs) - 1, -1, -1) + if msgs[i]["role"] == "assistant"), None) + if last_asst is None: + return (msgs or [{"role": "user", "content": ""}]), "" + history = msgs[:last_asst] or [{"role": "user", "content": ""}] + return history, msgs[last_asst]["content"] if fmt == PREFERENCE or ("chosen" in row and "prompt" in row): - return str(row.get("prompt", "")), str(row.get("chosen", "")) + return [{"role": "user", "content": str(row.get("prompt", ""))}], \ + str(row.get("chosen", "")) # instruction / QA / raw dict — auto-detect the prompt & answer columns from .apo import _cols # noqa: PLC0415 @@ -92,7 +101,8 @@ def _row_io(row: dict, fmt: str) -> tuple[str, str]: # alpaca-style extra context column, when distinct from the prompt if pcol != "input" and row.get("input"): prompt = f"{prompt}\n\n{row['input']}" - return prompt, str(row.get(acol, "")) if acol else "" + return [{"role": "user", "content": prompt}], \ + (str(row.get(acol, "")) if acol else "") def _resolve_scorer(metric, judge): @@ -141,7 +151,7 @@ def evaluate( data = Dataset.load(data) fmt = data.format if isinstance(data, Dataset) else None rows = list(data.rows if isinstance(data, Dataset) else data) - if sample: + if sample is not None: rows = rows[:sample] if not rows: raise ValueError("evaluate needs at least one row") @@ -156,13 +166,15 @@ def evaluate( scores: list[float] = [] examples: list[dict] = [] for r in rows: - prompt, expected = _row_io(r, fmt) - msgs = ([{"role": "system", "content": system}] if system else []) + \ - [{"role": "user", "content": prompt}] + history, expected = _row_io(r, fmt) + msgs = ([{"role": "system", "content": system}] if system else []) + history out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens)) - s = max(0.0, min(1.0, float(scorer(out, expected, prompt)))) + # the last user turn is the "question" passed to a judge / shown in output + question = next((m["content"] for m in reversed(history) + if m["role"] == "user"), "") + s = max(0.0, min(1.0, float(scorer(out, expected, question)))) scores.append(s) - examples.append({"input": prompt, "output": out, "expected": expected, "score": s}) + examples.append({"input": question, "output": out, "expected": expected, "score": s}) score = sum(scores) / len(scores) if verbose: diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..6c96f46 --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,129 @@ +"""No-GPU tests for `slm.evaluate` — scorers, format dispatch, and error paths. + +Uses a stub model (canned `.chat`) so nothing downloads. Runs under pytest, or +standalone: `python tests/test_eval.py` (exit 0 = all passed). +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from shadowlm.apo import _parse_judge_score # noqa: E402 +from shadowlm.data import Dataset # noqa: E402 +from shadowlm.eval import EvalResult, evaluate # noqa: E402 + + +class Stub: + """A model whose `.chat` returns a canned reply and records what it saw.""" + + def __init__(self, reply: str) -> None: + self.reply = reply + self.seen: list[list[dict]] = [] + + def chat(self, messages, **kw): + self.seen.append(messages) + return self.reply + + +QA = [{"question": "2+2?", "answer": "4"}, {"question": "cap of France?", "answer": "Paris"}] + + +def test_contains_metric(): + r = evaluate(Stub("the answer is 4"), QA, metric="contains", verbose=False) + assert isinstance(r, EvalResult) and r.metric == "contains" + assert r.scores == [1.0, 0.0] and r.n == 2 + assert abs(r.score - 0.5) < 1e-9 + + +def test_exact_metric_normalizes(): + r = evaluate(Stub(" PARIS "), QA, metric="exact", verbose=False) + assert r.scores == [0.0, 1.0] # case/space-insensitive equality + + +def test_custom_callable_scorer(): + r = evaluate(Stub("xx"), QA, metric=lambda out, exp, q: len(out) / 10, verbose=False) + assert r.metric == "" and r.scores == [0.2, 0.2] + + +def test_judge_metric_and_implied_flip(): + # passing judge= flips the default metric to "judge" + r = evaluate(Stub("4"), QA, judge=Stub("0.9"), verbose=False) + assert r.metric == "judge" and r.scores == [0.9, 0.9] + + +def test_judge_without_model_raises(): + try: + evaluate(Stub("x"), QA, metric="judge", verbose=False) + except ValueError as e: + assert "judge" in str(e) + else: + raise AssertionError("expected ValueError for metric='judge' without a judge") + + +def test_missing_prompt_column_raises(): + try: + evaluate(Stub("x"), [{"foo": "bar"}], metric="exact", verbose=False) + except ValueError as e: + assert "prompt column" in str(e) + else: + raise AssertionError("expected ValueError for a row with no prompt column") + + +def test_chat_multiturn_keeps_context(): + stub = Stub("blue") + row = {"messages": [ + {"role": "user", "content": "pick a color"}, + {"role": "assistant", "content": "ok"}, + {"role": "user", "content": "now say it"}, + {"role": "assistant", "content": "blue"}, + ]} + r = evaluate(stub, Dataset.from_list([row]), metric="contains", verbose=False) + assert r.scores == [1.0] + # the model must have received the full prefix (3 turns), not just turn 1 + sent = stub.seen[0] + assert [m["role"] for m in sent] == ["user", "assistant", "user"] + assert r.examples[0]["input"] == "now say it" # last user turn is the question + assert r.examples[0]["expected"] == "blue" # final assistant turn is the ref + + +def test_preference_format(): + ds = Dataset.from_list([{"prompt": "q", "chosen": "good", "rejected": "bad"}]) + r = evaluate(Stub("this is good"), ds, metric="contains", verbose=False) + assert r.scores == [1.0] + + +def test_sample_zero_is_not_whole_dataset(): + # `--sample 0` must not silently mean "evaluate everything" + try: + evaluate(Stub("x"), QA, metric="exact", sample=0, verbose=False) + except ValueError as e: + assert "at least one row" in str(e) + else: + raise AssertionError("expected sample=0 to yield no rows, not the full set") + + +def test_path_input_and_result_helpers(): + path = str(Path(__file__).resolve().parents[1] / "examples" / "sample_dataset.jsonl") + r = evaluate(Stub("Paris"), path, metric="contains", sample=3, verbose=False) + assert r.n == 3 and len(r.sparkline()) == 3 + assert sorted(r.to_dict()) == ["examples", "metric", "n", "score", "scores"] + assert len(r.worst(2)) == 2 + + +def test_judge_score_parser_tolerant(): + assert _parse_judge_score("0.7") == 0.7 + assert abs(_parse_judge_score("7/10") - 0.7) < 1e-9 + assert _parse_judge_score("I'd rate this an 8") == 0.8 # x/10 rating + assert _parse_judge_score("score: 1") == 1.0 + assert _parse_judge_score("nonsense") == 0.0 + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")] + for fn in fns: + fn() + print(f"ok {fn.__name__}") + print(f"\n{len(fns)} tests passed") From e42421ec1b4e8fb388b5b95538b89b9b53add3b5 Mon Sep 17 00:00:00 2001 From: shreyas-lyzr <141219160+shreyas-lyzr@users.noreply.github.com> Date: Sat, 20 Jun 2026 15:28:05 -0400 Subject: [PATCH 4/4] Harden eval demo + tests: CWD-independent path, edge-case coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - examples/evaluate.py: resolve sample_dataset.jsonl relative to __file__ so `python examples/evaluate.py` runs from any working directory (the dataset is present and tracked; this removes the only real failure mode — a CWD-relative path). - tests/test_eval.py: add regression tests for unknown-metric ValueError, out-of-range scorer clamping, and degenerate chat rows (None content, assistant-only, system-in-prefix). 14 tests, all no-GPU. --- examples/evaluate.py | 8 ++++++-- tests/test_eval.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/examples/evaluate.py b/examples/evaluate.py index bde5c83..d00384f 100644 --- a/examples/evaluate.py +++ b/examples/evaluate.py @@ -4,15 +4,19 @@ actually does the job. Point a loaded model at a dataset, pick a metric, get one number plus a per-row breakdown. - python examples/evaluate.py + python examples/evaluate.py # runs from any working directory """ +from pathlib import Path + import shadowlm as slm MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit" +# Resolve the dataset next to this script, so the demo runs from any CWD. +DATA = Path(__file__).resolve().parent / "sample_dataset.jsonl" # A dataset with a prompt column (instruction/question/...) and an answer column. -ds = slm.Dataset.from_jsonl("examples/sample_dataset.jsonl") +ds = slm.Dataset.from_jsonl(DATA) model = slm.load(MODEL) # contains-match: 1.0 when the expected answer appears in the output ---------- diff --git a/tests/test_eval.py b/tests/test_eval.py index 6c96f46..cb70a68 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -113,6 +113,35 @@ def test_path_input_and_result_helpers(): assert len(r.worst(2)) == 2 +def test_unknown_metric_raises(): + try: + evaluate(Stub("a"), QA, metric="bleu", verbose=False) + except ValueError as e: + assert "unknown metric" in str(e) + else: + raise AssertionError("expected ValueError for an unknown metric name") + + +def test_scores_are_clamped_to_unit_interval(): + assert all(s == 1.0 for s in evaluate(Stub("x"), QA, metric=lambda o, e, q: 5.0, verbose=False).scores) + assert all(s == 0.0 for s in evaluate(Stub("x"), QA, metric=lambda o, e, q: -3, verbose=False).scores) + + +def test_degenerate_chat_rows_dont_crash(): + from shadowlm.data import CHAT + from shadowlm.eval import _row_io + + # None content is coerced to ""; an assistant-only row yields a placeholder turn + h, exp = _row_io({"messages": [{"role": "assistant", "content": None}]}, CHAT) + assert h == [{"role": "user", "content": ""}] and exp == "" + # a system turn is kept in the context prefix, not dropped + h, exp = _row_io({"messages": [ + {"role": "system", "content": "be brief"}, + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}]}, CHAT) + assert [m["role"] for m in h] == ["system", "user"] and exp == "hello" + + def test_judge_score_parser_tolerant(): assert _parse_judge_score("0.7") == 0.7 assert abs(_parse_judge_score("7/10") - 0.7) < 1e-9