diff --git a/README.md b/README.md index 3dcf522..734bb57 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ **A fine-tuning SDK. Any open model — with any method, on any hardware, for any harness.** -Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) · `slm♥` +Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) & [Shreyas Kapale](mailto:shreyas@lyzr.ai) · `slm♥` ```bash pip install shadowlm # batteries included — the full training stack diff --git a/examples/evaluate.py b/examples/evaluate.py new file mode 100644 index 0000000..d00384f --- /dev/null +++ b/examples/evaluate.py @@ -0,0 +1,33 @@ +"""Score a model on a task — quality, not training loss. + +`finetune` tells you the loss went down; `evaluate` tells you whether the model +actually does the job. Point a loaded model at a dataset, pick a metric, get one +number plus a per-row breakdown. + + python examples/evaluate.py # runs from any working directory +""" + +from pathlib import Path + +import shadowlm as slm + +MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit" +# Resolve the dataset next to this script, so the demo runs from any CWD. +DATA = Path(__file__).resolve().parent / "sample_dataset.jsonl" + +# A dataset with a prompt column (instruction/question/...) and an answer column. +ds = slm.Dataset.from_jsonl(DATA) +model = slm.load(MODEL) + +# contains-match: 1.0 when the expected answer appears in the output ---------- +res = slm.evaluate(model, ds, metric="contains") +print(res) # EvalResult(metric='contains', score=..., n=...) +print("per-row:", res.sparkline()) + +# the rows it did worst on ---------------------------------------------------- +for ex in res.worst(3): + print(f" {ex['score']:.1f} {ex['input'][:50]!r} → {ex['output'][:50]!r}") + +# LLM-as-judge scoring (here the model judges itself; use a stronger judge for real) +judged = slm.evaluate(model, ds, judge=model) +print("judge score:", round(judged.score, 3)) diff --git a/shadowlm/__init__.py b/shadowlm/__init__.py index bf452fa..96890d8 100644 --- a/shadowlm/__init__.py +++ b/shadowlm/__init__.py @@ -20,6 +20,7 @@ from .capture import CaptureProxy, capture from .checkpoints import Checkpoint from .data import Dataset +from .eval import EvalResult, evaluate from .models import Model, Reply, load from .rl import Trajectory, TrajectoryGroup, judge_group from .training import Metric, TrainConfig, TrainingRun @@ -29,6 +30,8 @@ __all__ = [ "APORun", "optimize_prompt", + "evaluate", + "EvalResult", "CaptureProxy", "capture", "Checkpoint", diff --git a/shadowlm/apo.py b/shadowlm/apo.py index ad9ef86..deaf5bc 100644 --- a/shadowlm/apo.py +++ b/shadowlm/apo.py @@ -67,10 +67,14 @@ def _cols(row: dict) -> tuple[str, str | None]: return p, a +def _norm(text: str) -> str: + """Whitespace-collapsed, lowercased text for tolerant string matching.""" + return " ".join(str(text).lower().split()) + + def _contains_score(output: str, expected: str) -> float: """Default scorer: 1.0 if the expected answer appears in the output.""" - o = " ".join(str(output).lower().split()) - e = " ".join(str(expected).lower().split()) + o, e = _norm(output), _norm(expected) return 1.0 if e and e in o else 0.0 @@ -218,15 +222,47 @@ def _propose(optimizer, current, failures, k, temperature, max_new_tokens) -> li return out +# The shared single-answer judge: a short rubric + a tolerant number parse, used +# by both APO and `evaluate` so they agree on what a good answer is. (The RL judge +# in rl.py is a *group-relative* ranker — it can't score a lone eval row — so eval +# reuses this scorer, not judge_group.) +_JUDGE_RUBRIC = ( + "Reward correctness first, then helpfulness, then concision. " + "Penalize factual errors and ignored instructions." +) + + +def _parse_judge_score(raw: str) -> float: + """Tolerantly pull a 0–1 score out of a judge's reply. + + Small judges phrase scores many ways — a bare decimal ("0.7"), a ratio + ("7/10"), or an integer rating ("8" → 0.8). Handle all three, then clamp. + """ + import re # noqa: PLC0415 + + s = str(raw) + m = re.search(r"(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", s) # "7/10" + if m: + num, den = float(m.group(1)), float(m.group(2)) + return max(0.0, min(1.0, num / den)) if den else 0.0 + m = re.search(r"\d+\.\d+", s) # a decimal like "0.7" + if m: + return max(0.0, min(1.0, float(m.group()))) + m = re.search(r"\d+", s) # a bare integer — assume an x/10 rating above 1 + if m: + v = float(m.group()) + return max(0.0, min(1.0, v if v <= 1 else v / 10.0)) + return 0.0 + + def _judge_one(judge, question: str, output: str, expected: str) -> float: prompt = ( "Score how well the ANSWER responds to the INPUT from 0.0 to 1.0.\n" + f"{_JUDGE_RUBRIC}\n" f"INPUT: {question}\nANSWER: {output}\n" + (f"REFERENCE: {expected}\n" if expected else "") + 'Reply with ONLY a number like 0.7.' ) raw = str(judge.chat([{"role": "user", "content": prompt}], temperature=0.0, max_new_tokens=8)) - import re # noqa: PLC0415 - m = re.search(r"[01](?:\.\d+)?", raw) - return max(0.0, min(1.0, float(m.group()))) if m else 0.0 + return _parse_judge_score(raw) diff --git a/shadowlm/cli.py b/shadowlm/cli.py index 916ea00..8ef540b 100644 --- a/shadowlm/cli.py +++ b/shadowlm/cli.py @@ -378,6 +378,63 @@ def export( console.print(f"exported [slm]{format}[/slm] → [slm]{out}[/slm]") +@app.command(name="eval", rich_help_panel="Models") +def evaluate_cmd( + target: Annotated[str, typer.Argument(help="model name, or an adapter directory")], + dataset: Annotated[str, typer.Argument( + help="dataset (.jsonl/.json/.csv/.parquet) with a prompt + answer column")], + metric: Annotated[str, typer.Option(help="contains | exact | judge")] = "contains", + judge: Annotated[Optional[str], typer.Option( + help="judge model (HF id) — implies --metric judge")] = None, + system: Annotated[Optional[str], typer.Option(help="system prompt for every query")] = None, + sample: Annotated[Optional[int], typer.Option(help="evaluate only the first N rows")] = None, + show: Annotated[int, typer.Option(help="how many worst examples to show")] = 5, + model: Annotated[Optional[str], typer.Option("--model", "-m", + help="base model override for adapter dirs")] = None, + backend: Annotated[str, typer.Option(help="auto | mlx | torch")] = "auto", + load_in_4bit: Annotated[bool, typer.Option("--load-in-4bit")] = False, + max_new_tokens: Annotated[int, typer.Option("--max-new-tokens")] = 256, + hf_token: Annotated[Optional[str], typer.Option("--hf-token", envvar="HF_TOKEN")] = None, +): + """Score a model on a dataset — task quality, not training loss.""" + from .data import Dataset # noqa: PLC0415 + from .eval import evaluate as _evaluate # noqa: PLC0415 + from .models import load # noqa: PLC0415 + + if metric not in ("contains", "exact", "judge"): + raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'") + # Validate before loading the model — otherwise the user waits for a full + # model download only to hit a scorer error. + if metric == "judge" and not judge: + raise typer.BadParameter("--metric judge needs a judge model: --judge ") + _maybe_set_token(hf_token) + m = _resolve_target(target, model, backend, load_in_4bit) + judge_model = load(judge, backend=backend) if judge else None + data = Dataset.load(dataset) + + result = _evaluate(m, data, metric=metric, judge=judge_model, system=system, + sample=sample, max_new_tokens=max_new_tokens, verbose=False) + + console.print( + f"[slm]{result.metric}[/slm] score [ok]{result.score:.3f}[/ok] " + f"over {result.n} rows {result.sparkline()}") + worst = result.worst(show) + if worst and result.score < 1.0: + table = Table(title="lowest-scoring examples", title_style="slm", + header_style="slm", border_style="muted") + for col in ("score", "input", "expected", "output"): + table.add_column(col, no_wrap=(col == "score")) + for ex in worst: + table.add_row(f"{ex['score']:.2f}", _trunc(ex["input"]), + _trunc(ex["expected"]), _trunc(ex["output"])) + console.print(table) + + +def _trunc(text: str, n: int = 60) -> str: + text = " ".join(str(text).split()) + return text if len(text) <= n else text[: n - 1] + "…" + + # ---- runs / history --------------------------------------------------------- @app.command(rich_help_panel="Runs") def runs( diff --git a/shadowlm/eval.py b/shadowlm/eval.py new file mode 100644 index 0000000..1cae1c3 --- /dev/null +++ b/shadowlm/eval.py @@ -0,0 +1,183 @@ +"""Evaluation — score a model on a task, not just its training loss. + +`finetune` reports next-token *loss*; this reports task *quality*. Point a loaded +model at a dataset, pick a metric, and get one number plus a per-row breakdown: + + res = slm.evaluate(model, "qa.jsonl") # contains-match + res = slm.evaluate(model, ds, metric="exact") # exact-match + res = slm.evaluate(model, ds, judge=judge) # LLM-as-judge + res = slm.evaluate(model, ds, metric=my_score_fn) # custom scorer + print(res.score, res.sparkline()) + +This is the front half of an "eval gate" — the same capture/judge primitives, +turned toward *measuring* a model instead of training one. Pure ShadowLM: it +only needs a loaded model's `.chat()`. The built-in scorers are reused from APO +(`apo._contains_score`, `apo._judge_one`) so eval and prompt-optimization agree +on what a good answer is. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from .data import Dataset + + +def _exact_score(output: str, expected: str) -> float: + """1.0 when the output equals the expected answer (case/space-insensitive).""" + from .apo import _norm # noqa: PLC0415 + + o, e = _norm(output), _norm(expected) + return 1.0 if e and o == e else 0.0 + + +@dataclass +class EvalResult: + """The outcome of an `evaluate` run: an aggregate score plus per-row detail.""" + + metric: str + score: float # mean of `scores` + scores: list[float] = field(default_factory=list) + examples: list[dict] = field(default_factory=list) # [{input, output, expected, score}] + n: int = 0 + + def sparkline(self) -> str: + """A tiny unicode bar of per-row scores — handy in a REPL or log line.""" + if not self.scores: + return "" + bars = "▁▂▃▄▅▆▇█" + lo, hi = min(self.scores), max(self.scores) + rng = (hi - lo) or 1.0 + return "".join(bars[min(7, int((s - lo) / rng * 7))] for s in self.scores) + + def worst(self, k: int = 5) -> list[dict]: + """The k lowest-scoring examples (for eyeballing where the model fails).""" + return sorted(self.examples, key=lambda e: e["score"])[:k] + + def to_dict(self) -> dict: + return {"metric": self.metric, "score": self.score, "n": self.n, + "scores": self.scores, "examples": self.examples} + + def __repr__(self) -> str: + return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})" + + +def _row_io(row: dict, fmt: str) -> tuple[list[dict], str]: + """Pull (history, expected_answer) out of a row, by dataset format. + + `history` is the conversation to feed the model — a full multi-turn prefix + for chat rows, or a single user turn for QA/preference rows. `expected` may + be "" when the dataset carries no reference answer (e.g. judge scoring on + prompts alone). + """ + from .data import CHAT, PREFERENCE # noqa: PLC0415 + + if fmt == CHAT or "messages" in row: + msgs = [{"role": m.get("role", "user"), "content": m.get("content") or ""} + for m in row.get("messages", [])] + # Everything up to the final assistant turn is context; that turn is the + # reference — so a multi-turn row is answered in its full conversation, + # not scored as "answer the opening question". + last_asst = next((i for i in range(len(msgs) - 1, -1, -1) + if msgs[i]["role"] == "assistant"), None) + if last_asst is None: + return (msgs or [{"role": "user", "content": ""}]), "" + history = msgs[:last_asst] or [{"role": "user", "content": ""}] + return history, msgs[last_asst]["content"] + if fmt == PREFERENCE or ("chosen" in row and "prompt" in row): + return [{"role": "user", "content": str(row.get("prompt", ""))}], \ + str(row.get("chosen", "")) + # instruction / QA / raw dict — auto-detect the prompt & answer columns + from .apo import _cols # noqa: PLC0415 + + pcol, acol = _cols(row) + if not pcol: + from .apo import _PROMPT_KEYS # noqa: PLC0415 + + raise ValueError( + f"no prompt column found in row (looked for {_PROMPT_KEYS}); " + "pass chat-format rows or a dataset with a prompt/question column") + prompt = str(row[pcol]) + # alpaca-style extra context column, when distinct from the prompt + if pcol != "input" and row.get("input"): + prompt = f"{prompt}\n\n{row['input']}" + return [{"role": "user", "content": prompt}], \ + (str(row.get(acol, "")) if acol else "") + + +def _resolve_scorer(metric, judge): + """Map the metric arg to a scorer `(output, expected, prompt) -> float`.""" + if callable(metric): + return metric, getattr(metric, "__name__", "custom") + from .apo import _contains_score, _judge_one # noqa: PLC0415 + + if metric == "contains": + return (lambda out, exp, q: _contains_score(out, exp)), "contains" + if metric == "exact": + return (lambda out, exp, q: _exact_score(out, exp)), "exact" + if metric == "judge": + if judge is None: + raise ValueError("metric='judge' needs a judge model: evaluate(..., judge=model)") + return (lambda out, exp, q: _judge_one(judge, q, out, exp)), "judge" + raise ValueError( + f"unknown metric {metric!r} (expected 'contains', 'exact', 'judge', or a callable)") + + +def evaluate( + model, + data: Dataset | list[dict] | str, + *, + metric="contains", + judge=None, + system: str | None = None, + sample: int | None = None, + max_new_tokens: int = 256, + temperature: float = 0.0, + verbose: bool = True, +) -> EvalResult: + """Score `model` on `data`, returning an `EvalResult`. + + model: a loaded shadowlm Model (answers each row via `.chat`). + data: a Dataset, rows, or a path to a dataset file (jsonl/json/csv/parquet). + metric: "contains" (default — expected answer appears in the output), "exact" + (normalized equality), "judge" (LLM-as-judge, needs `judge=`), or a custom + callable `(output, expected, prompt) -> float in [0, 1]`. + judge: a Model that scores answers 0–1. Passing it defaults `metric` to "judge". + system: optional system prompt prepended to every query. + sample: evaluate only the first N rows. + temperature: generation temperature — 0.0 (default) for deterministic scoring. + """ + if isinstance(data, str): + data = Dataset.load(data) + fmt = data.format if isinstance(data, Dataset) else None + rows = list(data.rows if isinstance(data, Dataset) else data) + if sample is not None: + rows = rows[:sample] + if not rows: + raise ValueError("evaluate needs at least one row") + if judge is not None and metric == "contains": + metric = "judge" # passing a judge implies judge scoring + if fmt is None: + from .data import _detect_format # noqa: PLC0415 + + fmt = _detect_format(rows) + scorer, metric_name = _resolve_scorer(metric, judge) + + scores: list[float] = [] + examples: list[dict] = [] + for r in rows: + history, expected = _row_io(r, fmt) + msgs = ([{"role": "system", "content": system}] if system else []) + history + out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens)) + # the last user turn is the "question" passed to a judge / shown in output + question = next((m["content"] for m in reversed(history) + if m["role"] == "user"), "") + s = max(0.0, min(1.0, float(scorer(out, expected, question)))) + scores.append(s) + examples.append({"input": question, "output": out, "expected": expected, "score": s}) + + score = sum(scores) / len(scores) + if verbose: + print(f"[eval] {metric_name} · {score:.3f} over {len(scores)} rows", flush=True) + return EvalResult(metric=metric_name, score=score, scores=scores, + examples=examples, n=len(scores)) diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..cb70a68 --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,158 @@ +"""No-GPU tests for `slm.evaluate` — scorers, format dispatch, and error paths. + +Uses a stub model (canned `.chat`) so nothing downloads. Runs under pytest, or +standalone: `python tests/test_eval.py` (exit 0 = all passed). +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from shadowlm.apo import _parse_judge_score # noqa: E402 +from shadowlm.data import Dataset # noqa: E402 +from shadowlm.eval import EvalResult, evaluate # noqa: E402 + + +class Stub: + """A model whose `.chat` returns a canned reply and records what it saw.""" + + def __init__(self, reply: str) -> None: + self.reply = reply + self.seen: list[list[dict]] = [] + + def chat(self, messages, **kw): + self.seen.append(messages) + return self.reply + + +QA = [{"question": "2+2?", "answer": "4"}, {"question": "cap of France?", "answer": "Paris"}] + + +def test_contains_metric(): + r = evaluate(Stub("the answer is 4"), QA, metric="contains", verbose=False) + assert isinstance(r, EvalResult) and r.metric == "contains" + assert r.scores == [1.0, 0.0] and r.n == 2 + assert abs(r.score - 0.5) < 1e-9 + + +def test_exact_metric_normalizes(): + r = evaluate(Stub(" PARIS "), QA, metric="exact", verbose=False) + assert r.scores == [0.0, 1.0] # case/space-insensitive equality + + +def test_custom_callable_scorer(): + r = evaluate(Stub("xx"), QA, metric=lambda out, exp, q: len(out) / 10, verbose=False) + assert r.metric == "" and r.scores == [0.2, 0.2] + + +def test_judge_metric_and_implied_flip(): + # passing judge= flips the default metric to "judge" + r = evaluate(Stub("4"), QA, judge=Stub("0.9"), verbose=False) + assert r.metric == "judge" and r.scores == [0.9, 0.9] + + +def test_judge_without_model_raises(): + try: + evaluate(Stub("x"), QA, metric="judge", verbose=False) + except ValueError as e: + assert "judge" in str(e) + else: + raise AssertionError("expected ValueError for metric='judge' without a judge") + + +def test_missing_prompt_column_raises(): + try: + evaluate(Stub("x"), [{"foo": "bar"}], metric="exact", verbose=False) + except ValueError as e: + assert "prompt column" in str(e) + else: + raise AssertionError("expected ValueError for a row with no prompt column") + + +def test_chat_multiturn_keeps_context(): + stub = Stub("blue") + row = {"messages": [ + {"role": "user", "content": "pick a color"}, + {"role": "assistant", "content": "ok"}, + {"role": "user", "content": "now say it"}, + {"role": "assistant", "content": "blue"}, + ]} + r = evaluate(stub, Dataset.from_list([row]), metric="contains", verbose=False) + assert r.scores == [1.0] + # the model must have received the full prefix (3 turns), not just turn 1 + sent = stub.seen[0] + assert [m["role"] for m in sent] == ["user", "assistant", "user"] + assert r.examples[0]["input"] == "now say it" # last user turn is the question + assert r.examples[0]["expected"] == "blue" # final assistant turn is the ref + + +def test_preference_format(): + ds = Dataset.from_list([{"prompt": "q", "chosen": "good", "rejected": "bad"}]) + r = evaluate(Stub("this is good"), ds, metric="contains", verbose=False) + assert r.scores == [1.0] + + +def test_sample_zero_is_not_whole_dataset(): + # `--sample 0` must not silently mean "evaluate everything" + try: + evaluate(Stub("x"), QA, metric="exact", sample=0, verbose=False) + except ValueError as e: + assert "at least one row" in str(e) + else: + raise AssertionError("expected sample=0 to yield no rows, not the full set") + + +def test_path_input_and_result_helpers(): + path = str(Path(__file__).resolve().parents[1] / "examples" / "sample_dataset.jsonl") + r = evaluate(Stub("Paris"), path, metric="contains", sample=3, verbose=False) + assert r.n == 3 and len(r.sparkline()) == 3 + assert sorted(r.to_dict()) == ["examples", "metric", "n", "score", "scores"] + assert len(r.worst(2)) == 2 + + +def test_unknown_metric_raises(): + try: + evaluate(Stub("a"), QA, metric="bleu", verbose=False) + except ValueError as e: + assert "unknown metric" in str(e) + else: + raise AssertionError("expected ValueError for an unknown metric name") + + +def test_scores_are_clamped_to_unit_interval(): + assert all(s == 1.0 for s in evaluate(Stub("x"), QA, metric=lambda o, e, q: 5.0, verbose=False).scores) + assert all(s == 0.0 for s in evaluate(Stub("x"), QA, metric=lambda o, e, q: -3, verbose=False).scores) + + +def test_degenerate_chat_rows_dont_crash(): + from shadowlm.data import CHAT + from shadowlm.eval import _row_io + + # None content is coerced to ""; an assistant-only row yields a placeholder turn + h, exp = _row_io({"messages": [{"role": "assistant", "content": None}]}, CHAT) + assert h == [{"role": "user", "content": ""}] and exp == "" + # a system turn is kept in the context prefix, not dropped + h, exp = _row_io({"messages": [ + {"role": "system", "content": "be brief"}, + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}]}, CHAT) + assert [m["role"] for m in h] == ["system", "user"] and exp == "hello" + + +def test_judge_score_parser_tolerant(): + assert _parse_judge_score("0.7") == 0.7 + assert abs(_parse_judge_score("7/10") - 0.7) < 1e-9 + assert _parse_judge_score("I'd rate this an 8") == 0.8 # x/10 rating + assert _parse_judge_score("score: 1") == 1.0 + assert _parse_judge_score("nonsense") == 0.0 + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")] + for fn in fns: + fn() + print(f"ok {fn.__name__}") + print(f"\n{len(fns)} tests passed")