Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

**A fine-tuning SDK. Any open model — with any method, on any hardware, for any harness.**

Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) · `slm♥`
Open source · built by [Lyzr Research Labs](https://lyzr.ai) · maintained by [Khush Patel](mailto:khush@lyzr.ai) & [Shreyas Kapale](mailto:shreyas@lyzr.ai) · `slm♥`

```bash
pip install shadowlm # batteries included — the full training stack
Expand Down
33 changes: 33 additions & 0 deletions examples/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Score a model on a task — quality, not training loss.

`finetune` tells you the loss went down; `evaluate` tells you whether the model
actually does the job. Point a loaded model at a dataset, pick a metric, get one
number plus a per-row breakdown.

python examples/evaluate.py # runs from any working directory
"""

from pathlib import Path

import shadowlm as slm

MODEL = "mlx-community/Qwen2.5-0.5B-Instruct-4bit"
# Resolve the dataset next to this script, so the demo runs from any CWD.
DATA = Path(__file__).resolve().parent / "sample_dataset.jsonl"

# A dataset with a prompt column (instruction/question/...) and an answer column.
ds = slm.Dataset.from_jsonl(DATA)
model = slm.load(MODEL)

# contains-match: 1.0 when the expected answer appears in the output ----------
res = slm.evaluate(model, ds, metric="contains")
print(res) # EvalResult(metric='contains', score=..., n=...)
print("per-row:", res.sparkline())

# the rows it did worst on ----------------------------------------------------
for ex in res.worst(3):
print(f" {ex['score']:.1f} {ex['input'][:50]!r} → {ex['output'][:50]!r}")

# LLM-as-judge scoring (here the model judges itself; use a stronger judge for real)
judged = slm.evaluate(model, ds, judge=model)
print("judge score:", round(judged.score, 3))
3 changes: 3 additions & 0 deletions shadowlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .capture import CaptureProxy, capture
from .checkpoints import Checkpoint
from .data import Dataset
from .eval import EvalResult, evaluate
from .models import Model, Reply, load
from .rl import Trajectory, TrajectoryGroup, judge_group
from .training import Metric, TrainConfig, TrainingRun
Expand All @@ -29,6 +30,8 @@
__all__ = [
"APORun",
"optimize_prompt",
"evaluate",
"EvalResult",
"CaptureProxy",
"capture",
"Checkpoint",
Expand Down
46 changes: 41 additions & 5 deletions shadowlm/apo.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,14 @@ def _cols(row: dict) -> tuple[str, str | None]:
return p, a


def _norm(text: str) -> str:
"""Whitespace-collapsed, lowercased text for tolerant string matching."""
return " ".join(str(text).lower().split())


def _contains_score(output: str, expected: str) -> float:
"""Default scorer: 1.0 if the expected answer appears in the output."""
o = " ".join(str(output).lower().split())
e = " ".join(str(expected).lower().split())
o, e = _norm(output), _norm(expected)
return 1.0 if e and e in o else 0.0


Expand Down Expand Up @@ -218,15 +222,47 @@ def _propose(optimizer, current, failures, k, temperature, max_new_tokens) -> li
return out


# The shared single-answer judge: a short rubric + a tolerant number parse, used
# by both APO and `evaluate` so they agree on what a good answer is. (The RL judge
# in rl.py is a *group-relative* ranker — it can't score a lone eval row — so eval
# reuses this scorer, not judge_group.)
_JUDGE_RUBRIC = (
"Reward correctness first, then helpfulness, then concision. "
"Penalize factual errors and ignored instructions."
)


def _parse_judge_score(raw: str) -> float:
"""Tolerantly pull a 0–1 score out of a judge's reply.

Small judges phrase scores many ways — a bare decimal ("0.7"), a ratio
("7/10"), or an integer rating ("8" → 0.8). Handle all three, then clamp.
"""
import re # noqa: PLC0415

s = str(raw)
m = re.search(r"(\d+(?:\.\d+)?)\s*/\s*(\d+(?:\.\d+)?)", s) # "7/10"
if m:
num, den = float(m.group(1)), float(m.group(2))
return max(0.0, min(1.0, num / den)) if den else 0.0
m = re.search(r"\d+\.\d+", s) # a decimal like "0.7"
if m:
return max(0.0, min(1.0, float(m.group())))
m = re.search(r"\d+", s) # a bare integer — assume an x/10 rating above 1
if m:
v = float(m.group())
return max(0.0, min(1.0, v if v <= 1 else v / 10.0))
return 0.0


def _judge_one(judge, question: str, output: str, expected: str) -> float:
prompt = (
"Score how well the ANSWER responds to the INPUT from 0.0 to 1.0.\n"
f"{_JUDGE_RUBRIC}\n"
f"INPUT: {question}\nANSWER: {output}\n"
+ (f"REFERENCE: {expected}\n" if expected else "")
+ 'Reply with ONLY a number like 0.7.'
)
raw = str(judge.chat([{"role": "user", "content": prompt}],
temperature=0.0, max_new_tokens=8))
import re # noqa: PLC0415
m = re.search(r"[01](?:\.\d+)?", raw)
return max(0.0, min(1.0, float(m.group()))) if m else 0.0
return _parse_judge_score(raw)
57 changes: 57 additions & 0 deletions shadowlm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,63 @@ def export(
console.print(f"exported [slm]{format}[/slm] → [slm]{out}[/slm]")


@app.command(name="eval", rich_help_panel="Models")
def evaluate_cmd(
target: Annotated[str, typer.Argument(help="model name, or an adapter directory")],
dataset: Annotated[str, typer.Argument(
help="dataset (.jsonl/.json/.csv/.parquet) with a prompt + answer column")],
metric: Annotated[str, typer.Option(help="contains | exact | judge")] = "contains",
judge: Annotated[Optional[str], typer.Option(
help="judge model (HF id) — implies --metric judge")] = None,
system: Annotated[Optional[str], typer.Option(help="system prompt for every query")] = None,
sample: Annotated[Optional[int], typer.Option(help="evaluate only the first N rows")] = None,
show: Annotated[int, typer.Option(help="how many worst examples to show")] = 5,
model: Annotated[Optional[str], typer.Option("--model", "-m",
help="base model override for adapter dirs")] = None,
backend: Annotated[str, typer.Option(help="auto | mlx | torch")] = "auto",
load_in_4bit: Annotated[bool, typer.Option("--load-in-4bit")] = False,
max_new_tokens: Annotated[int, typer.Option("--max-new-tokens")] = 256,
hf_token: Annotated[Optional[str], typer.Option("--hf-token", envvar="HF_TOKEN")] = None,
):
"""Score a model on a dataset — task quality, not training loss."""
from .data import Dataset # noqa: PLC0415
from .eval import evaluate as _evaluate # noqa: PLC0415
from .models import load # noqa: PLC0415

if metric not in ("contains", "exact", "judge"):
raise typer.BadParameter("--metric must be 'contains', 'exact', or 'judge'")
# Validate before loading the model — otherwise the user waits for a full
# model download only to hit a scorer error.
if metric == "judge" and not judge:
raise typer.BadParameter("--metric judge needs a judge model: --judge <hf-id>")
_maybe_set_token(hf_token)
m = _resolve_target(target, model, backend, load_in_4bit)
judge_model = load(judge, backend=backend) if judge else None
data = Dataset.load(dataset)

result = _evaluate(m, data, metric=metric, judge=judge_model, system=system,
sample=sample, max_new_tokens=max_new_tokens, verbose=False)

console.print(
f"[slm]{result.metric}[/slm] score [ok]{result.score:.3f}[/ok] "
f"over {result.n} rows {result.sparkline()}")
worst = result.worst(show)
if worst and result.score < 1.0:
table = Table(title="lowest-scoring examples", title_style="slm",
header_style="slm", border_style="muted")
for col in ("score", "input", "expected", "output"):
table.add_column(col, no_wrap=(col == "score"))
for ex in worst:
table.add_row(f"{ex['score']:.2f}", _trunc(ex["input"]),
_trunc(ex["expected"]), _trunc(ex["output"]))
console.print(table)


def _trunc(text: str, n: int = 60) -> str:
text = " ".join(str(text).split())
return text if len(text) <= n else text[: n - 1] + "…"


# ---- runs / history ---------------------------------------------------------
@app.command(rich_help_panel="Runs")
def runs(
Expand Down
183 changes: 183 additions & 0 deletions shadowlm/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Evaluation — score a model on a task, not just its training loss.

`finetune` reports next-token *loss*; this reports task *quality*. Point a loaded
model at a dataset, pick a metric, and get one number plus a per-row breakdown:

res = slm.evaluate(model, "qa.jsonl") # contains-match
res = slm.evaluate(model, ds, metric="exact") # exact-match
res = slm.evaluate(model, ds, judge=judge) # LLM-as-judge
res = slm.evaluate(model, ds, metric=my_score_fn) # custom scorer
print(res.score, res.sparkline())

This is the front half of an "eval gate" — the same capture/judge primitives,
turned toward *measuring* a model instead of training one. Pure ShadowLM: it
only needs a loaded model's `.chat()`. The built-in scorers are reused from APO
(`apo._contains_score`, `apo._judge_one`) so eval and prompt-optimization agree
on what a good answer is.
"""

from __future__ import annotations

from dataclasses import dataclass, field

from .data import Dataset


def _exact_score(output: str, expected: str) -> float:
"""1.0 when the output equals the expected answer (case/space-insensitive)."""
from .apo import _norm # noqa: PLC0415

o, e = _norm(output), _norm(expected)
return 1.0 if e and o == e else 0.0


@dataclass
class EvalResult:
"""The outcome of an `evaluate` run: an aggregate score plus per-row detail."""

metric: str
score: float # mean of `scores`
scores: list[float] = field(default_factory=list)
examples: list[dict] = field(default_factory=list) # [{input, output, expected, score}]
n: int = 0

def sparkline(self) -> str:
"""A tiny unicode bar of per-row scores — handy in a REPL or log line."""
if not self.scores:
return ""
bars = "▁▂▃▄▅▆▇█"
lo, hi = min(self.scores), max(self.scores)
rng = (hi - lo) or 1.0
return "".join(bars[min(7, int((s - lo) / rng * 7))] for s in self.scores)

def worst(self, k: int = 5) -> list[dict]:
"""The k lowest-scoring examples (for eyeballing where the model fails)."""
return sorted(self.examples, key=lambda e: e["score"])[:k]

def to_dict(self) -> dict:
return {"metric": self.metric, "score": self.score, "n": self.n,
"scores": self.scores, "examples": self.examples}

def __repr__(self) -> str:
return f"EvalResult(metric={self.metric!r}, score={self.score:.4f}, n={self.n})"


def _row_io(row: dict, fmt: str) -> tuple[list[dict], str]:
"""Pull (history, expected_answer) out of a row, by dataset format.

`history` is the conversation to feed the model — a full multi-turn prefix
for chat rows, or a single user turn for QA/preference rows. `expected` may
be "" when the dataset carries no reference answer (e.g. judge scoring on
prompts alone).
"""
from .data import CHAT, PREFERENCE # noqa: PLC0415

if fmt == CHAT or "messages" in row:
msgs = [{"role": m.get("role", "user"), "content": m.get("content") or ""}
for m in row.get("messages", [])]
# Everything up to the final assistant turn is context; that turn is the
# reference — so a multi-turn row is answered in its full conversation,
# not scored as "answer the opening question".
last_asst = next((i for i in range(len(msgs) - 1, -1, -1)
if msgs[i]["role"] == "assistant"), None)
if last_asst is None:
return (msgs or [{"role": "user", "content": ""}]), ""
history = msgs[:last_asst] or [{"role": "user", "content": ""}]
return history, msgs[last_asst]["content"]
if fmt == PREFERENCE or ("chosen" in row and "prompt" in row):
return [{"role": "user", "content": str(row.get("prompt", ""))}], \
str(row.get("chosen", ""))
# instruction / QA / raw dict — auto-detect the prompt & answer columns
from .apo import _cols # noqa: PLC0415

pcol, acol = _cols(row)
if not pcol:
from .apo import _PROMPT_KEYS # noqa: PLC0415

raise ValueError(
f"no prompt column found in row (looked for {_PROMPT_KEYS}); "
"pass chat-format rows or a dataset with a prompt/question column")
prompt = str(row[pcol])
# alpaca-style extra context column, when distinct from the prompt
if pcol != "input" and row.get("input"):
prompt = f"{prompt}\n\n{row['input']}"
return [{"role": "user", "content": prompt}], \
(str(row.get(acol, "")) if acol else "")


def _resolve_scorer(metric, judge):
"""Map the metric arg to a scorer `(output, expected, prompt) -> float`."""
if callable(metric):
return metric, getattr(metric, "__name__", "custom")
from .apo import _contains_score, _judge_one # noqa: PLC0415

if metric == "contains":
return (lambda out, exp, q: _contains_score(out, exp)), "contains"
if metric == "exact":
return (lambda out, exp, q: _exact_score(out, exp)), "exact"
if metric == "judge":
if judge is None:
raise ValueError("metric='judge' needs a judge model: evaluate(..., judge=model)")
return (lambda out, exp, q: _judge_one(judge, q, out, exp)), "judge"
raise ValueError(
f"unknown metric {metric!r} (expected 'contains', 'exact', 'judge', or a callable)")


def evaluate(
model,
data: Dataset | list[dict] | str,
*,
metric="contains",
judge=None,
system: str | None = None,
sample: int | None = None,
max_new_tokens: int = 256,
temperature: float = 0.0,
verbose: bool = True,
) -> EvalResult:
"""Score `model` on `data`, returning an `EvalResult`.

model: a loaded shadowlm Model (answers each row via `.chat`).
data: a Dataset, rows, or a path to a dataset file (jsonl/json/csv/parquet).
metric: "contains" (default — expected answer appears in the output), "exact"
(normalized equality), "judge" (LLM-as-judge, needs `judge=`), or a custom
callable `(output, expected, prompt) -> float in [0, 1]`.
judge: a Model that scores answers 0–1. Passing it defaults `metric` to "judge".
system: optional system prompt prepended to every query.
sample: evaluate only the first N rows.
temperature: generation temperature — 0.0 (default) for deterministic scoring.
"""
if isinstance(data, str):
data = Dataset.load(data)
fmt = data.format if isinstance(data, Dataset) else None
rows = list(data.rows if isinstance(data, Dataset) else data)
if sample is not None:
rows = rows[:sample]
if not rows:
raise ValueError("evaluate needs at least one row")
if judge is not None and metric == "contains":
metric = "judge" # passing a judge implies judge scoring
if fmt is None:
from .data import _detect_format # noqa: PLC0415

fmt = _detect_format(rows)
scorer, metric_name = _resolve_scorer(metric, judge)

scores: list[float] = []
examples: list[dict] = []
for r in rows:
history, expected = _row_io(r, fmt)
msgs = ([{"role": "system", "content": system}] if system else []) + history
out = str(model.chat(msgs, temperature=temperature, max_new_tokens=max_new_tokens))
# the last user turn is the "question" passed to a judge / shown in output
question = next((m["content"] for m in reversed(history)
if m["role"] == "user"), "")
s = max(0.0, min(1.0, float(scorer(out, expected, question))))
scores.append(s)
examples.append({"input": question, "output": out, "expected": expected, "score": s})

score = sum(scores) / len(scores)
if verbose:
print(f"[eval] {metric_name} · {score:.3f} over {len(scores)} rows", flush=True)
return EvalResult(metric=metric_name, score=score, scores=scores,
examples=examples, n=len(scores))
Loading