Skip to content

Commit 2b8bbc4

Browse files
ctawiahcursoragent
andcommitted
feat: add manual judge evaluation (Judge, Evaluator, createJudge) (AIC-2665)
Implements the AIEVALS manual-only evaluation path: - Runner SPI and RunnerResult for caller-supplied model invocation - Judge: sampling decided before invocation, well-known input format, score/reasoning parsing with range validation, invocation tracked via trackMetricsOf (does not emit trackJudgeResult; caller's responsibility) - Evaluator: per-judge fault isolation and per-judge timeout, order-preserving results, noop() returns an empty list; sampling-rate normalization on Judge - LDAIClient.createJudge: fires only $ld:ai:usage:create-judge, resolves the judge config via the internal evaluate path, returns null when disabled or when no runner is supplied Automatic judge auto-attachment and provider runners are deferred past v1.0. README documents the manual-only flow and the auto-attach descope. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 19d0f4f commit 2b8bbc4

10 files changed

Lines changed: 941 additions & 2 deletions

File tree

lib/sdk/server-ai/README.md

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,44 @@ The companion `agentConfig`/`agentConfigs` and `judgeConfig` methods retrieve ag
4343
configs respectively. Within a prompt message or agent instruction, the evaluation context is
4444
available as `{{ldctx}}` (for example `{{ldctx.key}}`).
4545

46-
Metric tracking and manual judge evaluation will be added as the SDK is built out (see epic
47-
AIC-2629).
46+
## Tracking AI runs
47+
48+
Every retrieved config exposes a tracker via `config.createTracker()`. Use it to record duration,
49+
time-to-first-token, success/error, token usage, tool calls, and feedback for an AI run. Trackers
50+
are thread-safe, and at-most-once metrics (duration, time-to-first-token, outcome, feedback, tokens)
51+
emit a single event even under concurrent calls. A run can be correlated across processes with
52+
`tracker.getResumptionToken()` and rebuilt later via `aiClient.createTracker(token, context)`.
53+
54+
## Evaluating responses with judges (manual)
55+
56+
A judge is an AI Config with `mode: judge` that scores another config's output against an evaluation
57+
metric.
58+
59+
In `v1.0`, evaluation is **manual only**. The SDK parses `judgeConfiguration` and exposes it on
60+
configs, but it does **not** automatically invoke judges on completion or agent calls. Sample-rate
61+
driven auto-attachment is deferred past `v1.0`. Because no provider-specific runners ship yet, you
62+
supply your own `Runner` that calls your model and returns structured `{score, reasoning}` output.
63+
64+
```java
65+
Runner runner = input -> {
66+
// Call your model with `input`, then return its score/reasoning as structured output.
67+
// metrics carries success/tokens/duration for the invocation.
68+
return RunnerResult.builder(Metrics.builder(true).build())
69+
.parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "grounded").build())
70+
.build();
71+
};
72+
73+
Judge judge = aiClient.createJudge("my-judge-key", context, null, variables, runner, 1.0);
74+
if (judge != null) {
75+
JudgeResult result = judge.evaluate(originalInput, modelOutput);
76+
// Recording the result is the caller's responsibility:
77+
completionTracker.trackJudgeResult(result);
78+
}
79+
```
80+
81+
`Evaluator` runs several judges over the same input/output with per-judge fault isolation and a
82+
per-judge timeout, returning one `JudgeResult` per judge in order. `Evaluator.noop()` returns an
83+
empty result list.
4884

4985
## Internal API convention
5086

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
package com.launchdarkly.sdk.server.ai;
2+
3+
import com.launchdarkly.logging.LDLogger;
4+
import com.launchdarkly.logging.Logs;
5+
import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
6+
7+
import java.time.Duration;
8+
import java.util.ArrayList;
9+
import java.util.Collections;
10+
import java.util.List;
11+
import java.util.Objects;
12+
import java.util.concurrent.ExecutionException;
13+
import java.util.concurrent.ExecutorService;
14+
import java.util.concurrent.Executors;
15+
import java.util.concurrent.Future;
16+
import java.util.concurrent.TimeUnit;
17+
import java.util.concurrent.TimeoutException;
18+
19+
/**
20+
* Runs a fixed set of {@link Judge}s against one input/output pair and collects their results.
21+
* <p>
22+
* Each judge runs with <strong>fault isolation</strong>: a judge that throws or times out yields a
23+
* failed {@link JudgeResult} for that judge while every other judge's result is preserved, in the
24+
* original order. Judges run concurrently and each is bounded by a per-judge timeout so a single
25+
* hung judge cannot stall the whole evaluation.
26+
* <p>
27+
* The evaluator does not record results; recording the returned {@link JudgeResult}s (for example
28+
* via a tracker) is the caller's responsibility. Instances are immutable and thread-safe.
29+
*/
30+
public final class Evaluator {
31+
/**
32+
* Default per-judge timeout used when one is not supplied.
33+
*/
34+
public static final Duration DEFAULT_PER_JUDGE_TIMEOUT = Duration.ofSeconds(30);
35+
36+
private final List<Judge> judges;
37+
private final Duration perJudgeTimeout;
38+
private final LDLogger logger;
39+
40+
/**
41+
* Creates an evaluator using the {@link #DEFAULT_PER_JUDGE_TIMEOUT default per-judge timeout}.
42+
*
43+
* @param judges the judges to run; must not be {@code null}
44+
* @param logger the logger; must not be {@code null}
45+
*/
46+
public Evaluator(List<Judge> judges, LDLogger logger) {
47+
this(judges, DEFAULT_PER_JUDGE_TIMEOUT, Objects.requireNonNull(logger, "logger"));
48+
}
49+
50+
/**
51+
* Creates an evaluator with an explicit per-judge timeout.
52+
*
53+
* @param judges the judges to run; must not be {@code null}
54+
* @param perJudgeTimeout the maximum time to wait for each judge; must not be {@code null}
55+
* @param logger the logger; must not be {@code null}
56+
*/
57+
public Evaluator(List<Judge> judges, Duration perJudgeTimeout, LDLogger logger) {
58+
this.judges = Collections.unmodifiableList(new ArrayList<>(Objects.requireNonNull(judges, "judges")));
59+
this.perJudgeTimeout = Objects.requireNonNull(perJudgeTimeout, "perJudgeTimeout");
60+
this.logger = Objects.requireNonNull(logger, "logger");
61+
}
62+
63+
/**
64+
* Returns an evaluator with no judges. Its {@link #evaluate} returns an empty list and logs
65+
* nothing.
66+
*
67+
* @return a no-op evaluator
68+
*/
69+
public static Evaluator noop() {
70+
return new Evaluator(
71+
Collections.emptyList(), DEFAULT_PER_JUDGE_TIMEOUT, LDLogger.withAdapter(Logs.none(), ""));
72+
}
73+
74+
/**
75+
* Runs every judge against the given input and output.
76+
*
77+
* @param input the input that was provided to the AI being evaluated
78+
* @param output the AI-generated response to score
79+
* @return one {@link JudgeResult} per judge, in the judges' order; empty when there are no judges
80+
*/
81+
public List<JudgeResult> evaluate(String input, String output) {
82+
if (judges.isEmpty()) {
83+
return Collections.emptyList();
84+
}
85+
86+
ExecutorService pool = Executors.newFixedThreadPool(judges.size());
87+
try {
88+
List<Future<JudgeResult>> futures = new ArrayList<>(judges.size());
89+
for (Judge judge : judges) {
90+
futures.add(pool.submit(() -> judge.evaluate(input, output)));
91+
}
92+
93+
List<JudgeResult> results = new ArrayList<>(judges.size());
94+
for (int i = 0; i < judges.size(); i++) {
95+
results.add(awaitResult(judges.get(i), futures.get(i)));
96+
}
97+
return results;
98+
} finally {
99+
pool.shutdownNow();
100+
}
101+
}
102+
103+
private JudgeResult awaitResult(Judge judge, Future<JudgeResult> future) {
104+
String key = judge.getAIConfig().getKey();
105+
try {
106+
return future.get(perJudgeTimeout.toMillis(), TimeUnit.MILLISECONDS);
107+
} catch (TimeoutException e) {
108+
future.cancel(true);
109+
logger.warn("Judge {} timed out after {} ms", key, perJudgeTimeout.toMillis());
110+
return failed(key, "Judge evaluation timed out");
111+
} catch (ExecutionException e) {
112+
Throwable cause = e.getCause() != null ? e.getCause() : e;
113+
logger.error("Judge {} failed: {}", key, cause.toString());
114+
return failed(key, cause.getMessage() != null ? cause.getMessage() : "Unknown error");
115+
} catch (InterruptedException e) {
116+
Thread.currentThread().interrupt();
117+
future.cancel(true);
118+
return failed(key, "Judge evaluation interrupted");
119+
}
120+
}
121+
122+
private static JudgeResult failed(String key, String message) {
123+
return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build();
124+
}
125+
}

0 commit comments

Comments
 (0)