launchdarkly
diff --git a/‎lib/sdk/server-ai/README.md‎
Lines changed: 38 additions & 2 deletions b/‎lib/sdk/server-ai/README.md‎
Lines changed: 38 additions & 2 deletions
diff --git a/‎lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java‎
Lines changed: 125 additions & 0 deletions b/‎lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java‎
Lines changed: 125 additions & 0 deletions
@@ -43,8 +43,44 @@ The companion `agentConfig`/`agentConfigs` and `judgeConfig` methods retrieve ag
 configs respectively. Within a prompt message or agent instruction, the evaluation context is
 available as `{{ldctx}}` (for example `{{ldctx.key}}`).
 
-Metric tracking and manual judge evaluation will be added as the SDK is built out (see epic
-AIC-2629).
+## Tracking AI runs
+
+Every retrieved config exposes a tracker via `config.createTracker()`. Use it to record duration,
+time-to-first-token, success/error, token usage, tool calls, and feedback for an AI run. Trackers
+are thread-safe, and at-most-once metrics (duration, time-to-first-token, outcome, feedback, tokens)
+emit a single event even under concurrent calls. A run can be correlated across processes with
+`tracker.getResumptionToken()` and rebuilt later via `aiClient.createTracker(token, context)`.
+
+## Evaluating responses with judges (manual)
+
+A judge is an AI Config with `mode: judge` that scores another config's output against an evaluation
+metric.
+
+In `v1.0`, evaluation is **manual only**. The SDK parses `judgeConfiguration` and exposes it on
+configs, but it does **not** automatically invoke judges on completion or agent calls. Sample-rate
+driven auto-attachment is deferred past `v1.0`. Because no provider-specific runners ship yet, you
+supply your own `Runner` that calls your model and returns structured `{score, reasoning}` output.
+
+```java
+Runner runner = input -> {
+    // Call your model with `input`, then return its score/reasoning as structured output.
+    // metrics carries success/tokens/duration for the invocation.
+    return RunnerResult.builder(Metrics.builder(true).build())
+        .parsed(LDValue.buildObject().put("score", 0.9).put("reasoning", "grounded").build())
+        .build();
+};
+
+Judge judge = aiClient.createJudge("my-judge-key", context, null, variables, runner, 1.0);
+if (judge != null) {
+    JudgeResult result = judge.evaluate(originalInput, modelOutput);
+    // Recording the result is the caller's responsibility:
+    completionTracker.trackJudgeResult(result);
+}
+```
+
+`Evaluator` runs several judges over the same input/output with per-judge fault isolation and a
+per-judge timeout, returning one `JudgeResult` per judge in order. `Evaluator.noop()` returns an
+empty result list.
 
 ## Internal API convention
 
 
@@ -0,0 +1,125 @@
+package com.launchdarkly.sdk.server.ai;
+
+import com.launchdarkly.logging.LDLogger;
+import com.launchdarkly.logging.Logs;
+import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+/**
+ * Runs a fixed set of {@link Judge}s against one input/output pair and collects their results.
+ * <p>
+ * Each judge runs with <strong>fault isolation</strong>: a judge that throws or times out yields a
+ * failed {@link JudgeResult} for that judge while every other judge's result is preserved, in the
+ * original order. Judges run concurrently and each is bounded by a per-judge timeout so a single
+ * hung judge cannot stall the whole evaluation.
+ * <p>
+ * The evaluator does not record results; recording the returned {@link JudgeResult}s (for example
+ * via a tracker) is the caller's responsibility. Instances are immutable and thread-safe.
+ */
+public final class Evaluator {
+  /**
+   * Default per-judge timeout used when one is not supplied.
+   */
+  public static final Duration DEFAULT_PER_JUDGE_TIMEOUT = Duration.ofSeconds(30);
+
+  private final List<Judge> judges;
+  private final Duration perJudgeTimeout;
+  private final LDLogger logger;
+
+  /**
+   * Creates an evaluator using the {@link #DEFAULT_PER_JUDGE_TIMEOUT default per-judge timeout}.
+   *
+   * @param judges the judges to run; must not be {@code null}
+   * @param logger the logger; must not be {@code null}
+   */
+  public Evaluator(List<Judge> judges, LDLogger logger) {
+    this(judges, DEFAULT_PER_JUDGE_TIMEOUT, Objects.requireNonNull(logger, "logger"));
+  }
+
+  /**
+   * Creates an evaluator with an explicit per-judge timeout.
+   *
+   * @param judges the judges to run; must not be {@code null}
+   * @param perJudgeTimeout the maximum time to wait for each judge; must not be {@code null}
+   * @param logger the logger; must not be {@code null}
+   */
+  public Evaluator(List<Judge> judges, Duration perJudgeTimeout, LDLogger logger) {
+    this.judges = Collections.unmodifiableList(new ArrayList<>(Objects.requireNonNull(judges, "judges")));
+    this.perJudgeTimeout = Objects.requireNonNull(perJudgeTimeout, "perJudgeTimeout");
+    this.logger = Objects.requireNonNull(logger, "logger");
+  }
+
+  /**
+   * Returns an evaluator with no judges. Its {@link #evaluate} returns an empty list and logs
+   * nothing.
+   *
+   * @return a no-op evaluator
+   */
+  public static Evaluator noop() {
+    return new Evaluator(
+        Collections.emptyList(), DEFAULT_PER_JUDGE_TIMEOUT, LDLogger.withAdapter(Logs.none(), ""));
+  }
+
+  /**
+   * Runs every judge against the given input and output.
+   *
+   * @param input the input that was provided to the AI being evaluated
+   * @param output the AI-generated response to score
+   * @return one {@link JudgeResult} per judge, in the judges' order; empty when there are no judges
+   */
+  public List<JudgeResult> evaluate(String input, String output) {
+    if (judges.isEmpty()) {
+      return Collections.emptyList();
+    }
+
+    ExecutorService pool = Executors.newFixedThreadPool(judges.size());
+    try {
+      List<Future<JudgeResult>> futures = new ArrayList<>(judges.size());
+      for (Judge judge : judges) {
+        futures.add(pool.submit(() -> judge.evaluate(input, output)));
+      }
+
+      List<JudgeResult> results = new ArrayList<>(judges.size());
+      for (int i = 0; i < judges.size(); i++) {
+        results.add(awaitResult(judges.get(i), futures.get(i)));
+      }
+      return results;
+    } finally {
+      pool.shutdownNow();
+    }
+  }
+
+  private JudgeResult awaitResult(Judge judge, Future<JudgeResult> future) {
+    String key = judge.getAIConfig().getKey();
+    try {
+      return future.get(perJudgeTimeout.toMillis(), TimeUnit.MILLISECONDS);
+    } catch (TimeoutException e) {
+      future.cancel(true);
+      logger.warn("Judge {} timed out after {} ms", key, perJudgeTimeout.toMillis());
+      return failed(key, "Judge evaluation timed out");
+    } catch (ExecutionException e) {
+      Throwable cause = e.getCause() != null ? e.getCause() : e;
+      logger.error("Judge {} failed: {}", key, cause.toString());
+      return failed(key, cause.getMessage() != null ? cause.getMessage() : "Unknown error");
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      future.cancel(true);
+      return failed(key, "Judge evaluation interrupted");
+    }
+  }
+
+  private static JudgeResult failed(String key, String message) {
+    return JudgeResult.builder(true, false).judgeConfigKey(key).errorMessage(message).build();
+  }
+}