diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIAgentConfig.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIAgentConfig.java index 5df6b067..b71625a1 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIAgentConfig.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIAgentConfig.java @@ -30,8 +30,9 @@ public final class AIAgentConfig extends AIConfig { String instructions, JudgeConfiguration judgeConfiguration, Map tools, - Supplier trackerFactory) { - super(key, enabled, Mode.AGENT, model, provider, trackerFactory); + Supplier trackerFactory, + Evaluator evaluator) { + super(key, enabled, Mode.AGENT, model, provider, trackerFactory, evaluator); this.instructions = instructions; this.judgeConfiguration = judgeConfiguration; this.tools = tools == null ? null : Collections.unmodifiableMap(tools); diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AICompletionConfig.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AICompletionConfig.java index 0a15aca0..e13801b4 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AICompletionConfig.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AICompletionConfig.java @@ -32,8 +32,9 @@ public final class AICompletionConfig extends AIConfig { List messages, JudgeConfiguration judgeConfiguration, Map tools, - Supplier trackerFactory) { - super(key, enabled, Mode.COMPLETION, model, provider, trackerFactory); + Supplier trackerFactory, + Evaluator evaluator) { + super(key, enabled, Mode.COMPLETION, model, provider, trackerFactory, evaluator); this.messages = messages == null ? null : Collections.unmodifiableList(messages); this.judgeConfiguration = judgeConfiguration; this.tools = tools == null ? null : Collections.unmodifiableMap(tools); diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIConfig.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIConfig.java index 22820f08..f39c264b 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIConfig.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIConfig.java @@ -24,6 +24,7 @@ public abstract class AIConfig { private final Model model; private final Provider provider; private final Supplier trackerFactory; + private final Evaluator evaluator; AIConfig( String key, @@ -31,13 +32,15 @@ public abstract class AIConfig { Mode mode, Model model, Provider provider, - Supplier trackerFactory) { + Supplier trackerFactory, + Evaluator evaluator) { this.key = key; this.enabled = enabled; this.mode = mode; this.model = model; this.provider = provider; this.trackerFactory = Objects.requireNonNull(trackerFactory, "trackerFactory"); + this.evaluator = Objects.requireNonNull(evaluator, "evaluator"); } /** @@ -102,4 +105,17 @@ public Provider getProvider() { public LDAIConfigTracker createTracker() { return trackerFactory.get(); } + + /** + * Returns the evaluator that coordinates judge execution for this configuration. + *

+ * For {@link AIJudgeConfig} this is always {@link Evaluator#noop()}. For + * {@link AICompletionConfig} and {@link AIAgentConfig} it is the evaluator supplied at + * construction time (also {@link Evaluator#noop()} unless a custom one is wired in). + * + * @return the evaluator, never {@code null} + */ + public Evaluator getEvaluator() { + return evaluator; + } } diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIJudgeConfig.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIJudgeConfig.java index 0c6245b1..2c905886 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIJudgeConfig.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/AIJudgeConfig.java @@ -29,7 +29,7 @@ public final class AIJudgeConfig extends AIConfig { List messages, String evaluationMetricKey, Supplier trackerFactory) { - super(key, enabled, Mode.JUDGE, model, provider, trackerFactory); + super(key, enabled, Mode.JUDGE, model, provider, trackerFactory, Evaluator.noop()); this.messages = messages == null ? null : Collections.unmodifiableList(messages); this.evaluationMetricKey = evaluationMetricKey; } diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java new file mode 100644 index 00000000..287a7a36 --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Evaluator.java @@ -0,0 +1,94 @@ +package com.launchdarkly.sdk.server.ai; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.JudgeConfiguration; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; + +/** + * Coordinates evaluation of an AI Config output by running a set of {@link Judge} instances. + *

+ * An {@code Evaluator} is attached to an {@link AICompletionConfig} or {@link AIAgentConfig} and + * invoked by managed AI types (plan 4). In v1.0, the evaluator returned by the config retrieval + * methods is always a noop that returns an empty list immediately. + *

+ * Instances are immutable and thread-safe. + */ +public final class Evaluator { + private static final Evaluator NOOP = new Evaluator(); + + private final Map judges; + private final JudgeConfiguration judgeConfiguration; + private final LDLogger logger; + private final boolean isNoop; + + private Evaluator() { + this.judges = Collections.emptyMap(); + this.judgeConfiguration = null; + this.logger = null; + this.isNoop = true; + } + + /** + * Constructs an evaluator with the given judges and configuration. + * + * @param judges a map from judge config key to {@link Judge} instance; must not be {@code null} + * @param judgeConfiguration the judge configuration listing which judges to run and their sampling + * rates; must not be {@code null} + * @param logger the logger; must not be {@code null} + */ + public Evaluator(Map judges, JudgeConfiguration judgeConfiguration, LDLogger logger) { + this.judges = Collections.unmodifiableMap(new HashMap<>(Objects.requireNonNull(judges, "judges"))); + this.judgeConfiguration = Objects.requireNonNull(judgeConfiguration, "judgeConfiguration"); + this.logger = Objects.requireNonNull(logger, "logger"); + this.isNoop = false; + } + + /** + * Returns the shared noop evaluator, which immediately returns an empty result list without + * logging any warnings. + * + * @return the noop singleton, never {@code null} + */ + public static Evaluator noop() { + return NOOP; + } + + /** + * Runs all configured judges against the given input/output pair and returns their results. + *

+ * When this is the noop evaluator, returns a completed future holding an empty list immediately. + * Otherwise, judges are run sequentially in the order specified by the {@link JudgeConfiguration}. + * Judges referenced in the configuration but absent from the judges map are skipped with a + * warning; this is not an error. + *

+ * This method does NOT call {@code trackJudgeResult} — that is the caller's responsibility. + * + * @param input the message history or prompt that was sent to the model + * @param output the model's response to evaluate + * @return a completed future holding the list of judge results; never {@code null} + */ + public CompletableFuture> evaluate(String input, String output) { + if (isNoop) { + return CompletableFuture.completedFuture(Collections.emptyList()); + } + + List results = new ArrayList<>(); + for (JudgeConfiguration.Judge entry : judgeConfiguration.getJudges()) { + Judge judge = judges.get(entry.getKey()); + if (judge == null) { + logger.warn("Evaluator: no judge found for key '{}', skipping", entry.getKey()); + continue; + } + results.add(judge.evaluate(input, output, entry.getSamplingRate())); + } + return CompletableFuture.completedFuture(results); + } +} diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java new file mode 100644 index 00000000..91e2855c --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Judge.java @@ -0,0 +1,210 @@ +package com.launchdarkly.sdk.server.ai; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ThreadLocalRandom; +import java.util.stream.Collectors; + +/** + * Evaluates an AI model output against a judge prompt, returning a scored {@link JudgeResult}. + *

+ * A {@code Judge} wraps an {@link AIJudgeConfig} and a {@link Runner}. Each call to + * {@link #evaluate} or {@link #evaluateMessages} invokes the runner with a formatted evaluation + * prompt and parses the structured {@code {score, reasoning}} response. Evaluation can be sampled + * to reduce cost: pass a {@code samplingRate} of {@code 0.0} to always skip, or {@code 1.0} to + * always run. + *

+ * Instances are immutable and thread-safe. + */ +public final class Judge { + /** + * JSON-Schema fragment sent to the runner as the {@code outputType}, requesting structured + * {@code {score, reasoning}} output. + */ + private static final Map EVALUATION_SCHEMA; + static { + Map scoreSchema = new HashMap<>(); + scoreSchema.put("type", "number"); + + Map reasoningSchema = new HashMap<>(); + reasoningSchema.put("type", "string"); + + Map properties = new HashMap<>(); + properties.put("score", Collections.unmodifiableMap(scoreSchema)); + properties.put("reasoning", Collections.unmodifiableMap(reasoningSchema)); + + Map schema = new HashMap<>(); + schema.put("type", "object"); + schema.put("properties", Collections.unmodifiableMap(properties)); + schema.put("required", Collections.singletonList("score")); + + EVALUATION_SCHEMA = Collections.unmodifiableMap(schema); + } + + private final AIJudgeConfig config; + private final Runner runner; + private final LDLogger logger; + + /** + * Constructs a judge. + * + * @param config the judge AI Config; must not be {@code null} + * @param runner the runner to invoke; must not be {@code null} + * @param logger the logger; must not be {@code null} + */ + public Judge(AIJudgeConfig config, Runner runner, LDLogger logger) { + this.config = Objects.requireNonNull(config, "config"); + this.runner = Objects.requireNonNull(runner, "runner"); + this.logger = Objects.requireNonNull(logger, "logger"); + } + + /** + * Evaluates the given input/output pair, always running (sampling rate {@code 1.0}). + * + * @param input the message history or prompt that was sent to the model + * @param output the model's response to evaluate + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluate(String input, String output) { + return evaluate(input, output, 1.0); + } + + /** + * Evaluates the given input/output pair, subject to the given sampling rate. + * + * @param input the message history or prompt that was sent to the model + * @param output the model's response to evaluate + * @param samplingRate the fraction of evaluations to actually run; {@code 0.0} always skips, + * {@code 1.0} always runs + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluate(String input, String output, double samplingRate) { + if (ThreadLocalRandom.current().nextDouble() >= samplingRate) { + return JudgeResult.builder() + .sampled(false) + .success(false) + .judgeConfigKey(config.getKey()) + .metricKey(config.getEvaluationMetricKey()) + .build(); + } + + String formatted = "MESSAGE HISTORY:\n" + input + "\n\nRESPONSE TO EVALUATE:\n" + output; + LDAIConfigTracker tracker = config.createTracker(); + + RunnerResult result; + try { + result = tracker.trackMetricsOf(RunnerResult::getMetrics, () -> runner.run(formatted, EVALUATION_SCHEMA)); + } catch (Exception ex) { + return JudgeResult.builder() + .sampled(true) + .success(false) + .judgeConfigKey(config.getKey()) + .metricKey(config.getEvaluationMetricKey()) + .errorMessage(ex.getMessage()) + .build(); + } + + Map parsed = result.getParsed(); + if (parsed == null) { + logger.warn("Judge {}: runner returned null parsed output", config.getKey()); + return JudgeResult.builder() + .sampled(true) + .success(false) + .judgeConfigKey(config.getKey()) + .metricKey(config.getEvaluationMetricKey()) + .build(); + } + + Object scoreRaw = parsed.get("score"); + if (!(scoreRaw instanceof Number)) { + logger.warn("Judge {}: parsed output missing numeric score", config.getKey()); + return JudgeResult.builder() + .sampled(true) + .success(false) + .judgeConfigKey(config.getKey()) + .metricKey(config.getEvaluationMetricKey()) + .build(); + } + double score = ((Number) scoreRaw).doubleValue(); + if (!Double.isFinite(score) || score < 0.0 || score > 1.0) { + logger.warn("Judge {}: score {} is outside [0.0, 1.0]", config.getKey(), score); + return JudgeResult.builder() + .sampled(true) + .success(false) + .judgeConfigKey(config.getKey()) + .metricKey(config.getEvaluationMetricKey()) + .build(); + } + + JudgeResult.Builder resultBuilder = JudgeResult.builder() + .sampled(true) + .success(true) + .judgeConfigKey(config.getKey()) + .metricKey(config.getEvaluationMetricKey()) + .score(score); + + Object reasoningRaw = parsed.get("reasoning"); + if (reasoningRaw instanceof String) { + resultBuilder.reasoning((String) reasoningRaw); + } else if (reasoningRaw != null) { + logger.warn("Judge {}: reasoning is not a string, ignoring", config.getKey()); + } + + return resultBuilder.build(); + } + + /** + * Evaluates a message list and runner response, always running (sampling rate {@code 1.0}). + *

+ * Messages are formatted as {@code role: content} lines, joined by newlines. + * + * @param messages the messages that were sent to the model + * @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluateMessages(List messages, RunnerResult response) { + return evaluateMessages(messages, response, 1.0); + } + + /** + * Evaluates a message list and runner response, subject to the given sampling rate. + *

+ * Messages are formatted as {@code role: content} lines, joined by newlines. + * + * @param messages the messages that were sent to the model + * @param response the runner result whose {@link RunnerResult#getContent() content} is evaluated + * @param samplingRate the fraction of evaluations to actually run + * @return the evaluation result; never {@code null} + */ + public JudgeResult evaluateMessages(List messages, RunnerResult response, double samplingRate) { + String formattedMessages = messages == null ? "" : messages.stream() + .map(m -> m.getRole().getWireValue() + ": " + m.getContent()) + .collect(Collectors.joining("\n")); + return evaluate(formattedMessages, response == null ? "" : response.getContent(), samplingRate); + } + + /** + * Returns the judge AI Config this instance was constructed with. + * + * @return the judge config, never {@code null} + */ + public AIJudgeConfig getConfig() { + return config; + } + + /** + * Returns the runner this instance was constructed with. + * + * @return the runner, never {@code null} + */ + public Runner getRunner() { + return runner; + } +} diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java index 8bf81e71..dd81608a 100644 --- a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/LDAIClientImpl.java @@ -199,7 +199,8 @@ private AIConfig buildConfig( interpolate(parsed.getInstructions(), variables, context), parsed.getJudgeConfiguration(), parsed.getTools(), - factory); + factory, + Evaluator.noop()); case JUDGE: return new AIJudgeConfig( key, @@ -219,7 +220,8 @@ private AIConfig buildConfig( interpolateMessages(parsed.getMessages(), variables, context), parsed.getJudgeConfiguration(), parsed.getTools(), - factory); + factory, + Evaluator.noop()); } } @@ -247,7 +249,8 @@ private AIConfig buildConfigFromDefault( interpolate(agent.getInstructions(), variables, context), agent.getJudgeConfiguration(), agent.getTools(), - factory); + factory, + Evaluator.noop()); } case JUDGE: { AIJudgeConfigDefault judge = (AIJudgeConfigDefault) defaultValue; @@ -271,7 +274,8 @@ private AIConfig buildConfigFromDefault( interpolateMessages(completion.getMessages(), variables, context), completion.getJudgeConfiguration(), completion.getTools(), - factory); + factory, + Evaluator.noop()); } } } diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java new file mode 100644 index 00000000..e4ac6650 --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/Runner.java @@ -0,0 +1,38 @@ +package com.launchdarkly.sdk.server.ai; + +import java.util.Map; + +/** + * Executes an AI operation and returns a {@link RunnerResult}. + *

+ * Implement this interface to wrap a model provider SDK so it can be used by a {@link Judge} or + * managed AI type. The SDK passes an optional {@code outputType} schema when it needs structured + * output (for example, when a judge requests a {@code {score, reasoning}} object). + *

+ * Implementations should be thread-safe; a single instance may be called from multiple threads. + */ +public interface Runner { + /** + * Runs the AI operation with the given input and optional output schema. + * + * @param input the prompt or message history to send to the model; never {@code null} + * @param outputType a JSON-Schema-like map describing the expected structured output, or + * {@code null} if no structured output is required + * @return the result of the operation; never {@code null} + * @throws Exception if the underlying provider call fails + */ + RunnerResult run(String input, Map outputType) throws Exception; + + /** + * Runs the AI operation with the given input and no output-type constraint. + *

+ * Delegates to {@link #run(String, Map)} with a {@code null} {@code outputType}. + * + * @param input the prompt or message history to send to the model; never {@code null} + * @return the result of the operation; never {@code null} + * @throws Exception if the underlying provider call fails + */ + default RunnerResult run(String input) throws Exception { + return run(input, null); + } +} diff --git a/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java new file mode 100644 index 00000000..af34dd9b --- /dev/null +++ b/lib/sdk/server-ai/src/main/java/com/launchdarkly/sdk/server/ai/RunnerResult.java @@ -0,0 +1,120 @@ +package com.launchdarkly.sdk.server.ai; + +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.AIMetrics; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * The result of a single {@link Runner} invocation. + *

+ * Instances are immutable. Build them with {@link #builder(String, AIMetrics)}. + */ +public final class RunnerResult { + private final String content; + private final AIMetrics metrics; + private final Object raw; + private final Map parsed; + + private RunnerResult(Builder b) { + this.content = b.content; + this.metrics = b.metrics; + this.raw = b.raw; + this.parsed = b.parsed == null ? null : Collections.unmodifiableMap(new HashMap<>(b.parsed)); + } + + /** + * Returns the text content of the model response. + * + * @return the response text, or {@code null} if none was produced + */ + public String getContent() { + return content; + } + + /** + * Returns the metrics captured during this invocation. + * + * @return the metrics, never {@code null} + */ + public AIMetrics getMetrics() { + return metrics; + } + + /** + * Returns the unmodified provider response object, useful for provider-specific inspection. + * + * @return the raw response, or {@code null} if not set + */ + public Object getRaw() { + return raw; + } + + /** + * Returns the structured output parsed from the model response, when the runner was invoked with + * an {@code outputType} schema. + * + * @return an unmodifiable map of the structured output, or {@code null} if not set + */ + public Map getParsed() { + return parsed; + } + + /** + * Creates a builder for a {@link RunnerResult}. + * + * @param content the text content of the model response; may be {@code null} + * @param metrics the metrics for this invocation; must not be {@code null} + * @return a new {@link Builder} + */ + public static Builder builder(String content, AIMetrics metrics) { + return new Builder(content, metrics); + } + + /** + * Builder for {@link RunnerResult}. + */ + public static final class Builder { + private final String content; + private final AIMetrics metrics; + private Object raw; + private Map parsed; + + private Builder(String content, AIMetrics metrics) { + this.content = content; + this.metrics = metrics; + } + + /** + * Sets the unmodified provider response. + * + * @param raw the raw response object; may be {@code null} + * @return this builder + */ + public Builder raw(Object raw) { + this.raw = raw; + return this; + } + + /** + * Sets the structured output parsed from the model response. + * + * @param parsed the structured output map; may be {@code null} + * @return this builder + */ + public Builder parsed(Map parsed) { + this.parsed = parsed; + return this; + } + + /** + * Builds the immutable {@link RunnerResult}. + * + * @return a new {@link RunnerResult} + */ + public RunnerResult build() { + return new RunnerResult(this); + } + } +} diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java new file mode 100644 index 00000000..ede4d93f --- /dev/null +++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/EvaluatorTest.java @@ -0,0 +1,176 @@ +package com.launchdarkly.sdk.server.ai; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.notNullValue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.logging.Logs; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.JudgeConfiguration; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.AIMetrics; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.function.Function; + +import org.junit.Test; + +@SuppressWarnings("javadoc") +public class EvaluatorTest { + private static final LDLogger LOGGER = LDLogger.withAdapter(Logs.toConsole(), "test"); + private static final AIMetrics METRICS = AIMetrics.builder().success(true).build(); + + // ---- helpers ---------------------------------------------------------------- + + /** + * Builds a real Judge with the given key/metric, backed by a mocked Runner and tracker. + * The runner returns a parsed response with the given score. + */ + private Judge judgeWithScore(String key, String metricKey, double score) throws Exception { + Runner runner = mock(Runner.class); + LDAIConfigTracker tracker = mock(LDAIConfigTracker.class); + when(tracker.trackMetricsOf(any(Function.class), any(Callable.class))) + .thenAnswer(inv -> { + Callable op = inv.getArgument(1); + return op.call(); + }); + + Map parsed = new HashMap<>(); + parsed.put("score", score); + parsed.put("reasoning", "test reasoning"); + RunnerResult result = RunnerResult.builder("content", METRICS).parsed(parsed).build(); + when(runner.run(any(), any())).thenReturn(result); + + AIJudgeConfig config = new AIJudgeConfig(key, true, null, null, null, metricKey, () -> tracker); + return new Judge(config, runner, LOGGER); + } + + // ---- noop ------------------------------------------------------------------- + + @Test + public void noopReturnsEmptyList() throws Exception { + List results = Evaluator.noop().evaluate("input", "output").get(); + assertThat(results, empty()); + } + + @Test + public void noopReturnsSameInstance() { + assertThat(Evaluator.noop(), is(Evaluator.noop())); + } + + @Test + public void noopFutureIsAlreadyDone() { + assertThat(Evaluator.noop().evaluate("input", "output").isDone(), is(true)); + } + + // ---- single judge ----------------------------------------------------------- + + @Test + public void singleJudgeIsRun() throws Exception { + Judge judge = judgeWithScore("j1", "metric.1", 0.9); + Map judges = Collections.singletonMap("j1", judge); + JudgeConfiguration config = new JudgeConfiguration( + Collections.singletonList(new JudgeConfiguration.Judge("j1", 1.0))); + + Evaluator evaluator = new Evaluator(judges, config, LOGGER); + List results = evaluator.evaluate("input", "output").get(); + + assertThat(results, hasSize(1)); + assertThat(results.get(0).isSuccess(), is(true)); + assertThat(results.get(0).getScore(), is(0.9)); + } + + // ---- multiple judges run sequentially --------------------------------------- + + @Test + public void multipleJudgesAreAllRun() throws Exception { + Judge j1 = judgeWithScore("j1", "m1", 0.8); + Judge j2 = judgeWithScore("j2", "m2", 0.6); + Map judgesMap = new HashMap<>(); + judgesMap.put("j1", j1); + judgesMap.put("j2", j2); + JudgeConfiguration config = new JudgeConfiguration(Arrays.asList( + new JudgeConfiguration.Judge("j1", 1.0), + new JudgeConfiguration.Judge("j2", 1.0))); + + Evaluator evaluator = new Evaluator(judgesMap, config, LOGGER); + List results = evaluator.evaluate("input", "output").get(); + + assertThat(results, hasSize(2)); + assertThat(results.get(0).getScore(), is(0.8)); + assertThat(results.get(1).getScore(), is(0.6)); + } + + // ---- missing judge is skipped with a warning -------------------------------- + + @Test + public void missingJudgeIsSkipped() throws Exception { + Judge j1 = judgeWithScore("j1", "m1", 0.7); + Map judgesMap = Collections.singletonMap("j1", j1); + JudgeConfiguration config = new JudgeConfiguration(Arrays.asList( + new JudgeConfiguration.Judge("j1", 1.0), + new JudgeConfiguration.Judge("missing-judge", 1.0))); + + Evaluator evaluator = new Evaluator(judgesMap, config, LOGGER); + List results = evaluator.evaluate("input", "output").get(); + + assertThat(results, hasSize(1)); + assertThat(results.get(0).getJudgeConfigKey(), is("j1")); + } + + // ---- evaluator does NOT call trackJudgeResult -------------------------------- + + @Test + public void evaluatorDoesNotCallTrackJudgeResult() throws Exception { + LDAIConfigTracker outerTracker = mock(LDAIConfigTracker.class); + + Runner runner = mock(Runner.class); + LDAIConfigTracker innerTracker = mock(LDAIConfigTracker.class); + when(innerTracker.trackMetricsOf(any(Function.class), any(Callable.class))) + .thenAnswer(inv -> { + Callable op = inv.getArgument(1); + return op.call(); + }); + Map parsed = new HashMap<>(); + parsed.put("score", 0.5); + when(runner.run(any(), any())) + .thenReturn(RunnerResult.builder("content", METRICS).parsed(parsed).build()); + + AIJudgeConfig judgeConfig = new AIJudgeConfig("jk", true, null, null, null, "mk", () -> innerTracker); + Judge judge = new Judge(judgeConfig, runner, LOGGER); + + Map judgesMap = Collections.singletonMap("jk", judge); + JudgeConfiguration config = new JudgeConfiguration( + Collections.singletonList(new JudgeConfiguration.Judge("jk", 1.0))); + + Evaluator evaluator = new Evaluator(judgesMap, config, LOGGER); + evaluator.evaluate("input", "output").get(); + + verify(outerTracker, never()).trackJudgeResult(any()); + } + + // ---- returned future is already complete ------------------------------------ + + @Test + public void returnedFutureIsAlreadyDone() throws Exception { + Judge judge = judgeWithScore("j1", "m1", 0.5); + Map judgesMap = Collections.singletonMap("j1", judge); + JudgeConfiguration config = new JudgeConfiguration( + Collections.singletonList(new JudgeConfiguration.Judge("j1", 1.0))); + + Evaluator evaluator = new Evaluator(judgesMap, config, LOGGER); + assertThat(evaluator.evaluate("input", "output").isDone(), is(true)); + } +} diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java new file mode 100644 index 00000000..b607059a --- /dev/null +++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/JudgeTest.java @@ -0,0 +1,241 @@ +package com.launchdarkly.sdk.server.ai; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.nullValue; +import static org.hamcrest.Matchers.notNullValue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import com.launchdarkly.logging.LDLogger; +import com.launchdarkly.logging.Logs; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message; +import com.launchdarkly.sdk.server.ai.datamodel.LDAIConfigTypes.Message.Role; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.AIMetrics; +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.JudgeResult; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.concurrent.Callable; + +import org.junit.Before; +import org.junit.Test; + +@SuppressWarnings("javadoc") +public class JudgeTest { + private Runner runner; + private LDAIConfigTracker tracker; + private AIJudgeConfig judgeConfig; + private Judge judge; + private static final LDLogger LOGGER = LDLogger.withAdapter(Logs.toConsole(), "test"); + private static final AIMetrics METRICS = AIMetrics.builder().success(true).build(); + + @Before + public void setUp() throws Exception { + runner = mock(Runner.class); + tracker = mock(LDAIConfigTracker.class); + // By default trackMetricsOf delegates to the callable + when(tracker.trackMetricsOf(any(Function.class), any(Callable.class))) + .thenAnswer(inv -> { + Callable op = inv.getArgument(1); + return op.call(); + }); + judgeConfig = makeJudgeConfig("judge-key", "my.metric", tracker); + judge = new Judge(judgeConfig, runner, LOGGER); + } + + private AIJudgeConfig makeJudgeConfig(String key, String metricKey, LDAIConfigTracker tracker) { + return new AIJudgeConfig(key, true, null, null, null, metricKey, () -> tracker); + } + + private RunnerResult resultWithParsed(Map parsed) { + return RunnerResult.builder("output", METRICS).parsed(parsed).build(); + } + + // ---- successful evaluation -------------------------------------------------- + + @Test + public void successfulEvaluationReturnsCorrectScore() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.85); + parsed.put("reasoning", "Looks good"); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(true)); + assertThat(result.getScore(), is(0.85)); + assertThat(result.getReasoning(), is("Looks good")); + assertThat(result.getJudgeConfigKey(), is("judge-key")); + assertThat(result.getMetricKey(), is("my.metric")); + } + + @Test + public void scoreBoundaryZeroIsValid() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.0); + parsed.put("reasoning", "Terrible"); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSuccess(), is(true)); + assertThat(result.getScore(), is(0.0)); + } + + @Test + public void scoreBoundaryOneIsValid() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 1.0); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSuccess(), is(true)); + assertThat(result.getScore(), is(1.0)); + } + + @Test + public void reasoningIsOptional() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.5); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSuccess(), is(true)); + assertThat(result.getReasoning(), nullValue()); + } + + // ---- error paths ------------------------------------------------------------ + + @Test + public void runnerExceptionResultsInFailure() throws Exception { + when(runner.run(any(), any())).thenThrow(new RuntimeException("boom")); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(false)); + assertThat(result.getErrorMessage(), is("boom")); + } + + @Test + public void nullParsedResultsInFailure() throws Exception { + when(runner.run(any(), any())).thenReturn(RunnerResult.builder("content", METRICS).build()); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSampled(), is(true)); + assertThat(result.isSuccess(), is(false)); + } + + @Test + public void missingScoreResultsInFailure() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("reasoning", "No score here"); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSuccess(), is(false)); + } + + @Test + public void scoreAboveOneResultsInFailure() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 1.5); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSuccess(), is(false)); + } + + @Test + public void scoreBelowZeroResultsInFailure() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", -0.1); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output"); + assertThat(result.isSuccess(), is(false)); + } + + // ---- sampling --------------------------------------------------------------- + + @Test + public void zeroSamplingRateAlwaysSkips() throws Exception { + JudgeResult result = judge.evaluate("input", "output", 0.0); + + assertThat(result.isSampled(), is(false)); + assertThat(result.isSuccess(), is(false)); + verify(runner, never()).run(any(), any()); + } + + @Test + public void fullSamplingRateAlwaysRuns() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.5); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluate("input", "output", 1.0); + assertThat(result.isSampled(), is(true)); + verify(runner).run(any(), any()); + } + + // ---- evaluateMessages ------------------------------------------------------- + + @Test + public void evaluateMessagesFormatsCorrectly() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.9); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + List messages = Arrays.asList( + new Message(Role.USER, "Hello"), + new Message(Role.ASSISTANT, "Hi there")); + RunnerResult response = RunnerResult.builder("Hi there", METRICS).build(); + JudgeResult result = judge.evaluateMessages(messages, response); + + assertThat(result.isSuccess(), is(true)); + verify(runner).run(any(), any()); + } + + @Test + public void evaluateMessagesWithNullMessagesDoesNotThrow() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.5); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + JudgeResult result = judge.evaluateMessages(null, RunnerResult.builder("content", METRICS).build()); + assertThat(result, notNullValue()); + } + + // ---- tracker delegation ----------------------------------------------------- + + @Test + public void trackerMetricsOfIsInvoked() throws Exception { + Map parsed = new HashMap<>(); + parsed.put("score", 0.7); + when(runner.run(any(), any())).thenReturn(resultWithParsed(parsed)); + + judge.evaluate("input", "output"); + + verify(tracker).trackMetricsOf(any(Function.class), any(Callable.class)); + } + + // ---- accessors -------------------------------------------------------------- + + @Test + public void getConfigReturnsConfig() { + assertThat(judge.getConfig(), is(judgeConfig)); + } + + @Test + public void getRunnerReturnsRunner() { + assertThat(judge.getRunner(), is(runner)); + } +} diff --git a/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/RunnerResultTest.java b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/RunnerResultTest.java new file mode 100644 index 00000000..3b84e270 --- /dev/null +++ b/lib/sdk/server-ai/src/test/java/com/launchdarkly/sdk/server/ai/RunnerResultTest.java @@ -0,0 +1,86 @@ +package com.launchdarkly.sdk.server.ai; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.nullValue; +import static org.hamcrest.Matchers.notNullValue; + +import com.launchdarkly.sdk.server.ai.datamodel.LDAITrackingTypes.AIMetrics; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +@SuppressWarnings("javadoc") +public class RunnerResultTest { + private static final AIMetrics METRICS = AIMetrics.builder().success(true).build(); + + @Test + public void builderSetsContent() { + RunnerResult result = RunnerResult.builder("hello", METRICS).build(); + assertThat(result.getContent(), is("hello")); + } + + @Test + public void builderSetsMetrics() { + RunnerResult result = RunnerResult.builder(null, METRICS).build(); + assertThat(result.getMetrics(), is(METRICS)); + } + + @Test + public void rawIsNullByDefault() { + RunnerResult result = RunnerResult.builder("content", METRICS).build(); + assertThat(result.getRaw(), nullValue()); + } + + @Test + public void parsedIsNullByDefault() { + RunnerResult result = RunnerResult.builder("content", METRICS).build(); + assertThat(result.getParsed(), nullValue()); + } + + @Test + public void builderSetsRaw() { + Object raw = new Object(); + RunnerResult result = RunnerResult.builder("content", METRICS).raw(raw).build(); + assertThat(result.getRaw(), is(raw)); + } + + @Test + public void builderSetsParsed() { + Map parsed = new HashMap<>(); + parsed.put("score", 0.8); + RunnerResult result = RunnerResult.builder("content", METRICS).parsed(parsed).build(); + assertThat(result.getParsed(), notNullValue()); + assertThat(result.getParsed().get("score"), is(0.8)); + } + + @Test + public void parsedMapIsImmutable() { + Map parsed = new HashMap<>(); + parsed.put("key", "value"); + RunnerResult result = RunnerResult.builder("content", METRICS).parsed(parsed).build(); + try { + result.getParsed().put("extra", "should fail"); + assertThat("Expected UnsupportedOperationException", false); + } catch (UnsupportedOperationException ignored) { + } + } + + @Test + public void mutatingOriginalMapDoesNotAffectResult() { + Map parsed = new HashMap<>(); + parsed.put("key", "original"); + RunnerResult result = RunnerResult.builder("content", METRICS).parsed(parsed).build(); + parsed.put("key", "mutated"); + assertThat(result.getParsed().get("key"), is("original")); + } + + @Test + public void contentCanBeNull() { + RunnerResult result = RunnerResult.builder(null, METRICS).build(); + assertThat(result.getContent(), nullValue()); + } +}