From 640211051b88171ebf103b51227c35034b9c54d6 Mon Sep 17 00:00:00 2001 From: Raja Date: Wed, 3 Jun 2026 22:32:14 +0000 Subject: [PATCH] fix: flush OTEL spans between sequential studies and fix transform defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs fixed: 1. OTEL span loss in multi-study workflows: When running N+1 transform comparisons, multi-attack campaigns, or category sweeps, only the first study's traces were reliably exported to the platform. The BatchSpanProcessor buffers spans on a background thread — later studies' spans were still in the buffer when dn.shutdown() raced against process teardown. Added explicit force_flush() between sequential assessment.run() calls in all 3 multi-study templates. 2. Unwanted baseline run: When user specified transforms (e.g. 'run crescendo with Telugu'), the agent was forced to set compare_transforms=true, creating an N+1 study with an unrequested baseline run. Changed default to compare_transforms=false — transforms are now applied directly. N+1 comparison only triggers when user explicitly asks to 'compare' or 'benchmark' transforms. Bump version: 1.3.2 → 1.3.5 --- .../agents/ai-red-teaming-agent.md | 12 ++++++-- capabilities/ai-red-teaming/capability.yaml | 2 +- .../ai-red-teaming/scripts/attack_runner.py | 30 +++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md b/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md index b5ca529..c30e8b0 100644 --- a/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md +++ b/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md @@ -103,7 +103,7 @@ Complete requests that don't need clarification: 5. Never report failure without first running diagnostic tools AND asking for clarification. **Parameter handling:** -- When user specifies transforms (e.g. "using 3 transforms", "with base64, caesar, authority"), ALWAYS set `compare_transforms=true`. This produces N+1 runs (baseline + each transform). Set `compare_transforms=false` ONLY if the user explicitly says "bundle transforms" or "apply all together". +- When user specifies transforms (e.g. "with base64", "using Telugu"), set `compare_transforms=false` by default. The transforms are applied to the attack — no baseline run is added. Only set `compare_transforms=true` if the user explicitly asks to "compare transforms", "benchmark transforms", "compare against baseline", or uses similar comparison language. `compare_transforms=true` creates N+1 runs (1 baseline without transforms + 1 per transform individually). - "max trials N", "N trials", "max_trials N", or "iterations N" → set `n_iterations=N`. - Pass the user's model name as `target_model` verbatim. The tool resolves aliases internally. @@ -406,11 +406,17 @@ Use `generate_image_attack` when the user wants to attack a traditional ML model User: "run TAP on gpt-4o, goal: extract system prompt" → `generate_attack(attack_type="tap", goal="extract system prompt", target_model="gpt-4o")` -### Transform Comparison +### Transform (applied, no baseline) + +User: "run TAP with base64 and caesar transforms" +→ `generate_attack(attack_type="tap", ..., transforms=["base64","caesar"], compare_transforms=false)` +Transforms are applied to the attack. No baseline run. + +### Transform Comparison (explicit) User: "run TAP with base64, caesar, authority transforms, compare them" → `generate_attack(attack_type="tap", ..., transforms=["base64","caesar","authority"], compare_transforms=true)` -This generates N+1 runs: 1 baseline + 1 per transform. +This generates N+1 runs: 1 baseline + 1 per transform. Only when user asks to "compare". ### Campaign (multiple attacks) diff --git a/capabilities/ai-red-teaming/capability.yaml b/capabilities/ai-red-teaming/capability.yaml index c35dd9a..5d9afb7 100644 --- a/capabilities/ai-red-teaming/capability.yaml +++ b/capabilities/ai-red-teaming/capability.yaml @@ -1,6 +1,6 @@ schema: 1 name: ai-red-teaming -version: "1.3.2" +version: "1.3.5" description: > Probe the security and safety of AI applications, agents, and foundation models. Orchestrates adversarial attack workflows to discover vulnerabilities in LLMs, diff --git a/capabilities/ai-red-teaming/scripts/attack_runner.py b/capabilities/ai-red-teaming/scripts/attack_runner.py index cd9cb73..3d414c3 100644 --- a/capabilities/ai-red-teaming/scripts/attack_runner.py +++ b/capabilities/ai-red-teaming/scripts/attack_runner.py @@ -3059,6 +3059,16 @@ async def main(): print(f"--- end {{label}} ---") sys.stdout.flush() + # Flush OTEL spans between studies so each study's traces + # are exported to the platform before starting the next one. + try: + from dreadnode.app.main import DEFAULT_INSTANCE + _provider = DEFAULT_INSTANCE._logfire._tracer_provider + if hasattr(_provider, 'force_flush'): + _provider.force_flush(timeout_millis=10_000) + except Exception: + pass + except Exception as e: print(f"\\nERROR in study '{{label}}': {{e}}") traceback.print_exc() @@ -3144,6 +3154,16 @@ async def main(): await assessment.run(_{var}_study) print(f"{canon} completed successfully") sys.stdout.flush() + + # Flush OTEL spans between attacks so each attack's traces + # are exported to the platform before starting the next one. + try: + from dreadnode.app.main import DEFAULT_INSTANCE + _provider = DEFAULT_INSTANCE._logfire._tracer_provider + if hasattr(_provider, 'force_flush'): + _provider.force_flush(timeout_millis=10_000) + except Exception: + pass except Exception as e: print(f"\\nERROR in {canon}: {{e}}") traceback.print_exc() @@ -3420,6 +3440,16 @@ async def main(): print(f"completed") sys.stdout.flush() + # Flush OTEL spans between goals so each goal's traces + # are exported to the platform before starting the next one. + try: + from dreadnode.app.main import DEFAULT_INSTANCE + _provider = DEFAULT_INSTANCE._logfire._tracer_provider + if hasattr(_provider, 'force_flush'): + _provider.force_flush(timeout_millis=10_000) + except Exception: + pass + except Exception as e: print(f"ERROR: {{e}}") traceback.print_exc()