Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ Probe the security and safety of AI applications, agents, and foundation models.
- `"Run GOAT with MCP tool poisoning transforms"` — test MCP server security
- `"Run HopSkipJump against my image classifier at https://my-model.sagemaker.aws/predict"` — traditional ML adversarial attack

**What happens when you launch an attack (5 steps):**

1. **Plan** — register the assessment (target, goal, attack type) so progress is tracked.
2. **Generate** — build the attack workflow script for your goal + target.
3. **Run** — execute the workflow; the attacker model probes the target over N iterations.
4. **Score** — each attempt is judged and the success rate (ASR) is computed.
5. **Report** — validate results and show you the metrics.

*Metric: **ASR (success rate)** is the probability the attack worked — 0–100%. Higher = more vulnerable.*

---

Then wait for the user's request. Optional supporting skills (workflow-patterns,
Expand All @@ -56,6 +66,10 @@ YOU ARE A PARAMETER EXTRACTOR. Extract what the user wants and call the appropri

**Core sequence (applies to every attack flow):**

Before launching, print a short one-line plan so the user can follow along, e.g.:
`Plan → Generate → Run → Score → Report. Launching TAP on gpt-4o (goal: extract system prompt)…`
Keep it to a single line; don't pad it.

1. Pick the right generator for the target type:
- LLM with a specific goal → `generate_attack`
- LLM by harm category / sweep → `generate_category_attack`
Expand All @@ -68,7 +82,7 @@ YOU ARE A PARAMETER EXTRACTOR. Extract what the user wants and call the appropri
6. Call `save_session_context` so follow-up requests can reuse target / goal / configuration via `get_session_context`.

**Platform-data-only rule:**
`get_assessment_status` returns summary metrics (ASR %, risk score, status, notes). It does NOT include trial details, best scores, severity breakdowns, or scorer outputs. Report only what the platform returns — never interpret, never invent numbers, never explain what ASR/risk means. For deeper analysis, direct users to the platform web interface.
`get_assessment_status` returns summary metrics (ASR % = success rate / probability, status, notes). It does NOT include trial details, best scores, severity breakdowns, or scorer outputs. Report only what the platform returns — never interpret, never invent numbers. The headline metric is **ASR (the attack success probability, 0–100%)**; the severity-weighted /10 risk score is no longer surfaced to users. For deeper analysis, direct users to the platform web interface.

**Category mode:**
You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories.
Expand Down
2 changes: 1 addition & 1 deletion capabilities/ai-red-teaming/capability.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
schema: 1
name: ai-red-teaming
version: "1.3.5"
version: "1.3.6"
description: >
Probe the security and safety of AI applications, agents, and foundation models.
Orchestrates adversarial attack workflows to discover vulnerabilities in LLMs,
Expand Down
179 changes: 151 additions & 28 deletions capabilities/ai-red-teaming/scripts/attack_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,16 @@ def _resolve_platform_env() -> dict[str, str]:
"""
env = os.environ.copy()

# If platform env vars are already set (sandbox), use as-is
# If the runtime already provides platform credentials in any of the
# forms the SDK understands, pass the env through untouched -- the
# generated script self-configures via dn.configure(), whose precedence
# is: explicit args > env vars > saved profile.
# - DREADNODE_SERVER + DREADNODE_API_KEY (classic platform env)
# - DREADNODE_LLM_BASE + DREADNODE_LLM_API_KEY (runtime LLM proxy env)
if env.get("DREADNODE_SERVER") and env.get("DREADNODE_API_KEY"):
return env
if env.get("DREADNODE_LLM_BASE") and env.get("DREADNODE_LLM_API_KEY"):
return env

# Fall back to saved profile (TUI/CLI mode)
# Profile lives at ~/.dreadnode/config.yaml (YAML format)
Expand Down Expand Up @@ -2775,6 +2782,9 @@ def _build_imports(attacks: list[dict], transforms: list[dict], has_scorers: boo

lines.append("from dreadnode.airt.assessment import Assessment")
lines.append("from dreadnode.airt.analytics.types import GoalCategory")
# analyze() powers the local analytics JSON written at end of each run
# (consumed by inspect_results / validate_attack_results / get_analytics_summary).
lines.append("from dreadnode.airt.analytics import analyze")

if transforms:
module_names: dict[str, list[str]] = {}
Expand All @@ -2800,31 +2810,127 @@ def _build_configure() -> str:
"""
return """
# -- Connect SDK to platform --
# In sandbox: env vars are set by the platform (DREADNODE_SERVER, DREADNODE_API_KEY, etc.)
# In TUI/CLI: falls back to saved profile from ~/.cache/dreadnode/config.yaml
_server = os.environ.get("DREADNODE_SERVER")
_api_key = os.environ.get("DREADNODE_API_KEY")
_org = os.environ.get("DREADNODE_ORGANIZATION")
_ws = os.environ.get("DREADNODE_WORKSPACE")
_project = os.environ.get("DREADNODE_PROJECT")

if _server and _api_key:
# Explicit env vars (sandbox mode)
dn.configure(server=_server, api_key=_api_key, organization=_org, workspace=_ws, project=_project)
print(f"SDK configured (env): server={_server}")
else:
# Fall back to saved profile (TUI/CLI mode)
try:
dn.configure(organization=_org, workspace=_ws, project=_project)
print(f"SDK configured (profile): server={dn.server}")
except Exception as e:
print(f"FATAL: Could not configure SDK: {e}")
print(" Set DREADNODE_SERVER + DREADNODE_API_KEY env vars, or login via `dreadnode login`.")
sys.exit(1)
# Let the SDK resolve credentials itself. Per dn.configure()'s documented
# precedence, it reads: explicit args > environment variables > saved
# profile (~/.dreadnode/config.yaml). This works across sandbox AND TUI/CLI
# without the script having to know which env vars the runtime injects
# (DREADNODE_SERVER/_API_KEY, DREADNODE_LLM_*, or none at all).
#
# Only forward scope overrides (org/workspace/project) that are actually
# present in the environment; everything else is resolved by the SDK.
_scope = {
k: v
for k, v in (
("organization", os.environ.get("DREADNODE_ORGANIZATION")),
("workspace", os.environ.get("DREADNODE_WORKSPACE")),
("project", os.environ.get("DREADNODE_PROJECT")),
)
if v
}
try:
# configure() returns the configured SDK *instance*; read .server off it.
# NOTE: do NOT use `dn.server` -- the `dreadnode` module has no `server`
# attribute (it lives on the instance), and referencing it raises
# AttributeError, which previously surfaced as a misleading FATAL.
_dn = dn.configure(**_scope)
_resolved_server = (
getattr(_dn, "server", None)
or os.environ.get("DREADNODE_SERVER")
or "<saved profile>"
)
print(f"SDK configured: server={_resolved_server}")
except Exception as e:
print(f"FATAL: Could not configure SDK: {e}")
print(" Authenticate via `dreadnode login` (or set DREADNODE_SERVER + DREADNODE_API_KEY).")
sys.exit(1)
sys.stdout.flush()
"""


def _build_analytics_writer() -> str:
"""Build the local-analytics writer block.

Defines ``_write_local_analytics(assessment, ...)`` in the generated
script. It runs the SDK's own deterministic ``analyze()`` pipeline over
``assessment.attack_results`` and writes a ``*_analytics.json`` file to the
workspace. This is the artifact consumed by ``inspect_results``,
``validate_attack_results`` and ``get_analytics_summary``.

Metrics are computed by the SDK (real ASR / risk_score / severity) — the
script never invents numbers. If there are no attack results (e.g. the
study produced no finished trials) it writes nothing and says so.
"""
return """
import json as _json
from datetime import datetime, timezone

def _write_local_analytics(assessment, *, target_model=None, attacker_model=None, evaluator_model=None):
\"\"\"Run the SDK analytics pipeline and persist a local *_analytics.json.

Returns the output path, or None if there were no results to analyze.
\"\"\"
try:
attack_results = list(getattr(assessment, "attack_results", []) or [])
except Exception as _e:
print(f" [analytics] could not read assessment.attack_results: {_e}")
return None
if not attack_results:
print(" [analytics] no attack results to analyze (0 finished trials); "
"skipping local analytics file. Platform metrics may still be available.")
return None
try:
_analytics = analyze(
attack_results,
target_model=target_model,
attacker_model=attacker_model,
evaluator_model=evaluator_model,
)
_data = _analytics.to_dict()
except Exception as _e:
print(f" [analytics] analyze() failed: {_e}")
return None

# Resolve org/workspace the SAME way the results tools do, so the file
# lands in the dir they scan: ~/.dreadnode/airt/<org>/<workspace>/.
# Precedence: env vars > saved profile (UserConfig) > "default"/"main".
_org = os.environ.get("DREADNODE_ORGANIZATION")
_ws = os.environ.get("DREADNODE_WORKSPACE")
if not (_org and _ws):
try:
from dreadnode.app.config import UserConfig
_profile_data = UserConfig.read().active_profile
if _profile_data:
_, _profile = _profile_data
_org = _org or _profile.organization
_ws = _ws or _profile.workspace
except Exception:
pass
_org = _org or "default"
_ws = _ws or "main"
_out_dir = Path.home() / ".dreadnode" / "airt" / _org / _ws
_out_dir.mkdir(parents=True, exist_ok=True)

_ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
_aid = getattr(assessment, "assessment_id", None) or "local"
_envelope = {
"assessment_id": str(_aid),
"generated_at": datetime.now(timezone.utc).isoformat(),
"target_model": target_model,
"attacker_model": attacker_model,
"evaluator_model": evaluator_model,
"analytics": _data,
}
_path = _out_dir / f"{_aid}_{_ts}_analytics.json"
try:
_path.write_text(_json.dumps(_envelope, indent=2, default=str))
print(f" [analytics] wrote local analytics: {_path}")
return str(_path)
except Exception as _e:
print(f" [analytics] failed to write analytics file: {_e}")
return None
"""


def _build_proxy_routing() -> str:
"""Build the LiteLLM proxy routing block.

Expand Down Expand Up @@ -3079,6 +3185,7 @@ async def main():
print("\\nFATAL: No studies completed successfully!")
sys.exit(1)

_write_local_analytics(assessment, target_model=TARGET_MODEL, attacker_model=ATTACKER_MODEL, evaluator_model=JUDGE_MODEL)
print(f"\\nAssessment complete. {{completed}}/{{len(STUDIES)}} studies succeeded.")
sys.stdout.flush()

Expand Down Expand Up @@ -3128,6 +3235,7 @@ async def main():
await assessment.fail(str(e))
sys.exit(1)

_write_local_analytics(assessment, target_model=TARGET_MODEL, attacker_model=ATTACKER_MODEL, evaluator_model=JUDGE_MODEL)
print(f"\\nAssessment complete.")
sys.stdout.flush()

Expand Down Expand Up @@ -3172,6 +3280,7 @@ async def main():

_CAMPAIGN_FOOTER = """\

_write_local_analytics(assessment, target_model=TARGET_MODEL, attacker_model=ATTACKER_MODEL, evaluator_model=JUDGE_MODEL)
print(f"\\nAssessment complete.")
sys.stdout.flush()

Expand All @@ -3196,6 +3305,7 @@ def _generate_transform_study(config: dict) -> str:

imports = _build_imports([atk], transforms, has_scorers)
configure = _build_configure()
analytics_writer = _build_analytics_writer()
cfg = _build_config_section(config)
proxy = _build_proxy_routing()
tgt = _build_target()
Expand Down Expand Up @@ -3233,7 +3343,7 @@ def _generate_transform_study(config: dict) -> str:
tag_alias=_tag_alias(canon),
)

return "\n".join([imports, configure, cfg, proxy, "", tgt, body])
return "\n".join([imports, configure, analytics_writer, cfg, proxy, "", tgt, body])


def _generate_single(config: dict) -> str:
Expand All @@ -3244,6 +3354,7 @@ def _generate_single(config: dict) -> str:

imports = _build_imports([atk], transforms, has_scorers)
configure = _build_configure()
analytics_writer = _build_analytics_writer()
cfg = _build_config_section(config)
proxy = _build_proxy_routing()
tgt = _build_target()
Expand All @@ -3269,7 +3380,7 @@ def _generate_single(config: dict) -> str:
transforms_applied=repr(transform_names),
)

return "\n".join([imports, configure, cfg, proxy, "", tgt, body])
return "\n".join([imports, configure, analytics_writer, cfg, proxy, "", tgt, body])


def _generate_campaign(config: dict) -> str:
Expand All @@ -3280,6 +3391,7 @@ def _generate_campaign(config: dict) -> str:

imports = _build_imports(attacks, transforms, has_scorers)
configure = _build_configure()
analytics_writer = _build_analytics_writer()
cfg = _build_config_section(config)
proxy = _build_proxy_routing()
tgt = _build_target()
Expand Down Expand Up @@ -3326,7 +3438,7 @@ async def main():
async with assessment.trace():
""".format(kwargs=assessment_kwargs)

parts = [imports, configure, cfg, proxy, "", tgt, campaign_header]
parts = [imports, configure, analytics_writer, cfg, proxy, "", tgt, campaign_header]
parts.extend(attack_blocks)
parts.append(_CAMPAIGN_FOOTER)

Expand Down Expand Up @@ -3460,6 +3572,7 @@ async def main():
print("\\nFATAL: No goals completed!")
sys.exit(1)

_write_local_analytics(assessment, target_model=TARGET_MODEL, attacker_model=ATTACKER_MODEL, evaluator_model=JUDGE_MODEL)
print(f"\\nAssessment complete. {{completed}} goals succeeded.")
sys.stdout.flush()

Expand Down Expand Up @@ -3490,6 +3603,7 @@ def _generate_category_attack(config: dict) -> str:

imports = _build_imports(attacks, transforms, has_scorers)
configure = _build_configure()
analytics_writer = _build_analytics_writer()
proxy = _build_proxy_routing()

# Config section — no GOAL constant since goals are embedded below
Expand Down Expand Up @@ -3599,7 +3713,7 @@ def _generate_category_attack(config: dict) -> str:
transforms_applied=transforms_applied,
)

return "\n".join([imports, configure, cfg, proxy, "", tgt, body])
return "\n".join([imports, configure, analytics_writer, cfg, proxy, "", tgt, body])


def generate_category_attack(params: dict) -> dict:
Expand Down Expand Up @@ -4064,6 +4178,9 @@ def _build_agentic_imports(attacks: list[dict], transforms: list[dict], has_scor

lines.append("from dreadnode.airt.assessment import Assessment")
lines.append("from dreadnode.airt.analytics.types import GoalCategory")
# analyze() powers the local analytics JSON written at end of each run
# (consumed by inspect_results / validate_attack_results / get_analytics_summary).
lines.append("from dreadnode.airt.analytics import analyze")

if transforms:
module_names: dict[str, list[str]] = {}
Expand Down Expand Up @@ -4141,6 +4258,7 @@ async def main():
await assessment.fail(str(e))
sys.exit(1)

_write_local_analytics(assessment, target_model=TARGET_MODEL, attacker_model=ATTACKER_MODEL, evaluator_model=JUDGE_MODEL)
print(f"\\nAssessment complete.")
sys.stdout.flush()

Expand All @@ -4163,6 +4281,7 @@ def _generate_agentic_single(config: dict, agent_config: dict) -> str:

imports = _build_agentic_imports([atk], transforms, has_scorers, agent_config)
configure = _build_configure()
analytics_writer = _build_analytics_writer()
cfg = _build_config_section(config)
proxy = _build_proxy_routing()
tgt = _build_agent_target_code(agent_config)
Expand Down Expand Up @@ -4190,7 +4309,7 @@ def _generate_agentic_single(config: dict, agent_config: dict) -> str:
agent_url=_safe_str(agent_config["agent_url"]),
)

parts = [imports, configure, cfg, proxy]
parts = [imports, configure, analytics_writer, cfg, proxy]
if scorers_code:
parts.append(scorers_code)
parts.extend(["", tgt, body])
Expand Down Expand Up @@ -4624,6 +4743,7 @@ async def main():
await assessment.fail(str(e))
sys.exit(1)

_write_local_analytics(assessment)
print(f"\\nAssessment complete.")
sys.stdout.flush()

Expand Down Expand Up @@ -4682,6 +4802,7 @@ def generate_image_attack(params: dict) -> dict:
# Build script
imports = _build_image_imports(attack_func)
configure = _build_configure()
analytics_writer = _build_analytics_writer()

# Config section
config_lines = [
Expand Down Expand Up @@ -4762,7 +4883,7 @@ def generate_image_attack(params: dict) -> dict:
attack_params=attack_params_str,
)

script = "\n".join([imports, configure, config_section, "", target_code, body])
script = "\n".join([imports, configure, analytics_writer, config_section, "", target_code, body])

# Syntax check
try:
Expand Down Expand Up @@ -4892,6 +5013,7 @@ def generate_tabular_attack(params: dict) -> dict:

imports = _build_tabular_imports(attack_func)
configure = _build_configure()
analytics_writer = _build_analytics_writer()

script = '''{imports}

Expand Down Expand Up @@ -5046,6 +5168,7 @@ async def main():
await assessment.fail(str(e))
raise

_write_local_analytics(assessment)
print(f"\\nAssessment complete.")
sys.stdout.flush()

Expand Down
Loading
Loading