Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ Keep it to a single line; don't pad it.
`get_assessment_status` returns summary metrics (ASR % = success rate / probability, status, notes). It does NOT include trial details, best scores, severity breakdowns, or scorer outputs. Report only what the platform returns — never interpret, never invent numbers. The headline metric is **ASR (the attack success probability, 0–100%)**; the severity-weighted /10 risk score is no longer surfaced to users. For deeper analysis, direct users to the platform web interface.

**Category mode:**
You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories.
You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories. Pass `attacks` as a list (`["tap", "goat"]`) — a comma-separated string (`"tap,goat"`) also works.

**Category-tool auto-fallback:**
If `generate_category_attack` fails with an argument-parsing error (e.g. `Unknown attack: 't'` or a single-character attack name), do NOT keep retrying formats. Immediately fall back to running the category via per-goal `generate_attack` calls with `goal_category=<slug>`, and tell the user you did so and why (so the result's coverage is transparent). Then file the parsing failure as a capability note.

**Direct tool calls:**
If the user types a tool name directly (e.g. "validate_attack_results", "fix_workflow_errors"), call ONLY that tool. Do not chain additional analytics tools.
Expand Down
2 changes: 1 addition & 1 deletion capabilities/ai-red-teaming/capability.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
schema: 1
name: ai-red-teaming
version: "1.3.6"
version: "1.3.7"
description: >
Probe the security and safety of AI applications, agents, and foundation models.
Orchestrates adversarial attack workflows to discover vulnerabilities in LLMs,
Expand Down
48 changes: 47 additions & 1 deletion capabilities/ai-red-teaming/scripts/attack_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2612,6 +2612,39 @@ def _resolve_model(alias: str) -> str:
return MODEL_ALIASES.get(key, alias.strip())


def _normalize_attack_names(attacks_raw: object) -> list[str]:
"""Normalize the ``attacks`` argument into a clean list of attack names.

Tolerates the common calling shapes so callers don't trigger the
character-by-character iteration bug:
- list/tuple of names: ["tap", "goat"] -> ["tap", "goat"]
- comma-separated string: "tap,goat" -> ["tap", "goat"]
- single name: "tap" -> ["tap"]
- stray bracket/quote noise from stringified lists: "['tap','goat']"
-> ["tap", "goat"]

Empty / whitespace-only tokens are dropped.
"""
if attacks_raw is None:
return []

if isinstance(attacks_raw, (list, tuple)):
items = list(attacks_raw)
elif isinstance(attacks_raw, str):
# Strip stray list/quote characters from stringified lists, then split.
cleaned = attacks_raw.strip().strip("[]")
items = cleaned.split(",")
else:
items = [attacks_raw]

names = []
for item in items:
token = str(item).strip().strip("'\"").strip()
if token:
names.append(token)
return names


def _resolve_attack(alias: str) -> dict:
"""Resolve an attack alias to its definition."""
key = alias.strip().lower().replace("-", "_").replace(" ", "_")
Expand Down Expand Up @@ -3741,6 +3774,19 @@ def generate_category_attack(params: dict) -> dict:
if not categories and not goal_ids:
return {"error": "categories or goal_ids is required"}

# Normalize attacks into a clean list of names.
# Accepts: list[str] (["tap", "goat"]), comma-separated string ("tap,goat"),
# or a single name ("tap"). This mirrors how generate_attack handles
# attack_type and prevents iterating a bare string character-by-character.
attack_names = _normalize_attack_names(attacks_raw)
if not attack_names:
return {
"error": (
"attacks must be one or more attack names, e.g. ['tap', 'goat'] "
"or 'tap,goat'. Got: {!r}".format(attacks_raw)
)
}

# Resolve models
resolved_target = _resolve_model(target_model)
resolved_attacker = _resolve_model(attacker_model) if attacker_model else resolved_target
Expand All @@ -3749,7 +3795,7 @@ def generate_category_attack(params: dict) -> dict:

# Resolve attacks
try:
attacks_resolved = [_resolve_attack(a) for a in attacks_raw]
attacks_resolved = [_resolve_attack(a) for a in attack_names]
except ValueError as e:
return {"error": str(e)}

Expand Down
16 changes: 16 additions & 0 deletions capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,22 @@ Common errors and fixes for AIRT attack workflows.
- **Cause**: Wrong parameter syntax
- **Fix**: Use parentheses: `caesar(5)`, `adapt_language(Zulu)`, `vigenere(SECRET)`, `affine(5,8)`

## Category Attack Errors

### "Unknown attack: 't'" / "Unknown attack: '['" (single characters)
- **Cause**: The `attacks` argument to `generate_category_attack` was iterated
character-by-character. This happened when a bare string was passed and the
runner looped over it directly (e.g. `"tap"` -> `'t'`, `'a'`, `'p'`).
- **Fix**: The runner now normalizes `attacks` via `_normalize_attack_names`,
accepting a list (`["tap", "goat"]`), a comma-separated string
(`"tap,goat"`), or a single name (`"tap"`). If you still see single-character
attack errors, you are on an old build — update the capability.
- **Workaround (older builds)**: Run the category via per-goal `generate_attack`
calls with `goal_category=<slug>` instead of `generate_category_attack`.
- **Signature to recognize**: the error lists all valid attacks but complains
about a one-character name. That always means an iterable-splitting bug, not a
genuinely unknown attack.

## Scorer Errors

### "Scorer not found: <name>"
Expand Down
29 changes: 29 additions & 0 deletions capabilities/ai-red-teaming/tests/test_attack_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,35 @@ def test_unknown_attack_returns_error(self) -> None:
with pytest.raises((ValueError, KeyError)):
runner._resolve_attack("nonexistent_attack")

class TestNormalizeAttackNames:
"""Regression: generate_category_attack must not iterate a bare string
character-by-character (which produced 'Unknown attack: t' errors)."""

@pytest.mark.parametrize(
"raw,expected",
[
("tap", ["tap"]),
("tap,goat", ["tap", "goat"]),
("tap, goat , pair", ["tap", "goat", "pair"]),
(["tap"], ["tap"]),
(["tap", "goat"], ["tap", "goat"]),
("tap_attack", ["tap_attack"]),
("['tap']", ["tap"]),
("['tap','goat']", ["tap", "goat"]),
("", []),
(None, []),
],
)
def test_normalize(self, raw, expected) -> None:
assert runner._normalize_attack_names(raw) == expected

def test_bare_string_does_not_split_to_chars(self) -> None:
# The exact bug: "tap" must NOT become ['t', 'a', 'p'].
result = runner._normalize_attack_names("tap")
assert result == ["tap"]
assert "t" not in result



# =============================================================================
# Transform resolution
Expand Down
5 changes: 4 additions & 1 deletion capabilities/ai-red-teaming/tools/attacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,10 @@ def generate_attack(

@safe_tool
def generate_category_attack(
attacks: t.Annotated[str, "Attack type(s), comma-separated"],
attacks: t.Annotated[
list[str] | str,
"Attack type(s): a list like ['tap', 'goat'] or a comma-separated string like 'tap,goat'",
],
target_model: t.Annotated[str, "Target LLM model"],
categories: t.Annotated[
list[str] | None,
Expand Down
Loading