From a4fe5f1f681ed1cdb5653fe6ced07c642b31eea6 Mon Sep 17 00:00:00 2001
From: rdheekonda <raja@dreadnode.io>
Date: Thu, 4 Jun 2026 00:31:21 +0000
Subject: [PATCH 1/3] fix(airt): normalize attacks arg in
 generate_category_attack

The `attacks` parameter was iterated character-by-character when passed
as a bare string, producing cryptic "Unknown attack: 't'" errors and
making the entire category-sweep path unusable.

- Add _normalize_attack_names() to accept list, comma-separated string,
  single name, or stringified-list noise; mirrors generate_attack's
  attack_type.split(",") handling.
- Return a clear validation error instead of a single-character failure.
- Widen the tool annotation to list[str] | str.
- Add TestNormalizeAttackNames regression coverage.
- Document the failure signature in error-troubleshooting skill.
- Add category-tool auto-fallback guidance to the agent instructions.
---
 .../agents/ai-red-teaming-agent.md            |  5 +-
 .../ai-red-teaming/scripts/attack_runner.py   | 48 ++++++++++++++++++-
 .../skills/error-troubleshooting/SKILL.md     | 16 +++++++
 .../tests/test_attack_runner.py               | 29 +++++++++++
 capabilities/ai-red-teaming/tools/attacks.py  |  5 +-
 5 files changed, 100 insertions(+), 3 deletions(-)
diff --git a/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md b/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md
index f8d096a..2bde639 100644
--- a/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md
+++ b/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md
@@ -85,7 +85,10 @@ Keep it to a single line; don't pad it.
 `get_assessment_status` returns summary metrics (ASR % = success rate / probability, status, notes). It does NOT include trial details, best scores, severity breakdowns, or scorer outputs. Report only what the platform returns — never interpret, never invent numbers. The headline metric is **ASR (the attack success probability, 0–100%)**; the severity-weighted /10 risk score is no longer surfaced to users. For deeper analysis, direct users to the platform web interface.
 
 **Category mode:**
-You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories.
+You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories. Pass `attacks` as a list (`["tap", "goat"]`) — a comma-separated string (`"tap,goat"`) also works.
+
+**Category-tool auto-fallback:**
+If `generate_category_attack` fails with an argument-parsing error (e.g. `Unknown attack: 't'` or a single-character attack name), do NOT keep retrying formats. Immediately fall back to running the category via per-goal `generate_attack` calls with `goal_category=<slug>`, and tell the user you did so and why (so the result's coverage is transparent). Then file the parsing failure as a capability note.
 
 **Direct tool calls:**
 If the user types a tool name directly (e.g. "validate_attack_results", "fix_workflow_errors"), call ONLY that tool. Do not chain additional analytics tools.
diff --git a/capabilities/ai-red-teaming/scripts/attack_runner.py b/capabilities/ai-red-teaming/scripts/attack_runner.py
index 5c5777e..9ba04a3 100644
--- a/capabilities/ai-red-teaming/scripts/attack_runner.py
+++ b/capabilities/ai-red-teaming/scripts/attack_runner.py
@@ -2612,6 +2612,39 @@ def _resolve_model(alias: str) -> str:
     return MODEL_ALIASES.get(key, alias.strip())
 
 
+def _normalize_attack_names(attacks_raw: t.Any) -> list[str]:
+    """Normalize the ``attacks`` argument into a clean list of attack names.
+
+    Tolerates the common calling shapes so callers don't trigger the
+    character-by-character iteration bug:
+      - list/tuple of names: ["tap", "goat"] -> ["tap", "goat"]
+      - comma-separated string: "tap,goat" -> ["tap", "goat"]
+      - single name: "tap" -> ["tap"]
+      - stray bracket/quote noise from stringified lists: "['tap','goat']"
+        -> ["tap", "goat"]
+
+    Empty / whitespace-only tokens are dropped.
+    """
+    if attacks_raw is None:
+        return []
+
+    if isinstance(attacks_raw, (list, tuple)):
+        items = list(attacks_raw)
+    elif isinstance(attacks_raw, str):
+        # Strip stray list/quote characters from stringified lists, then split.
+        cleaned = attacks_raw.strip().strip("[]")
+        items = cleaned.split(",")
+    else:
+        items = [attacks_raw]
+
+    names = []
+    for item in items:
+        token = str(item).strip().strip("'\"").strip()
+        if token:
+            names.append(token)
+    return names
+
+
 def _resolve_attack(alias: str) -> dict:
     """Resolve an attack alias to its definition."""
     key = alias.strip().lower().replace("-", "_").replace(" ", "_")
@@ -3741,6 +3774,19 @@ def generate_category_attack(params: dict) -> dict:
     if not categories and not goal_ids:
         return {"error": "categories or goal_ids is required"}
 
+    # Normalize attacks into a clean list of names.
+    # Accepts: list[str] (["tap", "goat"]), comma-separated string ("tap,goat"),
+    # or a single name ("tap"). This mirrors how generate_attack handles
+    # attack_type and prevents iterating a bare string character-by-character.
+    attack_names = _normalize_attack_names(attacks_raw)
+    if not attack_names:
+        return {
+            "error": (
+                "attacks must be one or more attack names, e.g. ['tap', 'goat'] "
+                "or 'tap,goat'. Got: {!r}".format(attacks_raw)
+            )
+        }
+
     # Resolve models
     resolved_target = _resolve_model(target_model)
     resolved_attacker = _resolve_model(attacker_model) if attacker_model else resolved_target
@@ -3749,7 +3795,7 @@ def generate_category_attack(params: dict) -> dict:
 
     # Resolve attacks
     try:
-        attacks_resolved = [_resolve_attack(a) for a in attacks_raw]
+        attacks_resolved = [_resolve_attack(a) for a in attack_names]
     except ValueError as e:
         return {"error": str(e)}
 
diff --git a/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md b/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md
index 009493e..3de81bc 100644
--- a/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md
+++ b/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md
@@ -45,6 +45,22 @@ Common errors and fixes for AIRT attack workflows.
 - **Cause**: Wrong parameter syntax
 - **Fix**: Use parentheses: `caesar(5)`, `adapt_language(Zulu)`, `vigenere(SECRET)`, `affine(5,8)`
 
+## Category Attack Errors
+
+### "Unknown attack: 't'" / "Unknown attack: '['" (single characters)
+- **Cause**: The `attacks` argument to `generate_category_attack` was iterated
+  character-by-character. This happened when a bare string was passed and the
+  runner looped over it directly (e.g. `"tap"` -> `'t'`, `'a'`, `'p'`).
+- **Fix**: The runner now normalizes `attacks` via `_normalize_attack_names`,
+  accepting a list (`["tap", "goat"]`), a comma-separated string
+  (`"tap,goat"`), or a single name (`"tap"`). If you still see single-character
+  attack errors, you are on an old build — update the capability.
+- **Workaround (older builds)**: Run the category via per-goal `generate_attack`
+  calls with `goal_category=<slug>` instead of `generate_category_attack`.
+- **Signature to recognize**: the error lists all valid attacks but complains
+  about a one-character name. That always means an iterable-splitting bug, not a
+  genuinely unknown attack.
+
 ## Scorer Errors
 
 ### "Scorer not found: <name>"
diff --git a/capabilities/ai-red-teaming/tests/test_attack_runner.py b/capabilities/ai-red-teaming/tests/test_attack_runner.py
index 46564e8..9656f76 100644
--- a/capabilities/ai-red-teaming/tests/test_attack_runner.py
+++ b/capabilities/ai-red-teaming/tests/test_attack_runner.py
@@ -101,6 +101,35 @@ def test_unknown_attack_returns_error(self) -> None:
         with pytest.raises((ValueError, KeyError)):
             runner._resolve_attack("nonexistent_attack")
 
+class TestNormalizeAttackNames:
+    """Regression: generate_category_attack must not iterate a bare string
+    character-by-character (which produced 'Unknown attack: t' errors)."""
+
+    @pytest.mark.parametrize(
+        "raw,expected",
+        [
+            ("tap", ["tap"]),
+            ("tap,goat", ["tap", "goat"]),
+            ("tap, goat , pair", ["tap", "goat", "pair"]),
+            (["tap"], ["tap"]),
+            (["tap", "goat"], ["tap", "goat"]),
+            ("tap_attack", ["tap_attack"]),
+            ("['tap']", ["tap"]),
+            ("['tap','goat']", ["tap", "goat"]),
+            ("", []),
+            (None, []),
+        ],
+    )
+    def test_normalize(self, raw, expected) -> None:
+        assert runner._normalize_attack_names(raw) == expected
+
+    def test_bare_string_does_not_split_to_chars(self) -> None:
+        # The exact bug: "tap" must NOT become ['t', 'a', 'p'].
+        result = runner._normalize_attack_names("tap")
+        assert result == ["tap"]
+        assert "t" not in result
+
+
 
 # =============================================================================
 # Transform resolution
diff --git a/capabilities/ai-red-teaming/tools/attacks.py b/capabilities/ai-red-teaming/tools/attacks.py
index f5fb096..4e5d8da 100644
--- a/capabilities/ai-red-teaming/tools/attacks.py
+++ b/capabilities/ai-red-teaming/tools/attacks.py
@@ -148,7 +148,10 @@ def generate_attack(
 
 @safe_tool
 def generate_category_attack(
-    attacks: t.Annotated[str, "Attack type(s), comma-separated"],
+    attacks: t.Annotated[
+        list[str] | str,
+        "Attack type(s): a list like ['tap', 'goat'] or a comma-separated string like 'tap,goat'",
+    ],
     target_model: t.Annotated[str, "Target LLM model"],
     categories: t.Annotated[
         list[str] | None,

From 849f98b9b3a7a241ba953b7faadac268612218e7 Mon Sep 17 00:00:00 2001
From: rdheekonda <raja@dreadnode.io>
Date: Thu, 4 Jun 2026 00:33:16 +0000
Subject: [PATCH 2/3] fix(airt): use object instead of t.Any in
 _normalize_attack_names

attack_runner.py does not import typing as t, so the t.Any annotation
tripped ruff F821 (undefined name) in CI. Use the builtin object
annotation, which needs no import.
---
 capabilities/ai-red-teaming/scripts/attack_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capabilities/ai-red-teaming/scripts/attack_runner.py b/capabilities/ai-red-teaming/scripts/attack_runner.py
index 9ba04a3..55a3204 100644
--- a/capabilities/ai-red-teaming/scripts/attack_runner.py
+++ b/capabilities/ai-red-teaming/scripts/attack_runner.py
@@ -2612,7 +2612,7 @@ def _resolve_model(alias: str) -> str:
     return MODEL_ALIASES.get(key, alias.strip())
 
 
-def _normalize_attack_names(attacks_raw: t.Any) -> list[str]:
+def _normalize_attack_names(attacks_raw: object) -> list[str]:
     """Normalize the ``attacks`` argument into a clean list of attack names.
 
     Tolerates the common calling shapes so callers don't trigger the

From 073885a4ccc308f86c7be39b3427a295c58b2747 Mon Sep 17 00:00:00 2001
From: rdheekonda <raja@dreadnode.io>
Date: Thu, 4 Jun 2026 00:33:53 +0000
Subject: [PATCH 3/3] chore(ai-red-teaming): bump version 1.3.6 -> 1.3.7

---
 capabilities/ai-red-teaming/capability.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capabilities/ai-red-teaming/capability.yaml b/capabilities/ai-red-teaming/capability.yaml
index 5dc855e..2c91ea6 100644
--- a/capabilities/ai-red-teaming/capability.yaml
+++ b/capabilities/ai-red-teaming/capability.yaml
@@ -1,6 +1,6 @@
 schema: 1
 name: ai-red-teaming
-version: "1.3.6"
+version: "1.3.7"
 description: >
   Probe the security and safety of AI applications, agents, and foundation models.
   Orchestrates adversarial attack workflows to discover vulnerabilities in LLMs,