From a4fe5f1f681ed1cdb5653fe6ced07c642b31eea6 Mon Sep 17 00:00:00 2001 From: rdheekonda Date: Thu, 4 Jun 2026 00:31:21 +0000 Subject: [PATCH 1/3] fix(airt): normalize attacks arg in generate_category_attack The `attacks` parameter was iterated character-by-character when passed as a bare string, producing cryptic "Unknown attack: 't'" errors and making the entire category-sweep path unusable. - Add _normalize_attack_names() to accept list, comma-separated string, single name, or stringified-list noise; mirrors generate_attack's attack_type.split(",") handling. - Return a clear validation error instead of a single-character failure. - Widen the tool annotation to list[str] | str. - Add TestNormalizeAttackNames regression coverage. - Document the failure signature in error-troubleshooting skill. - Add category-tool auto-fallback guidance to the agent instructions. --- .../agents/ai-red-teaming-agent.md | 5 +- .../ai-red-teaming/scripts/attack_runner.py | 48 ++++++++++++++++++- .../skills/error-troubleshooting/SKILL.md | 16 +++++++ .../tests/test_attack_runner.py | 29 +++++++++++ capabilities/ai-red-teaming/tools/attacks.py | 5 +- 5 files changed, 100 insertions(+), 3 deletions(-) diff --git a/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md b/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md index f8d096a..2bde639 100644 --- a/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md +++ b/capabilities/ai-red-teaming/agents/ai-red-teaming-agent.md @@ -85,7 +85,10 @@ Keep it to a single line; don't pad it. `get_assessment_status` returns summary metrics (ASR % = success rate / probability, status, notes). It does NOT include trial details, best scores, severity breakdowns, or scorer outputs. Report only what the platform returns — never interpret, never invent numbers. The headline metric is **ASR (the attack success probability, 0–100%)**; the severity-weighted /10 risk score is no longer surfaced to users. For deeper analysis, direct users to the platform web interface. **Category mode:** -You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories. +You NEVER see goal text in category mode. Work only with category names, goal IDs, and numeric results — the tool loads goals internally. Use `list_goal_categories` first to show available categories. Pass `attacks` as a list (`["tap", "goat"]`) — a comma-separated string (`"tap,goat"`) also works. + +**Category-tool auto-fallback:** +If `generate_category_attack` fails with an argument-parsing error (e.g. `Unknown attack: 't'` or a single-character attack name), do NOT keep retrying formats. Immediately fall back to running the category via per-goal `generate_attack` calls with `goal_category=`, and tell the user you did so and why (so the result's coverage is transparent). Then file the parsing failure as a capability note. **Direct tool calls:** If the user types a tool name directly (e.g. "validate_attack_results", "fix_workflow_errors"), call ONLY that tool. Do not chain additional analytics tools. diff --git a/capabilities/ai-red-teaming/scripts/attack_runner.py b/capabilities/ai-red-teaming/scripts/attack_runner.py index 5c5777e..9ba04a3 100644 --- a/capabilities/ai-red-teaming/scripts/attack_runner.py +++ b/capabilities/ai-red-teaming/scripts/attack_runner.py @@ -2612,6 +2612,39 @@ def _resolve_model(alias: str) -> str: return MODEL_ALIASES.get(key, alias.strip()) +def _normalize_attack_names(attacks_raw: t.Any) -> list[str]: + """Normalize the ``attacks`` argument into a clean list of attack names. + + Tolerates the common calling shapes so callers don't trigger the + character-by-character iteration bug: + - list/tuple of names: ["tap", "goat"] -> ["tap", "goat"] + - comma-separated string: "tap,goat" -> ["tap", "goat"] + - single name: "tap" -> ["tap"] + - stray bracket/quote noise from stringified lists: "['tap','goat']" + -> ["tap", "goat"] + + Empty / whitespace-only tokens are dropped. + """ + if attacks_raw is None: + return [] + + if isinstance(attacks_raw, (list, tuple)): + items = list(attacks_raw) + elif isinstance(attacks_raw, str): + # Strip stray list/quote characters from stringified lists, then split. + cleaned = attacks_raw.strip().strip("[]") + items = cleaned.split(",") + else: + items = [attacks_raw] + + names = [] + for item in items: + token = str(item).strip().strip("'\"").strip() + if token: + names.append(token) + return names + + def _resolve_attack(alias: str) -> dict: """Resolve an attack alias to its definition.""" key = alias.strip().lower().replace("-", "_").replace(" ", "_") @@ -3741,6 +3774,19 @@ def generate_category_attack(params: dict) -> dict: if not categories and not goal_ids: return {"error": "categories or goal_ids is required"} + # Normalize attacks into a clean list of names. + # Accepts: list[str] (["tap", "goat"]), comma-separated string ("tap,goat"), + # or a single name ("tap"). This mirrors how generate_attack handles + # attack_type and prevents iterating a bare string character-by-character. + attack_names = _normalize_attack_names(attacks_raw) + if not attack_names: + return { + "error": ( + "attacks must be one or more attack names, e.g. ['tap', 'goat'] " + "or 'tap,goat'. Got: {!r}".format(attacks_raw) + ) + } + # Resolve models resolved_target = _resolve_model(target_model) resolved_attacker = _resolve_model(attacker_model) if attacker_model else resolved_target @@ -3749,7 +3795,7 @@ def generate_category_attack(params: dict) -> dict: # Resolve attacks try: - attacks_resolved = [_resolve_attack(a) for a in attacks_raw] + attacks_resolved = [_resolve_attack(a) for a in attack_names] except ValueError as e: return {"error": str(e)} diff --git a/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md b/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md index 009493e..3de81bc 100644 --- a/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md +++ b/capabilities/ai-red-teaming/skills/error-troubleshooting/SKILL.md @@ -45,6 +45,22 @@ Common errors and fixes for AIRT attack workflows. - **Cause**: Wrong parameter syntax - **Fix**: Use parentheses: `caesar(5)`, `adapt_language(Zulu)`, `vigenere(SECRET)`, `affine(5,8)` +## Category Attack Errors + +### "Unknown attack: 't'" / "Unknown attack: '['" (single characters) +- **Cause**: The `attacks` argument to `generate_category_attack` was iterated + character-by-character. This happened when a bare string was passed and the + runner looped over it directly (e.g. `"tap"` -> `'t'`, `'a'`, `'p'`). +- **Fix**: The runner now normalizes `attacks` via `_normalize_attack_names`, + accepting a list (`["tap", "goat"]`), a comma-separated string + (`"tap,goat"`), or a single name (`"tap"`). If you still see single-character + attack errors, you are on an old build — update the capability. +- **Workaround (older builds)**: Run the category via per-goal `generate_attack` + calls with `goal_category=` instead of `generate_category_attack`. +- **Signature to recognize**: the error lists all valid attacks but complains + about a one-character name. That always means an iterable-splitting bug, not a + genuinely unknown attack. + ## Scorer Errors ### "Scorer not found: " diff --git a/capabilities/ai-red-teaming/tests/test_attack_runner.py b/capabilities/ai-red-teaming/tests/test_attack_runner.py index 46564e8..9656f76 100644 --- a/capabilities/ai-red-teaming/tests/test_attack_runner.py +++ b/capabilities/ai-red-teaming/tests/test_attack_runner.py @@ -101,6 +101,35 @@ def test_unknown_attack_returns_error(self) -> None: with pytest.raises((ValueError, KeyError)): runner._resolve_attack("nonexistent_attack") +class TestNormalizeAttackNames: + """Regression: generate_category_attack must not iterate a bare string + character-by-character (which produced 'Unknown attack: t' errors).""" + + @pytest.mark.parametrize( + "raw,expected", + [ + ("tap", ["tap"]), + ("tap,goat", ["tap", "goat"]), + ("tap, goat , pair", ["tap", "goat", "pair"]), + (["tap"], ["tap"]), + (["tap", "goat"], ["tap", "goat"]), + ("tap_attack", ["tap_attack"]), + ("['tap']", ["tap"]), + ("['tap','goat']", ["tap", "goat"]), + ("", []), + (None, []), + ], + ) + def test_normalize(self, raw, expected) -> None: + assert runner._normalize_attack_names(raw) == expected + + def test_bare_string_does_not_split_to_chars(self) -> None: + # The exact bug: "tap" must NOT become ['t', 'a', 'p']. + result = runner._normalize_attack_names("tap") + assert result == ["tap"] + assert "t" not in result + + # ============================================================================= # Transform resolution diff --git a/capabilities/ai-red-teaming/tools/attacks.py b/capabilities/ai-red-teaming/tools/attacks.py index f5fb096..4e5d8da 100644 --- a/capabilities/ai-red-teaming/tools/attacks.py +++ b/capabilities/ai-red-teaming/tools/attacks.py @@ -148,7 +148,10 @@ def generate_attack( @safe_tool def generate_category_attack( - attacks: t.Annotated[str, "Attack type(s), comma-separated"], + attacks: t.Annotated[ + list[str] | str, + "Attack type(s): a list like ['tap', 'goat'] or a comma-separated string like 'tap,goat'", + ], target_model: t.Annotated[str, "Target LLM model"], categories: t.Annotated[ list[str] | None, From 849f98b9b3a7a241ba953b7faadac268612218e7 Mon Sep 17 00:00:00 2001 From: rdheekonda Date: Thu, 4 Jun 2026 00:33:16 +0000 Subject: [PATCH 2/3] fix(airt): use object instead of t.Any in _normalize_attack_names attack_runner.py does not import typing as t, so the t.Any annotation tripped ruff F821 (undefined name) in CI. Use the builtin object annotation, which needs no import. --- capabilities/ai-red-teaming/scripts/attack_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capabilities/ai-red-teaming/scripts/attack_runner.py b/capabilities/ai-red-teaming/scripts/attack_runner.py index 9ba04a3..55a3204 100644 --- a/capabilities/ai-red-teaming/scripts/attack_runner.py +++ b/capabilities/ai-red-teaming/scripts/attack_runner.py @@ -2612,7 +2612,7 @@ def _resolve_model(alias: str) -> str: return MODEL_ALIASES.get(key, alias.strip()) -def _normalize_attack_names(attacks_raw: t.Any) -> list[str]: +def _normalize_attack_names(attacks_raw: object) -> list[str]: """Normalize the ``attacks`` argument into a clean list of attack names. Tolerates the common calling shapes so callers don't trigger the From 073885a4ccc308f86c7be39b3427a295c58b2747 Mon Sep 17 00:00:00 2001 From: rdheekonda Date: Thu, 4 Jun 2026 00:33:53 +0000 Subject: [PATCH 3/3] chore(ai-red-teaming): bump version 1.3.6 -> 1.3.7 --- capabilities/ai-red-teaming/capability.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capabilities/ai-red-teaming/capability.yaml b/capabilities/ai-red-teaming/capability.yaml index 5dc855e..2c91ea6 100644 --- a/capabilities/ai-red-teaming/capability.yaml +++ b/capabilities/ai-red-teaming/capability.yaml @@ -1,6 +1,6 @@ schema: 1 name: ai-red-teaming -version: "1.3.6" +version: "1.3.7" description: > Probe the security and safety of AI applications, agents, and foundation models. Orchestrates adversarial attack workflows to discover vulnerabilities in LLMs,