Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
022f70a
Scaffolding
Jan 26, 2026
e85cdb9
Precommit
Jan 26, 2026
fc260c3
fixtures and basic tests
Jan 27, 2026
89a8079
basic tests
Jan 27, 2026
b18f224
basic tests
Jan 27, 2026
96ddf6c
last test
Jan 28, 2026
eb4e936
jailbreak format test
Jan 28, 2026
243ea0a
sample jailbreak prompt
Jan 28, 2026
946fdde
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
132caf5
real jailbreaks added
Jan 28, 2026
c4e625f
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
79d1a64
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
cb28fda
changing dataset name
Jan 29, 2026
f399b6d
moved jailbreak discovery
Jan 29, 2026
75436ea
changed path resolution
Jan 29, 2026
c0022f6
minor changes
Jan 29, 2026
9f579f2
minor bug
Jan 29, 2026
ccf7025
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
349cc6b
old dataset name
Jan 30, 2026
9fa6430
precommit
Jan 30, 2026
513cbf3
random jailbreak selection
Jan 30, 2026
b57b35a
error handling
Jan 30, 2026
999a0c6
error handling docstring
Jan 30, 2026
f3ec8bb
Merge branch 'Azure:main' into jailbreak2
ValbuenaVC Jan 30, 2026
89fd8bd
scaffolding
Jan 30, 2026
66650a6
scaffolding for subset
Jan 30, 2026
fa5b01a
scaffolding
Jan 30, 2026
44bc05c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
db5270c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
9d9666f
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 7, 2026
302101f
subset
Feb 9, 2026
9c7b757
tweaking
Feb 10, 2026
737aabe
new strategy template
Feb 10, 2026
472bd20
types'
Feb 10, 2026
b07e197
adversarial
Feb 10, 2026
c31d088
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 10, 2026
6dcf318
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
ec9d731
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
163e582
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
a503a4b
unit test fixes
Feb 11, 2026
af32046
Merge branch 'jailbreak2' of https://github.com/ValbuenaVC/PyRIT into…
Feb 11, 2026
6da95f9
unit test fix
Feb 11, 2026
73d77a6
mypy
Feb 11, 2026
827ec0e
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 12, 2026
8168db8
params
Feb 12, 2026
5ac7651
tweaks
Feb 12, 2026
20ef0c3
dataset_size
Feb 12, 2026
06bb694
k_jailbreak bug
Feb 13, 2026
03a1e9b
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 13, 2026
6a67ac4
tests
Feb 13, 2026
4b441d4
new strategies
Feb 14, 2026
b14f564
adversarial chat
Feb 14, 2026
07b6142
roleplay path
Feb 14, 2026
36b6b95
roleplay
Feb 14, 2026
f39aecd
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 17, 2026
a43eeaf
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 17, 2026
be5045a
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 18, 2026
11347d9
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 19, 2026
4a4f77a
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 19, 2026
0e27829
instance namnes
Feb 19, 2026
ec19a5d
tweaks
Feb 19, 2026
1a410e3
tests
Feb 19, 2026
4210a3d
precommit
Feb 19, 2026
f773ec3
max_dataset_size
Feb 19, 2026
b7cb6c0
comments
Feb 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pyrit/datasets/jailbreak/text_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ def __init__(
self.template.value = self.template.render_template_value_silent(**kwargs)

@classmethod
def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
def get_jailbreak_templates(cls, num_templates: Optional[int] = None) -> List[str]:
"""
Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH.

Args:
n (int, optional): Number of jailbreak templates to return. None to get all.
num_templates (int, optional): Number of jailbreak templates to return. None to get all.

Returns:
List[str]: List of jailbreak template file names.
Expand All @@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
if not jailbreak_template_names:
raise ValueError("No jailbreak templates found in the jailbreak directory")

if n:
if n > len(jailbreak_template_names):
if num_templates:
if num_templates > len(jailbreak_template_names):
raise ValueError(
f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
f"Attempted to pull {num_templates} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
)
jailbreak_template_names = random.choices(jailbreak_template_names, k=n)
jailbreak_template_names = random.choices(jailbreak_template_names, k=num_templates)
return jailbreak_template_names

def get_jailbreak_system_prompt(self) -> str:
Expand Down
161 changes: 125 additions & 36 deletions pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@

import os
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Union

from pyrit.common import apply_defaults
from pyrit.datasets import TextJailBreak
from pyrit.executor.attack.core.attack_config import (
AttackConverterConfig,
AttackScoringConfig,
)
from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack
from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack
from pyrit.executor.attack.single_turn.role_play import RolePlayAttack, RolePlayPaths
from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack
from pyrit.models import SeedAttackGroup
from pyrit.prompt_converter import TextJailbreakConverter
from pyrit.prompt_normalizer import PromptConverterConfiguration
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.scenario.core.atomic_attack import AtomicAttack
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.scenario.core.scenario_strategy import (
ScenarioStrategy,
)
from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy
from pyrit.score import (
SelfAskRefusalScorer,
TrueFalseInverterScorer,
Expand All @@ -31,13 +32,41 @@

class JailbreakStrategy(ScenarioStrategy):
"""
Strategy for single-turn jailbreak attacks.
Strategy for jailbreak attacks.

The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to
expose an obvious way of using this scenario without worrying about additional tweaks and changes
to the prompt.

There is currently only one, running all jailbreaks.
COMPLEX strategies use additional techniques to enhance the jailbreak like modifying the
system prompt or probing the target model for an additional vulnerability (e.g. the SkeletonKeyAttack).
They are meant to provide a sense of how well a jailbreak generalizes to slight changes in the delivery
method.
"""

# Aggregate members (special markers that expand to strategies with matching tags)
ALL = ("all", {"all"})
PYRIT = ("pyrit", {"pyrit"})
SIMPLE = ("simple", {"simple"})
COMPLEX = ("complex", {"complex"})

# Simple strategies
PromptSending = ("prompt_sending", {"simple"})

# Complex strategies
ManyShot = ("many_shot", {"complex"})
SkeletonKey = ("skeleton", {"complex"})
RolePlay = ("role_play", {"complex"})

@classmethod
def get_aggregate_tags(cls) -> set[str]:
"""
Get the set of tags that represent aggregate categories.

Returns:
set[str]: Set of tags that are aggregate markers.
"""
# Include base class aggregates ("all") and add scenario-specific ones
return super().get_aggregate_tags() | {"simple", "complex"}


class Jailbreak(Scenario):
Expand Down Expand Up @@ -67,9 +96,9 @@ def get_default_strategy(cls) -> ScenarioStrategy:
Get the default strategy used when no strategies are specified.

Returns:
ScenarioStrategy: JailbreakStrategy.ALL.
ScenarioStrategy: JailbreakStrategy.PromptSending.
"""
return JailbreakStrategy.ALL
return JailbreakStrategy.SIMPLE

@classmethod
def required_datasets(cls) -> list[str]:
Expand All @@ -93,7 +122,9 @@ def __init__(
objective_scorer: Optional[TrueFalseScorer] = None,
include_baseline: bool = False,
scenario_result_id: Optional[str] = None,
n_jailbreaks: Optional[int] = 3,
num_templates: Optional[int] = None,
num_attempts: int = 1,
jailbreak_names: List[str] = [],
) -> None:
"""
Initialize the jailbreak scenario.
Expand All @@ -104,13 +135,45 @@ def __init__(
include_baseline (bool): Whether to include a baseline atomic attack that sends all
objectives without modifications. Defaults to True.
scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
num_templates (Optional[int]): Choose num_templates random jailbreaks rather than using all of them.
num_attempts (Optional[int]): Number of times to try each jailbreak.
jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets.
to use.

Raises:
ValueError: If both jailbreak_names and num_templates are provided, as random selection
is incompatible with a predetermined list.
ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed
templates.

"""
if jailbreak_names and num_templates:
raise ValueError(
"Please provide only one of `num_templates` (random selection) or `jailbreak_names` (specific selection)."
)

if not objective_scorer:
objective_scorer = self._get_default_objective_scorer()
self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer)

self._n = n_jailbreaks
self._num_templates = num_templates
self._num_attempts = num_attempts

# Note that num_templates and jailbreak_names are mutually exclusive.
# If self._num_templates is None, then this returns all discoverable jailbreak templates.
# If self._num_templates has some value, then all_templates is a subset of all available
# templates, but jailbreak_names is guaranteed to be [], so diff = {}.
all_templates = TextJailBreak.get_jailbreak_templates(num_templates=self._num_templates)

# Example: if jailbreak_names is {'a', 'b', 'c'}, and all_templates is {'b', 'c', 'd'},
# then diff = {'a'}, which raises the error as 'a' was not discovered in all_templates.
diff = set(jailbreak_names) - set(all_templates)
if len(diff) > 0:
raise ValueError(f"Error: could not find templates `{diff}`!")

# If jailbreak_names has some value, then `if jailbreak_names` passes, and self._jailbreaks
# is set to jailbreak_names. Otherwise we use all_templates.
self._jailbreaks = jailbreak_names if jailbreak_names else all_templates

super().__init__(
name="Jailbreak",
Expand Down Expand Up @@ -146,6 +209,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
)
return refusal_scorer

def _get_default_adversarial_target(self) -> OpenAIChatTarget:
"""
Create and retrieve the default adversarial target.

Returns:
OpenAIChatTarget: Default adversarial target using an unfiltered endpoint.
"""
return OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=1.2,
)

def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
"""
Resolve seed groups from dataset configuration.
Expand All @@ -161,23 +238,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]:

return list(seed_groups)

def _get_all_jailbreak_templates(self) -> List[str]:
"""
Retrieve all available jailbreak templates.

Returns:
List[str]: List of jailbreak template file names.
"""
if not self._n:
return TextJailBreak.get_all_jailbreak_templates()
else:
return TextJailBreak.get_all_jailbreak_templates(n=self._n)

async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack:
async def _get_atomic_attack_from_strategy_async(
self, *, strategy: str, jailbreak_template_name: str
) -> AtomicAttack:
"""
Create an atomic attack for a specific jailbreak template.

Args:
strategy (str): JailbreakStrategy to use.
jailbreak_template_name (str): Name of the jailbreak template file.

Returns:
Expand All @@ -202,12 +270,28 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na
request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter])
)

# Create the attack
attack = PromptSendingAttack(
objective_target=self._objective_target,
attack_scoring_config=self._scorer_config,
attack_converter_config=converter_config,
)
attack: Optional[Union[ManyShotJailbreakAttack, PromptSendingAttack, RolePlayAttack, SkeletonKeyAttack]] = None
args = {
"objective_target": self._objective_target,
"attack_scoring_config": self._scorer_config,
"attack_converter_config": converter_config,
}
match strategy:
case "many_shot":
attack = ManyShotJailbreakAttack(**args)
case "prompt_sending":
attack = PromptSendingAttack(**args)
case "skeleton":
attack = SkeletonKeyAttack(**args)
case "role_play":
args["adversarial_chat"] = self._get_default_adversarial_target()
args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value
attack = RolePlayAttack(**args)
case _:
raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.")

if not attack:
raise ValueError(f"Attack cannot be None!")

# Extract template name without extension for the atomic attack name
template_name = Path(jailbreak_template_name).stem
Expand All @@ -230,11 +314,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:
# Retrieve seed prompts based on selected strategies
self._seed_groups = self._resolve_seed_groups()

# Get all jailbreak template names
jailbreak_template_names = self._get_all_jailbreak_templates()
strategies = ScenarioCompositeStrategy.extract_single_strategy_values(
composites=self._scenario_composites, strategy_type=JailbreakStrategy
)

for template_name in jailbreak_template_names:
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name)
atomic_attacks.append(atomic_attack)
for strategy in strategies:
for template_name in self._jailbreaks:
for _ in range(0, self._num_attempts):
atomic_attack = await self._get_atomic_attack_from_strategy_async(
strategy=strategy, jailbreak_template_name=template_name
)
atomic_attacks.append(atomic_attack)

return atomic_attacks
Loading