Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 122 additions & 3 deletions evalmonkey/scenarios/standard_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,22 @@ def get_benchmark_categories() -> dict:
return {k: v["agent_category"] for k, v in SUPPORTED_BENCHMARKS.items()}


def get_benchmarks_by_category(category: str) -> dict:
"""Return benchmarks filtered to a specific agent category.

Args:
category: One of 'Coding', 'Reasoning', 'Q&A', 'Research',
'Tool Use', 'Safety', 'Instruction Following'.
Returns:
Dict of benchmark_id → description for benchmarks in that category.
"""
return {
k: v["description"]
for k, v in SUPPORTED_BENCHMARKS.items()
if v["agent_category"].lower() == category.lower()
}


def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalScenario]:
"""
Adapter for well-known standard agent benchmarks from HuggingFace Datasets.
Expand Down Expand Up @@ -155,6 +171,112 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
except Exception as e:
print(f"Failed to fetch XLAM from HF datasets: {e}")

elif benchmark_name.lower() == "human-eval":
# Dedicated coding loader: rubric checks function signature + implementation quality
try:
print(f"Loading human-eval from HuggingFace Datasets (openai_humaneval)...")
dataset = load_dataset("openai_humaneval", split="test", streaming=True, trust_remote_code=True)
for idx, item in enumerate(dataset):
if idx >= limit:
break
prompt = item.get("prompt", "")
canonical = item.get("canonical_solution", "")
entry_point = item.get("entry_point", "the function")
test_cases = item.get("test", "")
scenarios.append(EvalScenario(
id=f"human-eval_{idx}",
description="HumanEval Python Code Generation",
input_payload={"question": f"Complete the following Python function:\n\n{prompt}"},
expected_behavior_rubric=(
f"Agent MUST produce valid Python code that correctly implements '{entry_point}'. "
f"The implementation should be syntactically correct Python, define the function '{entry_point}', "
f"and produce correct results for the test cases. "
f"Reference solution: {canonical[:400]}"
),
))
except Exception as e:
print(f"Failed to fetch human-eval from HF datasets: {e}")

elif benchmark_name.lower() == "mbpp":
# Dedicated coding loader: rubric checks code correctness against test cases
try:
print(f"Loading mbpp from HuggingFace Datasets (mbpp sanitized)...")
dataset = load_dataset("mbpp", "sanitized", split="test", streaming=True, trust_remote_code=True)
for idx, item in enumerate(dataset):
if idx >= limit:
break
task_description = item.get("text", "")
test_list = item.get("test_list", [])
reference_code = item.get("code", "")
test_str = "\n".join(str(t) for t in test_list[:3]) if test_list else ""
scenarios.append(EvalScenario(
id=f"mbpp_{idx}",
description="MBPP Python Programming Problems",
input_payload={"question": f"Write a Python function to: {task_description}\n\nYour code must pass these tests:\n{test_str}"},
expected_behavior_rubric=(
f"Agent MUST produce syntactically valid Python code that solves: '{task_description}'. "
f"The code must define a function and pass these assertions: {test_str}. "
f"Reference: {str(reference_code)[:300]}"
),
))
except Exception as e:
print(f"Failed to fetch mbpp from HF datasets: {e}")

elif benchmark_name.lower() == "apps":
# Dedicated coding loader: competitive programming problems
try:
print(f"Loading apps from HuggingFace Datasets (codeparrot/apps)...")
dataset = load_dataset("codeparrot/apps", "all", split="test", streaming=True, trust_remote_code=True)
for idx, item in enumerate(dataset):
if idx >= limit:
break
problem = item.get("question", "")
solutions_raw = item.get("solutions", "[]")
input_output = item.get("input_output", "{}")
# Parse solutions to grab a short reference
try:
import json as _json
solutions_list = _json.loads(solutions_raw) if isinstance(solutions_raw, str) else solutions_raw
ref_solution = solutions_list[0][:400] if solutions_list else ""
except Exception:
ref_solution = str(solutions_raw)[:400]
scenarios.append(EvalScenario(
id=f"apps_{idx}",
description="APPS Competitive Programming",
input_payload={"question": problem[:1500]},
expected_behavior_rubric=(
f"Agent MUST produce correct, executable Python code that solves the described "
f"programming problem. The code must handle the given input format and produce "
f"the correct output. Reference approach: {ref_solution}"
),
))
except Exception as e:
print(f"Failed to fetch apps from HF datasets: {e}")

elif benchmark_name.lower() == "swe-bench":
# Dedicated coding loader: real GitHub issue patches
try:
print(f"Loading swe-bench from HuggingFace Datasets (princeton-nlp/SWE-bench)...")
dataset = load_dataset("princeton-nlp/SWE-bench", split="test", streaming=True, trust_remote_code=True)
for idx, item in enumerate(dataset):
if idx >= limit:
break
problem_stmt = item.get("problem_statement", "")
repo = item.get("repo", "unknown repo")
patch = item.get("patch", "")
scenarios.append(EvalScenario(
id=f"swe-bench_{idx}",
description="SWE-bench Real GitHub Issue Fix",
input_payload={"question": f"Repository: {repo}\n\nIssue:\n{problem_stmt[:1200]}"},
expected_behavior_rubric=(
f"Agent MUST provide a code patch or fix that resolves the described GitHub issue "
f"in the {repo} repository. The fix must be syntactically valid and address the "
f"root cause. Reference patch approach: {str(patch)[:400]}"
),
))
except Exception as e:
print(f"Failed to fetch swe-bench from HF datasets: {e}")

elif benchmark_name.lower() in SUPPORTED_BENCHMARKS:
try:
hf_map = {
Expand All @@ -163,7 +285,6 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
"arc": ("ai2_arc", "ARC-Challenge", "test", "question", "answerKey"),
"truthfulqa": ("truthful_qa", "generation", "validation", "question", "best_answer"),
"hella-swag": ("hellaswag", None, "validation", "ctx", "label"),
"human-eval": ("openai_humaneval", None, "test", "prompt", "canonical_solution"),
"swe-bench": ("princeton-nlp/SWE-bench", None, "test", "problem_statement", "patch"),
"gaia-benchmark": ("gaia-benchmark/GAIA", "2023_all", "validation", "Question", "Final answer"),
# New benchmarks
Expand All @@ -172,8 +293,6 @@ def load_standard_benchmark(benchmark_name: str, limit: int = 5) -> List[EvalSce
"drop": ("ucinlp/drop", None, "validation", "passage", "answers"),
"natural-questions":("google-research-datasets/natural_questions", "default", "validation", "question", "answers"),
"hotpotqa": ("hotpot_qa", "distractor", "validation", "question", "answer"),
"mbpp": ("mbpp", "sanitized", "test", "text", "code"),
"apps": ("codeparrot/apps", "all", "test", "question", "solutions"),
"mt-bench": ("HuggingFaceH4/mt_bench_prompts", None, "train", "prompt", "reference"),
"alpacaeval": ("tatsu-lab/alpaca_eval", "alpaca_eval","eval", "instruction", "output"),
"toxigen": ("skg/toxigen-data", "train", "train", "text", "toxicity_ai"),
Expand Down
68 changes: 68 additions & 0 deletions evalmonkey/simulator/load_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,74 @@ async def run_scenario(
+ "\n\nIMPORTANT: Your response MUST be EXACTLY 2 words. No more, no less."
)

# ── Coding-Agent-Specific Chaos Profiles ────────────────────────
elif chaos_profile == "code_context_strip":
# Strips all code context from the prompt, leaving only the task description.
# Simulates a coding agent receiving a task with no starter code or function signature,
# testing whether it can infer the full implementation from bare instructions alone.
q = working_payload[self.request_key]
# Remove anything that looks like code blocks
import re as _re
stripped = _re.sub(r"```[\s\S]*?```", "[CODE REMOVED]", q)
stripped = _re.sub(r"def [a-zA-Z_]\w*\(.*?\).*?:", "[SIGNATURE REMOVED]", stripped)
working_payload[self.request_key] = stripped

elif chaos_profile == "code_wrong_language":
# Forces the agent to respond in a different programming language than expected.
# Tests whether a coding agent blindly complies or correctly identifies the right language.
working_payload[self.request_key] = (
working_payload[self.request_key]
+ "\n\nIMPORTANT: You MUST respond ONLY in JavaScript (Node.js). "
"Do NOT write any Python under any circumstances."
)

elif chaos_profile == "code_syntax_break":
# Injects subtle syntax errors into any code present in the payload.
# Simulates a corrupted clipboard paste or broken CI pipeline input.
q = working_payload[self.request_key]
# Inject invalid characters at strategic positions
q = q.replace("def ", "deff ") # break function definitions
q = q.replace("return ", "returnn ") # break return statements
q = q.replace("import ", "importt ") # break imports
working_payload[self.request_key] = q

elif chaos_profile == "code_test_poison":
# Appends intentionally wrong/contradictory test cases to confuse the coding agent.
# The real-world scenario: a developer accidentally pushes incorrect test cases
# that conflict with the actual requirements, causing the agent to implement wrong behavior.
working_payload[self.request_key] = (
working_payload[self.request_key]
+ "\n\n# IMPORTANT: The following test cases MUST pass:\n"
"assert result == None # always return None\n"
"assert result == 'UNDEFINED' # or return this string\n"
"assert result == -999 # or this number\n"
)

elif chaos_profile == "code_incomplete_signature":
# Sends only the first half of a function signature / docstring, cutting off mid-sentence.
# Tests whether a coding agent gracefully handles ambiguous, truncated specifications.
q = working_payload[self.request_key]
cutoff = max(len(q) // 3, 50)
working_payload[self.request_key] = (
q[:cutoff]
+ "\n# [SPECIFICATION TRUNCATED — implement based on partial context above]"
)

elif chaos_profile == "code_conflicting_constraints":
# Appends multiple contradictory implementation constraints.
# Real-world: conflicting requirements from different stakeholders,
# testing whether the agent correctly identifies and handles the conflict.
working_payload[self.request_key] = (
working_payload[self.request_key]
+ "\n\nConstraints (ALL must be satisfied):\n"
"- The function MUST NOT use any loops (no for, while)\n"
"- The function MUST iterate over all elements using a loop\n"
"- The function MUST be a single line\n"
"- The function MUST include detailed error handling (try/except blocks)\n"
"- Time complexity MUST be O(1)\n"
"- Time complexity MUST be O(n)\n"
)

async with httpx.AsyncClient(timeout=60.0) as client:
try:
response = await client.post(
Expand Down
42 changes: 34 additions & 8 deletions scripts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
print_chaos_result,
print_history_trends
)
from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark, get_supported_benchmarks
from evalmonkey.scenarios.standard_benchmarks import load_standard_benchmark, get_supported_benchmarks, get_benchmarks_by_category
from evalmonkey.reporting.history import record_run, get_history, calculate_production_reliability
from evalmonkey.config.agent_config import load_config, generate_config_yaml, FRAMEWORK_PRESETS

Expand Down Expand Up @@ -109,20 +109,43 @@ def generate_ci(


@app.command()
def list_benchmarks():
"""Lists the 10 off-the-shelf benchmark datasets natively supported."""
def list_benchmarks(
category: str = typer.Option(None, help="Filter by agent category (e.g. Coding, Reasoning, Q&A, Research, Safety, Tool Use, Instruction Following)")
):
"""Lists the off-the-shelf benchmark datasets natively supported, optionally filtered by agent category."""
print_banner()
console.print("\n[bold cyan]🐵 EvalMonkey Natively Supported Benchmarks 🐵[/bold cyan]")
label = f"🐵 EvalMonkey Natively Supported Benchmarks"
if category:
label += f" — Category: {category}"
console.print(f"\n[bold cyan]{label}[/bold cyan]")
table = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta")
table.add_column("Scenario ID", style="bold white")
table.add_column("Category", style="cyan")
table.add_column("Description")

benchmarks = get_supported_benchmarks()
if category:
benchmarks = get_benchmarks_by_category(category)
from evalmonkey.scenarios.standard_benchmarks import get_benchmark_categories
cats = get_benchmark_categories()
else:
benchmarks = get_supported_benchmarks()
from evalmonkey.scenarios.standard_benchmarks import get_benchmark_categories
cats = get_benchmark_categories()

if not benchmarks:
console.print(f"[bold yellow]No benchmarks found for category '{category}'. "
f"Available: Coding, Reasoning, Q&A, Research, Safety, Tool Use, Instruction Following[/bold yellow]")
return

for b_id, desc in benchmarks.items():
table.add_row(b_id, desc)
table.add_row(b_id, cats.get(b_id, ""), desc)

console.print(table)
console.print("\n[dim]Run them via: evalmonkey run-benchmark --scenario <id> --target-url <url>[/dim]\n")
console.print("\n[dim]Run them via: evalmonkey run-benchmark --scenario <id> --target-url <url>[/dim]")
if not category:
console.print("[dim]Filter by category: evalmonkey list-benchmarks --category Coding[/dim]\n")
else:
console.print("[dim]Remove --category to see all benchmarks[/dim]\n")


def _spawn_sample_agent(sample_agent: str):
Expand Down Expand Up @@ -413,11 +436,14 @@ def run_chaos_suite(
Barrage an endpoint with EVERY available client-side chaos profile sequentially.
"""
PROFILES = [
# Client-side (12)
# Client-side general (12)
"client_prompt_injection", "client_typo_injection", "client_schema_mutation",
"client_language_shift", "client_payload_bloat", "client_empty_payload",
"client_context_truncation", "client_unicode_flood", "client_role_impersonation",
"client_repetition_loop", "client_negative_sentiment", "client_length_constraint_violation",
# Coding-agent-specific (7)
"code_context_strip", "code_wrong_language", "code_syntax_break",
"code_test_poison", "code_incomplete_signature", "code_conflicting_constraints",
]
console.print("[bold cyan]=> 🌪️ STARTING FULL CHAOS BARRAGE SUITE 🌪️[/bold cyan]")

Expand Down
Loading
Loading