From b53cde9f1b4f8eeafcccf4aa7aa3518d2167fd23 Mon Sep 17 00:00:00 2001 From: bgagent Date: Thu, 4 Jun 2026 13:39:08 -0400 Subject: [PATCH 1/2] feat(agent): add in-pipeline pre-PR self-review phase (#262) Adds an optional self-review phase between agent execution and post-hooks where the LLM critiques its own cumulative diff before the PR is created. This improves first-pass PR quality by catching bugs, style issues, and test gaps before human review. - New self_review.py orchestration module with run_self_review() - New prompts/self_review.py with focused review prompt template - TaskConfig extended with self_review_enabled and self_review_max_turns - Fields threaded through build_config, get_config, server, pipeline - Fail-open: self-review errors never block PR creation - Uses remaining turns/budget from original allocation (capped) - Feature is opt-in (disabled by default) --- agent/src/config.py | 7 + agent/src/models.py | 3 + agent/src/pipeline.py | 21 ++ agent/src/prompts/__init__.py | 1 + agent/src/prompts/self_review.py | 40 ++++ agent/src/self_review.py | 198 +++++++++++++++++++ agent/src/server.py | 9 + agent/tests/test_self_review.py | 316 +++++++++++++++++++++++++++++++ 8 files changed, 595 insertions(+) create mode 100644 agent/src/prompts/self_review.py create mode 100644 agent/src/self_review.py create mode 100644 agent/tests/test_self_review.py diff --git a/agent/src/config.py b/agent/src/config.py index 04fa162d..78e683ea 100644 --- a/agent/src/config.py +++ b/agent/src/config.py @@ -337,6 +337,8 @@ def build_config( initial_approval_gate_count: int = 0, approval_gate_cap: int | None = None, attachments: list[dict] | None = None, + self_review_enabled: bool = False, + self_review_max_turns: int = 5, ) -> TaskConfig: """Build and validate configuration from explicit parameters. @@ -407,6 +409,8 @@ def build_config( initial_approval_gate_count=initial_approval_gate_count, approval_gate_cap=approval_gate_cap, attachments=validated_attachments, + self_review_enabled=self_review_enabled, + self_review_max_turns=self_review_max_turns, ) @@ -431,6 +435,9 @@ def get_config() -> TaskConfig: # an unreachable ``traces//`` key. trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"), user_id=os.environ.get("USER_ID", ""), + self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower() + in ("1", "true", "yes"), + self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")), ) except ValueError as e: print(f"ERROR: {e}", file=sys.stderr) diff --git a/agent/src/models.py b/agent/src/models.py index 007b94bc..2d0ddabd 100644 --- a/agent/src/models.py +++ b/agent/src/models.py @@ -186,6 +186,9 @@ class TaskConfig(BaseModel): # Attachments from the orchestrator payload (Phase 3). Validated as # AttachmentConfig models. Empty list for tasks without attachments. attachments: list[AttachmentConfig] = Field(default_factory=list) + # Self-review: optional LLM diff critique before PR creation. + self_review_enabled: bool = False + self_review_max_turns: int = 5 # Cap on turns allocated to self-review @model_validator(mode="after") def _validate_trace_requires_user_id(self) -> Self: diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index e476a11c..9881eb9c 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -29,6 +29,7 @@ from progress_writer import _ProgressWriter from prompt_builder import build_system_prompt, discover_project_config from runner import run_agent +from self_review import run_self_review from shell import log, log_error_cw from system_prompt import SYSTEM_PROMPT from telemetry import ( @@ -279,6 +280,8 @@ def run_task( trace: bool = False, user_id: str = "", attachments: list[dict] | None = None, + self_review_enabled: bool = False, + self_review_max_turns: int = 5, ) -> dict: """Run the full agent pipeline and return a serialized result dict. @@ -318,6 +321,8 @@ def run_task( initial_approval_gate_count=initial_approval_gate_count, approval_gate_cap=approval_gate_cap, attachments=attachments, + self_review_enabled=self_review_enabled, + self_review_max_turns=self_review_max_turns, ) # Inject Cedar policies into config for the PolicyEngine in runner.py @@ -623,6 +628,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: "turns_attempted": agent_result.num_turns or agent_result.turns, } + # Self-review phase: LLM critiques its own diff before PR creation. + # Runs between cancel-check and post-hooks. Fail-open: errors here + # never block PR creation. + with task_span("task.self_review"): + review_result = run_self_review( + config, setup, agent_result, trajectory, progress + ) + if review_result is not None: + # Accumulate turns and cost from the review phase + agent_result.turns += review_result.turns + agent_result.num_turns += review_result.num_turns or review_result.turns + if review_result.cost_usd is not None: + agent_result.cost_usd = ( + (agent_result.cost_usd or 0.0) + review_result.cost_usd + ) + # Post-hooks (agent_result is guaranteed set by the try/except above) with task_span("task.post_hooks") as post_span: # Safety net: commit any uncommitted tracked changes (skip for read-only tasks) diff --git a/agent/src/prompts/__init__.py b/agent/src/prompts/__init__.py index a864e013..2dd586a5 100644 --- a/agent/src/prompts/__init__.py +++ b/agent/src/prompts/__init__.py @@ -4,6 +4,7 @@ from .new_task import NEW_TASK_WORKFLOW from .pr_iteration import PR_ITERATION_WORKFLOW from .pr_review import PR_REVIEW_WORKFLOW +from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT _PROMPTS = { "new_task": BASE_PROMPT.replace("{workflow}", NEW_TASK_WORKFLOW), diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py new file mode 100644 index 00000000..9a6d109a --- /dev/null +++ b/agent/src/prompts/self_review.py @@ -0,0 +1,40 @@ +"""Self-review prompt template for pre-PR diff critique.""" + +SELF_REVIEW_PROMPT = """\ +You are reviewing your own work before it becomes a pull request. Below is the \ +cumulative diff of all changes on this branch compared to the base branch. + + +{diff} + + +## Task context + +{task_description} + +## Review checklist + +Examine the diff carefully for: + +1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \ +incorrect assumptions about data shapes or API contracts. +2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \ +race conditions. +3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \ +insecure defaults, OWASP Top 10 issues. +4. **Style & consistency** — Naming conventions, code style violations relative to \ +the surrounding codebase, unnecessary complexity. +5. **Test gaps** — Important behaviour that is untested, assertions that don't \ +verify the right thing, missing edge-case coverage. + +## Instructions + +- If you find issues, fix them directly: edit the files, run the build/tests to \ +verify your fixes, and commit the changes. +- If no issues are found, stop immediately — do not make changes for the sake of \ +making changes. +- Do NOT refactor code that was not part of the original diff unless it has a \ +concrete bug or security issue. +- Keep fixes minimal and focused. Each fix should be a separate commit with a \ +clear message. +""" diff --git a/agent/src/self_review.py b/agent/src/self_review.py new file mode 100644 index 00000000..20814190 --- /dev/null +++ b/agent/src/self_review.py @@ -0,0 +1,198 @@ +"""Self-review orchestration: LLM critiques its own diff before PR creation.""" + +from __future__ import annotations + +import asyncio +import subprocess +from typing import TYPE_CHECKING + +from prompts.self_review import SELF_REVIEW_PROMPT +from shell import log + +if TYPE_CHECKING: + from models import AgentResult, RepoSetup, TaskConfig + from progress_writer import _ProgressWriter + from telemetry import _TrajectoryWriter + +# Diff truncation limit (characters). Large diffs are cut at hunk boundaries. +_MAX_DIFF_CHARS = 60_000 + +# Minimal system prompt for the self-review agent invocation. +_REVIEW_SYSTEM_PROMPT = """\ +You are a code reviewer working inside the repository {repo_url} on branch {branch_name}. +Your working directory is {repo_dir}. + +You have full access to the filesystem and can run commands. Fix any issues you \ +find directly — edit files, run the build, and commit fixes. Keep changes minimal \ +and focused. + +Do NOT open a pull request or push. Just fix issues and commit locally. +""" + + +def _get_diff(repo_dir: str, default_branch: str) -> str: + """Generate the cumulative diff of the branch vs origin/{default_branch}.""" + try: + result = subprocess.run( + ["git", "diff", f"origin/{default_branch}...HEAD"], + cwd=repo_dir, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + log("WARN", f"self_review: git diff failed (exit {result.returncode})") + return "" + return result.stdout + except (subprocess.TimeoutExpired, OSError) as e: + log("WARN", f"self_review: git diff error: {type(e).__name__}: {e}") + return "" + + +def _truncate_diff(diff: str, max_chars: int = _MAX_DIFF_CHARS) -> str: + """Truncate diff at a hunk boundary if it exceeds max_chars. + + Cuts at the last complete hunk (line starting with '@@') that fits + within the limit, appending a truncation notice. + """ + if len(diff) <= max_chars: + return diff + + # Find the last hunk header that starts before max_chars + truncated = diff[:max_chars] + last_hunk = truncated.rfind("\n@@") + if last_hunk > 0: + # Cut just before this hunk header + truncated = truncated[:last_hunk] + else: + # No hunk boundary found — hard-cut at max_chars + last_newline = truncated.rfind("\n") + if last_newline > 0: + truncated = truncated[:last_newline] + + total_lines = diff.count("\n") + kept_lines = truncated.count("\n") + truncated += ( + f"\n\n... [diff truncated: showing ~{kept_lines} of ~{total_lines} lines; " + f"{len(diff) - len(truncated)} chars omitted] ..." + ) + return truncated + + +def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str: + """Build a minimal system prompt for the self-review agent.""" + return _REVIEW_SYSTEM_PROMPT.format( + repo_url=config.repo_url, + branch_name=setup.branch, + repo_dir=setup.repo_dir, + ) + + +def run_self_review( + config: TaskConfig, + setup: RepoSetup, + agent_result: AgentResult, + trajectory: _TrajectoryWriter, + progress: _ProgressWriter, +) -> AgentResult | None: + """Run the self-review phase: LLM critiques its own diff and fixes issues. + + Returns the AgentResult from the review phase, or None if skipped. + Fail-open: errors are logged but never block the pipeline. + """ + # Skip condition: feature disabled + if not config.self_review_enabled: + log("TASK", "self_review: disabled (self_review_enabled=False)") + return None + + # Skip condition: pr_review is read-only (no diff to review) + if config.task_type == "pr_review": + log("TASK", "self_review: skipped for pr_review task type") + return None + + # Compute remaining turns + used_turns = agent_result.turns or 0 + remaining_turns = config.max_turns - used_turns + review_turns = min(remaining_turns, config.self_review_max_turns) + if review_turns <= 0: + log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})") + return None + + # Compute remaining budget + review_budget: float | None = None + if config.max_budget_usd is not None: + used_cost = agent_result.cost_usd or 0.0 + remaining_budget = config.max_budget_usd - used_cost + if remaining_budget <= 0: + log( + "TASK", + f"self_review: no remaining budget " + f"(used=${used_cost:.2f}, max=${config.max_budget_usd:.2f})", + ) + return None + review_budget = remaining_budget + + # Get the diff + diff = _get_diff(setup.repo_dir, setup.default_branch) + if not diff.strip(): + log("TASK", "self_review: no diff found — skipping") + return None + + # Truncate if needed + diff = _truncate_diff(diff) + + # Build the review prompt + task_desc = config.task_description or f"Issue #{config.issue_number}" + user_prompt = SELF_REVIEW_PROMPT.format(diff=diff, task_description=task_desc) + system_prompt = _build_review_system_prompt(config, setup) + + # Build a modified config for the review run + review_config = config.model_copy( + update={ + "max_turns": review_turns, + "max_budget_usd": review_budget, + } + ) + + log( + "TASK", + f"self_review: starting (turns={review_turns}, " + f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, " + f"diff_chars={len(diff)})", + ) + progress.write_agent_milestone( + "self_review_started", + f"turns={review_turns} diff_chars={len(diff)}", + ) + + try: + from runner import run_agent + + review_result = asyncio.run( + run_agent( + user_prompt, + system_prompt, + review_config, + cwd=setup.repo_dir, + trajectory=trajectory, + ) + ) + except Exception as e: + # Fail-open: self-review errors never block the pipeline + log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}") + progress.write_agent_milestone( + "self_review_complete", + f"status=error error={type(e).__name__}: {e}", + ) + return None + + log( + "TASK", + f"self_review: complete (status={review_result.status}, " + f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})", + ) + progress.write_agent_milestone( + "self_review_complete", + f"status={review_result.status} turns={review_result.turns}", + ) + return review_result diff --git a/agent/src/server.py b/agent/src/server.py index b01df8da..55c3779b 100644 --- a/agent/src/server.py +++ b/agent/src/server.py @@ -386,6 +386,8 @@ def _run_task_background( user_id: str = "", workload_access_token: str = "", attachments: list[dict] | None = None, + self_review_enabled: bool = False, + self_review_max_turns: int = 5, ) -> None: """Run the agent task in a background thread.""" global _background_pipeline_failed @@ -469,6 +471,8 @@ def _run_task_background( trace=trace, user_id=user_id, attachments=attachments, + self_review_enabled=self_review_enabled, + self_review_max_turns=self_review_max_turns, ) _background_pipeline_failed = False except Exception as e: @@ -557,6 +561,9 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict: channel_source = inp.get("channel_source", "") or "" channel_metadata = inp.get("channel_metadata") or {} attachments = inp.get("attachments") or [] + # Self-review (opt-in): LLM critiques its own diff before PR creation. + self_review_enabled = inp.get("self_review_enabled") is True + self_review_max_turns = int(inp.get("self_review_max_turns", 5)) # ``trace`` is strictly opt-in (design §10.1). Accept only real # booleans from the orchestrator — a string "false" would otherwise # flip the flag on. @@ -630,6 +637,8 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict: "user_id": user_id, "workload_access_token": workload_access_token, "attachments": attachments, + "self_review_enabled": self_review_enabled, + "self_review_max_turns": self_review_max_turns, } diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py new file mode 100644 index 00000000..bd39741a --- /dev/null +++ b/agent/tests/test_self_review.py @@ -0,0 +1,316 @@ +"""Unit tests for self_review.py — self-review orchestration module.""" + +from unittest.mock import MagicMock, patch + +from models import AgentResult, RepoSetup, TaskConfig +from self_review import _MAX_DIFF_CHARS, _get_diff, _truncate_diff, run_self_review + + +def _make_config(**overrides) -> TaskConfig: + """Create a minimal TaskConfig for testing.""" + defaults = { + "repo_url": "owner/repo", + "github_token": "ghp_test123", + "aws_region": "us-east-1", + "task_description": "Fix the bug", + "max_turns": 10, + "self_review_enabled": True, + "self_review_max_turns": 5, + "task_type": "new_task", + } + defaults.update(overrides) + return TaskConfig(**defaults) + + +def _make_setup(**overrides) -> RepoSetup: + """Create a minimal RepoSetup for testing.""" + defaults = { + "repo_dir": "/workspace/repo", + "branch": "feat/123-fix", + "default_branch": "main", + } + defaults.update(overrides) + return RepoSetup(**defaults) + + +def _make_agent_result(**overrides) -> AgentResult: + """Create a minimal AgentResult for testing.""" + defaults = { + "status": "success", + "turns": 5, + "num_turns": 5, + "cost_usd": 0.50, + } + defaults.update(overrides) + return AgentResult(**defaults) + + +class TestSkipConditions: + """Test all conditions that cause self-review to be skipped.""" + + def test_skip_when_disabled(self): + config = _make_config(self_review_enabled=False) + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_for_pr_review_task_type(self): + config = _make_config( + task_type="pr_review", + pr_number="42", + task_description="", + issue_number="", + ) + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_when_no_remaining_turns(self): + config = _make_config(max_turns=5, self_review_max_turns=5) + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + def test_skip_when_no_remaining_budget(self): + config = _make_config(max_budget_usd=1.0) + setup = _make_setup() + agent_result = _make_agent_result(cost_usd=1.0) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + @patch("self_review._get_diff", return_value="") + def test_skip_when_empty_diff(self, mock_diff): + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + @patch("self_review._get_diff", return_value=" \n \n ") + def test_skip_when_whitespace_only_diff(self, mock_diff): + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + +class TestGetDiff: + """Test _get_diff helper.""" + + @patch("self_review.subprocess.run") + def test_returns_diff_output(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stdout="diff --git a/f\n+line\n") + result = _get_diff("/repo", "main") + assert result == "diff --git a/f\n+line\n" + mock_run.assert_called_once_with( + ["git", "diff", "origin/main...HEAD"], + cwd="/repo", + capture_output=True, + text=True, + timeout=60, + ) + + @patch("self_review.subprocess.run") + def test_returns_empty_on_failure(self, mock_run): + mock_run.return_value = MagicMock(returncode=1, stdout="") + result = _get_diff("/repo", "main") + assert result == "" + + @patch("self_review.subprocess.run") + def test_returns_empty_on_timeout(self, mock_run): + import subprocess + + mock_run.side_effect = subprocess.TimeoutExpired(cmd="git", timeout=60) + result = _get_diff("/repo", "main") + assert result == "" + + @patch("self_review.subprocess.run") + def test_uses_custom_default_branch(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stdout="some diff") + _get_diff("/repo", "develop") + mock_run.assert_called_once_with( + ["git", "diff", "origin/develop...HEAD"], + cwd="/repo", + capture_output=True, + text=True, + timeout=60, + ) + + +class TestTruncateDiff: + """Test _truncate_diff helper.""" + + def test_no_truncation_needed(self): + short_diff = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n+new line\n" + result = _truncate_diff(short_diff) + assert result == short_diff + + def test_truncates_at_hunk_boundary(self): + # Build a diff that exceeds the limit with multiple hunks + hunk1 = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n" + "+line\n" * 100 + hunk2 = "\n@@ -100,3 +101,4 @@\n" + "+more\n" * 100 + big_diff = hunk1 + hunk2 + result = _truncate_diff(big_diff, max_chars=len(hunk1) + 10) + # Should truncate before hunk2's @@ marker + assert "@@ -100,3" not in result + assert "truncated" in result + + def test_hard_cut_when_no_hunk_boundary(self): + # Single hunk that exceeds max + big_diff = "+x\n" * 30000 + result = _truncate_diff(big_diff, max_chars=100) + assert len(result) < len(big_diff) + assert "truncated" in result + + def test_exact_limit_no_truncation(self): + diff = "a" * _MAX_DIFF_CHARS + result = _truncate_diff(diff) + assert result == diff + + def test_one_over_limit_truncates(self): + diff = "a" * (_MAX_DIFF_CHARS + 1) + result = _truncate_diff(diff) + assert "truncated" in result + + +class TestBudgetAndTurnComputation: + """Test that budget and turn limits are computed correctly.""" + + @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") + @patch("self_review.asyncio.run") + def test_review_turns_capped_at_self_review_max_turns(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult(status="success", turns=2, num_turns=2) + config = _make_config(max_turns=100, self_review_max_turns=3) + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # The review config passed to run_agent should have max_turns=3 + call_args = mock_asyncio_run.call_args + coro = call_args[0][0] + # Close the coroutine to avoid warnings + coro.close() + + @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n") + @patch("self_review.asyncio.run") + def test_review_turns_uses_remaining_when_less_than_cap(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult(status="success", turns=1, num_turns=1) + config = _make_config(max_turns=8, self_review_max_turns=5) + setup = _make_setup() + agent_result = _make_agent_result(turns=6) # Only 2 remaining + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # Should use min(2, 5) = 2 turns + call_args = mock_asyncio_run.call_args + coro = call_args[0][0] + coro.close() + + +class TestHappyPath: + """Test the full happy path with mocked run_agent.""" + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_returns_review_result(self, mock_asyncio_run, mock_diff): + review_agent_result = AgentResult( + status="success", turns=2, num_turns=2, cost_usd=0.10 + ) + mock_asyncio_run.return_value = review_agent_result + + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result(turns=5) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + + assert result is not None + assert result.status == "success" + assert result.turns == 2 + assert result.cost_usd == 0.10 + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_writes_progress_milestones(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult( + status="success", turns=1, num_turns=1, cost_usd=0.05 + ) + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + run_self_review(config, setup, agent_result, trajectory, progress) + + # Should write both started and complete milestones + milestone_calls = progress.write_agent_milestone.call_args_list + assert len(milestone_calls) == 2 + assert milestone_calls[0][0][0] == "self_review_started" + assert milestone_calls[1][0][0] == "self_review_complete" + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_fail_open_on_agent_error(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.side_effect = RuntimeError("SDK crashed") + + config = _make_config() + setup = _make_setup() + agent_result = _make_agent_result() + trajectory = MagicMock() + progress = MagicMock() + + # Should not raise + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is None + + # Should still write milestones + milestone_calls = progress.write_agent_milestone.call_args_list + assert milestone_calls[0][0][0] == "self_review_started" + assert milestone_calls[1][0][0] == "self_review_complete" + assert "error" in milestone_calls[1][0][1] + + @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n") + @patch("self_review.asyncio.run") + def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff): + mock_asyncio_run.return_value = AgentResult( + status="success", turns=1, num_turns=1, cost_usd=0.03 + ) + config = _make_config(max_budget_usd=None) + setup = _make_setup() + agent_result = _make_agent_result(cost_usd=None) + trajectory = MagicMock() + progress = MagicMock() + + result = run_self_review(config, setup, agent_result, trajectory, progress) + assert result is not None + assert result.status == "success" From 19b763c06f42a835a1bb74c17c79a80eefabc535 Mon Sep 17 00:00:00 2001 From: bgagent Date: Thu, 4 Jun 2026 15:29:18 -0400 Subject: [PATCH 2/2] feat(agent): post self-review summary as PR comment (#263) After the self-review phase completes, the agent writes a structured summary of findings to `.self-review-summary.md`. The pipeline reads this file, posts it as a PR comment via `gh pr comment`, then deletes it so it never appears in the PR diff. Fail-open: if the file is missing or the comment fails to post, the pipeline continues normally. --- agent/src/pipeline.py | 5 + agent/src/post_hooks.py | 62 +++++++++++ agent/src/prompts/self_review.py | 21 ++++ agent/src/self_review.py | 41 ++++++++ agent/tests/test_self_review.py | 170 ++++++++++++++++++++++++++++++- 5 files changed, 298 insertions(+), 1 deletion(-) diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index 9881eb9c..783d12c7 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -23,6 +23,7 @@ _extract_agent_notes, ensure_committed, ensure_pr, + post_self_review_comment, verify_build, verify_lint, ) @@ -664,6 +665,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: if pr_url: progress.write_agent_milestone("pr_created", pr_url) + # Post self-review summary as PR comment (if self-review ran and produced findings) + if pr_url and review_result is not None: + post_self_review_comment(setup.repo_dir, pr_url, config) + # Memory write — capture task episode and repo learnings memory_written = False effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "") diff --git a/agent/src/post_hooks.py b/agent/src/post_hooks.py index e66ae9bb..5cbcce6f 100644 --- a/agent/src/post_hooks.py +++ b/agent/src/post_hooks.py @@ -327,6 +327,68 @@ def ensure_pr( return None +def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool: + """Post the self-review summary as a PR comment. + + Reads the summary file written by the self-review agent, formats it as a + comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged + but never propagated. + + Returns True if a comment was posted, False otherwise. + """ + from self_review import read_self_review_summary + + try: + summary = read_self_review_summary(repo_dir) + except Exception as e: + log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}") + return False + + if not summary: + log("POST", "post_self_review_comment: no summary file found — skipping") + return False + + # Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123) + match = re.search(r"/pull/(\d+)", pr_url) + if not match: + log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}") + return False + pr_number = match.group(1) + + comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}" + + try: + result = subprocess.run( + [ + "gh", + "pr", + "comment", + pr_number, + "--repo", + config.repo_url, + "--body", + comment_body, + ], + cwd=repo_dir, + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode == 0: + log("POST", f"Self-review summary posted as comment on PR #{pr_number}") + return True + stderr = result.stderr.strip()[:200] if result.stderr else "" + log( + "WARN", + f"post_self_review_comment: gh pr comment failed " + f"(rc={result.returncode}): {stderr}", + ) + return False + except (subprocess.TimeoutExpired, OSError) as e: + log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}") + return False + + def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None: """Extract the "## Agent notes" section from the PR body. diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py index 9a6d109a..25cf47a2 100644 --- a/agent/src/prompts/self_review.py +++ b/agent/src/prompts/self_review.py @@ -37,4 +37,25 @@ concrete bug or security issue. - Keep fixes minimal and focused. Each fix should be a separate commit with a \ clear message. + +## Summary output + +After completing your review (whether you made fixes or not), write a file \ +`.self-review-summary.md` in the repository root with your findings in this format: + +```markdown +### Self-Review Summary + +**Findings:** +**Fixes applied:** + +#### Issues found + +- : +``` + +If no issues were found, write the file with: "No issues found — code looks good." + +This file is a pipeline artifact and will be deleted automatically — it will NOT \ +appear in the pull request. """ diff --git a/agent/src/self_review.py b/agent/src/self_review.py index 20814190..bb53e3b7 100644 --- a/agent/src/self_review.py +++ b/agent/src/self_review.py @@ -3,6 +3,8 @@ from __future__ import annotations import asyncio +import contextlib +import os import subprocess from typing import TYPE_CHECKING @@ -196,3 +198,42 @@ def run_self_review( f"status={review_result.status} turns={review_result.turns}", ) return review_result + + +_SUMMARY_FILENAME = ".self-review-summary.md" + + +def read_self_review_summary(repo_dir: str) -> str | None: + """Read and delete the self-review summary file. + + The self-review agent writes `.self-review-summary.md` in the repo root. + This function reads the content, removes the file (so it never appears in + the PR), and returns the content. Returns None if the file doesn't exist. + """ + summary_path = os.path.join(repo_dir, _SUMMARY_FILENAME) + if not os.path.isfile(summary_path): + return None + + try: + with open(summary_path) as f: + content = f.read() + except OSError as e: + log("WARN", f"self_review: failed to read summary file: {type(e).__name__}: {e}") + return None + + # Remove the file so it doesn't end up in the PR + try: + os.remove(summary_path) + except OSError as e: + log("WARN", f"self_review: failed to delete summary file: {type(e).__name__}: {e}") + + # If the file was staged by the agent, unstage it + with contextlib.suppress(subprocess.TimeoutExpired, OSError): + subprocess.run( + ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME], + cwd=repo_dir, + capture_output=True, + timeout=30, + ) + + return content.strip() if content.strip() else None diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py index bd39741a..e2a817f1 100644 --- a/agent/tests/test_self_review.py +++ b/agent/tests/test_self_review.py @@ -1,9 +1,17 @@ """Unit tests for self_review.py — self-review orchestration module.""" +import os from unittest.mock import MagicMock, patch from models import AgentResult, RepoSetup, TaskConfig -from self_review import _MAX_DIFF_CHARS, _get_diff, _truncate_diff, run_self_review +from self_review import ( + _MAX_DIFF_CHARS, + _SUMMARY_FILENAME, + _get_diff, + _truncate_diff, + read_self_review_summary, + run_self_review, +) def _make_config(**overrides) -> TaskConfig: @@ -314,3 +322,163 @@ def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff): result = run_self_review(config, setup, agent_result, trajectory, progress) assert result is not None assert result.status == "success" + + +class TestReadSelfReviewSummary: + """Tests for read_self_review_summary — reads and cleans up the summary file.""" + + def test_returns_content_when_file_exists(self, tmp_path): + summary_content = ( + "### Self-Review Summary\n\n" + "**Findings:** 2\n" + "**Fixes applied:** 1\n\n" + "#### Issues found\n\n" + "- Security: hardcoded token — fixed\n" + "- Style: inconsistent naming — not fixed (cosmetic)\n" + ) + (tmp_path / _SUMMARY_FILENAME).write_text(summary_content) + + result = read_self_review_summary(str(tmp_path)) + + assert result == summary_content.strip() + + def test_returns_none_when_file_missing(self, tmp_path): + result = read_self_review_summary(str(tmp_path)) + assert result is None + + def test_deletes_file_after_reading(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("No issues found — code looks good.") + + read_self_review_summary(str(tmp_path)) + + assert not (tmp_path / _SUMMARY_FILENAME).exists() + + def test_returns_none_for_empty_file(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text(" \n\n ") + + result = read_self_review_summary(str(tmp_path)) + + assert result is None + + def test_returns_none_for_whitespace_only(self, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("\t\n ") + + result = read_self_review_summary(str(tmp_path)) + + assert result is None + + @patch("self_review.subprocess.run") + def test_runs_git_rm_cached_for_cleanup(self, mock_run, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("Some findings") + mock_run.return_value = MagicMock(returncode=0) + + read_self_review_summary(str(tmp_path)) + + mock_run.assert_called_once_with( + ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME], + cwd=str(tmp_path), + capture_output=True, + timeout=30, + ) + + @patch("self_review.subprocess.run", side_effect=OSError("git not found")) + def test_git_rm_failure_does_not_block(self, mock_run, tmp_path): + (tmp_path / _SUMMARY_FILENAME).write_text("Findings here") + + # Should not raise even if git rm fails + result = read_self_review_summary(str(tmp_path)) + + assert result == "Findings here" + + +class TestPostSelfReviewComment: + """Tests for post_hooks.post_self_review_comment.""" + + def test_posts_comment_on_success(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1\n**Fixes applied:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/42" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is True + call_args = mock_run.call_args[0][0] + assert call_args[0:3] == ["gh", "pr", "comment"] + assert "42" in call_args + assert "owner/repo" in call_args + + def test_returns_false_when_no_summary(self, tmp_path): + from post_hooks import post_self_review_comment + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/42" + + result = post_self_review_comment(str(tmp_path), pr_url, config) + assert result is False + + def test_returns_false_on_invalid_pr_url(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("Some findings") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/issues/42" + + result = post_self_review_comment(str(tmp_path), pr_url, config) + assert result is False + + def test_returns_false_on_gh_failure(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/99" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found") + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is False + + def test_fail_open_on_exception(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/5" + + with patch("post_hooks.subprocess.run", side_effect=OSError("network error")): + result = post_self_review_comment(str(tmp_path), pr_url, config) + + assert result is False + + def test_comment_body_includes_header(self, tmp_path): + from post_hooks import post_self_review_comment + + (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 0\nNo issues.") + + config = MagicMock() + config.repo_url = "owner/repo" + pr_url = "https://github.com/owner/repo/pull/7" + + with patch("post_hooks.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + post_self_review_comment(str(tmp_path), pr_url, config) + + call_args = mock_run.call_args[0][0] + body_idx = call_args.index("--body") + 1 + body = call_args[body_idx] + assert "Self-Review Summary" in body + assert "**Findings:** 0" in body