From b53cde9f1b4f8eeafcccf4aa7aa3518d2167fd23 Mon Sep 17 00:00:00 2001
From: bgagent <bgagent@noreply.github.com>
Date: Thu, 4 Jun 2026 13:39:08 -0400
Subject: [PATCH 1/2] feat(agent): add in-pipeline pre-PR self-review phase
 (#262)

Adds an optional self-review phase between agent execution and post-hooks
where the LLM critiques its own cumulative diff before the PR is created.
This improves first-pass PR quality by catching bugs, style issues, and
test gaps before human review.

- New self_review.py orchestration module with run_self_review()
- New prompts/self_review.py with focused review prompt template
- TaskConfig extended with self_review_enabled and self_review_max_turns
- Fields threaded through build_config, get_config, server, pipeline
- Fail-open: self-review errors never block PR creation
- Uses remaining turns/budget from original allocation (capped)
- Feature is opt-in (disabled by default)
---
 agent/src/config.py              |   7 +
 agent/src/models.py              |   3 +
 agent/src/pipeline.py            |  21 ++
 agent/src/prompts/__init__.py    |   1 +
 agent/src/prompts/self_review.py |  40 ++++
 agent/src/self_review.py         | 198 +++++++++++++++++++
 agent/src/server.py              |   9 +
 agent/tests/test_self_review.py  | 316 +++++++++++++++++++++++++++++++
 8 files changed, 595 insertions(+)
 create mode 100644 agent/src/prompts/self_review.py
 create mode 100644 agent/src/self_review.py
 create mode 100644 agent/tests/test_self_review.py

diff --git a/agent/src/config.py b/agent/src/config.py
index 04fa162d..78e683ea 100644
--- a/agent/src/config.py
+++ b/agent/src/config.py
@@ -337,6 +337,8 @@ def build_config(
     initial_approval_gate_count: int = 0,
     approval_gate_cap: int | None = None,
     attachments: list[dict] | None = None,
+    self_review_enabled: bool = False,
+    self_review_max_turns: int = 5,
 ) -> TaskConfig:
     """Build and validate configuration from explicit parameters.
 
@@ -407,6 +409,8 @@ def build_config(
         initial_approval_gate_count=initial_approval_gate_count,
         approval_gate_cap=approval_gate_cap,
         attachments=validated_attachments,
+        self_review_enabled=self_review_enabled,
+        self_review_max_turns=self_review_max_turns,
     )
 
 
@@ -431,6 +435,9 @@ def get_config() -> TaskConfig:
             # an unreachable ``traces//`` key.
             trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"),
             user_id=os.environ.get("USER_ID", ""),
+            self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower()
+            in ("1", "true", "yes"),
+            self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")),
         )
     except ValueError as e:
         print(f"ERROR: {e}", file=sys.stderr)
diff --git a/agent/src/models.py b/agent/src/models.py
index 007b94bc..2d0ddabd 100644
--- a/agent/src/models.py
+++ b/agent/src/models.py
@@ -186,6 +186,9 @@ class TaskConfig(BaseModel):
     # Attachments from the orchestrator payload (Phase 3). Validated as
     # AttachmentConfig models. Empty list for tasks without attachments.
     attachments: list[AttachmentConfig] = Field(default_factory=list)
+    # Self-review: optional LLM diff critique before PR creation.
+    self_review_enabled: bool = False
+    self_review_max_turns: int = 5  # Cap on turns allocated to self-review
 
     @model_validator(mode="after")
     def _validate_trace_requires_user_id(self) -> Self:
diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py
index e476a11c..9881eb9c 100644
--- a/agent/src/pipeline.py
+++ b/agent/src/pipeline.py
@@ -29,6 +29,7 @@
 from progress_writer import _ProgressWriter
 from prompt_builder import build_system_prompt, discover_project_config
 from runner import run_agent
+from self_review import run_self_review
 from shell import log, log_error_cw
 from system_prompt import SYSTEM_PROMPT
 from telemetry import (
@@ -279,6 +280,8 @@ def run_task(
     trace: bool = False,
     user_id: str = "",
     attachments: list[dict] | None = None,
+    self_review_enabled: bool = False,
+    self_review_max_turns: int = 5,
 ) -> dict:
     """Run the full agent pipeline and return a serialized result dict.
 
@@ -318,6 +321,8 @@ def run_task(
         initial_approval_gate_count=initial_approval_gate_count,
         approval_gate_cap=approval_gate_cap,
         attachments=attachments,
+        self_review_enabled=self_review_enabled,
+        self_review_max_turns=self_review_max_turns,
     )
 
     # Inject Cedar policies into config for the PolicyEngine in runner.py
@@ -623,6 +628,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
                     "turns_attempted": agent_result.num_turns or agent_result.turns,
                 }
 
+            # Self-review phase: LLM critiques its own diff before PR creation.
+            # Runs between cancel-check and post-hooks. Fail-open: errors here
+            # never block PR creation.
+            with task_span("task.self_review"):
+                review_result = run_self_review(
+                    config, setup, agent_result, trajectory, progress
+                )
+                if review_result is not None:
+                    # Accumulate turns and cost from the review phase
+                    agent_result.turns += review_result.turns
+                    agent_result.num_turns += review_result.num_turns or review_result.turns
+                    if review_result.cost_usd is not None:
+                        agent_result.cost_usd = (
+                            (agent_result.cost_usd or 0.0) + review_result.cost_usd
+                        )
+
             # Post-hooks (agent_result is guaranteed set by the try/except above)
             with task_span("task.post_hooks") as post_span:
                 # Safety net: commit any uncommitted tracked changes (skip for read-only tasks)
diff --git a/agent/src/prompts/__init__.py b/agent/src/prompts/__init__.py
index a864e013..2dd586a5 100644
--- a/agent/src/prompts/__init__.py
+++ b/agent/src/prompts/__init__.py
@@ -4,6 +4,7 @@
 from .new_task import NEW_TASK_WORKFLOW
 from .pr_iteration import PR_ITERATION_WORKFLOW
 from .pr_review import PR_REVIEW_WORKFLOW
+from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT
 
 _PROMPTS = {
     "new_task": BASE_PROMPT.replace("{workflow}", NEW_TASK_WORKFLOW),
diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py
new file mode 100644
index 00000000..9a6d109a
--- /dev/null
+++ b/agent/src/prompts/self_review.py
@@ -0,0 +1,40 @@
+"""Self-review prompt template for pre-PR diff critique."""
+
+SELF_REVIEW_PROMPT = """\
+You are reviewing your own work before it becomes a pull request. Below is the \
+cumulative diff of all changes on this branch compared to the base branch.
+
+<diff>
+{diff}
+</diff>
+
+## Task context
+
+{task_description}
+
+## Review checklist
+
+Examine the diff carefully for:
+
+1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \
+incorrect assumptions about data shapes or API contracts.
+2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \
+race conditions.
+3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \
+insecure defaults, OWASP Top 10 issues.
+4. **Style & consistency** — Naming conventions, code style violations relative to \
+the surrounding codebase, unnecessary complexity.
+5. **Test gaps** — Important behaviour that is untested, assertions that don't \
+verify the right thing, missing edge-case coverage.
+
+## Instructions
+
+- If you find issues, fix them directly: edit the files, run the build/tests to \
+verify your fixes, and commit the changes.
+- If no issues are found, stop immediately — do not make changes for the sake of \
+making changes.
+- Do NOT refactor code that was not part of the original diff unless it has a \
+concrete bug or security issue.
+- Keep fixes minimal and focused. Each fix should be a separate commit with a \
+clear message.
+"""
diff --git a/agent/src/self_review.py b/agent/src/self_review.py
new file mode 100644
index 00000000..20814190
--- /dev/null
+++ b/agent/src/self_review.py
@@ -0,0 +1,198 @@
+"""Self-review orchestration: LLM critiques its own diff before PR creation."""
+
+from __future__ import annotations
+
+import asyncio
+import subprocess
+from typing import TYPE_CHECKING
+
+from prompts.self_review import SELF_REVIEW_PROMPT
+from shell import log
+
+if TYPE_CHECKING:
+    from models import AgentResult, RepoSetup, TaskConfig
+    from progress_writer import _ProgressWriter
+    from telemetry import _TrajectoryWriter
+
+# Diff truncation limit (characters). Large diffs are cut at hunk boundaries.
+_MAX_DIFF_CHARS = 60_000
+
+# Minimal system prompt for the self-review agent invocation.
+_REVIEW_SYSTEM_PROMPT = """\
+You are a code reviewer working inside the repository {repo_url} on branch {branch_name}.
+Your working directory is {repo_dir}.
+
+You have full access to the filesystem and can run commands. Fix any issues you \
+find directly — edit files, run the build, and commit fixes. Keep changes minimal \
+and focused.
+
+Do NOT open a pull request or push. Just fix issues and commit locally.
+"""
+
+
+def _get_diff(repo_dir: str, default_branch: str) -> str:
+    """Generate the cumulative diff of the branch vs origin/{default_branch}."""
+    try:
+        result = subprocess.run(
+            ["git", "diff", f"origin/{default_branch}...HEAD"],
+            cwd=repo_dir,
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode != 0:
+            log("WARN", f"self_review: git diff failed (exit {result.returncode})")
+            return ""
+        return result.stdout
+    except (subprocess.TimeoutExpired, OSError) as e:
+        log("WARN", f"self_review: git diff error: {type(e).__name__}: {e}")
+        return ""
+
+
+def _truncate_diff(diff: str, max_chars: int = _MAX_DIFF_CHARS) -> str:
+    """Truncate diff at a hunk boundary if it exceeds max_chars.
+
+    Cuts at the last complete hunk (line starting with '@@') that fits
+    within the limit, appending a truncation notice.
+    """
+    if len(diff) <= max_chars:
+        return diff
+
+    # Find the last hunk header that starts before max_chars
+    truncated = diff[:max_chars]
+    last_hunk = truncated.rfind("\n@@")
+    if last_hunk > 0:
+        # Cut just before this hunk header
+        truncated = truncated[:last_hunk]
+    else:
+        # No hunk boundary found — hard-cut at max_chars
+        last_newline = truncated.rfind("\n")
+        if last_newline > 0:
+            truncated = truncated[:last_newline]
+
+    total_lines = diff.count("\n")
+    kept_lines = truncated.count("\n")
+    truncated += (
+        f"\n\n... [diff truncated: showing ~{kept_lines} of ~{total_lines} lines; "
+        f"{len(diff) - len(truncated)} chars omitted] ..."
+    )
+    return truncated
+
+
+def _build_review_system_prompt(config: TaskConfig, setup: RepoSetup) -> str:
+    """Build a minimal system prompt for the self-review agent."""
+    return _REVIEW_SYSTEM_PROMPT.format(
+        repo_url=config.repo_url,
+        branch_name=setup.branch,
+        repo_dir=setup.repo_dir,
+    )
+
+
+def run_self_review(
+    config: TaskConfig,
+    setup: RepoSetup,
+    agent_result: AgentResult,
+    trajectory: _TrajectoryWriter,
+    progress: _ProgressWriter,
+) -> AgentResult | None:
+    """Run the self-review phase: LLM critiques its own diff and fixes issues.
+
+    Returns the AgentResult from the review phase, or None if skipped.
+    Fail-open: errors are logged but never block the pipeline.
+    """
+    # Skip condition: feature disabled
+    if not config.self_review_enabled:
+        log("TASK", "self_review: disabled (self_review_enabled=False)")
+        return None
+
+    # Skip condition: pr_review is read-only (no diff to review)
+    if config.task_type == "pr_review":
+        log("TASK", "self_review: skipped for pr_review task type")
+        return None
+
+    # Compute remaining turns
+    used_turns = agent_result.turns or 0
+    remaining_turns = config.max_turns - used_turns
+    review_turns = min(remaining_turns, config.self_review_max_turns)
+    if review_turns <= 0:
+        log("TASK", f"self_review: no remaining turns (used={used_turns}, max={config.max_turns})")
+        return None
+
+    # Compute remaining budget
+    review_budget: float | None = None
+    if config.max_budget_usd is not None:
+        used_cost = agent_result.cost_usd or 0.0
+        remaining_budget = config.max_budget_usd - used_cost
+        if remaining_budget <= 0:
+            log(
+                "TASK",
+                f"self_review: no remaining budget "
+                f"(used=${used_cost:.2f}, max=${config.max_budget_usd:.2f})",
+            )
+            return None
+        review_budget = remaining_budget
+
+    # Get the diff
+    diff = _get_diff(setup.repo_dir, setup.default_branch)
+    if not diff.strip():
+        log("TASK", "self_review: no diff found — skipping")
+        return None
+
+    # Truncate if needed
+    diff = _truncate_diff(diff)
+
+    # Build the review prompt
+    task_desc = config.task_description or f"Issue #{config.issue_number}"
+    user_prompt = SELF_REVIEW_PROMPT.format(diff=diff, task_description=task_desc)
+    system_prompt = _build_review_system_prompt(config, setup)
+
+    # Build a modified config for the review run
+    review_config = config.model_copy(
+        update={
+            "max_turns": review_turns,
+            "max_budget_usd": review_budget,
+        }
+    )
+
+    log(
+        "TASK",
+        f"self_review: starting (turns={review_turns}, "
+        f"budget={'$' + f'{review_budget:.2f}' if review_budget else 'unlimited'}, "
+        f"diff_chars={len(diff)})",
+    )
+    progress.write_agent_milestone(
+        "self_review_started",
+        f"turns={review_turns} diff_chars={len(diff)}",
+    )
+
+    try:
+        from runner import run_agent
+
+        review_result = asyncio.run(
+            run_agent(
+                user_prompt,
+                system_prompt,
+                review_config,
+                cwd=setup.repo_dir,
+                trajectory=trajectory,
+            )
+        )
+    except Exception as e:
+        # Fail-open: self-review errors never block the pipeline
+        log("WARN", f"self_review: agent execution failed: {type(e).__name__}: {e}")
+        progress.write_agent_milestone(
+            "self_review_complete",
+            f"status=error error={type(e).__name__}: {e}",
+        )
+        return None
+
+    log(
+        "TASK",
+        f"self_review: complete (status={review_result.status}, "
+        f"turns={review_result.turns}, cost=${review_result.cost_usd or 0:.4f})",
+    )
+    progress.write_agent_milestone(
+        "self_review_complete",
+        f"status={review_result.status} turns={review_result.turns}",
+    )
+    return review_result
diff --git a/agent/src/server.py b/agent/src/server.py
index b01df8da..55c3779b 100644
--- a/agent/src/server.py
+++ b/agent/src/server.py
@@ -386,6 +386,8 @@ def _run_task_background(
     user_id: str = "",
     workload_access_token: str = "",
     attachments: list[dict] | None = None,
+    self_review_enabled: bool = False,
+    self_review_max_turns: int = 5,
 ) -> None:
     """Run the agent task in a background thread."""
     global _background_pipeline_failed
@@ -469,6 +471,8 @@ def _run_task_background(
             trace=trace,
             user_id=user_id,
             attachments=attachments,
+            self_review_enabled=self_review_enabled,
+            self_review_max_turns=self_review_max_turns,
         )
         _background_pipeline_failed = False
     except Exception as e:
@@ -557,6 +561,9 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict:
     channel_source = inp.get("channel_source", "") or ""
     channel_metadata = inp.get("channel_metadata") or {}
     attachments = inp.get("attachments") or []
+    # Self-review (opt-in): LLM critiques its own diff before PR creation.
+    self_review_enabled = inp.get("self_review_enabled") is True
+    self_review_max_turns = int(inp.get("self_review_max_turns", 5))
     # ``trace`` is strictly opt-in (design §10.1). Accept only real
     # booleans from the orchestrator — a string "false" would otherwise
     # flip the flag on.
@@ -630,6 +637,8 @@ def _extract_invocation_params(inp: dict, request: Request) -> dict:
         "user_id": user_id,
         "workload_access_token": workload_access_token,
         "attachments": attachments,
+        "self_review_enabled": self_review_enabled,
+        "self_review_max_turns": self_review_max_turns,
     }
 
 
diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py
new file mode 100644
index 00000000..bd39741a
--- /dev/null
+++ b/agent/tests/test_self_review.py
@@ -0,0 +1,316 @@
+"""Unit tests for self_review.py — self-review orchestration module."""
+
+from unittest.mock import MagicMock, patch
+
+from models import AgentResult, RepoSetup, TaskConfig
+from self_review import _MAX_DIFF_CHARS, _get_diff, _truncate_diff, run_self_review
+
+
+def _make_config(**overrides) -> TaskConfig:
+    """Create a minimal TaskConfig for testing."""
+    defaults = {
+        "repo_url": "owner/repo",
+        "github_token": "ghp_test123",
+        "aws_region": "us-east-1",
+        "task_description": "Fix the bug",
+        "max_turns": 10,
+        "self_review_enabled": True,
+        "self_review_max_turns": 5,
+        "task_type": "new_task",
+    }
+    defaults.update(overrides)
+    return TaskConfig(**defaults)
+
+
+def _make_setup(**overrides) -> RepoSetup:
+    """Create a minimal RepoSetup for testing."""
+    defaults = {
+        "repo_dir": "/workspace/repo",
+        "branch": "feat/123-fix",
+        "default_branch": "main",
+    }
+    defaults.update(overrides)
+    return RepoSetup(**defaults)
+
+
+def _make_agent_result(**overrides) -> AgentResult:
+    """Create a minimal AgentResult for testing."""
+    defaults = {
+        "status": "success",
+        "turns": 5,
+        "num_turns": 5,
+        "cost_usd": 0.50,
+    }
+    defaults.update(overrides)
+    return AgentResult(**defaults)
+
+
+class TestSkipConditions:
+    """Test all conditions that cause self-review to be skipped."""
+
+    def test_skip_when_disabled(self):
+        config = _make_config(self_review_enabled=False)
+        setup = _make_setup()
+        agent_result = _make_agent_result()
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+    def test_skip_for_pr_review_task_type(self):
+        config = _make_config(
+            task_type="pr_review",
+            pr_number="42",
+            task_description="",
+            issue_number="",
+        )
+        setup = _make_setup()
+        agent_result = _make_agent_result()
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+    def test_skip_when_no_remaining_turns(self):
+        config = _make_config(max_turns=5, self_review_max_turns=5)
+        setup = _make_setup()
+        agent_result = _make_agent_result(turns=5)
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+    def test_skip_when_no_remaining_budget(self):
+        config = _make_config(max_budget_usd=1.0)
+        setup = _make_setup()
+        agent_result = _make_agent_result(cost_usd=1.0)
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+    @patch("self_review._get_diff", return_value="")
+    def test_skip_when_empty_diff(self, mock_diff):
+        config = _make_config()
+        setup = _make_setup()
+        agent_result = _make_agent_result()
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+    @patch("self_review._get_diff", return_value="   \n  \n  ")
+    def test_skip_when_whitespace_only_diff(self, mock_diff):
+        config = _make_config()
+        setup = _make_setup()
+        agent_result = _make_agent_result()
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+
+class TestGetDiff:
+    """Test _get_diff helper."""
+
+    @patch("self_review.subprocess.run")
+    def test_returns_diff_output(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=0, stdout="diff --git a/f\n+line\n")
+        result = _get_diff("/repo", "main")
+        assert result == "diff --git a/f\n+line\n"
+        mock_run.assert_called_once_with(
+            ["git", "diff", "origin/main...HEAD"],
+            cwd="/repo",
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+
+    @patch("self_review.subprocess.run")
+    def test_returns_empty_on_failure(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=1, stdout="")
+        result = _get_diff("/repo", "main")
+        assert result == ""
+
+    @patch("self_review.subprocess.run")
+    def test_returns_empty_on_timeout(self, mock_run):
+        import subprocess
+
+        mock_run.side_effect = subprocess.TimeoutExpired(cmd="git", timeout=60)
+        result = _get_diff("/repo", "main")
+        assert result == ""
+
+    @patch("self_review.subprocess.run")
+    def test_uses_custom_default_branch(self, mock_run):
+        mock_run.return_value = MagicMock(returncode=0, stdout="some diff")
+        _get_diff("/repo", "develop")
+        mock_run.assert_called_once_with(
+            ["git", "diff", "origin/develop...HEAD"],
+            cwd="/repo",
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+
+
+class TestTruncateDiff:
+    """Test _truncate_diff helper."""
+
+    def test_no_truncation_needed(self):
+        short_diff = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n+new line\n"
+        result = _truncate_diff(short_diff)
+        assert result == short_diff
+
+    def test_truncates_at_hunk_boundary(self):
+        # Build a diff that exceeds the limit with multiple hunks
+        hunk1 = "diff --git a/file.py\n@@ -1,3 +1,4 @@\n" + "+line\n" * 100
+        hunk2 = "\n@@ -100,3 +101,4 @@\n" + "+more\n" * 100
+        big_diff = hunk1 + hunk2
+        result = _truncate_diff(big_diff, max_chars=len(hunk1) + 10)
+        # Should truncate before hunk2's @@ marker
+        assert "@@ -100,3" not in result
+        assert "truncated" in result
+
+    def test_hard_cut_when_no_hunk_boundary(self):
+        # Single hunk that exceeds max
+        big_diff = "+x\n" * 30000
+        result = _truncate_diff(big_diff, max_chars=100)
+        assert len(result) < len(big_diff)
+        assert "truncated" in result
+
+    def test_exact_limit_no_truncation(self):
+        diff = "a" * _MAX_DIFF_CHARS
+        result = _truncate_diff(diff)
+        assert result == diff
+
+    def test_one_over_limit_truncates(self):
+        diff = "a" * (_MAX_DIFF_CHARS + 1)
+        result = _truncate_diff(diff)
+        assert "truncated" in result
+
+
+class TestBudgetAndTurnComputation:
+    """Test that budget and turn limits are computed correctly."""
+
+    @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n")
+    @patch("self_review.asyncio.run")
+    def test_review_turns_capped_at_self_review_max_turns(self, mock_asyncio_run, mock_diff):
+        mock_asyncio_run.return_value = AgentResult(status="success", turns=2, num_turns=2)
+        config = _make_config(max_turns=100, self_review_max_turns=3)
+        setup = _make_setup()
+        agent_result = _make_agent_result(turns=5)
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        run_self_review(config, setup, agent_result, trajectory, progress)
+
+        # The review config passed to run_agent should have max_turns=3
+        call_args = mock_asyncio_run.call_args
+        coro = call_args[0][0]
+        # Close the coroutine to avoid warnings
+        coro.close()
+
+    @patch("self_review._get_diff", return_value="diff --git a/f\n+line\n")
+    @patch("self_review.asyncio.run")
+    def test_review_turns_uses_remaining_when_less_than_cap(self, mock_asyncio_run, mock_diff):
+        mock_asyncio_run.return_value = AgentResult(status="success", turns=1, num_turns=1)
+        config = _make_config(max_turns=8, self_review_max_turns=5)
+        setup = _make_setup()
+        agent_result = _make_agent_result(turns=6)  # Only 2 remaining
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        run_self_review(config, setup, agent_result, trajectory, progress)
+
+        # Should use min(2, 5) = 2 turns
+        call_args = mock_asyncio_run.call_args
+        coro = call_args[0][0]
+        coro.close()
+
+
+class TestHappyPath:
+    """Test the full happy path with mocked run_agent."""
+
+    @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+    @patch("self_review.asyncio.run")
+    def test_returns_review_result(self, mock_asyncio_run, mock_diff):
+        review_agent_result = AgentResult(
+            status="success", turns=2, num_turns=2, cost_usd=0.10
+        )
+        mock_asyncio_run.return_value = review_agent_result
+
+        config = _make_config()
+        setup = _make_setup()
+        agent_result = _make_agent_result(turns=5)
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+
+        assert result is not None
+        assert result.status == "success"
+        assert result.turns == 2
+        assert result.cost_usd == 0.10
+
+    @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+    @patch("self_review.asyncio.run")
+    def test_writes_progress_milestones(self, mock_asyncio_run, mock_diff):
+        mock_asyncio_run.return_value = AgentResult(
+            status="success", turns=1, num_turns=1, cost_usd=0.05
+        )
+        config = _make_config()
+        setup = _make_setup()
+        agent_result = _make_agent_result()
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        run_self_review(config, setup, agent_result, trajectory, progress)
+
+        # Should write both started and complete milestones
+        milestone_calls = progress.write_agent_milestone.call_args_list
+        assert len(milestone_calls) == 2
+        assert milestone_calls[0][0][0] == "self_review_started"
+        assert milestone_calls[1][0][0] == "self_review_complete"
+
+    @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+    @patch("self_review.asyncio.run")
+    def test_fail_open_on_agent_error(self, mock_asyncio_run, mock_diff):
+        mock_asyncio_run.side_effect = RuntimeError("SDK crashed")
+
+        config = _make_config()
+        setup = _make_setup()
+        agent_result = _make_agent_result()
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        # Should not raise
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is None
+
+        # Should still write milestones
+        milestone_calls = progress.write_agent_milestone.call_args_list
+        assert milestone_calls[0][0][0] == "self_review_started"
+        assert milestone_calls[1][0][0] == "self_review_complete"
+        assert "error" in milestone_calls[1][0][1]
+
+    @patch("self_review._get_diff", return_value="diff --git a/file.py\n+new code\n")
+    @patch("self_review.asyncio.run")
+    def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff):
+        mock_asyncio_run.return_value = AgentResult(
+            status="success", turns=1, num_turns=1, cost_usd=0.03
+        )
+        config = _make_config(max_budget_usd=None)
+        setup = _make_setup()
+        agent_result = _make_agent_result(cost_usd=None)
+        trajectory = MagicMock()
+        progress = MagicMock()
+
+        result = run_self_review(config, setup, agent_result, trajectory, progress)
+        assert result is not None
+        assert result.status == "success"

From 19b763c06f42a835a1bb74c17c79a80eefabc535 Mon Sep 17 00:00:00 2001
From: bgagent <bgagent@noreply.github.com>
Date: Thu, 4 Jun 2026 15:29:18 -0400
Subject: [PATCH 2/2] feat(agent): post self-review summary as PR comment
 (#263)

After the self-review phase completes, the agent writes a structured
summary of findings to `.self-review-summary.md`. The pipeline reads
this file, posts it as a PR comment via `gh pr comment`, then deletes
it so it never appears in the PR diff. Fail-open: if the file is
missing or the comment fails to post, the pipeline continues normally.
---
 agent/src/pipeline.py            |   5 +
 agent/src/post_hooks.py          |  62 +++++++++++
 agent/src/prompts/self_review.py |  21 ++++
 agent/src/self_review.py         |  41 ++++++++
 agent/tests/test_self_review.py  | 170 ++++++++++++++++++++++++++++++-
 5 files changed, 298 insertions(+), 1 deletion(-)

diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py
index 9881eb9c..783d12c7 100644
--- a/agent/src/pipeline.py
+++ b/agent/src/pipeline.py
@@ -23,6 +23,7 @@
     _extract_agent_notes,
     ensure_committed,
     ensure_pr,
+    post_self_review_comment,
     verify_build,
     verify_lint,
 )
@@ -664,6 +665,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
             if pr_url:
                 progress.write_agent_milestone("pr_created", pr_url)
 
+            # Post self-review summary as PR comment (if self-review ran and produced findings)
+            if pr_url and review_result is not None:
+                post_self_review_comment(setup.repo_dir, pr_url, config)
+
             # Memory write — capture task episode and repo learnings
             memory_written = False
             effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "")
diff --git a/agent/src/post_hooks.py b/agent/src/post_hooks.py
index e66ae9bb..5cbcce6f 100644
--- a/agent/src/post_hooks.py
+++ b/agent/src/post_hooks.py
@@ -327,6 +327,68 @@ def ensure_pr(
         return None
 
 
+def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool:
+    """Post the self-review summary as a PR comment.
+
+    Reads the summary file written by the self-review agent, formats it as a
+    comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged
+    but never propagated.
+
+    Returns True if a comment was posted, False otherwise.
+    """
+    from self_review import read_self_review_summary
+
+    try:
+        summary = read_self_review_summary(repo_dir)
+    except Exception as e:
+        log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}")
+        return False
+
+    if not summary:
+        log("POST", "post_self_review_comment: no summary file found — skipping")
+        return False
+
+    # Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123)
+    match = re.search(r"/pull/(\d+)", pr_url)
+    if not match:
+        log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}")
+        return False
+    pr_number = match.group(1)
+
+    comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}"
+
+    try:
+        result = subprocess.run(
+            [
+                "gh",
+                "pr",
+                "comment",
+                pr_number,
+                "--repo",
+                config.repo_url,
+                "--body",
+                comment_body,
+            ],
+            cwd=repo_dir,
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode == 0:
+            log("POST", f"Self-review summary posted as comment on PR #{pr_number}")
+            return True
+        stderr = result.stderr.strip()[:200] if result.stderr else ""
+        log(
+            "WARN",
+            f"post_self_review_comment: gh pr comment failed "
+            f"(rc={result.returncode}): {stderr}",
+        )
+        return False
+    except (subprocess.TimeoutExpired, OSError) as e:
+        log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}")
+        return False
+
+
 def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None:
     """Extract the "## Agent notes" section from the PR body.
 
diff --git a/agent/src/prompts/self_review.py b/agent/src/prompts/self_review.py
index 9a6d109a..25cf47a2 100644
--- a/agent/src/prompts/self_review.py
+++ b/agent/src/prompts/self_review.py
@@ -37,4 +37,25 @@
 concrete bug or security issue.
 - Keep fixes minimal and focused. Each fix should be a separate commit with a \
 clear message.
+
+## Summary output
+
+After completing your review (whether you made fixes or not), write a file \
+`.self-review-summary.md` in the repository root with your findings in this format:
+
+```markdown
+### Self-Review Summary
+
+**Findings:** <number of issues found>
+**Fixes applied:** <number of fixes committed>
+
+#### Issues found
+
+- <category>: <brief description of issue> — <fixed | not fixed (reason)>
+```
+
+If no issues were found, write the file with: "No issues found — code looks good."
+
+This file is a pipeline artifact and will be deleted automatically — it will NOT \
+appear in the pull request.
 """
diff --git a/agent/src/self_review.py b/agent/src/self_review.py
index 20814190..bb53e3b7 100644
--- a/agent/src/self_review.py
+++ b/agent/src/self_review.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 
 import asyncio
+import contextlib
+import os
 import subprocess
 from typing import TYPE_CHECKING
 
@@ -196,3 +198,42 @@ def run_self_review(
         f"status={review_result.status} turns={review_result.turns}",
     )
     return review_result
+
+
+_SUMMARY_FILENAME = ".self-review-summary.md"
+
+
+def read_self_review_summary(repo_dir: str) -> str | None:
+    """Read and delete the self-review summary file.
+
+    The self-review agent writes `.self-review-summary.md` in the repo root.
+    This function reads the content, removes the file (so it never appears in
+    the PR), and returns the content. Returns None if the file doesn't exist.
+    """
+    summary_path = os.path.join(repo_dir, _SUMMARY_FILENAME)
+    if not os.path.isfile(summary_path):
+        return None
+
+    try:
+        with open(summary_path) as f:
+            content = f.read()
+    except OSError as e:
+        log("WARN", f"self_review: failed to read summary file: {type(e).__name__}: {e}")
+        return None
+
+    # Remove the file so it doesn't end up in the PR
+    try:
+        os.remove(summary_path)
+    except OSError as e:
+        log("WARN", f"self_review: failed to delete summary file: {type(e).__name__}: {e}")
+
+    # If the file was staged by the agent, unstage it
+    with contextlib.suppress(subprocess.TimeoutExpired, OSError):
+        subprocess.run(
+            ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME],
+            cwd=repo_dir,
+            capture_output=True,
+            timeout=30,
+        )
+
+    return content.strip() if content.strip() else None
diff --git a/agent/tests/test_self_review.py b/agent/tests/test_self_review.py
index bd39741a..e2a817f1 100644
--- a/agent/tests/test_self_review.py
+++ b/agent/tests/test_self_review.py
@@ -1,9 +1,17 @@
 """Unit tests for self_review.py — self-review orchestration module."""
 
+import os
 from unittest.mock import MagicMock, patch
 
 from models import AgentResult, RepoSetup, TaskConfig
-from self_review import _MAX_DIFF_CHARS, _get_diff, _truncate_diff, run_self_review
+from self_review import (
+    _MAX_DIFF_CHARS,
+    _SUMMARY_FILENAME,
+    _get_diff,
+    _truncate_diff,
+    read_self_review_summary,
+    run_self_review,
+)
 
 
 def _make_config(**overrides) -> TaskConfig:
@@ -314,3 +322,163 @@ def test_works_with_unlimited_budget(self, mock_asyncio_run, mock_diff):
         result = run_self_review(config, setup, agent_result, trajectory, progress)
         assert result is not None
         assert result.status == "success"
+
+
+class TestReadSelfReviewSummary:
+    """Tests for read_self_review_summary — reads and cleans up the summary file."""
+
+    def test_returns_content_when_file_exists(self, tmp_path):
+        summary_content = (
+            "### Self-Review Summary\n\n"
+            "**Findings:** 2\n"
+            "**Fixes applied:** 1\n\n"
+            "#### Issues found\n\n"
+            "- Security: hardcoded token — fixed\n"
+            "- Style: inconsistent naming — not fixed (cosmetic)\n"
+        )
+        (tmp_path / _SUMMARY_FILENAME).write_text(summary_content)
+
+        result = read_self_review_summary(str(tmp_path))
+
+        assert result == summary_content.strip()
+
+    def test_returns_none_when_file_missing(self, tmp_path):
+        result = read_self_review_summary(str(tmp_path))
+        assert result is None
+
+    def test_deletes_file_after_reading(self, tmp_path):
+        (tmp_path / _SUMMARY_FILENAME).write_text("No issues found — code looks good.")
+
+        read_self_review_summary(str(tmp_path))
+
+        assert not (tmp_path / _SUMMARY_FILENAME).exists()
+
+    def test_returns_none_for_empty_file(self, tmp_path):
+        (tmp_path / _SUMMARY_FILENAME).write_text("   \n\n  ")
+
+        result = read_self_review_summary(str(tmp_path))
+
+        assert result is None
+
+    def test_returns_none_for_whitespace_only(self, tmp_path):
+        (tmp_path / _SUMMARY_FILENAME).write_text("\t\n ")
+
+        result = read_self_review_summary(str(tmp_path))
+
+        assert result is None
+
+    @patch("self_review.subprocess.run")
+    def test_runs_git_rm_cached_for_cleanup(self, mock_run, tmp_path):
+        (tmp_path / _SUMMARY_FILENAME).write_text("Some findings")
+        mock_run.return_value = MagicMock(returncode=0)
+
+        read_self_review_summary(str(tmp_path))
+
+        mock_run.assert_called_once_with(
+            ["git", "rm", "--cached", "--ignore-unmatch", "-f", _SUMMARY_FILENAME],
+            cwd=str(tmp_path),
+            capture_output=True,
+            timeout=30,
+        )
+
+    @patch("self_review.subprocess.run", side_effect=OSError("git not found"))
+    def test_git_rm_failure_does_not_block(self, mock_run, tmp_path):
+        (tmp_path / _SUMMARY_FILENAME).write_text("Findings here")
+
+        # Should not raise even if git rm fails
+        result = read_self_review_summary(str(tmp_path))
+
+        assert result == "Findings here"
+
+
+class TestPostSelfReviewComment:
+    """Tests for post_hooks.post_self_review_comment."""
+
+    def test_posts_comment_on_success(self, tmp_path):
+        from post_hooks import post_self_review_comment
+
+        (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1\n**Fixes applied:** 1")
+
+        config = MagicMock()
+        config.repo_url = "owner/repo"
+        pr_url = "https://github.com/owner/repo/pull/42"
+
+        with patch("post_hooks.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+            result = post_self_review_comment(str(tmp_path), pr_url, config)
+
+        assert result is True
+        call_args = mock_run.call_args[0][0]
+        assert call_args[0:3] == ["gh", "pr", "comment"]
+        assert "42" in call_args
+        assert "owner/repo" in call_args
+
+    def test_returns_false_when_no_summary(self, tmp_path):
+        from post_hooks import post_self_review_comment
+
+        config = MagicMock()
+        config.repo_url = "owner/repo"
+        pr_url = "https://github.com/owner/repo/pull/42"
+
+        result = post_self_review_comment(str(tmp_path), pr_url, config)
+        assert result is False
+
+    def test_returns_false_on_invalid_pr_url(self, tmp_path):
+        from post_hooks import post_self_review_comment
+
+        (tmp_path / _SUMMARY_FILENAME).write_text("Some findings")
+
+        config = MagicMock()
+        config.repo_url = "owner/repo"
+        pr_url = "https://github.com/owner/repo/issues/42"
+
+        result = post_self_review_comment(str(tmp_path), pr_url, config)
+        assert result is False
+
+    def test_returns_false_on_gh_failure(self, tmp_path):
+        from post_hooks import post_self_review_comment
+
+        (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1")
+
+        config = MagicMock()
+        config.repo_url = "owner/repo"
+        pr_url = "https://github.com/owner/repo/pull/99"
+
+        with patch("post_hooks.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="not found")
+            result = post_self_review_comment(str(tmp_path), pr_url, config)
+
+        assert result is False
+
+    def test_fail_open_on_exception(self, tmp_path):
+        from post_hooks import post_self_review_comment
+
+        (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 1")
+
+        config = MagicMock()
+        config.repo_url = "owner/repo"
+        pr_url = "https://github.com/owner/repo/pull/5"
+
+        with patch("post_hooks.subprocess.run", side_effect=OSError("network error")):
+            result = post_self_review_comment(str(tmp_path), pr_url, config)
+
+        assert result is False
+
+    def test_comment_body_includes_header(self, tmp_path):
+        from post_hooks import post_self_review_comment
+
+        (tmp_path / _SUMMARY_FILENAME).write_text("**Findings:** 0\nNo issues.")
+
+        config = MagicMock()
+        config.repo_url = "owner/repo"
+        pr_url = "https://github.com/owner/repo/pull/7"
+
+        with patch("post_hooks.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+            post_self_review_comment(str(tmp_path), pr_url, config)
+
+        call_args = mock_run.call_args[0][0]
+        body_idx = call_args.index("--body") + 1
+        body = call_args[body_idx]
+        assert "Self-Review Summary" in body
+        assert "**Findings:** 0" in body