diff --git a/agent/src/pipeline.py b/agent/src/pipeline.py index 4f34da65..2135e8d6 100644 --- a/agent/src/pipeline.py +++ b/agent/src/pipeline.py @@ -8,6 +8,7 @@ import subprocess import sys import time +from typing import TYPE_CHECKING from pydantic import ValidationError @@ -38,6 +39,9 @@ upload_trace_to_s3, ) +if TYPE_CHECKING: + from workflow import Workflow + _SDK_NO_RESULT_MESSAGE = ( "Agent SDK stream ended without a ResultMessage (agent_status=unknown). " "Treat as failure: possible SDK bug, network interruption, or protocol mismatch." @@ -366,6 +370,84 @@ def _run_repoless_task( return result_dict +def _apply_post_hook_gates( + workflow: Workflow | None, + *, + read_only: bool, + build_passed: bool, + lint_passed: bool, + build_before: bool, + lint_before: bool, +) -> bool: + """Resolve the coding lane's post-hook verify gates against the workflow (#301). + + Decision (issue #301 acceptance criteria): the inline post-hook path + CONSULTS each declared ``verify_build`` / ``verify_lint`` step's ``gate`` + through the runner's ``gate_status`` — the single place gate semantics live — + rather than routing the post-hooks through the runner's step handlers. + Routing through the runner would also change failure-path side effects (a + gating ``verify_build`` with ``on_failure: fail`` stops the runner *before* + ``ensure_pr``, stranding committed work with no PR), which is the broader + half-migrated-runner unification the issue defers. Here the inline ordering + (verify → ensure_pr always runs) is preserved; only the task verdict honors + the declared gate. + + Per-step semantics: + + - A declared step gates per its ``gate`` (``strict`` | ``regression_only`` | + ``informational``; unset = ``regression_only``), but only when its + ``on_failure`` is ``fail`` — ``continue``/``skip_remaining`` steps are + advisory for the task verdict, matching the runner. + - An undeclared ``verify_build`` keeps the legacy regression-only gating + (identical to ``gate_status`` with ``gate=None``). + - An undeclared ``verify_lint`` never gates (legacy: lint is not used for + terminal status unless a workflow opts in by declaring the step). + - ``workflow is None`` (post-hook reload failed) falls back to the legacy + gating for both, so a corrupt file cannot strand the agent's work. + """ + from workflow import gate_status + + steps = list(workflow.steps) if workflow is not None else [] + gates_ok = True + for kind, passed, was_passing_before in ( + ("verify_build", build_passed, build_before), + ("verify_lint", lint_passed, lint_before), + ): + step = next((s for s in steps if s.kind == kind), None) + if step is None: + if kind == "verify_lint": + continue + gate, gating, on_failure = None, True, "fail" + else: + gate, gating, on_failure = step.gate, step.on_failure == "fail", step.on_failure + status = gate_status( + passed=passed, + gate=gate, + read_only=read_only, + was_passing_before=was_passing_before, + ) + if passed: + continue + label = gate or "regression_only" + if status == "succeeded": + if read_only: + log("INFO", f"read-only workflow: {kind} failed — informational only, not gating") + elif gate == "informational": + log("INFO", f"{kind} failed — gate=informational, not gating") + else: + log( + "WARN", + f"Post-agent {kind} failed, but it was already failing before " + "agent changes — not counting as regression", + ) + elif gating: + log("WARN", f"{kind} failed — gate={label} gates the task") + gates_ok = False + else: + log("INFO", f"{kind} failed — gate={label} but on_failure={on_failure}, not gating") + return gates_ok + + def _resolve_overall_task_status( agent_result: AgentResult, *, @@ -870,22 +952,25 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: "turns_attempted": agent_result.num_turns or agent_result.turns, } - # Resolve the post-hook gating inputs: read_only and the ensure_pr - # strategy (create / push_resolve / resolve) the workflow declares. + # Resolve the post-hook gating inputs: read_only, the ensure_pr + # strategy (create / push_resolve / resolve), and the verify steps' + # declared gates (#301) the workflow declares. # # ``read_only`` comes from ``config`` — build_config already computed # it (with its own fail-soft fallback) and it drove Cedar during the # run, so reusing it keeps the post-hook on the SAME verdict rather - # than re-deriving a possibly-divergent one. The workflow file is only - # reloaded for the ensure_pr STRATEGY, and that reload is wrapped in - # the same WorkflowValidationError fallback build_config uses - # (config.py): this code path runs AFTER run_agent has already mutated - # / committed the tree, so a load failure here must NOT strand the work - # as FAILED with no PR — it falls back to the default "create" strategy - # and still opens the PR (PR review #296 finding #5). + # than re-deriving a possibly-divergent one. The workflow file is + # reloaded for the ensure_pr STRATEGY and the verify-step GATES, and + # that reload is wrapped in the same WorkflowValidationError fallback + # build_config uses (config.py): this code path runs AFTER run_agent + # has already mutated / committed the tree, so a load failure here + # must NOT strand the work as FAILED with no PR — it falls back to + # the default "create" strategy + legacy regression-only gating and + # still opens the PR (PR review #296 finding #5). from workflow import WorkflowValidationError, load_workflow workflow_read_only = config.read_only + _workflow = None try: _workflow = load_workflow( (config.resolved_workflow or {}).get("id", "coding/new-task-v1") @@ -944,23 +1029,19 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None: # Overall status: do not infer success from PR/build when the SDK never # emitted ResultMessage (agent_status=unknown) — that masks protocol gaps. - # NOTE: lint_passed is intentionally NOT used for terminal status. + # Gating honors each verify step's declared ``gate`` via the runner's + # gate_status (#301); an undeclared verify_lint never gates (legacy). agent_status = agent_result.status - # Default True = assume build was green before, so a post-agent - # failure IS counted as a regression (conservative). - build_before = setup.build_before - if workflow_read_only: - build_ok = True # Read-only review — build status is informational only - if not build_passed: - log("INFO", "read-only workflow: build failed — informational only, not gating") - else: - build_ok = build_passed or not build_before - if not build_passed and not build_before and not workflow_read_only: - log( - "WARN", - "Post-agent build failed, but build was already failing before " - "agent changes — not counting as regression", - ) + build_ok = _apply_post_hook_gates( + _workflow, + read_only=workflow_read_only, + build_passed=build_passed, + lint_passed=lint_passed, + # setup defaults assume green-before, so a post-agent failure IS + # counted as a regression (conservative). + build_before=setup.build_before, + lint_before=setup.lint_before, + ) overall_status, result_error = _resolve_overall_task_status( agent_result, build_ok=build_ok, diff --git a/agent/src/workflow/__init__.py b/agent/src/workflow/__init__.py index bff06b8b..9ffcef99 100644 --- a/agent/src/workflow/__init__.py +++ b/agent/src/workflow/__init__.py @@ -39,6 +39,7 @@ StepOutcome, WorkflowCheckpoint, WorkflowResult, + gate_status, run_workflow, ) from .validator import assert_valid, validate_workflow @@ -60,6 +61,7 @@ "WorkflowResult", "WorkflowValidationError", "assert_valid", + "gate_status", "load_workflow", "load_workflow_file", "policy_principal_for", diff --git a/agent/src/workflow/runner.py b/agent/src/workflow/runner.py index 97535c9a..4f818689 100644 --- a/agent/src/workflow/runner.py +++ b/agent/src/workflow/runner.py @@ -483,22 +483,24 @@ def _handle_run_agent(step: Step, ctx: StepContext) -> StepOutcome: ) -def _gate_status( +def gate_status( *, passed: bool, gate: str | None, read_only: bool, was_passing_before: bool ) -> StepStatus: """Map a verify result + the step's ``gate`` to a step status. Single place the verify-gate semantics live, shared by ``verify_build`` and ``verify_lint`` (the two were near-identical twins that drifted on the - ``read_only`` rule — see the code-review finding). Mirrors ``pipeline.py``'s - terminal-status logic so the workflow path and the legacy path agree: + ``read_only`` rule — see the code-review finding). Since #301 it is also the + implementation behind the coding lane's inline post-hook gating + (``pipeline._apply_post_hook_gates``), so both lanes honor a step's + declared ``gate`` through this one function: - ``informational`` (or a ``read_only`` workflow) — never gates. - ``strict`` — any failure gates. - ``regression_only`` **and the unset default** — fail only on a *regression* (was passing before, fails now); a check that was already red before the - agent ran is not a regression and does not gate. This matches pipeline.py's - unconditional ``build_ok = passed or not build_before`` — which is + agent ran is not a regression and does not gate. This matches the legacy + pipeline behavior of ``build_ok = passed or not build_before`` — which was regression-only for *every* task — so an unset gate agrees with the legacy path rather than defaulting to the stricter ``strict``. """ @@ -521,7 +523,7 @@ def _handle_verify_build(step: Step, ctx: StepContext) -> StepOutcome: # was_passing_before defaults True (assume green-before, so a post-agent # failure IS a regression) — the same conservative default pipeline.py uses. was_passing_before = ctx.setup.build_before if ctx.setup else True - status = _gate_status( + status = gate_status( passed=passed, gate=step.gate, read_only=ctx.workflow.read_only, @@ -543,7 +545,7 @@ def _handle_verify_lint(step: Step, ctx: StepContext) -> StepOutcome: repo_dir = ctx.setup.repo_dir if ctx.setup else "" passed = verify_lint(repo_dir) was_passing_before = ctx.setup.lint_before if ctx.setup else True - status = _gate_status( + status = gate_status( passed=passed, gate=step.gate, read_only=ctx.workflow.read_only, diff --git a/agent/tests/test_pipeline_post_hook_gates.py b/agent/tests/test_pipeline_post_hook_gates.py new file mode 100644 index 00000000..32ed74cc --- /dev/null +++ b/agent/tests/test_pipeline_post_hook_gates.py @@ -0,0 +1,372 @@ +"""Coding-lane post-hook gating honors each verify step's declared ``gate`` (#301). + +The inline post-hook path in ``pipeline.run_task`` consults the workflow's +``verify_build`` / ``verify_lint`` steps through the runner's ``gate_status`` — +the single place gate semantics live — instead of hardcoding regression-only +gating. These tests pin: + +- ``gate: strict`` fails the task on a build regression on the coding lane + (and ``informational`` never gates) — end-to-end through ``run_task``. +- The three shipped coding workflows keep their exact pre-#301 effective + gating: inline verdict == runner verdict == legacy verdict for the + ``regression_only`` and ``read_only`` cases. +""" + +from __future__ import annotations + +from typing import ClassVar +from unittest.mock import MagicMock, patch + +import pytest + +from models import AgentResult, RepoSetup +from pipeline import _apply_post_hook_gates +from workflow import Workflow, gate_status, load_workflow + + +def _workflow(steps: list[dict], **over) -> Workflow: + body = { + "id": "coding/new-task-v1", + "version": "1.0.0", + "domain": "coding", + "requires_repo": True, + "read_only": False, + "prompt": {"template": "do the thing"}, + "hydration": {"sources": ["task_description"]}, + "agent_config": { + "tier": "standard", + "allowed_tools": ["Bash", "Read"], + "cedar_policy_modules": ["builtin/hard_deny", "builtin/soft_deny"], + }, + "steps": steps, + "terminal_outcomes": {"primary": "pr_url"}, + "status": "production", + } + body.update(over) + return Workflow.model_validate(body) + + +_BASE_STEPS = [ + {"kind": "clone_repo", "name": "setup"}, + {"kind": "run_agent", "name": "implement"}, +] + + +def _wf_with_build_gate(gate: str | None, **over) -> Workflow: + build_step: dict = {"kind": "verify_build", "name": "build"} + if gate is not None: + build_step["gate"] = gate + steps = [*_BASE_STEPS, build_step, {"kind": "ensure_pr", "strategy": "create"}] + return _workflow(steps, **over) + + +class TestApplyPostHookGates: + """Unit semantics of the inline gate resolution.""" + + def test_strict_gates_on_regression(self): + wf = _wf_with_build_gate("strict") + assert not _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=True, + lint_before=True, + ) + + def test_strict_gates_even_preexisting_failure(self): + # strict means ANY failure gates — including a build that was already + # red before the agent ran (the case regression_only forgives). + wf = _wf_with_build_gate("strict") + assert not _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=False, + lint_before=True, + ) + + def test_informational_never_gates(self): + wf = _wf_with_build_gate("informational") + assert _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=True, + lint_before=True, + ) + + def test_regression_only_gates_a_regression(self): + wf = _wf_with_build_gate("regression_only") + assert not _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=True, + lint_before=True, + ) + + def test_regression_only_ignores_preexisting_failure(self): + wf = _wf_with_build_gate("regression_only") + assert _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=False, + lint_before=True, + ) + + def test_unset_gate_defaults_to_regression_only(self): + wf = _wf_with_build_gate(None) + assert not _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=True, + lint_before=True, + ) + assert _apply_post_hook_gates( + wf, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=False, + lint_before=True, + ) + + def test_read_only_never_gates_even_strict(self): + wf = _wf_with_build_gate("strict", read_only=True) + assert _apply_post_hook_gates( + wf, + read_only=True, + build_passed=False, + lint_passed=True, + build_before=True, + lint_before=True, + ) + + def test_undeclared_verify_lint_never_gates(self): + # Legacy: lint is not used for terminal status unless a workflow + # declares the step (the shipped coding workflows do not). + wf = _wf_with_build_gate("regression_only") + assert _apply_post_hook_gates( + wf, + read_only=False, + build_passed=True, + lint_passed=False, + build_before=True, + lint_before=True, + ) + + def test_declared_strict_verify_lint_gates(self): + steps = [ + *_BASE_STEPS, + {"kind": "verify_build", "name": "build", "gate": "regression_only"}, + {"kind": "verify_lint", "name": "lint", "gate": "strict"}, + {"kind": "ensure_pr", "strategy": "create"}, + ] + wf = _workflow(steps) + assert not _apply_post_hook_gates( + wf, + read_only=False, + build_passed=True, + lint_passed=False, + build_before=True, + lint_before=True, + ) + + def test_advisory_on_failure_continue_does_not_gate(self): + # on_failure: continue marks the step advisory — the runner records the + # failure and proceeds, so the inline verdict must not gate either. + steps = [ + *_BASE_STEPS, + {"kind": "verify_build", "name": "build", "gate": "regression_only"}, + { + "kind": "verify_lint", + "name": "lint", + "gate": "strict", + "on_failure": "continue", + }, + {"kind": "ensure_pr", "strategy": "create"}, + ] + wf = _workflow(steps) + assert _apply_post_hook_gates( + wf, + read_only=False, + build_passed=True, + lint_passed=False, + build_before=True, + lint_before=True, + ) + + def test_workflow_none_falls_back_to_legacy(self): + # Post-hook reload failed: build keeps legacy regression-only gating, + # lint never gates — a corrupt file cannot strand the agent's work. + assert not _apply_post_hook_gates( + None, + read_only=False, + build_passed=False, + lint_passed=True, + build_before=True, + lint_before=True, + ) + assert _apply_post_hook_gates( + None, + read_only=False, + build_passed=False, + lint_passed=False, + build_before=False, + lint_before=True, + ) + + +class TestShippedWorkflowParity: + """Lock: no behavior change for the three shipped coding workflows. + + For every (build_passed, build_before) combination, the inline verdict + (``_apply_post_hook_gates``), the runner verdict (``gate_status`` on the + declared step), and the legacy pre-#301 inline logic must agree. + """ + + SHIPPED: ClassVar[list[str]] = [ + "coding/new-task-v1", + "coding/pr-iteration-v1", + "coding/pr-review-v1", + ] + + @staticmethod + def _legacy_build_ok(*, read_only: bool, build_passed: bool, build_before: bool) -> bool: + # The pre-#301 inline logic, verbatim. + if read_only: + return True + return build_passed or not build_before + + @pytest.mark.parametrize("workflow_id", SHIPPED) + @pytest.mark.parametrize("build_passed", [True, False]) + @pytest.mark.parametrize("build_before", [True, False]) + def test_inline_runner_and_legacy_verdicts_match(self, workflow_id, build_passed, build_before): + wf = load_workflow(workflow_id) + build_step = next(s for s in wf.steps if s.kind == "verify_build") + + inline_ok = _apply_post_hook_gates( + wf, + read_only=wf.read_only, + build_passed=build_passed, + lint_passed=True, + build_before=build_before, + lint_before=True, + ) + runner_ok = ( + gate_status( + passed=build_passed, + gate=build_step.gate, + read_only=wf.read_only, + was_passing_before=build_before, + ) + == "succeeded" + ) + legacy_ok = self._legacy_build_ok( + read_only=wf.read_only, build_passed=build_passed, build_before=build_before + ) + assert inline_ok == runner_ok == legacy_ok + + @pytest.mark.parametrize("workflow_id", SHIPPED) + def test_lint_failure_never_gates_shipped_workflows(self, workflow_id): + # None of the shipped coding workflows declare verify_lint, so a lint + # failure must not gate — exactly the legacy behavior. + wf = load_workflow(workflow_id) + assert all(s.kind != "verify_lint" for s in wf.steps) + assert _apply_post_hook_gates( + wf, + read_only=wf.read_only, + build_passed=True, + lint_passed=False, + build_before=True, + lint_before=True, + ) + + +class TestRunTaskHonorsGate: + """End-to-end through run_task: the declared gate decides the task verdict.""" + + @staticmethod + def _span() -> MagicMock: + span = MagicMock() + span.__enter__ = MagicMock(return_value=span) + span.__exit__ = MagicMock(return_value=False) + return span + + def _run(self, *, gate: str, build_passed: bool, build_before: bool) -> tuple[dict, MagicMock]: + wf = _wf_with_build_gate(gate) + + async def fake_run_agent(_p, _s, _c, cwd=None, trajectory=None): + return AgentResult(status="success", turns=2, cost_usd=0.01, num_turns=2) + + mock_ensure_pr = MagicMock(return_value="https://github.com/o/r/pull/1") + + with ( + patch("runner.run_agent", side_effect=fake_run_agent), + patch("pipeline.build_system_prompt", return_value="sys"), + patch("pipeline.discover_project_config", return_value=None), + patch( + "repo.setup_repo", + return_value=RepoSetup( + repo_dir="/workspace/repo", + branch="bgagent/test/branch", + build_before=build_before, + ), + ), + patch("pipeline.task_span", return_value=self._span()), + patch("pipeline.task_state"), + patch("pipeline.ensure_committed", return_value=False), + patch("pipeline.verify_build", return_value=build_passed), + patch("pipeline.verify_lint", return_value=True), + patch("pipeline.ensure_pr", mock_ensure_pr), + patch("pipeline.get_disk_usage", return_value=0), + patch("pipeline.print_metrics"), + patch("workflow.load_workflow", return_value=wf), + ): + from pipeline import run_task + + result = run_task( + repo_url="o/r", + task_description="x", + github_token="ghp_test", + aws_region="us-east-1", + task_id="t-gate", + resolved_workflow={"id": "coding/new-task-v1", "version": "1.0.0"}, + ) + return result, mock_ensure_pr + + def test_strict_gate_fails_task_on_build_regression(self, monkeypatch): + monkeypatch.setenv("GITHUB_TOKEN", "ghp_test") + monkeypatch.setenv("AWS_REGION", "us-east-1") + result, mock_ensure_pr = self._run(gate="strict", build_passed=False, build_before=True) + assert result["status"] == "error" + # The inline ordering is preserved: ensure_pr still ran (the PR is the + # reviewable artifact even when the gate fails the task). + mock_ensure_pr.assert_called_once() + + def test_strict_gate_fails_task_even_when_broken_before(self, monkeypatch): + # The case the legacy regression-only inline logic would have passed — + # strict must gate it (this is exactly what #301 makes effective). + monkeypatch.setenv("GITHUB_TOKEN", "ghp_test") + monkeypatch.setenv("AWS_REGION", "us-east-1") + result, _ = self._run(gate="strict", build_passed=False, build_before=False) + assert result["status"] == "error" + + def test_informational_gate_never_fails_task(self, monkeypatch): + # A build regression that regression_only/strict would gate — the + # author opted out via gate: informational, so the task succeeds. + monkeypatch.setenv("GITHUB_TOKEN", "ghp_test") + monkeypatch.setenv("AWS_REGION", "us-east-1") + result, mock_ensure_pr = self._run( + gate="informational", build_passed=False, build_before=True + ) + assert result["status"] == "success" + mock_ensure_pr.assert_called_once() diff --git a/agent/tests/test_workflow_runner.py b/agent/tests/test_workflow_runner.py index 726f821c..6ca4d0d8 100644 --- a/agent/tests/test_workflow_runner.py +++ b/agent/tests/test_workflow_runner.py @@ -399,75 +399,75 @@ class TestGateStatus: """The shared verify-gate semantics (verify_build / verify_lint).""" def test_passing_always_succeeds(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status assert ( - _gate_status(passed=True, gate="strict", read_only=False, was_passing_before=True) + gate_status(passed=True, gate="strict", read_only=False, was_passing_before=True) == "succeeded" ) def test_strict_failure_gates(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status assert ( - _gate_status(passed=False, gate="strict", read_only=False, was_passing_before=True) + gate_status(passed=False, gate="strict", read_only=False, was_passing_before=True) == "failed" ) def test_informational_never_gates(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status assert ( - _gate_status( + gate_status( passed=False, gate="informational", read_only=False, was_passing_before=True ) == "succeeded" ) def test_read_only_never_gates(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status # read_only workflows treat verify results as informational (matches # pipeline.py: pr_review build status is informational only). assert ( - _gate_status(passed=False, gate="strict", read_only=True, was_passing_before=True) + gate_status(passed=False, gate="strict", read_only=True, was_passing_before=True) == "succeeded" ) def test_regression_only_gates_a_regression(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status # was passing before, fails now → regression → gates. assert ( - _gate_status( + gate_status( passed=False, gate="regression_only", read_only=False, was_passing_before=True ) == "failed" ) def test_regression_only_ignores_preexisting_failure(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status # already broken before the agent ran → not a regression → does NOT gate # (mirrors pipeline.py build_ok = passed or not build_before). assert ( - _gate_status( + gate_status( passed=False, gate="regression_only", read_only=False, was_passing_before=False ) == "succeeded" ) def test_unset_gate_defaults_to_regression_only(self): - from workflow.runner import _gate_status + from workflow.runner import gate_status # An unset gate must mirror pipeline.py (which is always regression-only), # NOT default to strict: a regression gates, a pre-existing failure does not. assert ( - _gate_status(passed=False, gate=None, read_only=False, was_passing_before=True) + gate_status(passed=False, gate=None, read_only=False, was_passing_before=True) == "failed" ) assert ( - _gate_status(passed=False, gate=None, read_only=False, was_passing_before=False) + gate_status(passed=False, gate=None, read_only=False, was_passing_before=False) == "succeeded" ) diff --git a/docs/design/WORKFLOWS.md b/docs/design/WORKFLOWS.md index 4a992958..9f281868 100644 --- a/docs/design/WORKFLOWS.md +++ b/docs/design/WORKFLOWS.md @@ -116,14 +116,16 @@ Steps are the unit the runner interprets. Each has a `kind`, an optional `name`, | `clone_repo` | deterministic | Clone + `mise trust`/`install` + initial build/lint; select branch | Forbidden when `requires_repo:false`. Replaces `setup_repo`. | | `hydrate_context` | deterministic | Assemble prompt from declared `hydration` sources | Mostly done orchestrator-side; this step consumes the `HydratedContext` payload. | | `run_agent` | agentic | The Claude Agent SDK loop with the workflow's prompt + `agent_config` | Exactly one per workflow. Today only one `run_agent` is ever called (`pipeline.py`), so this is an emergent property the schema now *promotes to an enforced constraint*; multi-agent loops are out of scope (#99). | -| `verify_build` | deterministic | Run `mise run build`; gate or inform | `read_only` workflows treat result as informational. Forbidden when `requires_repo:false`. | -| `verify_lint` | deterministic | Run `mise run lint` | Optional gate. | +| `verify_build` | deterministic | Run `mise run build`; gate or inform | Gating declared per step via `gate` (see below). `read_only` workflows treat result as informational. Forbidden when `requires_repo:false`. | +| `verify_lint` | deterministic | Run `mise run lint` | Optional gate (`gate` field, see below); advisory unless declared. | | `ensure_pr` | deterministic | Create / push+resolve / resolve-only a PR | Strategy chosen by step config (`create` \| `push_resolve` \| `resolve`), replacing the `post_hooks.ensure_pr` task_type branch. | | `post_review` | deterministic | Post a GitHub review (Reviews API) | For review workflows (e.g. `coding/pr-review-v1`). | | `deliver_artifact` | deterministic | Upload a produced artifact (S3) / post a comment | Repo-less terminal delivery for knowledge work. | Each step declares `on_failure: fail | continue | skip_remaining` (default `fail`) so the runner's error behavior is explicit and matches today's fail-closed default. +**The `gate` field (`verify_build` / `verify_lint`).** A verify step declares how its result affects the task verdict: `strict` (any failure gates), `regression_only` (gates only when the check was passing before the agent ran and fails after — the default when unset, matching the legacy pipeline behavior), or `informational` (never gates). A `read_only` workflow never gates regardless of `gate`. The semantics live in exactly one place — `gate_status` in `agent/src/workflow/runner.py` — used by both lanes (#301): the repo-less lane through the runner's `verify_*` step handlers, and the coding lane through the inline post-hook resolution (`pipeline._apply_post_hook_gates`), which consults each declared step's `gate` and `on_failure` (`continue`/`skip_remaining` steps are advisory for the verdict, matching the runner). On the coding lane an *undeclared* `verify_lint` never gates (the legacy behavior — lint is advisory unless a workflow opts in by declaring the step), and the inline ordering is preserved: `ensure_pr` still runs after a gating verify failure so the agent's work surfaces as a reviewable PR even when the task is marked failed. Routing the coding post-hooks bodily through the runner's step handlers (which would stop *before* `ensure_pr` on a gating failure) is the broader runner unification deferred out of #301's scope. + ### Example: shipped coding workflow (`new_task`) ```yaml diff --git a/docs/src/content/docs/architecture/Workflows.md b/docs/src/content/docs/architecture/Workflows.md index 6c66aa76..cb5a4b20 100644 --- a/docs/src/content/docs/architecture/Workflows.md +++ b/docs/src/content/docs/architecture/Workflows.md @@ -120,14 +120,16 @@ Steps are the unit the runner interprets. Each has a `kind`, an optional `name`, | `clone_repo` | deterministic | Clone + `mise trust`/`install` + initial build/lint; select branch | Forbidden when `requires_repo:false`. Replaces `setup_repo`. | | `hydrate_context` | deterministic | Assemble prompt from declared `hydration` sources | Mostly done orchestrator-side; this step consumes the `HydratedContext` payload. | | `run_agent` | agentic | The Claude Agent SDK loop with the workflow's prompt + `agent_config` | Exactly one per workflow. Today only one `run_agent` is ever called (`pipeline.py`), so this is an emergent property the schema now *promotes to an enforced constraint*; multi-agent loops are out of scope (#99). | -| `verify_build` | deterministic | Run `mise run build`; gate or inform | `read_only` workflows treat result as informational. Forbidden when `requires_repo:false`. | -| `verify_lint` | deterministic | Run `mise run lint` | Optional gate. | +| `verify_build` | deterministic | Run `mise run build`; gate or inform | Gating declared per step via `gate` (see below). `read_only` workflows treat result as informational. Forbidden when `requires_repo:false`. | +| `verify_lint` | deterministic | Run `mise run lint` | Optional gate (`gate` field, see below); advisory unless declared. | | `ensure_pr` | deterministic | Create / push+resolve / resolve-only a PR | Strategy chosen by step config (`create` \| `push_resolve` \| `resolve`), replacing the `post_hooks.ensure_pr` task_type branch. | | `post_review` | deterministic | Post a GitHub review (Reviews API) | For review workflows (e.g. `coding/pr-review-v1`). | | `deliver_artifact` | deterministic | Upload a produced artifact (S3) / post a comment | Repo-less terminal delivery for knowledge work. | Each step declares `on_failure: fail | continue | skip_remaining` (default `fail`) so the runner's error behavior is explicit and matches today's fail-closed default. +**The `gate` field (`verify_build` / `verify_lint`).** A verify step declares how its result affects the task verdict: `strict` (any failure gates), `regression_only` (gates only when the check was passing before the agent ran and fails after — the default when unset, matching the legacy pipeline behavior), or `informational` (never gates). A `read_only` workflow never gates regardless of `gate`. The semantics live in exactly one place — `gate_status` in `agent/src/workflow/runner.py` — used by both lanes (#301): the repo-less lane through the runner's `verify_*` step handlers, and the coding lane through the inline post-hook resolution (`pipeline._apply_post_hook_gates`), which consults each declared step's `gate` and `on_failure` (`continue`/`skip_remaining` steps are advisory for the verdict, matching the runner). On the coding lane an *undeclared* `verify_lint` never gates (the legacy behavior — lint is advisory unless a workflow opts in by declaring the step), and the inline ordering is preserved: `ensure_pr` still runs after a gating verify failure so the agent's work surfaces as a reviewable PR even when the task is marked failed. Routing the coding post-hooks bodily through the runner's step handlers (which would stop *before* `ensure_pr` on a gating failure) is the broader runner unification deferred out of #301's scope. + ### Example: shipped coding workflow (`new_task`) ```yaml