Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 106 additions & 25 deletions agent/src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import subprocess
import sys
import time
from typing import TYPE_CHECKING

from pydantic import ValidationError

Expand Down Expand Up @@ -38,6 +39,9 @@
upload_trace_to_s3,
)

if TYPE_CHECKING:
from workflow import Workflow

_SDK_NO_RESULT_MESSAGE = (
"Agent SDK stream ended without a ResultMessage (agent_status=unknown). "
"Treat as failure: possible SDK bug, network interruption, or protocol mismatch."
Expand Down Expand Up @@ -366,6 +370,84 @@ def _run_repoless_task(
return result_dict


def _apply_post_hook_gates(
workflow: Workflow | None,
*,
read_only: bool,
build_passed: bool,
lint_passed: bool,
build_before: bool,
lint_before: bool,
) -> bool:
"""Resolve the coding lane's post-hook verify gates against the workflow (#301).

Decision (issue #301 acceptance criteria): the inline post-hook path
CONSULTS each declared ``verify_build`` / ``verify_lint`` step's ``gate``
through the runner's ``gate_status`` — the single place gate semantics live —
rather than routing the post-hooks through the runner's step handlers.
Routing through the runner would also change failure-path side effects (a
gating ``verify_build`` with ``on_failure: fail`` stops the runner *before*
``ensure_pr``, stranding committed work with no PR), which is the broader
half-migrated-runner unification the issue defers. Here the inline ordering
(verify → ensure_pr always runs) is preserved; only the task verdict honors
the declared gate.

Per-step semantics:

- A declared step gates per its ``gate`` (``strict`` | ``regression_only`` |
``informational``; unset = ``regression_only``), but only when its
``on_failure`` is ``fail`` — ``continue``/``skip_remaining`` steps are
advisory for the task verdict, matching the runner.
- An undeclared ``verify_build`` keeps the legacy regression-only gating
(identical to ``gate_status`` with ``gate=None``).
- An undeclared ``verify_lint`` never gates (legacy: lint is not used for
terminal status unless a workflow opts in by declaring the step).
- ``workflow is None`` (post-hook reload failed) falls back to the legacy
gating for both, so a corrupt file cannot strand the agent's work.
"""
from workflow import gate_status

steps = list(workflow.steps) if workflow is not None else []
gates_ok = True
for kind, passed, was_passing_before in (
("verify_build", build_passed, build_before),
("verify_lint", lint_passed, lint_before),
):
step = next((s for s in steps if s.kind == kind), None)
if step is None:
if kind == "verify_lint":
continue
gate, gating, on_failure = None, True, "fail"
else:
gate, gating, on_failure = step.gate, step.on_failure == "fail", step.on_failure
status = gate_status(
passed=passed,
gate=gate,
read_only=read_only,
was_passing_before=was_passing_before,
)
if passed:
continue
label = gate or "regression_only"
if status == "succeeded":
if read_only:
log("INFO", f"read-only workflow: {kind} failed — informational only, not gating")
elif gate == "informational":
log("INFO", f"{kind} failed — gate=informational, not gating")
else:
log(
"WARN",
f"Post-agent {kind} failed, but it was already failing before "
"agent changes — not counting as regression",
)
elif gating:
log("WARN", f"{kind} failed — gate={label} gates the task")
gates_ok = False
else:
log("INFO", f"{kind} failed — gate={label} but on_failure={on_failure}, not gating")
return gates_ok


def _resolve_overall_task_status(
agent_result: AgentResult,
*,
Expand Down Expand Up @@ -870,22 +952,25 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
"turns_attempted": agent_result.num_turns or agent_result.turns,
}

# Resolve the post-hook gating inputs: read_only and the ensure_pr
# strategy (create / push_resolve / resolve) the workflow declares.
# Resolve the post-hook gating inputs: read_only, the ensure_pr
# strategy (create / push_resolve / resolve), and the verify steps'
# declared gates (#301) the workflow declares.
#
# ``read_only`` comes from ``config`` — build_config already computed
# it (with its own fail-soft fallback) and it drove Cedar during the
# run, so reusing it keeps the post-hook on the SAME verdict rather
# than re-deriving a possibly-divergent one. The workflow file is only
# reloaded for the ensure_pr STRATEGY, and that reload is wrapped in
# the same WorkflowValidationError fallback build_config uses
# (config.py): this code path runs AFTER run_agent has already mutated
# / committed the tree, so a load failure here must NOT strand the work
# as FAILED with no PR — it falls back to the default "create" strategy
# and still opens the PR (PR review #296 finding #5).
# than re-deriving a possibly-divergent one. The workflow file is
# reloaded for the ensure_pr STRATEGY and the verify-step GATES, and
# that reload is wrapped in the same WorkflowValidationError fallback
# build_config uses (config.py): this code path runs AFTER run_agent
# has already mutated / committed the tree, so a load failure here
# must NOT strand the work as FAILED with no PR — it falls back to
# the default "create" strategy + legacy regression-only gating and
# still opens the PR (PR review #296 finding #5).
from workflow import WorkflowValidationError, load_workflow

workflow_read_only = config.read_only
_workflow = None
try:
_workflow = load_workflow(
(config.resolved_workflow or {}).get("id", "coding/new-task-v1")
Expand Down Expand Up @@ -944,23 +1029,19 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:

# Overall status: do not infer success from PR/build when the SDK never
# emitted ResultMessage (agent_status=unknown) — that masks protocol gaps.
# NOTE: lint_passed is intentionally NOT used for terminal status.
# Gating honors each verify step's declared ``gate`` via the runner's
# gate_status (#301); an undeclared verify_lint never gates (legacy).
agent_status = agent_result.status
# Default True = assume build was green before, so a post-agent
# failure IS counted as a regression (conservative).
build_before = setup.build_before
if workflow_read_only:
build_ok = True # Read-only review — build status is informational only
if not build_passed:
log("INFO", "read-only workflow: build failed — informational only, not gating")
else:
build_ok = build_passed or not build_before
if not build_passed and not build_before and not workflow_read_only:
log(
"WARN",
"Post-agent build failed, but build was already failing before "
"agent changes — not counting as regression",
)
build_ok = _apply_post_hook_gates(
_workflow,
read_only=workflow_read_only,
build_passed=build_passed,
lint_passed=lint_passed,
# setup defaults assume green-before, so a post-agent failure IS
# counted as a regression (conservative).
build_before=setup.build_before,
lint_before=setup.lint_before,
)
overall_status, result_error = _resolve_overall_task_status(
agent_result,
build_ok=build_ok,
Expand Down
2 changes: 2 additions & 0 deletions agent/src/workflow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
StepOutcome,
WorkflowCheckpoint,
WorkflowResult,
gate_status,
run_workflow,
)
from .validator import assert_valid, validate_workflow
Expand All @@ -60,6 +61,7 @@
"WorkflowResult",
"WorkflowValidationError",
"assert_valid",
"gate_status",
"load_workflow",
"load_workflow_file",
"policy_principal_for",
Expand Down
16 changes: 9 additions & 7 deletions agent/src/workflow/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,22 +483,24 @@ def _handle_run_agent(step: Step, ctx: StepContext) -> StepOutcome:
)


def _gate_status(
def gate_status(
*, passed: bool, gate: str | None, read_only: bool, was_passing_before: bool
) -> StepStatus:
"""Map a verify result + the step's ``gate`` to a step status.

Single place the verify-gate semantics live, shared by ``verify_build`` and
``verify_lint`` (the two were near-identical twins that drifted on the
``read_only`` rule — see the code-review finding). Mirrors ``pipeline.py``'s
terminal-status logic so the workflow path and the legacy path agree:
``read_only`` rule — see the code-review finding). Since #301 it is also the
implementation behind the coding lane's inline post-hook gating
(``pipeline._apply_post_hook_gates``), so both lanes honor a step's
declared ``gate`` through this one function:

- ``informational`` (or a ``read_only`` workflow) — never gates.
- ``strict`` — any failure gates.
- ``regression_only`` **and the unset default** — fail only on a *regression*
(was passing before, fails now); a check that was already red before the
agent ran is not a regression and does not gate. This matches pipeline.py's
unconditional ``build_ok = passed or not build_before`` — which is
agent ran is not a regression and does not gate. This matches the legacy
pipeline behavior of ``build_ok = passed or not build_before`` — which was
regression-only for *every* task — so an unset gate agrees with the legacy
path rather than defaulting to the stricter ``strict``.
"""
Expand All @@ -521,7 +523,7 @@ def _handle_verify_build(step: Step, ctx: StepContext) -> StepOutcome:
# was_passing_before defaults True (assume green-before, so a post-agent
# failure IS a regression) — the same conservative default pipeline.py uses.
was_passing_before = ctx.setup.build_before if ctx.setup else True
status = _gate_status(
status = gate_status(
passed=passed,
gate=step.gate,
read_only=ctx.workflow.read_only,
Expand All @@ -543,7 +545,7 @@ def _handle_verify_lint(step: Step, ctx: StepContext) -> StepOutcome:
repo_dir = ctx.setup.repo_dir if ctx.setup else ""
passed = verify_lint(repo_dir)
was_passing_before = ctx.setup.lint_before if ctx.setup else True
status = _gate_status(
status = gate_status(
passed=passed,
gate=step.gate,
read_only=ctx.workflow.read_only,
Expand Down
Loading
Loading