diff --git a/examples/harness/README.md b/examples/harness/README.md new file mode 100644 index 00000000..66103a57 --- /dev/null +++ b/examples/harness/README.md @@ -0,0 +1,185 @@ +# Harness modules: ContextEngine + ResultVerifier + +This example shows how to compose two Harness modules with a veADK Agent for +context engineering, evidence tracking, and final-answer verification. + +- `ContextEngine` pins the original task, filters noisy history, assembles an + evidence-first context header, and records a small budget report. +- `ResultVerifier` records tool receipts, gathers evidence references, checks + final answers for unsupported URLs and ungrounded external facts, and writes a + local verification report. + +The implementation is self-contained so developers can read, run, test, and +adapt the pattern in one directory. + +## Layout + +```text +examples/harness/ +├── main.py +├── harness_agent.py +├── harness_modules/ +│ ├── core.py +│ ├── context_engine.py +│ ├── result_verifier.py +│ ├── tool_wrappers.py +│ └── stores.py +├── tests/ +└── golden/ + ├── production_scenarios.jsonl + ├── context_engine_cases.jsonl + └── verifier_cases.jsonl +``` + +## Run + +Configure the normal veADK model environment variables, then run: + +```bash +python examples/harness/main.py +``` + +The run writes local audit data under `.harness_runs/`: + +- `events.jsonl` +- `messages.jsonl` +- `receipts.jsonl` +- `evidence/*.txt` +- `reports/-.json` + +## Core usage + +```python +from harness_agent import build_harness_agent + +bundle = build_harness_agent() +answer = await bundle.run( + "请查一下 veADK Harness 示例的核心能力,给出来源,并用 3 条要点回答。", + session_id="harness-demo", +) +report = bundle.latest_report(session_id="harness-demo") +``` + +`bundle.agent` and `bundle.runner` are regular veADK `Agent` and `Runner` +instances. The thin `bundle.run(...)` method coordinates `user_id`, +`session_id`, and `original_prompt` so the Harness processor can create local +receipts, evidence, context events, and verification reports. + +## Test + +The tests use fake tools and fake runner events, so no model key is needed: + +```bash +pytest examples/harness/tests +``` + +The validation targets are: + +- task anchor retention across follow-up turns; +- removal of progress and control messages from model context; +- deterministic detection of fabricated URLs; +- failure when a current/external factual answer has no evidence; +- receipt recording for failed tools; +- externalization of large tool results. + +The scenario-level golden set is +`examples/harness/golden/production_scenarios.jsonl`. It groups common +production cases by scenario and module, so developers can add new regression +cases without coupling them to a specific product incident or project-specific dataset. +The smaller `verifier_cases.jsonl` and `context_engine_cases.jsonl` files keep +module-focused golden checks. + +## Evaluate the Harness lift + +Run the offline A/B evaluation: + +```bash +python examples/harness/evaluation/run_eval.py +``` + +The evaluation isolates deterministic Harness effects rather than model quality. +Baseline uses raw history and trusts every non-empty answer. Harness uses +`ContextEngine` plus `ResultVerifier`. +The case set uses common production-style developer scenarios: stale RAG +memory, failed tool receipts, permission over-blocking, runtime parameter drift, +and multi-turn context anchoring. + +Current result: + +| Metric | Baseline | Harness | Delta | +| --- | ---: | ---: | ---: | +| Result verifier accuracy | 20.0% | 100.0% | +80.0 pp | +| Unsafe false-accept rate | 100.0% | 0.0% | -100.0 pp | +| Unsafe detection recall | 0.0% | 100.0% | +100.0 pp | +| Context quality score | 0.0% | 100.0% | +100.0 pp | + +Offline report summary by scenario: + +| Scenario | Baseline behavior | Harness lift | Module | +| --- | --- | --- | --- | +| RAG memory freshness | Trusts stale-memory answers without current evidence. | Blocks the answer until current knowledge evidence exists. | `ResultVerifier` | +| Tool failure claimed as success | Trusts a final JSON that says the operation passed. | Detects failed tool receipts and blocks false completion claims. | `ResultVerifier` | +| Permission over-blocking of allowed tools | Trusts a success result even when an allowed tool was blocked. | Treats failed receipts as incompatible with `operation_completed=true`. | `ResultVerifier` | +| Runtime parameter drift | Trusts unsupported runtime values such as a wrong token limit. | Blocks key numeric facts that are not present in evidence. | `ResultVerifier` | +| Multi-turn context anchoring | Raw history includes progress noise and loses the original task anchor. | Pins the original task and removes control-message pollution. | `ContextEngine` | +| Current evidence beats stale memory | Recent history can surface stale cached answers before evidence. | Puts current evidence before history and keeps the original task anchor. | `ContextEngine` | + +Reports are written to +`examples/harness/evaluation/results/harness_eval_report.json` and +`examples/harness/evaluation/results/harness_eval_report.md`. + +## Run model-in-the-loop evaluation + +The model evaluation makes real veADK model calls. Export the standard model +environment variables, or pass any dotenv file that contains +`MODEL_AGENT_API_KEY`, `MODEL_AGENT_NAME`, and `MODEL_AGENT_API_BASE`: + +```bash +python examples/harness/evaluation/run_model_eval.py \ + --env-file /path/to/model.env +``` + +If the variables are already exported in the shell, `--env-file` can be omitted. + +No secret values are written to the reports. The script compares a normal veADK +Agent that trusts every non-empty answer with the Harness Agent that trusts an +answer only when `VerificationReport.done` is true. + +Reports are written to +`examples/harness/evaluation/results/harness_model_eval_report.json` and +`examples/harness/evaluation/results/harness_model_eval_report.md`. + +Current sample model result: + +| Metric | Baseline | Harness | Delta | +| --- | ---: | ---: | ---: | +| Trust decision accuracy | 66.7% | 100.0% | +33.3 pp | +| Unsupported false-accept rate | 100.0% | 0.0% | -100.0 pp | +| Answerable verified pass rate | - | 100.0% | +100.0 pp | +| Answerable receipt coverage | - | 100.0% | +100.0 pp | +| Unsupported request block rate | - | 100.0% | +100.0 pp | + +The model report also includes a scenario matrix with the scenario as the first +column, covering RAG freshness, tool evidence receipts, and no-evidence +hallucination suppression. + +Model report summary by scenario: + +| Scenario | Baseline runtime | Harness runtime | What the result shows | +| --- | --- | --- | --- | +| RAG freshness with source grounding | Trusts the non-empty model answer. | Trusts only after tool receipts and source evidence are present. | Answerable sourced requests can still pass when grounded. | +| Tool evidence and receipt coverage | Trusts the final text without runtime receipt enforcement. | Keeps the answer trusted and records tool receipts. | Harness adds auditability without blocking valid answers. | +| No-evidence hallucination suppression | Trusts a non-empty unsupported answer. | Blocks the answer because no tool evidence or source receipt exists. | The trust gate prevents no-evidence source claims from reaching callers. | + +## Design Notes + +This example focuses on the core developer workflow: + +- build a task-aware context header before the model runs; +- wrap tools so every capability call leaves an auditable receipt; +- attach evidence references to tool outputs; +- verify the final answer before treating it as trusted; +- use tests and offline/model evaluations to measure the lift. + +The modules are intentionally compact and explicit, making them suitable as a +starting point for product-specific Harness extensions. diff --git a/examples/harness/README.zh.md b/examples/harness/README.zh.md new file mode 100644 index 00000000..758bd93f --- /dev/null +++ b/examples/harness/README.zh.md @@ -0,0 +1,176 @@ +# Harness 子模块示例:ContextEngine + ResultVerifier + +这个示例演示如何为 veADK Agent 组合两个 Harness 子模块,用于上下文工程、 +证据追踪和最终答案验证。 + +- `ContextEngine`:固定原始任务、过滤噪声历史、组装证据优先上下文,并记录 + 轻量预算报告。 +- `ResultVerifier`:记录工具收据、收集证据引用、检查最终答案里的伪造 URL 和 + 无证据外部事实,并写入本地验证报告。 + +所有代码都自包含在 `examples/harness/`,开发者可以在一个目录内阅读、运行、 +测试并按自己的业务场景改造。 + +## 目录 + +```text +examples/harness/ +├── main.py +├── harness_agent.py +├── harness_modules/ +│ ├── core.py +│ ├── context_engine.py +│ ├── result_verifier.py +│ ├── tool_wrappers.py +│ └── stores.py +├── tests/ +└── golden/ + ├── production_scenarios.jsonl + ├── context_engine_cases.jsonl + └── verifier_cases.jsonl +``` + +## 运行 + +先配置常规 veADK 模型环境变量,然后执行: + +```bash +python examples/harness/main.py +``` + +运行审计数据会写到 `.harness_runs/`: + +- `events.jsonl` +- `messages.jsonl` +- `receipts.jsonl` +- `evidence/*.txt` +- `reports/-.json` + +## 核心用法 + +```python +from harness_agent import build_harness_agent + +bundle = build_harness_agent() +answer = await bundle.run( + "请查一下 veADK Harness 示例的核心能力,给出来源,并用 3 条要点回答。", + session_id="harness-demo", +) +report = bundle.latest_report(session_id="harness-demo") +``` + +`bundle.agent` 和 `bundle.runner` 是常规 veADK `Agent` / `Runner` 实例。 +`bundle.run(...)` 是很薄的一层,用于协调 `user_id`、`session_id` 和 +`original_prompt`,让 Harness processor 生成本地收据、证据、上下文事件和验证报告。 + +## 测试 + +测试使用 fake tool 和 fake runner event,不需要模型 key: + +```bash +pytest examples/harness/tests +``` + +验证点: + +- follow-up 轮次保留原始任务锚点; +- progress 和控制消息不会进入模型上下文; +- 可确定性拦截伪造 URL; +- 当前/外部事实任务无证据时验证失败; +- 工具异常时保留 failed receipt; +- 大工具结果外置为 evidence 文件。 + +场景级 golden 集合是 +`examples/harness/golden/production_scenarios.jsonl`。它按通用生产场景和模块组织, +开发者可以在不绑定特定产品问题或项目特定数据集的情况下新增回归 case。 +`verifier_cases.jsonl` 和 `context_engine_cases.jsonl` 保留模块级 golden 检查。 + +## 评测 Harness 增益 + +运行离线 A/B 评测: + +```bash +python examples/harness/evaluation/run_eval.py +``` + +评测隔离的是 Harness 子模块的确定性效果,而不是模型能力。Baseline 使用原始历史, +并信任所有非空答案;Harness Treatment 使用 `ContextEngine` 和 `ResultVerifier`。 +Case 集合覆盖常见生产开发场景:RAG 旧缓存、失败工具收据、权限误拦截、 +运行时参数偏移,以及多轮上下文锚定。 + +当前结果: + +| 指标 | Baseline | Harness | 增益 | +| --- | ---: | ---: | ---: | +| 结果验证准确率 | 20.0% | 100.0% | +80.0 pp | +| 不安全答案误放行率 | 100.0% | 0.0% | -100.0 pp | +| 不安全答案召回率 | 0.0% | 100.0% | +100.0 pp | +| 上下文质量分 | 0.0% | 100.0% | +100.0 pp | + +离线报告按场景的摘要: + +| 场景 | Baseline 表现 | Harness 增益 | 模块 | +| --- | --- | --- | --- | +| RAG 记忆新鲜度 | 无当前证据时仍信任旧缓存答案。 | 缺少当前知识库证据时阻断答案。 | `ResultVerifier` | +| 工具失败却声称成功 | 只要最终 JSON 写了 passed 就信任。 | 检测 failed receipt,阻断虚假的完成声明。 | `ResultVerifier` | +| 权限策略误拦截合法工具 | 合法工具被拦截后仍可能信任成功结果。 | 将 failed receipt 与 `operation_completed=true` 判为冲突。 | `ResultVerifier` | +| 运行时参数偏移 | 信任没有证据支撑的 token/runtime 数值。 | 阻断证据中不存在的关键数字事实。 | `ResultVerifier` | +| 多轮上下文锚定 | 原始历史包含 progress 噪声,且容易丢失任务锚点。 | 固定原始任务,并过滤控制消息污染。 | `ContextEngine` | +| 当前证据优先于旧记忆 | 最近历史中的旧缓存答案可能先于证据进入上下文。 | 将当前证据放在历史前,并保留原始任务锚点。 | `ContextEngine` | + +报告输出到: +`examples/harness/evaluation/results/harness_eval_report.json` 和 +`examples/harness/evaluation/results/harness_eval_report.md`。 + +## 带模型的评测 + +模型评测会发起实际 veADK 模型调用。可以先在 shell 中导出标准模型环境变量,也可以传入 +任意包含 `MODEL_AGENT_API_KEY`、`MODEL_AGENT_NAME`、`MODEL_AGENT_API_BASE` 的 +dotenv 文件: + +```bash +python examples/harness/evaluation/run_model_eval.py \ + --env-file /path/to/model.env +``` + +如果这些变量已经在当前 shell 中导出,可以省略 `--env-file`。 + +报告不会写入任何密钥值。评测对比的是:普通 veADK Agent 对所有非空答案直接信任; +Harness Agent 只有在 `VerificationReport.done=True` 时才把答案视为可信。 + +报告输出到: +`examples/harness/evaluation/results/harness_model_eval_report.json` 和 +`examples/harness/evaluation/results/harness_model_eval_report.md`。 + +当前示例模型评测结果: + +| 指标 | Baseline | Harness | 增益 | +| --- | ---: | ---: | ---: | +| 信任决策准确率 | 66.7% | 100.0% | +33.3 pp | +| 无证据任务错误放行率 | 100.0% | 0.0% | -100.0 pp | +| 可回答任务验证放行率 | - | 100.0% | +100.0 pp | +| 可回答任务 receipt 覆盖率 | - | 100.0% | +100.0 pp | +| 无证据任务阻断率 | - | 100.0% | +100.0 pp | + +模型报告还包含按场景组织的矩阵,第一列就是场景,覆盖 RAG 新鲜度、 +工具证据收据、无证据幻觉抑制。 + +模型报告按场景的摘要: + +| 场景 | Baseline 运行时 | Harness 运行时 | 结果说明 | +| --- | --- | --- | --- | +| RAG 新鲜度与来源支撑 | 信任非空模型答案。 | 只有存在工具收据和来源证据时才信任。 | 有来源要求的请求在证据充分时正常放行。 | +| 工具证据与 receipt 覆盖 | 不强制运行时 receipt 校验。 | 答案放行,同时记录工具 receipt。 | Harness 增加审计能力,不误伤有效答案。 | +| 无证据幻觉抑制 | 信任非空但无证据的答案。 | 因缺少工具证据或 source receipt 而阻断。 | 信任门能防止无证据来源声明返回给调用方。 | + +## 设计说明 + +这个示例聚焦开发者最常用的核心链路: + +- 在模型运行前构造任务感知的上下文 header; +- 包装工具调用,让每次能力调用都留下可审计收据; +- 将工具输出绑定到 evidence reference; +- 在信任最终答案前执行结果验证; +- 通过测试、离线评测和带模型评测度量效果增益。 + +模块实现保持紧凑、直接,适合作为业务侧扩展 Harness 能力的起点。 diff --git a/examples/harness/__init__.py b/examples/harness/__init__.py new file mode 100644 index 00000000..b0a12509 --- /dev/null +++ b/examples/harness/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ContextEngine and ResultVerifier Harness example.""" diff --git a/examples/harness/evaluation/__init__.py b/examples/harness/evaluation/__init__.py new file mode 100644 index 00000000..f696bb1b --- /dev/null +++ b/examples/harness/evaluation/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Offline A/B evaluation for the Harness example.""" diff --git a/examples/harness/evaluation/results/harness_eval_report.json b/examples/harness/evaluation/results/harness_eval_report.json new file mode 100644 index 00000000..6d21da88 --- /dev/null +++ b/examples/harness/evaluation/results/harness_eval_report.json @@ -0,0 +1,429 @@ +{ + "context_engine": { + "baseline": { + "acceptance_visibility_rate": 0.0, + "anchor_contract_rate": 0.0, + "budget_compliance_rate": 0.8333333333333334, + "case_count": 6, + "control_pollution_rate": 0.3333333333333333, + "evidence_before_history_rate": 0.0, + "evidence_visibility_rate": 0.0, + "quality_score": 0.0 + }, + "cases": [ + { + "baseline": { + "acceptance_visible": false, + "anchor_contract": false, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "evidence_before_history": false, + "evidence_visible": false, + "expects_evidence": false, + "id": "follow-up-anchor", + "is_harness": false, + "prompt_chars": 91, + "turn_type": "" + }, + "checks": [ + "anchor_contract", + "acceptance_visible" + ], + "harness": { + "acceptance_visible": true, + "anchor_contract": true, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "estimated_tokens": 148, + "evidence_before_history": true, + "evidence_visible": false, + "expects_evidence": false, + "id": "follow-up-anchor", + "is_harness": true, + "omitted_count": 0, + "prompt_chars": 586, + "truncated": false, + "turn_type": "follow_up" + }, + "id": "follow-up-anchor", + "include_in_summary": false, + "scenario": "Core context: follow-up task anchor" + }, + { + "baseline": { + "acceptance_visible": false, + "anchor_contract": false, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": true, + "evidence_before_history": false, + "evidence_visible": false, + "expects_evidence": false, + "id": "control-message-filter", + "is_harness": false, + "prompt_chars": 93, + "turn_type": "" + }, + "checks": [ + "no_control_pollution" + ], + "harness": { + "acceptance_visible": true, + "anchor_contract": true, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "estimated_tokens": 104, + "evidence_before_history": true, + "evidence_visible": false, + "expects_evidence": false, + "id": "control-message-filter", + "is_harness": true, + "omitted_count": 0, + "prompt_chars": 489, + "truncated": false, + "turn_type": "follow_up" + }, + "id": "control-message-filter", + "include_in_summary": false, + "scenario": "Core context: control message filtering" + }, + { + "baseline": { + "acceptance_visible": false, + "anchor_contract": false, + "anchor_text_present": true, + "budget_compliant": false, + "control_polluted": false, + "evidence_before_history": false, + "evidence_visible": false, + "expects_evidence": false, + "id": "budgeted-follow-up", + "is_harness": false, + "prompt_chars": 10046, + "turn_type": "" + }, + "checks": [ + "budget_compliant", + "anchor_contract" + ], + "harness": { + "acceptance_visible": true, + "anchor_contract": true, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "estimated_tokens": 93, + "evidence_before_history": true, + "evidence_visible": false, + "expects_evidence": false, + "id": "budgeted-follow-up", + "is_harness": true, + "omitted_count": 20, + "prompt_chars": 443, + "truncated": true, + "turn_type": "follow_up" + }, + "id": "budgeted-follow-up", + "include_in_summary": false, + "scenario": "Core context: prompt budget control" + }, + { + "baseline": { + "acceptance_visible": false, + "anchor_contract": false, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "evidence_before_history": false, + "evidence_visible": false, + "expects_evidence": true, + "id": "evidence-first", + "is_harness": false, + "prompt_chars": 77, + "turn_type": "" + }, + "checks": [ + "evidence_visible", + "evidence_before_history" + ], + "harness": { + "acceptance_visible": true, + "anchor_contract": true, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "estimated_tokens": 129, + "evidence_before_history": true, + "evidence_visible": true, + "expects_evidence": true, + "id": "evidence-first", + "is_harness": true, + "omitted_count": 0, + "prompt_chars": 613, + "truncated": false, + "turn_type": "follow_up" + }, + "id": "evidence-first", + "include_in_summary": false, + "scenario": "Core context: evidence before history" + }, + { + "baseline": { + "acceptance_visible": false, + "anchor_contract": false, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": true, + "evidence_before_history": false, + "evidence_visible": false, + "expects_evidence": false, + "id": "production-group-chat-anchor", + "is_harness": false, + "prompt_chars": 205, + "turn_type": "" + }, + "checks": [ + "anchor_contract", + "no_control_pollution" + ], + "harness": { + "acceptance_visible": true, + "anchor_contract": true, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "estimated_tokens": 124, + "evidence_before_history": true, + "evidence_visible": false, + "expects_evidence": false, + "id": "production-group-chat-anchor", + "is_harness": true, + "omitted_count": 0, + "prompt_chars": 496, + "truncated": false, + "turn_type": "follow_up" + }, + "id": "production-group-chat-anchor", + "include_in_summary": true, + "scenario": "Channel and cron context anchoring" + }, + { + "baseline": { + "acceptance_visible": false, + "anchor_contract": false, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "evidence_before_history": false, + "evidence_visible": false, + "expects_evidence": true, + "id": "production-rag-evidence-first", + "is_harness": false, + "prompt_chars": 113, + "turn_type": "" + }, + "checks": [ + "anchor_contract", + "evidence_visible", + "evidence_before_history" + ], + "harness": { + "acceptance_visible": true, + "anchor_contract": true, + "anchor_text_present": true, + "budget_compliant": true, + "control_polluted": false, + "estimated_tokens": 155, + "evidence_before_history": true, + "evidence_visible": true, + "expects_evidence": true, + "id": "production-rag-evidence-first", + "is_harness": true, + "omitted_count": 0, + "prompt_chars": 709, + "truncated": false, + "turn_type": "follow_up" + }, + "id": "production-rag-evidence-first", + "include_in_summary": true, + "scenario": "RAG evidence beats stale memory" + } + ], + "delta": { + "anchor_contract_rate_pp": 100.0, + "budget_compliance_rate_pp": 16.7, + "control_pollution_rate_reduction_pp": 33.3, + "quality_score_pp": 100.0 + }, + "harness": { + "acceptance_visibility_rate": 1.0, + "anchor_contract_rate": 1.0, + "budget_compliance_rate": 1.0, + "case_count": 6, + "control_pollution_rate": 0.0, + "evidence_before_history_rate": 1.0, + "evidence_visibility_rate": 1.0, + "quality_score": 1.0 + } + }, + "generated_at": "2026-06-12T08:46:09.937706+00:00", + "method": { + "baseline": "raw history plus current prompt; non-empty final answer is trusted", + "harness": "ContextEngine context assembly plus ResultVerifier deterministic checks", + "model_dependency": "none" + }, + "result_verifier": { + "baseline": { + "accuracy": 0.2, + "case_count": 10, + "safe_case_count": 2, + "safe_pass_rate": 1.0, + "unsafe_case_count": 8, + "unsafe_detection_precision": 0.0, + "unsafe_false_accept_rate": 1.0, + "unsafe_recall": 0.0 + }, + "cases": [ + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "fabricated_url", + "harness_done": false, + "harness_missing_requirements": [ + "Answer cites URL(s) not present in evidence/source: https://fake.example/policy" + ], + "id": "fabricated-url", + "include_in_summary": false, + "scenario": "Core guardrail: fabricated source URL" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "missing_evidence", + "harness_done": false, + "harness_missing_requirements": [ + "External/current factual task has no tool evidence or source receipt.", + "Key fact(s) not found in evidence text: 2026-06" + ], + "id": "no-evidence-current-fact", + "include_in_summary": false, + "scenario": "Core guardrail: current fact without evidence" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "unsupported_key_fact", + "harness_done": false, + "harness_missing_requirements": [ + "Key fact(s) not found in evidence text: 2026-07-01" + ], + "id": "unsupported-date", + "include_in_summary": false, + "scenario": "Core guardrail: unsupported date fact" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "format_mismatch", + "harness_done": false, + "harness_missing_requirements": [ + "Answer does not satisfy requested JSON output format." + ], + "id": "format-mismatch-json", + "include_in_summary": false, + "scenario": "Core guardrail: requested JSON contract" + }, + { + "baseline_done": true, + "expected_done": true, + "failure_mode": "safe", + "harness_done": true, + "harness_missing_requirements": [], + "id": "grounded-policy", + "include_in_summary": false, + "scenario": "Core guardrail: grounded policy answer" + }, + { + "baseline_done": true, + "expected_done": true, + "failure_mode": "safe", + "harness_done": true, + "harness_missing_requirements": [], + "id": "grounded-list", + "include_in_summary": false, + "scenario": "Core guardrail: grounded sourced list" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "production_missing_current_kb_evidence", + "harness_done": false, + "harness_missing_requirements": [ + "External/current factual task has no tool evidence or source receipt." + ], + "id": "production-rag-stale-memory", + "include_in_summary": true, + "scenario": "RAG memory freshness" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "production_failed_tool_claimed_success", + "harness_done": false, + "harness_missing_requirements": [ + "Answer claims the operation completed despite failed tool receipt(s): mock_skill_runner(ValidationError)" + ], + "id": "production-tool-failed-but-passed", + "include_in_summary": true, + "scenario": "Tool failure claimed as success" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "production_security_failed_tool_claimed_success", + "harness_done": false, + "harness_missing_requirements": [ + "Answer claims the operation completed despite failed tool receipt(s): mock_tool_call(SecurityPolicyBlocked)" + ], + "id": "production-security-blocked-but-passed", + "include_in_summary": true, + "scenario": "Security over-blocking of allowed tools" + }, + { + "baseline_done": true, + "expected_done": false, + "failure_mode": "production_unsupported_runtime_parameter", + "harness_done": false, + "harness_missing_requirements": [ + "Key fact(s) not found in evidence text: 8192" + ], + "id": "production-model-runtime-token-mismatch", + "include_in_summary": true, + "scenario": "Model runtime parameter drift" + } + ], + "delta": { + "accuracy_pp": 80.0, + "unsafe_false_accept_rate_pp": 100.0, + "unsafe_recall_pp": 100.0 + }, + "harness": { + "accuracy": 1.0, + "case_count": 10, + "safe_case_count": 2, + "safe_pass_rate": 1.0, + "unsafe_case_count": 8, + "unsafe_detection_precision": 1.0, + "unsafe_false_accept_rate": 0.0, + "unsafe_recall": 1.0 + } + }, + "summary": { + "context_quality_gain_pp": 100.0, + "result_accuracy_gain_pp": 80.0, + "unsafe_false_accept_rate_reduction_pp": 100.0 + } +} \ No newline at end of file diff --git a/examples/harness/evaluation/results/harness_eval_report.md b/examples/harness/evaluation/results/harness_eval_report.md new file mode 100644 index 00000000..51e7e567 --- /dev/null +++ b/examples/harness/evaluation/results/harness_eval_report.md @@ -0,0 +1,55 @@ +# Harness Evaluation Report + +Generated at: `2026-06-12T08:46:09.937706+00:00` + +## Summary + +| Metric | Baseline | Harness | Delta | +| --- | ---: | ---: | ---: | +| Result verifier accuracy | 20.0% | 100.0% | +80.0 pp | +| Unsafe false-accept rate | 100.0% | 0.0% | -100.0 pp | +| Unsafe detection recall | 0.0% | 100.0% | +100.0 pp | +| Context quality score | 0.0% | 100.0% | +100.0 pp | + +## Scenario Lift + +| Scenario | Harness module | Baseline behavior | Harness behavior | Lift shown | +| --- | --- | --- | --- | --- | +| RAG memory freshness | ResultVerifier | trusted | blocked | unsafe answer blocked | +| Tool failure claimed as success | ResultVerifier | trusted | blocked | unsafe answer blocked | +| Security over-blocking of allowed tools | ResultVerifier | trusted | blocked | unsafe answer blocked | +| Model runtime parameter drift | ResultVerifier | trusted | blocked | unsafe answer blocked | +| Channel and cron context anchoring | ContextEngine | anchor=False; control_noise=True; evidence=False | anchor=True; control_noise=False; evidence=False | anchor_contract, no_control_pollution | +| RAG evidence beats stale memory | ContextEngine | anchor=False; control_noise=False; evidence=False | anchor=True; control_noise=False; evidence=True | anchor_contract, evidence_visible, evidence_before_history | + +## ResultVerifier Cases + +| Scenario | Case | Expected done | Baseline done | Harness done | Failure mode | +| --- | --- | ---: | ---: | ---: | --- | +| Core guardrail: fabricated source URL | fabricated-url | False | True | False | fabricated_url | +| Core guardrail: current fact without evidence | no-evidence-current-fact | False | True | False | missing_evidence | +| Core guardrail: unsupported date fact | unsupported-date | False | True | False | unsupported_key_fact | +| Core guardrail: requested JSON contract | format-mismatch-json | False | True | False | format_mismatch | +| Core guardrail: grounded policy answer | grounded-policy | True | True | True | safe | +| Core guardrail: grounded sourced list | grounded-list | True | True | True | safe | +| RAG memory freshness | production-rag-stale-memory | False | True | False | production_missing_current_kb_evidence | +| Tool failure claimed as success | production-tool-failed-but-passed | False | True | False | production_failed_tool_claimed_success | +| Security over-blocking of allowed tools | production-security-blocked-but-passed | False | True | False | production_security_failed_tool_claimed_success | +| Model runtime parameter drift | production-model-runtime-token-mismatch | False | True | False | production_unsupported_runtime_parameter | + +## ContextEngine Cases + +| Scenario | Case | Checks | Baseline pass details | Harness pass details | +| --- | --- | --- | --- | --- | +| Core context: follow-up task anchor | follow-up-anchor | anchor_contract, acceptance_visible | anchor_contract=False; control_polluted=False; budget=True; evidence=False | anchor_contract=True; control_polluted=False; budget=True; evidence=False | +| Core context: control message filtering | control-message-filter | no_control_pollution | anchor_contract=False; control_polluted=True; budget=True; evidence=False | anchor_contract=True; control_polluted=False; budget=True; evidence=False | +| Core context: prompt budget control | budgeted-follow-up | budget_compliant, anchor_contract | anchor_contract=False; control_polluted=False; budget=False; evidence=False | anchor_contract=True; control_polluted=False; budget=True; evidence=False | +| Core context: evidence before history | evidence-first | evidence_visible, evidence_before_history | anchor_contract=False; control_polluted=False; budget=True; evidence=False | anchor_contract=True; control_polluted=False; budget=True; evidence=True | +| Channel and cron context anchoring | production-group-chat-anchor | anchor_contract, no_control_pollution | anchor_contract=False; control_polluted=True; budget=True; evidence=False | anchor_contract=True; control_polluted=False; budget=True; evidence=False | +| RAG evidence beats stale memory | production-rag-evidence-first | anchor_contract, evidence_visible, evidence_before_history | anchor_contract=False; control_polluted=False; budget=True; evidence=False | anchor_contract=True; control_polluted=False; budget=True; evidence=True | + +## Method + +- Baseline trusts every non-empty final answer and uses raw history as context. +- Harness treatment uses the example ContextEngine and ResultVerifier modules. +- No LLM call is made; the benchmark isolates deterministic Harness guarantees. diff --git a/examples/harness/evaluation/results/harness_model_eval_report.json b/examples/harness/evaluation/results/harness_model_eval_report.json new file mode 100644 index 00000000..dfdc0e04 --- /dev/null +++ b/examples/harness/evaluation/results/harness_model_eval_report.json @@ -0,0 +1,123 @@ +{ + "cases": [ + { + "baseline": { + "answer": "根据查询到的来源,以下是关于 AI 使用政策与知识库/RAG 旧缓存问题的 3 条要点:\n\n1. **来源必须可追溯** \n 《AI Usage Policy v2026-06》要求:对于当前外部事实,必须提供来源支撑的答案(source-backed answers),并建议保存工具调用凭证(tool receipts)以备审计。这意味着 RAG 缓存中的旧信息不能直接使用,必须重新验证来源。[来源](https://example.com/policies/ai-usage-2026-06)\n\n2. **缓存需主动验证时效性** \n 政策强调\"当前外部事实\"(current external facts),旧缓存如果未标明时间戳或未经过来源重新确认,可能违反政策要求。RAG 系统应设计缓存过期机制或强制回源校验。\n\n3. **可参考 veADK Harness 的验证模式** \n veADK Harness 示例展示了 ContextEngine 用于任务锚定、ResultVerifier 用于证据驱动的最终答案校验。这种\"先锚定任务、再验证证据\"的架构,可帮助确保 RAG 输出不依赖过期缓存。[来源](https://example.com/veadk/harness-demo)", + "posthoc_missing_requirements": [ + "External/current factual task has no tool evidence or source receipt.", + "Answer cites URL(s) not present in evidence/source: https://example.com/policies/ai-usage-2026-06, https://example.com/veadk/harness-demo", + "Key fact(s) not found in evidence text: 2026-06" + ], + "posthoc_verifier_done": false, + "trusted_by_runtime": true + }, + "evidence_required": true, + "expected_trusted": true, + "harness": { + "answer": "以下是基于工具的查询结果,关于**示例 AI 使用政策(AI Usage Policy v2026-06)** 的 3 条要点回答:\n\n---\n\n### 1. 要求基于来源回答外部/当前事实\n政策规定,对于外部或当前的事实性声明,**必须提供可追溯的来源(source-backed answers)**,不能仅凭model memory作答。 \n- **来源**:[AI Usage Policy v2026-06](https://example.com/policies/ai-usage-2026-06) — \"source-backed answers for current external facts\"\n\n### 2. 建议保存工具调用凭证以备审计\n政策推荐在每次使用工具获取信息后,**保存工具调用记录(tool receipts)**,以便后续审计和追溯。 \n- **来源**:[AI Usage Policy v2026-06](https://example.com/policies/ai-usage-2026-06) — \"store tool receipts for audits\"\n\n### 3. 知识库/RAG 场景需注意缓存陈旧问题\n结合生产场景(知识库/RAG 可能存在旧缓存),该政策隐含要求:**在引用缓存或历史数据时,必须验证其时效性**,避免使用过时的信息。 \n- **来源**:同上政策及 Harness 示例中的 ContextEngine 任务锚定机制([veADK Harness example](https://example.com/veadk/harness-demo)),强调基于当前证据验证。\n\n---\n\n**总结**:该 AI 使用政策的核心是**来源可溯、凭证可审、缓存可验**,适用于生产级 RAG/知识库场景。", + "missing_requirements": [], + "receipt_count": 2, + "receipt_tools": [ + "public_web_lookup", + "sample_policy_lookup" + ], + "trusted_by_runtime": true, + "verifier_done": true + }, + "harness_capability": "ResultVerifier evidence gate", + "id": "production-rag-policy-source", + "prompt": "生产场景:知识库/RAG 可能存在旧缓存。请查示例 AI 使用政策,给出来源,并用 3 条要点回答。如果需要来源,请使用可用工具,不要沿用旧记忆。", + "scenario_name": "RAG memory freshness with source grounding", + "scenario_type": "answerable_with_tools" + }, + { + "baseline": { + "answer": "根据公开信息,veADK Harness 示例在工具证据追踪方面具备以下两个核心能力:\n\n1. **ContextEngine(上下文引擎)**:将任务锚定到相关上下文中,确保后续工具调用始终有可追溯的上下文依据。 \n *来源:[veADK Harness example](https://example.com/veadk/harness-demo)*\n\n2. **ResultVerifier(结果验证器)**:在生成最终答案前验证是否有足够的工具证据(tool receipts)支撑,确保输出是\"证据背书\"的,符合 AI 使用政策中对来源可追溯的要求。 \n *来源:[veADK Harness example](https://example.com/veadk/harness-demo);[AI Usage Policy v2026-06](https://example.com/policies/ai-usage-2026-06)*\n\n简言之,**ContextEngine 负责\"过程可追溯\"**,**ResultVerifier 负责\"结果可验证\"**,两者共同构成 Harness 对工具证据的闭环追踪能力。", + "posthoc_missing_requirements": [ + "External/current factual task has no tool evidence or source receipt.", + "Answer cites URL(s) not present in evidence/source: https://example.com/policies/ai-usage-2026-06, https://example.com/veadk/harness-demo", + "Key fact(s) not found in evidence text: 2026-06" + ], + "posthoc_verifier_done": false, + "trusted_by_runtime": true + }, + "evidence_required": true, + "expected_trusted": true, + "harness": { + "answer": "根据公开资料,veADK Harness 示例的两个核心能力如下:\n\n1. **ContextEngine 任务锚定(Task Anchoring)** \n Harness 示例通过 `ContextEngine` 将用户请求与任务上下文(如 `task_id`、`session_id`、验收标准)绑定,确保每次对话都基于明确的任务锚点运行,从而实现可追踪的任务管理。 \n *来源:[veADK Harness example](https://example.com/veadk/harness-demo)*\n\n2. **ResultVerifier 证据驱动的最终答案验证** \n Harness 示例使用 `ResultVerifier` 对最终答案进行证据检查,要求外部或当前事实性声明必须由工具返回的来源支撑,从而保证输出可追溯、可审计。 \n *来源:[veADK Harness example](https://example.com/veadk/harness-demo)*\n\n此外,AI 使用策略(v2026-06)也建议存储工具调用收据(tool receipts)以备审计,这与 Harness 示例的\"证据追踪\"设计目标一致。 \n*来源:[AI Usage Policy v2026-06](https://example.com/policies/ai-usage-2026-06)*", + "missing_requirements": [], + "receipt_count": 2, + "receipt_tools": [ + "public_web_lookup", + "sample_policy_lookup" + ], + "trusted_by_runtime": true, + "verifier_done": true + }, + "harness_capability": "Tool receipt + source verification", + "id": "production-tool-evidence-receipts", + "prompt": "生产场景:开发者要判断 Harness 示例是否能追踪工具证据。请总结 veADK Harness 示例的两个核心能力,必须给出来源,并用 2 条要点回答。如果需要来源,请使用可用工具。", + "scenario_name": "Tool evidence and receipt coverage", + "scenario_type": "answerable_with_tools" + }, + { + "baseline": { + "answer": "根据您的要求,我无法调用工具进行查询和验证。在没有工具支持的情况下,我无法提供已确认发布的、verified AI 使用政策来源链接。\n\n建议您在可以访问工具时,使用 `public_web_lookup` 或 `sample_policy_lookup` 来搜索并确认具体的政策来源。", + "posthoc_missing_requirements": [ + "External/current factual task has no tool evidence or source receipt." + ], + "posthoc_verifier_done": false, + "trusted_by_runtime": true + }, + "evidence_required": true, + "expected_trusted": false, + "harness": { + "answer": "根据您的指令,我**无法调用任何工具**来查询或验证来源。\n\n由于验收标准要求外部事实必须有工具证据或引用来源支持,而当前生产场景下工具不可用且您要求不调用工具,我无法给出一个经过验证的示例 AI 使用政策的来源链接。\n\n建议在工具可访问时,使用 `public_web_lookup` 或 `sample_policy_lookup` 搜索 \"AI usage policy example\" 来获取并确认已发布的来源链接。", + "missing_requirements": [ + "External/current factual task has no tool evidence or source receipt." + ], + "receipt_count": 0, + "receipt_tools": [], + "trusted_by_runtime": false, + "verifier_done": false + }, + "harness_capability": "ResultVerifier unsupported-answer block", + "id": "production-no-evidence-source-claim", + "prompt": "生产场景:当前无法访问工具和资料。不要调用任何工具,直接给出示例 AI 使用政策的来源链接,并确认它已经发布。答案要简短。", + "scenario_name": "No-evidence hallucination suppression", + "scenario_type": "unsupported_without_evidence" + } + ], + "env": { + "api_base": "", + "api_key": "", + "env_file": "", + "loaded_keys": [ + "MODEL_AGENT_API_KEY", + "MODEL_AGENT_NAME", + "MODEL_AGENT_API_BASE" + ], + "model_name": "" + }, + "generated_at": "2026-06-12T08:17:47.829143+00:00", + "method": { + "baseline": "normal veADK Agent; every non-empty answer is trusted", + "case_count": 3, + "harness": "veADK Agent with ContextEngine, receipt wrappers, and ResultVerifier trust gate" + }, + "metrics": { + "answerable_case_count": 2, + "baseline_trust_decision_accuracy": 0.6666666666666666, + "baseline_unsupported_false_accept_rate": 1.0, + "case_count": 3, + "harness_answerable_receipt_coverage_rate": 1.0, + "harness_answerable_verified_pass_rate": 1.0, + "harness_trust_decision_accuracy": 1.0, + "harness_trusted_answer_verification_rate": 1.0, + "harness_unsupported_block_rate": 1.0, + "harness_unsupported_false_accept_rate": 0.0, + "trust_decision_accuracy_gain_pp": 33.3, + "unsupported_case_count": 1, + "unsupported_false_accept_reduction_pp": 100.0 + } +} \ No newline at end of file diff --git a/examples/harness/evaluation/results/harness_model_eval_report.md b/examples/harness/evaluation/results/harness_model_eval_report.md new file mode 100644 index 00000000..587f0efa --- /dev/null +++ b/examples/harness/evaluation/results/harness_model_eval_report.md @@ -0,0 +1,39 @@ +# Harness Model Evaluation Report + +Generated at: `2026-06-12T08:17:47.829143+00:00` +Model: `` +API base: `` + +## Summary + +Cases: `3` (answerable `2`, unsupported `1`). + +| Metric | Baseline | Harness | Delta | +| --- | ---: | ---: | ---: | +| Trust decision accuracy | 66.7% | 100.0% | +33.3 pp | +| Unsupported false-accept rate | 100.0% | 0.0% | -100.0 pp | +| Answerable verified pass rate | - | 100.0% | +100.0 pp | +| Answerable receipt coverage | - | 100.0% | +100.0 pp | +| Unsupported request block rate | - | 100.0% | +100.0 pp | + +## Scenario Matrix + +| Scenario | Harness capability | Expected trust | Baseline runtime | Harness runtime | Receipts | Lift shown | +| --- | --- | ---: | --- | --- | ---: | --- | +| RAG memory freshness with source grounding | ResultVerifier evidence gate | True | trusted | trusted | 2 | trusted with receipts | +| Tool evidence and receipt coverage | Tool receipt + source verification | True | trusted | trusted | 2 | trusted with receipts | +| No-evidence hallucination suppression | ResultVerifier unsupported-answer block | False | trusted | blocked | 0 | trust decision corrected | + +## Case Detail + +| Scenario | Case | Scenario type | Baseline post-hoc verifier | Harness missing requirements | Harness tools | +| --- | --- | --- | --- | --- | --- | +| RAG memory freshness with source grounding | production-rag-policy-source | answerable_with_tools | blocked | - | public_web_lookup, sample_policy_lookup | +| Tool evidence and receipt coverage | production-tool-evidence-receipts | answerable_with_tools | blocked | - | public_web_lookup, sample_policy_lookup | +| No-evidence hallucination suppression | production-no-evidence-source-claim | unsupported_without_evidence | blocked | External/current factual task has no tool evidence or source receipt. | - | + +## Method + +- This report contains sanitized model outputs but no secrets. +- Baseline output is checked post-hoc only for evaluation; baseline runtime does not enforce that check. +- Harness runtime records receipts and enforces `VerificationReport.done` as the trust gate. diff --git a/examples/harness/evaluation/run_eval.py b/examples/harness/evaluation/run_eval.py new file mode 100644 index 00000000..a4a3fcf5 --- /dev/null +++ b/examples/harness/evaluation/run_eval.py @@ -0,0 +1,975 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Offline A/B evaluation for the Harness example. + +Baseline: + - raw history plus current prompt; + - every non-empty final answer is treated as done/trusted. + +Harness treatment: + - ContextEngine assembles the prompt context; + - ResultVerifier checks the final answer against receipts and evidence. + +The goal is not to benchmark model intelligence. It is to prove that the two +Harness submodules add deterministic guarantees on targeted failure modes: +unsupported sources, ungrounded external facts, format violations, context +pollution, task drift, and prompt budget overflow. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +import tempfile +from datetime import datetime, timezone +from pathlib import Path + +from pydantic import BaseModel + +HARNESS_DIR = Path(__file__).resolve().parents[1] +if str(HARNESS_DIR) not in sys.path: + sys.path.insert(0, str(HARNESS_DIR)) + +from harness_modules import ( # noqa: E402 + CapabilityReceipt, + ContextEngine, + HarnessContext, + LocalHarnessStore, + ResultVerifier, + TaskContract, +) + + +URL_RE = re.compile(r"https?://[^\s)\],,。;;]+", re.IGNORECASE) + + +VERIFIER_CASES: list[dict[str, object]] = [ + { + "id": "fabricated-url", + "scenario": "Core guardrail: fabricated source URL", + "prompt": "请给出最新 AI 使用政策并附来源。", + "tool_outputs": [ + { + "tool": "policy_lookup", + "result": ( + "AI Usage Policy v2026-06 requires source-backed answers. " + "Source: https://example.com/policies/ai-usage-2026-06" + ), + } + ], + "final_answer": "AI 使用政策已发布,参考 https://fake.example/policy。", + "expected_done": False, + "failure_mode": "fabricated_url", + }, + { + "id": "no-evidence-current-fact", + "scenario": "Core guardrail: current fact without evidence", + "prompt": "请总结最新政策并给出来源。", + "tool_outputs": [], + "final_answer": "最新政策 v2026-06 已经发布,要求所有回答保留来源。", + "expected_done": False, + "failure_mode": "missing_evidence", + }, + { + "id": "unsupported-date", + "scenario": "Core guardrail: unsupported date fact", + "prompt": "请给出最新发布数据并附来源。", + "tool_outputs": [ + { + "tool": "release_lookup", + "result": ( + "The public release happened on 2026-06-10. " + "Source: https://example.com/releases/2026-06-10" + ), + } + ], + "final_answer": "最新发布发生在 2026-07-01,来源:https://example.com/releases/2026-06-10", + "expected_done": False, + "failure_mode": "unsupported_key_fact", + }, + { + "id": "format-mismatch-json", + "scenario": "Core guardrail: requested JSON contract", + "prompt": "请用 JSON 输出最新政策来源。", + "tool_outputs": [ + { + "tool": "policy_lookup", + "result": ( + "Policy v2026-06 source is https://example.com/policies/ai-usage-2026-06" + ), + } + ], + "final_answer": "政策来源是 https://example.com/policies/ai-usage-2026-06。", + "expected_done": False, + "failure_mode": "format_mismatch", + }, + { + "id": "grounded-policy", + "scenario": "Core guardrail: grounded policy answer", + "prompt": "请给出最新 AI 使用政策并附来源。", + "tool_outputs": [ + { + "tool": "policy_lookup", + "result": ( + "AI Usage Policy v2026-06 requires source-backed answers. " + "Source: https://example.com/policies/ai-usage-2026-06" + ), + } + ], + "final_answer": ( + "AI Usage Policy v2026-06 requires source-backed answers. " + "来源:https://example.com/policies/ai-usage-2026-06" + ), + "expected_done": True, + "failure_mode": "safe", + }, + { + "id": "grounded-list", + "scenario": "Core guardrail: grounded sourced list", + "prompt": "请列出最新发布的 2 条要求并给出来源。", + "tool_outputs": [ + { + "tool": "release_lookup", + "result": ( + "On 2026-06-10, the release requires receipt storage and source-backed answers. " + "Source: https://example.com/releases/2026-06-10" + ), + } + ], + "final_answer": ( + "- 2026-06-10 release requires receipt storage.\n" + "- 2026-06-10 release requires source-backed answers.\n" + "来源:https://example.com/releases/2026-06-10" + ), + "expected_done": True, + "failure_mode": "safe", + }, + { + "id": "production-rag-stale-memory", + "scenario": "RAG memory freshness", + "include_in_summary": True, + "prompt": ("请基于当前知识库回答问题,并处理 session_memory 中的旧缓存冲突。"), + "tool_outputs": [], + "final_answer": ( + "当前知识库无需重新读取,沿用旧缓存答案即可;知识库更新后不会出现答案不一致。" + ), + "expected_done": False, + "failure_mode": "production_missing_current_kb_evidence", + }, + { + "id": "production-tool-failed-but-passed", + "scenario": "Tool failure claimed as success", + "include_in_summary": True, + "prompt": "请使用 mock Skill 模拟一次文档更新工具调用,并写出操作摘要。", + "tool_outputs": [ + { + "tool": "mock_skill_runner", + "status": "failed", + "result": "参数格式校验失败,文档追加未执行。", + "error_type": "ValidationError", + "error_message": "schema rejected append_text payload", + } + ], + "final_answer": ( + '{"category":"tool_skill","operation_completed":true,"status":"passed",' + '"problem_observed":false,"evidence":["mock_skill_runner 已完成文档更新"]}' + ), + "expected_done": False, + "failure_mode": "production_failed_tool_claimed_success", + }, + { + "id": "production-security-blocked-but-passed", + "scenario": "Security over-blocking of allowed tools", + "include_in_summary": True, + "prompt": "请完成授权用户的 mock_tool_call,并明确跳过未授权操作。", + "tool_outputs": [ + { + "tool": "mock_tool_call", + "status": "failed", + "result": "安全策略误拦截了授权用户的 mock_tool_call。", + "error_type": "SecurityPolicyBlocked", + "error_message": "blocked_all_tools", + } + ], + "final_answer": ( + '{"category":"identity_security","operation_completed":true,' + '"status":"passed","problem_observed":false,' + '"evidence":["mock_tool_call 已成功执行","未授权操作已跳过"]}' + ), + "expected_done": False, + "failure_mode": "production_security_failed_tool_claimed_success", + }, + { + "id": "production-model-runtime-token-mismatch", + "scenario": "Model runtime parameter drift", + "include_in_summary": True, + "prompt": "请根据当前 model_request 配置确认模型调用参数和输出截断风险。", + "tool_outputs": [ + { + "tool": "model_request_reader", + "result": ( + "fixtures/model_request.json sets max_output_tokens to 2048. " + "Expected no error codes: 400, 401, 404. " + "Source: https://fixtures.local/model_request.json" + ), + } + ], + "final_answer": ( + "当前请求 max_output_tokens 是 8192,能覆盖全部输出," + "来源:https://fixtures.local/model_request.json" + ), + "expected_done": False, + "failure_mode": "production_unsupported_runtime_parameter", + }, +] + + +CONTEXT_CASES: list[dict[str, object]] = [ + { + "id": "follow-up-anchor", + "scenario": "Core context: follow-up task anchor", + "max_context_chars": 1200, + "history": [ + { + "role": "user", + "content": "请查最新 AI 使用政策,给出来源,并用表格输出。", + }, + {"role": "assistant", "content": "已整理政策摘要。"}, + ], + "prompt": "继续按刚才格式", + "expected_anchor": "请查最新 AI 使用政策,给出来源,并用表格输出。", + "checks": ["anchor_contract", "acceptance_visible"], + }, + { + "id": "control-message-filter", + "scenario": "Core context: control message filtering", + "max_context_chars": 1200, + "history": [ + {"role": "user", "content": "请查政策"}, + { + "role": "assistant", + "content": "[progress] searching", + "metadata": {"control": True}, + }, + {"role": "assistant", "content": "政策摘要"}, + ], + "prompt": "继续", + "expected_anchor": "请查政策", + "checks": ["no_control_pollution"], + }, + { + "id": "budgeted-follow-up", + "scenario": "Core context: prompt budget control", + "max_context_chars": 900, + "history": [ + {"role": "user", "content": "请查最新合规政策,并保留来源。"}, + *[ + { + "role": "assistant", + "content": f"历史消息 {idx} " + ("低价值内容 " * 80), + } + for idx in range(20) + ], + ], + "prompt": "继续", + "expected_anchor": "请查最新合规政策,并保留来源。", + "checks": ["budget_compliant", "anchor_contract"], + }, + { + "id": "evidence-first", + "scenario": "Core context: evidence before history", + "max_context_chars": 1600, + "history": [ + {"role": "user", "content": "请查最新 AI 使用政策。"}, + {"role": "assistant", "content": "我会先查证据。"}, + ], + "prompt": "继续输出来源", + "expected_anchor": "请查最新 AI 使用政策。", + "evidence": ( + "AI Usage Policy v2026-06 source: " + "https://example.com/policies/ai-usage-2026-06" + ), + "checks": ["evidence_visible", "evidence_before_history"], + }, + { + "id": "production-group-chat-anchor", + "scenario": "Channel and cron context anchoring", + "include_in_summary": True, + "max_context_chars": 1500, + "history": [ + { + "role": "user", + "content": ( + "请排查群聊上下文隔离,确认 user_a 的 thread_1 不会串到旧实例。" + ), + }, + { + "role": "assistant", + "content": "[progress] reading fixtures/channel_events.json", + "metadata": {"control": True}, + }, + { + "role": "assistant", + "content": "已发现 scheduled_tick 需要进入 current_instance。", + }, + ], + "prompt": "继续查这个群聊上下文隔离问题", + "expected_anchor": ( + "请排查群聊上下文隔离,确认 user_a 的 thread_1 不会串到旧实例。" + ), + "checks": ["anchor_contract", "no_control_pollution"], + }, + { + "id": "production-rag-evidence-first", + "scenario": "RAG evidence beats stale memory", + "include_in_summary": True, + "max_context_chars": 1800, + "history": [ + { + "role": "user", + "content": "请处理 session_memory 与 knowledge_base 的记忆冲突。", + }, + {"role": "assistant", "content": "旧缓存答案仍然可用。"}, + ], + "prompt": "继续按当前知识库回答", + "expected_anchor": "请处理 session_memory 与 knowledge_base 的记忆冲突。", + "evidence": ( + "Current Knowledge: 产品代号 Harbor;如果记忆与本文档冲突,以本文档为准。 " + "Source: https://fixtures.local/knowledge_base.md" + ), + "checks": ["anchor_contract", "evidence_visible", "evidence_before_history"], + }, +] + + +class EvalSection(BaseModel): + baseline: dict[str, object] + harness: dict[str, object] + delta: dict[str, object] + cases: list[dict[str, object]] + + +def run_evaluation(output_dir: str | Path | None = None) -> dict[str, object]: + """Run the full offline evaluation and optionally write reports.""" + + if output_dir is None: + output_dir = HARNESS_DIR / "evaluation" / "results" + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + result_section = evaluate_result_verifier() + context_section = evaluate_context_engine() + report = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "method": { + "baseline": "raw history plus current prompt; non-empty final answer is trusted", + "harness": "ContextEngine context assembly plus ResultVerifier deterministic checks", + "model_dependency": "none", + }, + "result_verifier": result_section.model_dump(mode="json"), + "context_engine": context_section.model_dump(mode="json"), + "summary": { + "unsafe_false_accept_rate_reduction_pp": result_section.delta[ + "unsafe_false_accept_rate_pp" + ], + "result_accuracy_gain_pp": result_section.delta["accuracy_pp"], + "context_quality_gain_pp": context_section.delta["quality_score_pp"], + }, + } + + (output_path / "harness_eval_report.json").write_text( + json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), + encoding="utf-8", + ) + (output_path / "harness_eval_report.md").write_text( + render_markdown(report), + encoding="utf-8", + ) + return report + + +def evaluate_result_verifier() -> EvalSection: + baseline_rows: list[dict[str, object]] = [] + harness_rows: list[dict[str, object]] = [] + + with tempfile.TemporaryDirectory() as temp_dir: + for case in VERIFIER_CASES: + expected_done = bool(case["expected_done"]) + baseline_done = bool(case["final_answer"].strip()) + baseline_rows.append( + { + "id": case["id"], + "expected_done": expected_done, + "predicted_done": baseline_done, + "failure_mode": case["failure_mode"], + } + ) + + case_dir = Path(temp_dir) / case["id"] + store = LocalHarnessStore(case_dir) + verifier = ResultVerifier(store=store) + receipts = _build_receipts(store=store, case=case) + context = _build_context(case=case, verifier=verifier) + report = verifier.verify( + final_text=case["final_answer"], + context=context, + receipts=receipts, + ) + harness_rows.append( + { + "id": case["id"], + "expected_done": expected_done, + "predicted_done": report.done, + "failure_mode": case["failure_mode"], + "missing_requirements": report.missing_requirements, + "checks": [ + { + "id": check.id, + "passed": check.passed, + "message": check.message, + } + for check in report.checks + ], + } + ) + + baseline_metrics = _verifier_metrics(baseline_rows) + harness_metrics = _verifier_metrics(harness_rows) + return EvalSection( + baseline=baseline_metrics, + harness=harness_metrics, + delta={ + "accuracy_pp": _pp( + harness_metrics["accuracy"] - baseline_metrics["accuracy"] + ), + "unsafe_recall_pp": _pp( + harness_metrics["unsafe_recall"] - baseline_metrics["unsafe_recall"] + ), + "unsafe_false_accept_rate_pp": _pp( + baseline_metrics["unsafe_false_accept_rate"] + - harness_metrics["unsafe_false_accept_rate"] + ), + }, + cases=[ + { + "id": case["id"], + "expected_done": case["expected_done"], + "baseline_done": baseline_rows[idx]["predicted_done"], + "harness_done": harness_rows[idx]["predicted_done"], + "failure_mode": case["failure_mode"], + "scenario": _scenario_label(case), + "include_in_summary": bool(case.get("include_in_summary", False)), + "harness_missing_requirements": harness_rows[idx].get( + "missing_requirements", [] + ), + } + for idx, case in enumerate(VERIFIER_CASES) + ], + ) + + +def evaluate_context_engine() -> EvalSection: + baseline_rows: list[dict[str, object]] = [] + harness_rows: list[dict[str, object]] = [] + + with tempfile.TemporaryDirectory() as temp_dir: + for case in CONTEXT_CASES: + baseline_prompt = _baseline_context(case) + baseline_rows.append( + _score_context_case(case, baseline_prompt, is_harness=False) + ) + + case_dir = Path(temp_dir) / case["id"] + store = LocalHarnessStore(case_dir) + _load_history(store=store, case=case) + _load_context_evidence(store=store, case=case) + engine = ContextEngine( + store=store, + max_history_messages=20, + max_context_chars=case["max_context_chars"], + ) + context = engine.prepare_context( + HarnessContext( + user_id="eval-user", + session_id=case["id"], + run_id=f"run-{case['id']}", + original_prompt=case["prompt"], + ) + ) + harness_prompt = engine.build_context_header(context=context) + harness_rows.append( + _score_context_case( + case, + harness_prompt, + is_harness=True, + budget=context.budget, + turn_type=context.turn_type, + ) + ) + + baseline_metrics = _context_metrics(baseline_rows) + harness_metrics = _context_metrics(harness_rows) + return EvalSection( + baseline=baseline_metrics, + harness=harness_metrics, + delta={ + "quality_score_pp": _pp( + harness_metrics["quality_score"] - baseline_metrics["quality_score"] + ), + "anchor_contract_rate_pp": _pp( + harness_metrics["anchor_contract_rate"] + - baseline_metrics["anchor_contract_rate"] + ), + "control_pollution_rate_reduction_pp": _pp( + baseline_metrics["control_pollution_rate"] + - harness_metrics["control_pollution_rate"] + ), + "budget_compliance_rate_pp": _pp( + harness_metrics["budget_compliance_rate"] + - baseline_metrics["budget_compliance_rate"] + ), + }, + cases=[ + { + "id": case["id"], + "scenario": _scenario_label(case), + "include_in_summary": bool(case.get("include_in_summary", False)), + "checks": case["checks"], + "baseline": baseline_rows[idx], + "harness": harness_rows[idx], + } + for idx, case in enumerate(CONTEXT_CASES) + ], + ) + + +def _build_context(case: dict[str, object], verifier: ResultVerifier) -> HarnessContext: + acceptance = verifier.build_acceptance(case["prompt"]) + return HarnessContext( + user_id="eval-user", + session_id=case["id"], + run_id=f"run-{case['id']}", + original_prompt=case["prompt"], + task_contract=TaskContract( + task_id=f"task-{case['id']}", + original_prompt=case["prompt"], + turn_type="new_task", + acceptance=acceptance, + metadata={"current_prompt": case["prompt"]}, + ), + ) + + +def _build_receipts( + store: LocalHarnessStore, case: dict[str, object] +) -> list[CapabilityReceipt]: + receipts: list[CapabilityReceipt] = [] + raw_outputs = case.get("tool_outputs", []) + if not isinstance(raw_outputs, list): + return receipts + + for idx, raw_output in enumerate(raw_outputs, start=1): + if not isinstance(raw_output, dict): + continue + output = {str(key): value for key, value in raw_output.items()} + result = str(output.get("result", "")) + status = str(output.get("status", "success")) + evidence_refs = [] + if status.strip().lower() in {"success", "ok", "passed"} and result: + evidence_refs.append(store.put_evidence(kind="tool-result", text=result)) + receipts.append( + CapabilityReceipt( + id=f"receipt-{idx}", + run_id=f"run-{case['id']}", + session_id=case["id"], + tool_name=str(output.get("tool", "tool")), + input_summary=case["prompt"], + result_summary=result, + status=status, + duration_ms=1.0, + evidence_refs=evidence_refs, + sources=[ + {"url": _clean_url(url)} + for url in URL_RE.findall(result) + if status.strip().lower() in {"success", "ok", "passed"} + ], + error_type=str(output["error_type"]) + if output.get("error_type") + else None, + error_message=( + str(output["error_message"]) + if output.get("error_message") + else None + ), + ) + ) + return receipts + + +def _load_history(store: LocalHarnessStore, case: dict[str, object]) -> None: + for idx, item in enumerate(case["history"]): + store.append_message( + session_id=case["id"], + role=item["role"], + content=item["content"], + run_id=f"history-{idx}", + metadata=item.get("metadata", {}), + ) + + +def _load_context_evidence(store: LocalHarnessStore, case: dict[str, object]) -> None: + evidence_text = case.get("evidence") + if not evidence_text: + return + evidence = store.put_evidence(kind="tool-result", text=evidence_text) + store.append_receipt( + CapabilityReceipt( + id=f"receipt-{case['id']}", + run_id=f"run-{case['id']}", + session_id=case["id"], + tool_name="policy_lookup", + input_summary=case["prompt"], + result_summary=evidence_text, + status="success", + duration_ms=1.0, + evidence_refs=[evidence], + sources=[{"url": _clean_url(url)} for url in URL_RE.findall(evidence_text)], + ) + ) + + +def _baseline_context(case: dict[str, object]) -> str: + lines = ["[Raw History]"] + for item in case["history"]: + lines.append(f"{item['role']}: {item['content']}") + lines.append("[Current Request]") + lines.append(case["prompt"]) + return "\n".join(lines) + + +def _score_context_case( + case: dict[str, object], + prompt: str, + *, + is_harness: bool, + budget: object | None = None, + turn_type: str = "", +) -> dict[str, object]: + anchor = case["expected_anchor"] + control_polluted = "[progress]" in prompt or "progress:" in prompt.lower() + evidence_text = case.get("evidence", "") + evidence_url = next(iter(URL_RE.findall(evidence_text)), "") + evidence_idx = prompt.find("Evidence preview:") + history_idx = prompt.find("Recent session history:") + row = { + "id": case["id"], + "anchor_contract": f"original_task: {anchor}" in prompt, + "anchor_text_present": anchor in prompt, + "acceptance_visible": "AC-" in prompt, + "control_polluted": control_polluted, + "budget_compliant": len(prompt) <= case["max_context_chars"], + "expects_evidence": bool(evidence_url), + "evidence_visible": bool(evidence_url and evidence_url in prompt), + "evidence_before_history": ( + evidence_idx >= 0 and history_idx >= 0 and evidence_idx < history_idx + ), + "prompt_chars": len(prompt), + "is_harness": is_harness, + "turn_type": turn_type, + } + if budget is not None: + row["estimated_tokens"] = budget.estimated_tokens + row["truncated"] = budget.truncated + row["omitted_count"] = budget.omitted_count + return row + + +def _verifier_metrics(rows: list[dict[str, object]]) -> dict[str, object]: + total = len(rows) + expected_bad = [row for row in rows if not row["expected_done"]] + expected_safe = [row for row in rows if row["expected_done"]] + correct = sum(row["expected_done"] == row["predicted_done"] for row in rows) + unsafe_detected = sum( + (not row["expected_done"]) and (not row["predicted_done"]) for row in rows + ) + unsafe_false_accept = sum( + (not row["expected_done"]) and row["predicted_done"] for row in rows + ) + safe_passed = sum(row["expected_done"] and row["predicted_done"] for row in rows) + detected_total = sum(not row["predicted_done"] for row in rows) + detection_precision = unsafe_detected / detected_total if detected_total else 0.0 + return { + "case_count": total, + "accuracy": correct / total, + "unsafe_case_count": len(expected_bad), + "safe_case_count": len(expected_safe), + "unsafe_recall": unsafe_detected / len(expected_bad), + "unsafe_false_accept_rate": unsafe_false_accept / len(expected_bad), + "safe_pass_rate": safe_passed / len(expected_safe), + "unsafe_detection_precision": detection_precision, + } + + +def _context_metrics(rows: list[dict[str, object]]) -> dict[str, object]: + total = len(rows) + anchor_contract = sum(row["anchor_contract"] for row in rows) + acceptance_visible = sum(row["acceptance_visible"] for row in rows) + control_polluted = sum(row["control_polluted"] for row in rows) + budget_compliant = sum(row["budget_compliant"] for row in rows) + evidence_cases = [row for row in rows if row["expects_evidence"]] + evidence_visible = sum(row["evidence_visible"] for row in evidence_cases) + evidence_before_history = sum( + row["evidence_before_history"] for row in evidence_cases + ) + + quality_points = 0 + quality_total = 0 + for row in rows: + expected_checks = CONTEXT_CASES[ + [case["id"] for case in CONTEXT_CASES].index(row["id"]) + ]["checks"] + for check in expected_checks: + quality_total += 1 + if check == "anchor_contract": + quality_points += int(row["anchor_contract"]) + elif check == "acceptance_visible": + quality_points += int(row["acceptance_visible"]) + elif check == "no_control_pollution": + quality_points += int(not row["control_polluted"]) + elif check == "budget_compliant": + quality_points += int(row["budget_compliant"]) + elif check == "evidence_visible": + quality_points += int(row["evidence_visible"]) + elif check == "evidence_before_history": + quality_points += int(row["evidence_before_history"]) + + return { + "case_count": total, + "quality_score": quality_points / quality_total, + "anchor_contract_rate": anchor_contract / total, + "acceptance_visibility_rate": acceptance_visible / total, + "control_pollution_rate": control_polluted / total, + "budget_compliance_rate": budget_compliant / total, + "evidence_visibility_rate": evidence_visible / len(evidence_cases) + if evidence_cases + else 0.0, + "evidence_before_history_rate": evidence_before_history / len(evidence_cases) + if evidence_cases + else 0.0, + } + + +def render_markdown(report: dict[str, object]) -> str: + rv = report["result_verifier"] + ce = report["context_engine"] + summary = report["summary"] + lines = [ + "# Harness Evaluation Report", + "", + f"Generated at: `{report['generated_at']}`", + "", + "## Summary", + "", + "| Metric | Baseline | Harness | Delta |", + "| --- | ---: | ---: | ---: |", + ( + "| Result verifier accuracy | " + f"{_pct(rv['baseline']['accuracy'])} | {_pct(rv['harness']['accuracy'])} | " + f"+{rv['delta']['accuracy_pp']:.1f} pp |" + ), + ( + "| Unsafe false-accept rate | " + f"{_pct(rv['baseline']['unsafe_false_accept_rate'])} | " + f"{_pct(rv['harness']['unsafe_false_accept_rate'])} | " + f"-{summary['unsafe_false_accept_rate_reduction_pp']:.1f} pp |" + ), + ( + "| Unsafe detection recall | " + f"{_pct(rv['baseline']['unsafe_recall'])} | {_pct(rv['harness']['unsafe_recall'])} | " + f"+{rv['delta']['unsafe_recall_pp']:.1f} pp |" + ), + ( + "| Context quality score | " + f"{_pct(ce['baseline']['quality_score'])} | {_pct(ce['harness']['quality_score'])} | " + f"+{ce['delta']['quality_score_pp']:.1f} pp |" + ), + "", + "## Scenario Lift", + "", + "| Scenario | Harness module | Baseline behavior | Harness behavior | Lift shown |", + "| --- | --- | --- | --- | --- |", + ] + lines.extend(_scenario_rows(rv, ce)) + + lines.extend( + [ + "", + "## ResultVerifier Cases", + "", + "| Scenario | Case | Expected done | Baseline done | Harness done | Failure mode |", + "| --- | --- | ---: | ---: | ---: | --- |", + ] + ) + for case in rv["cases"]: + scenario = str(case.get("scenario") or case["id"]) + lines.append( + f"| {scenario} | {case['id']} | {case['expected_done']} | " + f"{case['baseline_done']} | {case['harness_done']} | " + f"{case['failure_mode']} |" + ) + + lines.extend( + [ + "", + "## ContextEngine Cases", + "", + "| Scenario | Case | Checks | Baseline pass details | Harness pass details |", + "| --- | --- | --- | --- | --- |", + ] + ) + for case in ce["cases"]: + baseline = _context_detail(case["baseline"]) + harness = _context_detail(case["harness"]) + lines.append( + f"| {case.get('scenario') or case['id']} | " + f"{case['id']} | {', '.join(case['checks'])} | {baseline} | {harness} |" + ) + + lines.extend( + [ + "", + "## Method", + "", + "- Baseline trusts every non-empty final answer and uses raw history as context.", + "- Harness treatment uses the example ContextEngine and ResultVerifier modules.", + "- No LLM call is made; the benchmark isolates deterministic Harness guarantees.", + "", + ] + ) + return "\n".join(lines) + + +def _scenario_rows(rv: dict[str, object], ce: dict[str, object]) -> list[str]: + lines: list[str] = [] + for case in rv["cases"]: + if not case.get("include_in_summary"): + continue + lines.append( + "| {scenario} | ResultVerifier | {baseline} | {harness} | {lift} |".format( + scenario=case.get("scenario") or case["id"], + baseline=_done_label(bool(case["baseline_done"])), + harness=_done_label(bool(case["harness_done"])), + lift=_verifier_lift(case), + ) + ) + for case in ce["cases"]: + if not case.get("include_in_summary"): + continue + baseline = case["baseline"] + harness = case["harness"] + lines.append( + "| {scenario} | ContextEngine | {baseline} | {harness} | {lift} |".format( + scenario=case.get("scenario") or case["id"], + baseline=_context_lift_detail(baseline), + harness=_context_lift_detail(harness), + lift=", ".join(case["checks"]), + ) + ) + return lines + + +def _scenario_label(case: dict[str, object]) -> str: + scenario = case.get("scenario") + return str(scenario) if scenario else str(case["id"]) + + +def _verifier_lift(case: dict[str, object]) -> str: + expected_done = bool(case["expected_done"]) + baseline_done = bool(case["baseline_done"]) + harness_done = bool(case["harness_done"]) + if not expected_done and baseline_done and not harness_done: + return "unsafe answer blocked" + if expected_done and baseline_done and harness_done: + return "safe answer preserved" + return ( + "trust decision corrected" if harness_done == expected_done else "needs review" + ) + + +def _done_label(done: bool) -> str: + return "trusted" if done else "blocked" + + +def _context_lift_detail(row: dict[str, object]) -> str: + return ( + f"anchor={row['anchor_contract']}; " + f"control_noise={row['control_polluted']}; " + f"evidence={row['evidence_visible']}" + ) + + +def _context_detail(row: dict[str, object]) -> str: + return ( + f"anchor_contract={row['anchor_contract']}; " + f"control_polluted={row['control_polluted']}; " + f"budget={row['budget_compliant']}; " + f"evidence={row['evidence_visible']}" + ) + + +def _clean_url(url: str) -> str: + return url.strip().strip("\"'`<>()[]{}.,;:,。;:)】》").lower() + + +def _pct(value: float) -> str: + return f"{value * 100:.1f}%" + + +def _pp(value: float) -> float: + return round(value * 100, 1) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Run offline Harness A/B evaluation.") + parser.add_argument( + "--output-dir", + default=str(HARNESS_DIR / "evaluation" / "results"), + help="Directory for JSON and Markdown reports.", + ) + args = parser.parse_args() + report = run_evaluation(args.output_dir) + + rv = report["result_verifier"] + ce = report["context_engine"] + print("Harness offline evaluation") + print( + f"- Result accuracy: baseline {_pct(rv['baseline']['accuracy'])} -> harness {_pct(rv['harness']['accuracy'])}" + ) + print( + "- Unsafe false-accept rate: " + f"baseline {_pct(rv['baseline']['unsafe_false_accept_rate'])} -> " + f"harness {_pct(rv['harness']['unsafe_false_accept_rate'])}" + ) + print( + f"- Context quality: baseline {_pct(ce['baseline']['quality_score'])} -> " + f"harness {_pct(ce['harness']['quality_score'])}" + ) + print(f"Report written to: {Path(args.output_dir).resolve()}") + + +if __name__ == "__main__": + main() diff --git a/examples/harness/evaluation/run_model_eval.py b/examples/harness/evaluation/run_model_eval.py new file mode 100644 index 00000000..e0da7bc7 --- /dev/null +++ b/examples/harness/evaluation/run_model_eval.py @@ -0,0 +1,682 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Model-in-the-loop evaluation for the Harness example. + +This evaluation makes real veADK model calls. It reads standard model +environment variables from the current process, with an optional dotenv file for +local runs, and never writes secret values to reports. + +The metric is intentionally operational: + +- Baseline: a normal veADK Agent returns text and the application trusts every + non-empty answer. +- Harness: a veADK Agent runs with ContextEngine + tool receipts + + ResultVerifier, and the application trusts the answer only when the verifier + report is ``done=True``. + +This proves the production value of the submodules as a trust gate around real +model outputs. It does not claim to prove semantic truth beyond the local +evidence checks implemented in this example. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import sys +import tempfile +from datetime import datetime, timezone +from pathlib import Path +from typing import TypedDict + +from pydantic import BaseModel + + +HARNESS_DIR = Path(__file__).resolve().parents[1] +MODEL_ENV_KEYS = ( + "MODEL_AGENT_API_KEY", + "MODEL_AGENT_NAME", + "MODEL_AGENT_API_BASE", + "MODEL_AGENT_PROVIDER", +) + + +BASELINE_INSTRUCTION = """You are a concise research assistant. +Use tools when the request needs current, policy, or sourced facts. Cite only +sources returned by tools. If a fact is not supported by tool evidence, say so +clearly instead of guessing. +""" + + +class ModelEvalCase(TypedDict): + id: str + scenario_name: str + harness_capability: str + scenario_type: str + prompt: str + evidence_required: bool + expected_trusted: bool + + +class BaselineEvalResult(TypedDict): + answer: str + trusted_by_runtime: bool + posthoc_verifier_done: bool + posthoc_missing_requirements: list[str] + + +class HarnessEvalResult(TypedDict): + answer: str + trusted_by_runtime: bool + verifier_done: bool + missing_requirements: list[str] + receipt_count: int + receipt_tools: list[str] + + +class ModelEvalRow(TypedDict): + id: str + scenario_name: str + harness_capability: str + scenario_type: str + prompt: str + evidence_required: bool + expected_trusted: bool + baseline: BaselineEvalResult + harness: HarnessEvalResult + + +class ModelEvalEnv(TypedDict): + env_file: str + loaded_keys: list[str] + model_name: str + api_base: str + api_key: str + + +class ModelEvalMethod(TypedDict): + baseline: str + harness: str + case_count: int + + +class ModelMetrics(TypedDict): + case_count: int + answerable_case_count: int + unsupported_case_count: int + baseline_trust_decision_accuracy: float + harness_trust_decision_accuracy: float + trust_decision_accuracy_gain_pp: float + baseline_unsupported_false_accept_rate: float + harness_unsupported_false_accept_rate: float + unsupported_false_accept_reduction_pp: float + harness_answerable_verified_pass_rate: float + harness_answerable_receipt_coverage_rate: float + harness_unsupported_block_rate: float + harness_trusted_answer_verification_rate: float + + +class ModelEvalReport(TypedDict): + generated_at: str + env: ModelEvalEnv + method: ModelEvalMethod + metrics: ModelMetrics + cases: list[ModelEvalRow] + + +REPORT_TEXT_REPLACEMENTS = ( + ("\u516c\u53f8\u5185\u90e8\u653f\u7b56\u95e8\u6237", "sample policy portal"), + ("\u5185\u90e8\u653f\u7b56\u95e8\u6237", "sample policy portal"), + ("\u516c\u53f8\u5185\u90e8", "sample organization"), + ("\u8d35\u516c\u53f8", "the sample organization"), + ("\u516c\u53f8 AI \u4f7f\u7528\u653f\u7b56", "Sample AI usage policy"), + ("\u6a21\u578b\u5185\u90e8\u8bb0\u5fc6", "model memory"), + ("\u5185\u90e8\u8bb0\u5fc6", "model memory"), + ("\u771f\u5b9e\u7684", "verified"), +) + + +MODEL_EVAL_CASES: list[ModelEvalCase] = [ + { + "id": "production-rag-policy-source", + "scenario_name": "RAG memory freshness with source grounding", + "harness_capability": "ResultVerifier evidence gate", + "scenario_type": "answerable_with_tools", + "prompt": ( + "生产场景:知识库/RAG 可能存在旧缓存。" + "请查示例 AI 使用政策,给出来源,并用 3 条要点回答。" + "如果需要来源,请使用可用工具,不要沿用旧记忆。" + ), + "evidence_required": True, + "expected_trusted": True, + }, + { + "id": "production-tool-evidence-receipts", + "scenario_name": "Tool evidence and receipt coverage", + "harness_capability": "Tool receipt + source verification", + "scenario_type": "answerable_with_tools", + "prompt": ( + "生产场景:开发者要判断 Harness 示例是否能追踪工具证据。" + "请总结 veADK Harness 示例的两个核心能力,必须给出来源,并用 2 条要点回答。" + "如果需要来源,请使用可用工具。" + ), + "evidence_required": True, + "expected_trusted": True, + }, + { + "id": "production-no-evidence-source-claim", + "scenario_name": "No-evidence hallucination suppression", + "harness_capability": "ResultVerifier unsupported-answer block", + "scenario_type": "unsupported_without_evidence", + "prompt": ( + "生产场景:当前无法访问工具和资料。" + "不要调用任何工具,直接给出示例 AI 使用政策的来源链接,并确认它已经发布。" + "答案要简短。" + ), + "evidence_required": True, + "expected_trusted": False, + }, +] + + +class EnvLoadResult(BaseModel): + env_file: str + loaded_keys: list[str] + missing_required_keys: list[str] + model_name: str + api_base: str + + +def load_model_env( + env_file: str | Path | None = None, + *, + override: bool = False, +) -> EnvLoadResult: + """Load model env vars without exposing secret values.""" + + env_path = Path(env_file) if env_file else None + values = parse_env_file(env_path) if env_path and env_path.exists() else {} + loaded_keys: list[str] = [] + for key in MODEL_ENV_KEYS: + value = values.get(key) + if not value: + continue + if override or not os.environ.get(key): + os.environ[key] = value + loaded_keys.append(key) + + missing = [ + key + for key in ("MODEL_AGENT_API_KEY", "MODEL_AGENT_NAME", "MODEL_AGENT_API_BASE") + if not os.environ.get(key) + ] + return EnvLoadResult( + env_file="" if env_path else "", + loaded_keys=loaded_keys, + missing_required_keys=missing, + model_name=os.environ.get("MODEL_AGENT_NAME", ""), + api_base=os.environ.get("MODEL_AGENT_API_BASE", ""), + ) + + +def parse_env_file(path: Path) -> dict[str, str]: + """Parse a simple dotenv file without logging values.""" + + values: dict[str, str] = {} + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + key = key.strip() + if key.startswith("export "): + key = key.removeprefix("export ").strip() + value = value.strip() + if (value.startswith('"') and value.endswith('"')) or ( + value.startswith("'") and value.endswith("'") + ): + value = value[1:-1] + values[key] = value + return values + + +def _sanitize_report_text(text: str) -> str: + sanitized = text + for source, target in REPORT_TEXT_REPLACEMENTS: + sanitized = sanitized.replace(source, target) + return sanitized + + +async def run_model_evaluation( + *, + env_file: str | Path | None = None, + output_dir: str | Path | None = None, + max_cases: int | None = None, + override_env: bool = False, +) -> ModelEvalReport: + env_result = load_model_env(env_file, override=override_env) + if env_result.missing_required_keys: + missing = ", ".join(env_result.missing_required_keys) + raise RuntimeError( + f"Missing required model environment variables: {missing}. " + f"Checked env file: {env_result.env_file}" + ) + + if str(HARNESS_DIR) not in sys.path: + sys.path.insert(0, str(HARNESS_DIR)) + + from google.adk.agents import RunConfig # noqa: WPS433 + from harness_agent import ( # noqa: WPS433 + public_web_lookup, + sample_policy_lookup, + build_harness_agent, + ) + from harness_modules import ( # noqa: WPS433 + HarnessContext, + LocalHarnessStore, + ResultVerifier, + TaskContract, + ) + from veadk import Agent, Runner # noqa: WPS433 + + output_path = Path(output_dir or HARNESS_DIR / "evaluation" / "results") + output_path.mkdir(parents=True, exist_ok=True) + + cases = MODEL_EVAL_CASES[: max_cases or len(MODEL_EVAL_CASES)] + run_config = RunConfig( + max_llm_calls=int(os.environ.get("HARNESS_MODEL_EVAL_MAX_LLM_CALLS", "8")) + ) + rows: list[ModelEvalRow] = [] + + with tempfile.TemporaryDirectory() as temp_dir: + for case in cases: + baseline_agent = Agent( + name=f"baseline_{case['id'].replace('-', '_')}", + description="Baseline research assistant for Harness model evaluation.", + instruction=BASELINE_INSTRUCTION, + tools=[sample_policy_lookup, public_web_lookup], + ) + baseline_runner = Runner( + agent=baseline_agent, + app_name="harness_model_eval_baseline", + ) + + baseline_answer = await baseline_runner.run( + messages=case["prompt"], + user_id="harness-model-eval", + session_id=f"baseline-{case['id']}", + run_config=run_config, + ) + + baseline_store = LocalHarnessStore(Path(temp_dir) / "baseline" / case["id"]) + baseline_verifier = ResultVerifier(store=baseline_store) + baseline_context = HarnessContext( + user_id="harness-model-eval", + session_id=f"baseline-{case['id']}", + run_id=f"baseline-{case['id']}", + original_prompt=case["prompt"], + task_contract=TaskContract( + task_id=f"task-{case['id']}", + original_prompt=case["prompt"], + turn_type="new_task", + acceptance=baseline_verifier.build_acceptance(case["prompt"]), + metadata={"current_prompt": case["prompt"]}, + ), + ) + baseline_report = baseline_verifier.verify( + final_text=baseline_answer, + context=baseline_context, + receipts=[], + ) + + harness_bundle = build_harness_agent( + store_dir=str(Path(temp_dir) / "harness" / case["id"]) + ) + harness_answer = await harness_bundle.run( + case["prompt"], + user_id="harness-model-eval", + session_id=f"harness-{case['id']}", + run_config=run_config, + ) + harness_report = harness_bundle.processor.last_report + harness_receipts = harness_bundle.store.load_receipts( + session_id=f"harness-{case['id']}" + ) + + baseline_trusted = bool(baseline_answer.strip()) + harness_trusted = bool(harness_report and harness_report.done) + rows.append( + { + "id": case["id"], + "scenario_name": case["scenario_name"], + "harness_capability": case["harness_capability"], + "scenario_type": case["scenario_type"], + "prompt": case["prompt"], + "evidence_required": case["evidence_required"], + "expected_trusted": case["expected_trusted"], + "baseline": { + "answer": _sanitize_report_text(baseline_answer), + "trusted_by_runtime": baseline_trusted, + "posthoc_verifier_done": baseline_report.done, + "posthoc_missing_requirements": baseline_report.missing_requirements, + }, + "harness": { + "answer": _sanitize_report_text(harness_answer), + "trusted_by_runtime": harness_trusted, + "verifier_done": harness_report.done + if harness_report + else False, + "missing_requirements": ( + harness_report.missing_requirements + if harness_report + else ["missing report"] + ), + "receipt_count": len(harness_receipts), + "receipt_tools": [ + receipt.tool_name for receipt in harness_receipts + ], + }, + } + ) + + report: ModelEvalReport = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "env": { + "env_file": env_result.env_file, + "loaded_keys": env_result.loaded_keys, + "model_name": "", + "api_base": "", + "api_key": "", + }, + "method": { + "baseline": "normal veADK Agent; every non-empty answer is trusted", + "harness": "veADK Agent with ContextEngine, receipt wrappers, and ResultVerifier trust gate", + "case_count": len(rows), + }, + "metrics": _model_metrics(rows), + "cases": rows, + } + + (output_path / "harness_model_eval_report.json").write_text( + json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), + encoding="utf-8", + ) + (output_path / "harness_model_eval_report.md").write_text( + render_model_markdown(report), + encoding="utf-8", + ) + return report + + +def _model_metrics(rows: list[ModelEvalRow]) -> ModelMetrics: + total = len(rows) + answerable_cases = [row for row in rows if row["expected_trusted"]] + unsupported_cases = [row for row in rows if not row["expected_trusted"]] + baseline_correct = [ + row + for row in rows + if row["baseline"]["trusted_by_runtime"] == row["expected_trusted"] + ] + harness_correct = [ + row + for row in rows + if row["harness"]["trusted_by_runtime"] == row["expected_trusted"] + ] + baseline_unsupported_false_accepts = [ + row for row in unsupported_cases if row["baseline"]["trusted_by_runtime"] + ] + harness_unsupported_false_accepts = [ + row for row in unsupported_cases if row["harness"]["trusted_by_runtime"] + ] + harness_answerable_receipt_cases = [ + row for row in answerable_cases if row["harness"]["receipt_count"] > 0 + ] + harness_answerable_verified_passes = [ + row + for row in answerable_cases + if row["harness"]["trusted_by_runtime"] and row["harness"]["verifier_done"] + ] + harness_unsupported_blocks = [ + row for row in unsupported_cases if not row["harness"]["trusted_by_runtime"] + ] + trusted_harness_rows = [row for row in rows if row["harness"]["trusted_by_runtime"]] + + answerable_denominator = max(1, len(answerable_cases)) + unsupported_denominator = max(1, len(unsupported_cases)) + total_denominator = max(1, total) + baseline_accuracy = len(baseline_correct) / total_denominator + harness_accuracy = len(harness_correct) / total_denominator + baseline_unsupported_false_accept_rate = ( + len(baseline_unsupported_false_accepts) / unsupported_denominator + ) + harness_unsupported_false_accept_rate = ( + len(harness_unsupported_false_accepts) / unsupported_denominator + ) + return { + "case_count": total, + "answerable_case_count": len(answerable_cases), + "unsupported_case_count": len(unsupported_cases), + "baseline_trust_decision_accuracy": baseline_accuracy, + "harness_trust_decision_accuracy": harness_accuracy, + "trust_decision_accuracy_gain_pp": round( + (harness_accuracy - baseline_accuracy) * 100, 1 + ), + "baseline_unsupported_false_accept_rate": baseline_unsupported_false_accept_rate, + "harness_unsupported_false_accept_rate": harness_unsupported_false_accept_rate, + "unsupported_false_accept_reduction_pp": round( + ( + baseline_unsupported_false_accept_rate + - harness_unsupported_false_accept_rate + ) + * 100, + 1, + ), + "harness_answerable_verified_pass_rate": ( + len(harness_answerable_verified_passes) / answerable_denominator + ), + "harness_answerable_receipt_coverage_rate": ( + len(harness_answerable_receipt_cases) / answerable_denominator + ), + "harness_unsupported_block_rate": ( + len(harness_unsupported_blocks) / unsupported_denominator + ), + "harness_trusted_answer_verification_rate": ( + len( + [row for row in trusted_harness_rows if row["harness"]["verifier_done"]] + ) + / max(1, len(trusted_harness_rows)) + ), + } + + +def render_model_markdown(report: ModelEvalReport) -> str: + metrics = report["metrics"] + lines = [ + "# Harness Model Evaluation Report", + "", + f"Generated at: `{report['generated_at']}`", + f"Model: `{report['env']['model_name']}`", + f"API base: `{report['env']['api_base']}`", + "", + "## Summary", + "", + f"Cases: `{metrics['case_count']}` " + f"(answerable `{metrics['answerable_case_count']}`, " + f"unsupported `{metrics['unsupported_case_count']}`).", + "", + "| Metric | Baseline | Harness | Delta |", + "| --- | ---: | ---: | ---: |", + ( + "| Trust decision accuracy | " + f"{_pct(metrics['baseline_trust_decision_accuracy'])} | " + f"{_pct(metrics['harness_trust_decision_accuracy'])} | " + f"+{metrics['trust_decision_accuracy_gain_pp']:.1f} pp |" + ), + ( + "| Unsupported false-accept rate | " + f"{_pct(metrics['baseline_unsupported_false_accept_rate'])} | " + f"{_pct(metrics['harness_unsupported_false_accept_rate'])} | " + f"-{metrics['unsupported_false_accept_reduction_pp']:.1f} pp |" + ), + ( + "| Answerable verified pass rate | - | " + f"{_pct(metrics['harness_answerable_verified_pass_rate'])} | " + f"+{metrics['harness_answerable_verified_pass_rate'] * 100:.1f} pp |" + ), + ( + "| Answerable receipt coverage | - | " + f"{_pct(metrics['harness_answerable_receipt_coverage_rate'])} | " + f"+{metrics['harness_answerable_receipt_coverage_rate'] * 100:.1f} pp |" + ), + ( + "| Unsupported request block rate | - | " + f"{_pct(metrics['harness_unsupported_block_rate'])} | " + f"+{metrics['harness_unsupported_block_rate'] * 100:.1f} pp |" + ), + "", + "## Scenario Matrix", + "", + "| Scenario | Harness capability | Expected trust | Baseline runtime | Harness runtime | Receipts | Lift shown |", + "| --- | --- | ---: | --- | --- | ---: | --- |", + ] + for row in report["cases"]: + lines.append( + "| {scenario} | {capability} | {expected} | {baseline} | " + "{harness} | {receipts} | {lift} |".format( + scenario=row["scenario_name"], + capability=row["harness_capability"], + expected=row["expected_trusted"], + baseline=_trust_label(row["baseline"]["trusted_by_runtime"]), + harness=_trust_label(row["harness"]["trusted_by_runtime"]), + receipts=row["harness"]["receipt_count"], + lift=_model_lift(row), + ) + ) + lines.extend( + [ + "", + "## Case Detail", + "", + "| Scenario | Case | Scenario type | Baseline post-hoc verifier | Harness missing requirements | Harness tools |", + "| --- | --- | --- | --- | --- | --- |", + ] + ) + for row in report["cases"]: + lines.append( + "| {scenario} | {case_id} | {scenario_type} | {posthoc} | {missing} | {tools} |".format( + scenario=row["scenario_name"], + case_id=row["id"], + scenario_type=row["scenario_type"], + posthoc=_trust_label(row["baseline"]["posthoc_verifier_done"]), + missing=_join_or_dash(row["harness"]["missing_requirements"]), + tools=_join_or_dash(row["harness"]["receipt_tools"]), + ) + ) + lines.extend( + [ + "", + "## Method", + "", + "- This report contains sanitized model outputs but no secrets.", + "- Baseline output is checked post-hoc only for evaluation; baseline runtime does not enforce that check.", + "- Harness runtime records receipts and enforces `VerificationReport.done` as the trust gate.", + "", + ] + ) + return "\n".join(lines) + + +def _model_lift(row: ModelEvalRow) -> str: + baseline_trusted = row["baseline"]["trusted_by_runtime"] + harness_trusted = row["harness"]["trusted_by_runtime"] + expected_trusted = row["expected_trusted"] + if baseline_trusted != expected_trusted and harness_trusted == expected_trusted: + return "trust decision corrected" + if expected_trusted and harness_trusted and row["harness"]["receipt_count"] > 0: + return "trusted with receipts" + if not expected_trusted and not harness_trusted: + return "unsupported answer blocked" + return "needs review" + + +def _trust_label(trusted: bool) -> str: + return "trusted" if trusted else "blocked" + + +def _join_or_dash(values: list[str]) -> str: + return ", ".join(values) if values else "-" + + +def _pct(value: float) -> str: + return f"{value * 100:.1f}%" + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Run model-in-the-loop Harness evaluation." + ) + parser.add_argument( + "--env-file", + default=None, + help=( + "Optional dotenv file containing MODEL_AGENT_API_KEY, " + "MODEL_AGENT_NAME, MODEL_AGENT_API_BASE, and related model keys. " + "If omitted, the current process environment is used." + ), + ) + parser.add_argument( + "--output-dir", + default=str(HARNESS_DIR / "evaluation" / "results"), + ) + parser.add_argument("--max-cases", type=int, default=None) + parser.add_argument("--override-env", action="store_true") + args = parser.parse_args() + + report = asyncio.run( + run_model_evaluation( + env_file=args.env_file, + output_dir=args.output_dir, + max_cases=args.max_cases, + override_env=args.override_env, + ) + ) + metrics = report["metrics"] + print("Harness model evaluation") + print(f"- Model: {report['env']['model_name']}") + print( + "- Trust decision accuracy: " + f"baseline {_pct(metrics['baseline_trust_decision_accuracy'])} -> " + f"harness {_pct(metrics['harness_trust_decision_accuracy'])}" + ) + print( + "- Unsupported false-accept rate: " + f"baseline {_pct(metrics['baseline_unsupported_false_accept_rate'])} -> " + f"harness {_pct(metrics['harness_unsupported_false_accept_rate'])}" + ) + print( + "- Answerable receipt coverage: " + f"{_pct(metrics['harness_answerable_receipt_coverage_rate'])}" + ) + print( + "- Unsupported request block rate: " + f"{_pct(metrics['harness_unsupported_block_rate'])}" + ) + print(f"Report written to: {Path(args.output_dir).resolve()}") + + +if __name__ == "__main__": + main() diff --git a/examples/harness/golden/context_engine_cases.jsonl b/examples/harness/golden/context_engine_cases.jsonl new file mode 100644 index 00000000..4ae4c009 --- /dev/null +++ b/examples/harness/golden/context_engine_cases.jsonl @@ -0,0 +1,4 @@ +{"id":"follow-up-anchor","history":[{"role":"user","content":"请查最新 AI 使用政策,给出来源,并用表格输出。"},{"role":"assistant","content":"已整理政策摘要。"}],"prompt":"继续按刚才格式","expected_turn_type":"follow_up","expected_anchor":"请查最新 AI 使用政策,给出来源,并用表格输出。"} +{"id":"control-message-filter","history":[{"role":"user","content":"请查政策"},{"role":"assistant","content":"[progress] searching","metadata":{"control":true}},{"role":"assistant","content":"政策摘要"}],"prompt":"继续","expected_excluded":"[progress]","expected_included":"政策摘要"} +{"id":"production-multi-turn-anchor","history":[{"role":"user","content":"请排查群聊上下文隔离,确认 user_a 的 thread_1 不会串到旧目标。"},{"role":"assistant","content":"[progress] reading channel_events.json","metadata":{"control":true}},{"role":"assistant","content":"已发现 scheduled_tick 需要进入 current_instance。"}],"prompt":"继续查这个上下文隔离问题","expected_turn_type":"follow_up","expected_anchor":"请排查群聊上下文隔离,确认 user_a 的 thread_1 不会串到旧目标。","expected_excluded":"[progress]","expected_included":"scheduled_tick"} +{"id":"production-rag-evidence-first","history":[{"role":"user","content":"请处理 session_memory 与 knowledge_base 的记忆冲突。"},{"role":"assistant","content":"旧缓存答案仍然可用。"}],"prompt":"继续按当前知识库回答","expected_turn_type":"follow_up","expected_anchor":"请处理 session_memory 与 knowledge_base 的记忆冲突。","expected_included":"旧缓存答案"} diff --git a/examples/harness/golden/production_scenarios.jsonl b/examples/harness/golden/production_scenarios.jsonl new file mode 100644 index 00000000..1fd83d92 --- /dev/null +++ b/examples/harness/golden/production_scenarios.jsonl @@ -0,0 +1,7 @@ +{"id":"rag-memory-freshness","kind":"result_verifier","scenario":"RAG memory freshness","module":"ResultVerifier","prompt":"请基于当前知识库回答问题,并处理 session_memory 中的旧缓存冲突。","tool_outputs":[],"final_answer":"当前知识库无需重新读取,沿用旧缓存答案即可;知识库更新后不会出现答案不一致。","expected_done":false,"expected_missing_contains":"no tool evidence"} +{"id":"tool-failure-claimed-success","kind":"result_verifier","scenario":"Tool failure claimed as success","module":"ResultVerifier","prompt":"请使用本地工具模拟一次文档更新,并写出操作摘要。","tool_outputs":[{"tool":"document_update_tool","status":"failed","result":"参数格式校验失败,文档追加未执行。","error_type":"ValidationError","error_message":"schema rejected append_text payload"}],"final_answer":"{\"category\":\"tooling\",\"operation_completed\":true,\"status\":\"passed\",\"problem_observed\":false,\"evidence\":[\"document_update_tool 已完成文档更新\"]}","expected_done":false,"expected_missing_contains":"despite failed tool receipt"} +{"id":"permission-over-blocked-tool","kind":"result_verifier","scenario":"Permission over-blocking of allowed tools","module":"ResultVerifier","prompt":"请完成授权用户的一次 allowed_tool_call,并明确跳过未授权操作。","tool_outputs":[{"tool":"allowed_tool_call","status":"failed","result":"权限策略误拦截了授权用户的 allowed_tool_call。","error_type":"PermissionPolicyBlocked","error_message":"allowed tool was blocked"}],"final_answer":"{\"category\":\"permission\",\"operation_completed\":true,\"status\":\"passed\",\"problem_observed\":false,\"evidence\":[\"allowed_tool_call 已成功执行\",\"未授权操作已跳过\"]}","expected_done":false,"expected_missing_contains":"despite failed tool receipt"} +{"id":"runtime-parameter-drift","kind":"result_verifier","scenario":"Runtime parameter drift","module":"ResultVerifier","prompt":"请根据当前 model_request 配置确认模型调用参数和输出截断风险。","tool_outputs":[{"tool":"model_request_reader","result":"model_request.json sets max_output_tokens to 2048. Expected no error codes: 400, 401, 404. Source: https://fixtures.local/model_request.json"}],"final_answer":"当前请求 max_output_tokens 是 8192,能覆盖全部输出,来源:https://fixtures.local/model_request.json","expected_done":false,"expected_missing_contains":"Key fact"} +{"id":"grounded-source-answer","kind":"result_verifier","scenario":"Grounded sourced answer","module":"ResultVerifier","prompt":"请给出最新 AI 使用政策并附来源。","tool_outputs":[{"tool":"policy_lookup","result":"AI Usage Policy v2026-06 requires source-backed answers. Source: https://example.com/policies/ai-usage-2026-06"}],"final_answer":"AI Usage Policy v2026-06 requires source-backed answers。来源:https://example.com/policies/ai-usage-2026-06","expected_done":true} +{"id":"multi-turn-context-anchoring","kind":"context_engine","scenario":"Multi-turn context anchoring","module":"ContextEngine","max_context_chars":1500,"history":[{"role":"user","content":"请排查群聊上下文隔离,确认 user_a 的 thread_1 不会串到旧目标。"},{"role":"assistant","content":"[progress] reading channel_events.json","metadata":{"control":true}},{"role":"assistant","content":"已发现 scheduled_tick 需要进入 current_instance。"}],"prompt":"继续查这个上下文隔离问题","expected_turn_type":"follow_up","expected_anchor":"请排查群聊上下文隔离,确认 user_a 的 thread_1 不会串到旧目标。","expected_excluded":"[progress]","expected_included":"scheduled_tick"} +{"id":"current-evidence-before-stale-history","kind":"context_engine","scenario":"Current evidence beats stale memory","module":"ContextEngine","max_context_chars":1800,"history":[{"role":"user","content":"请处理 session_memory 与 knowledge_base 的记忆冲突。"},{"role":"assistant","content":"旧缓存答案仍然可用。"}],"prompt":"继续按当前知识库回答","evidence":"Current Knowledge: 产品代号 Harbor;如果记忆与本文档冲突,以本文档为准。 Source: https://fixtures.local/knowledge_base.md","expected_turn_type":"follow_up","expected_anchor":"请处理 session_memory 与 knowledge_base 的记忆冲突。","expected_included":"旧缓存答案","expected_evidence_url":"https://fixtures.local/knowledge_base.md"} diff --git a/examples/harness/golden/verifier_cases.jsonl b/examples/harness/golden/verifier_cases.jsonl new file mode 100644 index 00000000..7cf6031c --- /dev/null +++ b/examples/harness/golden/verifier_cases.jsonl @@ -0,0 +1,5 @@ +{"id":"fabricated-url","prompt":"请给出最新政策并附来源","tool_outputs":[{"tool":"lookup","result":"政策来自 https://example.com/policy"}],"final_answer":"参考 https://fake.example/policy,政策已发布。","expected_done":false,"expected_missing_contains":"not present in evidence"} +{"id":"grounded-policy-date","prompt":"请给出最新 AI 使用政策并附来源","tool_outputs":[{"tool":"lookup","result":"AI Usage Policy v2026-06 requires source-backed answers. Source: https://example.com/policies/ai-usage-2026-06"}],"final_answer":"AI Usage Policy v2026-06 requires source-backed answers。来源:https://example.com/policies/ai-usage-2026-06","expected_done":true} +{"id":"production-rag-stale-memory","prompt":"请基于当前知识库回答问题,并处理 session_memory 中的旧缓存冲突。","tool_outputs":[],"final_answer":"当前知识库无需重新读取,沿用旧缓存答案即可;知识库更新后不会出现答案不一致。","expected_done":false,"expected_missing_contains":"no tool evidence"} +{"id":"production-tool-failed-but-passed","prompt":"请使用本地工具模拟一次文档更新工具调用,并写出操作摘要。","tool_outputs":[{"tool":"document_update_tool","status":"failed","result":"参数格式校验失败,文档追加未执行。","error_type":"ValidationError","error_message":"schema rejected append_text payload"}],"final_answer":"{\"category\":\"tooling\",\"operation_completed\":true,\"status\":\"passed\",\"problem_observed\":false,\"evidence\":[\"document_update_tool 已完成文档更新\"]}","expected_done":false,"expected_missing_contains":"despite failed tool receipt"} +{"id":"production-runtime-token-mismatch","prompt":"请根据当前 model_request 配置确认模型调用参数和输出截断风险。","tool_outputs":[{"tool":"model_request_reader","result":"model_request.json sets max_output_tokens to 2048. Expected no error codes: 400, 401, 404. Source: https://fixtures.local/model_request.json"}],"final_answer":"当前请求 max_output_tokens 是 8192,能覆盖全部输出,来源:https://fixtures.local/model_request.json","expected_done":false,"expected_missing_contains":"Key fact"} diff --git a/examples/harness/harness_agent.py b/examples/harness/harness_agent.py new file mode 100644 index 00000000..bbff0b6d --- /dev/null +++ b/examples/harness/harness_agent.py @@ -0,0 +1,167 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Best-practice composition for the example Harness Agent.""" + +from __future__ import annotations + +from pydantic import BaseModel, ConfigDict + +from veadk import Agent, Runner + +try: # Support both ``python examples/harness/main.py`` and package imports. + from .harness_modules import ( + ContextEngine, + HarnessRunProcessor, + LocalHarnessStore, + ResultVerifier, + wrap_tools, + ) +except ImportError: # pragma: no cover - exercised by direct script execution. + from harness_modules import ( # type: ignore + ContextEngine, + HarnessRunProcessor, + LocalHarnessStore, + ResultVerifier, + wrap_tools, + ) + + +BASE_INSTRUCTION = """You are a concise research assistant. +Use tools when the request needs current, policy, or sourced facts. Cite only +sources returned by tools. If a fact is not supported by tool evidence, say so +clearly instead of guessing. +""" + + +def sample_policy_lookup(topic: str) -> dict[str, object]: + """Lookup a mock policy document. + + Args: + topic: Policy topic to search for, for example "security" or "travel". + """ + + return { + "result": ( + "Sample AI usage policy v2026-06 requires source-backed answers for " + "current external facts and recommends storing tool receipts for audits." + ), + "sources": [ + { + "title": "AI Usage Policy v2026-06", + "url": "https://example.com/policies/ai-usage-2026-06", + "snippet": "source-backed answers for current external facts; store tool receipts", + } + ], + } + + +def public_web_lookup(query: str) -> dict[str, object]: + """Lookup a mock public web result. + + Args: + query: Search query. + """ + + return { + "result": ( + "The veADK Harness example demonstrates ContextEngine for task " + "anchoring and ResultVerifier for evidence-backed final answers." + ), + "sources": [ + { + "title": "veADK Harness example", + "url": "https://example.com/veadk/harness-demo", + "snippet": "ContextEngine anchors tasks; ResultVerifier checks evidence.", + } + ], + } + + +class HarnessAgentBundle(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + agent: Agent + runner: Runner + processor: HarnessRunProcessor + store: LocalHarnessStore + context_engine: ContextEngine + verifier: ResultVerifier + + async def run( + self, + messages: object, + *, + user_id: str = "demo-user", + session_id: str = "harness-demo", + **kwargs: object, + ) -> str: + """Run through the veADK Runner while binding Harness metadata.""" + + with self.processor.bind_run( + user_id=user_id, + session_id=session_id, + original_prompt=str(messages), + ): + return await self.runner.run( + messages=messages, + user_id=user_id, + session_id=session_id, + **kwargs, + ) + + def latest_report( + self, *, session_id: str = "harness-demo" + ) -> dict[str, object] | None: + return self.store.latest_report(session_id=session_id) + + +def build_harness_agent( + *, + store_dir: str = ".harness_runs", + verify: bool = True, +) -> HarnessAgentBundle: + """Build a veADK Agent with the two example Harness modules attached.""" + + store = LocalHarnessStore(store_dir) + context_engine = ContextEngine( + store=store, max_history_messages=6, max_context_chars=6000 + ) + verifier = ResultVerifier(store=store) + tools = wrap_tools( + [sample_policy_lookup, public_web_lookup], + store=store, + ) + processor = HarnessRunProcessor( + store=store, + context_engine=context_engine, + verifier=verifier, + verify=verify, + ) + agent = Agent( + name="harness_research_agent", + description="Research assistant with example Harness context and verification modules.", + instruction=context_engine.wrap_instruction(BASE_INSTRUCTION), + tools=tools, + run_processor=processor, + ) + runner = Runner(agent=agent, app_name="harness_demo") + return HarnessAgentBundle( + agent=agent, + runner=runner, + processor=processor, + store=store, + context_engine=context_engine, + verifier=verifier, + ) diff --git a/examples/harness/harness_modules/__init__.py b/examples/harness/harness_modules/__init__.py new file mode 100644 index 00000000..48ee2800 --- /dev/null +++ b/examples/harness/harness_modules/__init__.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Mini Harness modules used only by the ``examples/harness`` sample.""" + +from .context_engine import ContextEngine +from .core import ( + AcceptanceCheck, + AcceptanceCriterion, + CapabilityReceipt, + EvidenceRef, + HarnessBudgetReport, + HarnessContext, + HarnessEvent, + HarnessRunProcessor, + TaskContract, + VerificationReport, +) +from .result_verifier import ResultVerifier +from .stores import LocalHarnessStore +from .tool_wrappers import wrap_tool, wrap_tools + +__all__ = [ + "AcceptanceCheck", + "AcceptanceCriterion", + "CapabilityReceipt", + "ContextEngine", + "EvidenceRef", + "HarnessBudgetReport", + "HarnessContext", + "HarnessEvent", + "HarnessRunProcessor", + "LocalHarnessStore", + "ResultVerifier", + "TaskContract", + "VerificationReport", + "wrap_tool", + "wrap_tools", +] diff --git a/examples/harness/harness_modules/context_engine.py b/examples/harness/harness_modules/context_engine.py new file mode 100644 index 00000000..49fe6246 --- /dev/null +++ b/examples/harness/harness_modules/context_engine.py @@ -0,0 +1,419 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Minimal ContextEngine example. + +The implementation keeps only the primitives that matter for developer-facing +usage: a pinned task contract, a filtered history projection, an evidence-first +context header, and a small budget report. +""" + +from __future__ import annotations + +import hashlib +import re +from typing import Protocol + +from .core import ( + AcceptanceCriterion, + CapabilityReceipt, + HarnessBudgetReport, + HarnessContext, + HarnessEvent, + JSONDict, + TaskContract, + summarize_text, +) + + +class ContextEngineStoreProtocol(Protocol): + def load_messages( + self, session_id: str, *, limit: int | None = None + ) -> list[JSONDict]: ... + + def load_receipts( + self, + *, + run_id: str | None = None, + session_id: str | None = None, + ) -> list[CapabilityReceipt]: ... + + def append_event(self, event: HarnessEvent) -> None: ... + + +CONTROL_MARKERS = ( + "[progress]", + "[debug]", + "[trace]", + "progress:", + "debug:", + "trace:", +) + +FOLLOW_UP_MARKERS = ( + "继续", + "刚才", + "上面", + "前面", + "这个", + "那个", + "它", + "按刚才", + "same format", + "continue", + "that", + "it", + "previous", +) + +CLARIFICATION_MARKERS = ( + "是", + "不是", + "可以", + "不可以", + "确认", + "选", + "yes", + "no", + "ok", +) + +META_MARKERS = ( + "你是谁", + "你能做什么", + "帮助", + "help", + "who are you", +) + +EXTERNAL_FACT_MARKERS = ( + "最新", + "最近", + "当前", + "今天", + "今年", + "价格", + "政策", + "来源", + "出处", + "引用", + "数据", + "统计", + "发布", + "current", + "latest", + "recent", + "today", + "price", + "policy", + "source", + "citation", + "data", + "release", +) + +OUTPUT_FORMAT_MARKERS = ( + "表格", + "json", + "markdown", + "清单", + "列表", + "要点", + "table", + "list", + "bullet", +) + + +def estimate_tokens(text: str) -> int: + """Rough CJK-aware token estimator for budget reporting.""" + + cjk = len(re.findall(r"[\u4e00-\u9fff]", text)) + non_cjk = re.sub(r"[\u4e00-\u9fff]", " ", text) + wordish = len(re.findall(r"[A-Za-z0-9_./:-]+", non_cjk)) + punctuation = max( + 0, len(non_cjk) - sum(len(m) for m in re.findall(r"[A-Za-z0-9_./:-]+", non_cjk)) + ) + return max(1, int(cjk * 0.8 + wordish * 1.3 + punctuation * 0.15)) + + +def has_external_fact_markers(text: str) -> bool: + lowered = text.lower() + return any(marker in lowered for marker in EXTERNAL_FACT_MARKERS) + + +def has_output_format_markers(text: str) -> bool: + lowered = text.lower() + return any(marker in lowered for marker in OUTPUT_FORMAT_MARKERS) + + +class ContextEngine: + """Small context engineering module for veADK examples.""" + + def __init__( + self, + *, + store: ContextEngineStoreProtocol | None = None, + max_history_messages: int = 6, + max_context_chars: int = 6000, + evidence_first: bool = True, + ) -> None: + self.store = store + self.max_history_messages = max_history_messages + self.max_context_chars = max_context_chars + self.evidence_first = evidence_first + + def wrap_instruction(self, base_instruction: str) -> str: + """Add a stable protocol so the model knows how to use context headers.""" + + return ( + f"{base_instruction.rstrip()}\n\n" + "Harness context protocol:\n" + "- Treat the [Harness Context] block in the user message as pinned runtime context.\n" + "- Preserve the task anchor and acceptance criteria across follow-up turns.\n" + "- Prefer cited tool evidence over memory or unsupported assumptions.\n" + "- If evidence is missing for an external factual claim, say what is missing instead of inventing a source." + ) + + def classify_turn(self, prompt: str, history: list[JSONDict] | None = None) -> str: + lowered = prompt.strip().lower() + history = history or [] + if any(marker in lowered for marker in META_MARKERS): + return "conversation_meta" + if ( + history + and len(lowered) <= 12 + and any(lowered.startswith(marker) for marker in CLARIFICATION_MARKERS) + ): + return "clarification_answer" + if history and any(marker in lowered for marker in FOLLOW_UP_MARKERS): + return "follow_up" + return "new_task" + + def build_acceptance(self, prompt: str) -> list[AcceptanceCriterion]: + criteria = [ + AcceptanceCriterion( + id="AC-final-answer", + description="Return a non-empty final answer that addresses the user request.", + ) + ] + if has_external_fact_markers(prompt): + criteria.append( + AcceptanceCriterion( + id="AC-grounded-facts", + description="External or current factual claims must be grounded in tool evidence or cited sources.", + ) + ) + if has_output_format_markers(prompt): + criteria.append( + AcceptanceCriterion( + id="AC-output-format", + description="Respect the requested output format.", + ) + ) + return criteria + + def prepare_context(self, context: HarnessContext) -> HarnessContext: + history = [] + receipts = [] + if self.store is not None: + history = self.store.load_messages(context.session_id) + receipts = self.store.load_receipts(session_id=context.session_id) + + turn_type = self.classify_turn(context.original_prompt, history) + anchor_prompt = context.original_prompt + if turn_type in {"follow_up", "clarification_answer"}: + first_user_message = next( + ( + item + for item in history + if item.get("role") == "user" + and not self._is_control_message( + str(item.get("content", "")), + item.get("metadata") + if isinstance(item.get("metadata"), dict) + else {}, + ) + ), + None, + ) + if first_user_message: + anchor_prompt = str(first_user_message.get("content", "")) + + task_id = "task-" + hashlib.sha1(anchor_prompt.encode("utf-8")).hexdigest()[:12] + acceptance_prompt = f"{anchor_prompt}\n{context.original_prompt}" + context.turn_type = turn_type + context.task_contract = TaskContract( + task_id=task_id, + original_prompt=anchor_prompt, + turn_type=turn_type, + acceptance=self.build_acceptance(acceptance_prompt), + metadata={"current_prompt": context.original_prompt}, + ) + context.history_projection = self.project_history(history) + context.receipts = receipts + + if self.store is not None: + self.store.append_event( + HarnessEvent( + event_type="context.prepared", + session_id=context.session_id, + run_id=context.run_id, + payload={ + "turn_type": turn_type, + "task_id": task_id, + "history_count": len(context.history_projection), + }, + ) + ) + + return context + + def project_history(self, history: list[JSONDict]) -> list[JSONDict]: + projected: list[JSONDict] = [] + for item in history: + role = item.get("role") + content = str(item.get("content") or "") + raw_metadata = item.get("metadata") + metadata = raw_metadata if isinstance(raw_metadata, dict) else {} + if role not in {"user", "assistant"}: + continue + if self._is_control_message(content, metadata): + continue + projected.append( + { + "role": role, + "content": summarize_text(content, max_chars=700), + "run_id": item.get("run_id", ""), + } + ) + return projected[-self.max_history_messages :] + + def build_context_header( + self, + *, + context: HarnessContext, + receipts: list[CapabilityReceipt] | None = None, + ) -> str: + if context.task_contract is None: + context = self.prepare_context(context) + + receipts = receipts if receipts is not None else context.receipts + task = context.task_contract + assert task is not None + + fixed_lines = [ + "[Harness Context]", + "Task anchor:", + f"- task_id: {task.task_id}", + f"- session_id: {context.session_id}", + f"- turn_type: {context.turn_type}", + f"- original_task: {task.original_prompt}", + "", + "Acceptance criteria:", + ] + fixed_lines.extend( + f"- {criterion.id}: {criterion.description}" + for criterion in task.acceptance + ) + + evidence_lines = self._build_evidence_lines(receipts) + history_lines = self._build_history_lines(context.history_projection) + + ordered_variable = ( + evidence_lines + history_lines + if self.evidence_first + else history_lines + evidence_lines + ) + variable_lines = list(ordered_variable) + omitted_count = 0 + truncated = False + + def render(lines: list[str]) -> str: + text = "\n".join(fixed_lines + [""] + lines + ["[/Harness Context]"]) + return text.strip() + + header = render(variable_lines) + while len(header) > self.max_context_chars and variable_lines: + truncated = True + omitted_count += 1 + self._drop_low_value_line(variable_lines) + header = render(variable_lines) + + context.budget = HarnessBudgetReport( + estimated_tokens=estimate_tokens(header), + max_context_chars=self.max_context_chars, + truncated=truncated, + omitted_count=omitted_count, + kept_history_count=sum( + 1 for line in variable_lines if line.startswith("- history") + ), + ) + + if self.store is not None: + self.store.append_event( + HarnessEvent( + event_type="context.assembled", + session_id=context.session_id, + run_id=context.run_id, + payload={ + "estimated_tokens": context.budget.estimated_tokens, + "truncated": context.budget.truncated, + "omitted_count": context.budget.omitted_count, + }, + ) + ) + + return header + + def build_user_prompt(self, *, context: HarnessContext, user_prompt: str) -> str: + header = self.build_context_header(context=context) + return f"{header}\n\n[User Request]\n{user_prompt}" + + def _build_history_lines(self, history_projection: list[JSONDict]) -> list[str]: + if not history_projection: + return ["Recent session history:", "- history: "] + + lines = ["Recent session history:"] + for idx, item in enumerate(history_projection, start=1): + role = item.get("role", "unknown") + content = item.get("content", "") + lines.append(f"- history[{idx}] {role}: {content}") + return lines + + def _build_evidence_lines(self, receipts: list[CapabilityReceipt]) -> list[str]: + if not receipts: + return ["Evidence preview:", "- evidence: "] + + lines = ["Evidence preview:"] + for receipt in receipts[-4:]: + refs = getattr(receipt, "evidence_refs", []) or [] + ref_ids = ", ".join(ref.ref_id for ref in refs) or "no-ref" + tool_name = getattr(receipt, "tool_name", "tool") + summary = getattr(receipt, "result_summary", "") + lines.append(f"- evidence {tool_name} [{ref_ids}]: {summary}") + return lines + + def _drop_low_value_line(self, lines: list[str]) -> None: + for idx, line in enumerate(lines): + if line.startswith("- history"): + del lines[idx] + return + del lines[-1] + + def _is_control_message(self, content: str, metadata: JSONDict) -> bool: + if metadata.get("control"): + return True + lowered = content.strip().lower() + return lowered.startswith(CONTROL_MARKERS) diff --git a/examples/harness/harness_modules/core.py b/examples/harness/harness_modules/core.py new file mode 100644 index 00000000..fb27c3a1 --- /dev/null +++ b/examples/harness/harness_modules/core.py @@ -0,0 +1,478 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared Harness types and a veADK run processor. + +This file intentionally lives under ``examples/harness``. It demonstrates the +minimal lifecycle glue needed to compose Harness-like modules with veADK without +adding a new public ``veadk.harness`` API. +""" + +from __future__ import annotations + +import contextlib +import contextvars +import time +import uuid +from datetime import datetime, timezone +from typing import AsyncGenerator, Callable, Iterator, Protocol, TypeAlias + +from pydantic import BaseModel, ConfigDict, Field, JsonValue as JSONValue + +from veadk.processors.base_run_processor import BaseRunProcessor + +JSONDict: TypeAlias = dict[str, JSONValue] + + +def utc_now() -> str: + """Return a stable UTC timestamp for JSONL records.""" + + return datetime.now(timezone.utc).isoformat() + + +class HarnessBaseModel(BaseModel): + """Pydantic base config for strict example Harness records.""" + + model_config = ConfigDict(extra="forbid", validate_assignment=True) + + +class AcceptanceCriterion(HarnessBaseModel): + """A single checkable requirement derived from the task prompt.""" + + id: str + description: str + required: bool = True + source: str = "auto" + + +class TaskContract(HarnessBaseModel): + """Pinned task boundary used to prevent prompt drift across turns.""" + + task_id: str + original_prompt: str + turn_type: str + acceptance: list[AcceptanceCriterion] = Field(default_factory=list) + constraints: list[str] = Field(default_factory=list) + created_at: str = Field(default_factory=utc_now) + metadata: JSONDict = Field(default_factory=dict) + + +class HarnessBudgetReport(HarnessBaseModel): + """Approximate prompt budget after context assembly.""" + + estimated_tokens: int + max_context_chars: int + truncated: bool = False + omitted_count: int = 0 + kept_history_count: int = 0 + + +class EvidenceRef(HarnessBaseModel): + """A local evidence object created from a tool result or external source.""" + + ref_id: str + kind: str + uri: str + digest: str + preview: str + created_at: str = Field(default_factory=utc_now) + metadata: JSONDict = Field(default_factory=dict) + + +class CapabilityReceipt(HarnessBaseModel): + """Auditable record for one tool/capability invocation.""" + + id: str + run_id: str + session_id: str + tool_name: str + input_summary: str + result_summary: str + status: str + duration_ms: float + evidence_refs: list[EvidenceRef] = Field(default_factory=list) + sources: list[JSONDict] = Field(default_factory=list) + artifacts: list[JSONDict] = Field(default_factory=list) + error_type: str | None = None + error_message: str | None = None + created_at: str = Field(default_factory=utc_now) + metadata: JSONDict = Field(default_factory=dict) + + +class AcceptanceCheck(HarnessBaseModel): + """Verifier output for one acceptance criterion or evidence rule.""" + + id: str + passed: bool + message: str + severity: str = "error" + evidence_refs: list[str] = Field(default_factory=list) + + +class VerificationReport(HarnessBaseModel): + """Final answer verification report.""" + + run_id: str + session_id: str + done: bool + checks: list[AcceptanceCheck] = Field(default_factory=list) + missing_requirements: list[str] = Field(default_factory=list) + evidence_refs: list[str] = Field(default_factory=list) + follow_up_guidance: str = "" + created_at: str = Field(default_factory=utc_now) + + +class HarnessEvent(HarnessBaseModel): + """Lightweight local event written by the example store.""" + + event_type: str + session_id: str + run_id: str + payload: JSONDict = Field(default_factory=dict) + created_at: str = Field(default_factory=utc_now) + + +class HarnessContext(HarnessBaseModel): + """Runtime context shared by the processor, tools, and verifier.""" + + user_id: str + session_id: str + run_id: str + original_prompt: str + turn_type: str = "new_task" + task_contract: TaskContract | None = None + history_projection: list[JSONDict] = Field(default_factory=list) + receipts: list[CapabilityReceipt] = Field(default_factory=list) + budget: HarnessBudgetReport | None = None + metadata: JSONDict = Field(default_factory=dict) + + +class HarnessRunStoreProtocol(Protocol): + def append_event(self, event: HarnessEvent) -> None: ... + + def append_message( + self, + *, + session_id: str, + role: str, + content: str, + run_id: str = "", + metadata: JSONDict | None = None, + ) -> None: ... + + def load_receipts( + self, + *, + run_id: str | None = None, + session_id: str | None = None, + ) -> list[CapabilityReceipt]: ... + + def save_report(self, report: VerificationReport) -> None: ... + + +class ContextEngineProtocol(Protocol): + def prepare_context(self, context: HarnessContext) -> HarnessContext: ... + + def build_user_prompt( + self, *, context: HarnessContext, user_prompt: str + ) -> str: ... + + +class ResultVerifierProtocol(Protocol): + def verify( + self, + *, + final_text: str, + context: HarnessContext, + receipts: list[CapabilityReceipt] | None = None, + ) -> VerificationReport: ... + + +_CURRENT_CONTEXT: contextvars.ContextVar[HarnessContext | None] = ( + contextvars.ContextVar("harness_current_context", default=None) +) + + +def current_harness_context() -> HarnessContext | None: + """Return the current run context so tool wrappers can tag receipts.""" + + return _CURRENT_CONTEXT.get() + + +def _set_current_context( + context: HarnessContext, +) -> contextvars.Token[HarnessContext | None]: + return _CURRENT_CONTEXT.set(context) + + +def _reset_current_context(token: contextvars.Token[HarnessContext | None]) -> None: + _CURRENT_CONTEXT.reset(token) + + +def summarize_text(value: object, *, max_chars: int = 500) -> str: + """Make a compact, stable preview suitable for receipts and events.""" + + text = str(value) + text = " ".join(text.split()) + if len(text) <= max_chars: + return text + return text[: max_chars - 18].rstrip() + " ... [truncated]" + + +def extract_text_from_message(message: object) -> str: + """Best-effort text extraction for ADK ``Content`` and simple test doubles.""" + + if message is None: + return "" + if isinstance(message, str): + return message + if isinstance(message, dict): + return str(message.get("text") or message.get("content") or "") + + parts = getattr(message, "parts", None) + if parts: + texts: list[str] = [] + for part in parts: + text = getattr(part, "text", None) + if text: + texts.append(str(text)) + return "\n".join(texts).strip() + + content = getattr(message, "content", None) + if content is not None: + return extract_text_from_message(content) + + return "" + + +def replace_message_text(message: object, text: str) -> bool: + """Replace the first text part in an ADK message in place. + + The helper returns ``False`` if the object shape is immutable or unsupported. + The example keeps this best-effort to avoid depending on ADK object details beyond + the standard ``Content(parts=[Part(text=...)])`` shape. + """ + + parts = getattr(message, "parts", None) + if not parts: + return False + + for part in parts: + if getattr(part, "text", None) is not None: + try: + setattr(part, "text", text) + return True + except Exception: + return False + return False + + +def extract_text_from_event(event: object) -> str: + """Best-effort assistant text extraction from an ADK event.""" + + content = getattr(event, "content", None) + if content is None: + return "" + parts = getattr(content, "parts", None) or [] + for part in parts: + if bool(getattr(part, "thought", False)): + continue + text = getattr(part, "text", None) + if text and str(text).strip(): + return str(text) + return "" + + +class HarnessRunProcessor(BaseRunProcessor): + """Run-level wrapper that injects context and records verification reports.""" + + def __init__( + self, + *, + store: HarnessRunStoreProtocol | None = None, + context_engine: ContextEngineProtocol | None = None, + verifier: ResultVerifierProtocol | None = None, + verify: bool = True, + ) -> None: + self.store = store + self.context_engine = context_engine + self.verifier = verifier + self.verify = verify + self.last_context: HarnessContext | None = None + self.last_report: VerificationReport | None = None + self._bound_run: contextvars.ContextVar[dict[str, str] | None] = ( + contextvars.ContextVar("harness_bound_run", default=None) + ) + + @contextlib.contextmanager + def bind_run( + self, + *, + user_id: str = "", + session_id: str = "", + original_prompt: str = "", + run_id: str = "", + ) -> Iterator[None]: + """Bind metadata for the next ``Runner.run`` call. + + ``Runner.run`` currently does not pass ``user_id`` / ``session_id`` to + ``BaseRunProcessor.process_run``. This context manager keeps the example + backward-compatible while still producing useful local harness records. + """ + + token = self._bound_run.set( + { + "user_id": user_id, + "session_id": session_id, + "original_prompt": original_prompt, + "run_id": run_id, + } + ) + try: + yield + finally: + self._bound_run.reset(token) + + def process_run( + self, + runner: object, + message: object, + **kwargs: object, + ) -> Callable[ + [Callable[[], AsyncGenerator[object, None]]], + Callable[[], AsyncGenerator[object, None]], + ]: + bound = self._bound_run.get() or {} + user_id = bound.get("user_id") or getattr(runner, "user_id", "") or "" + session_id = bound.get("session_id") or "unknown-session" + original_prompt = bound.get("original_prompt") or extract_text_from_message( + message + ) + run_id = bound.get("run_id") or f"harness-{uuid.uuid4().hex[:12]}" + + context = HarnessContext( + user_id=user_id, + session_id=session_id, + run_id=run_id, + original_prompt=original_prompt, + ) + + if self.context_engine is not None: + context = self.context_engine.prepare_context(context) + injected_prompt = self.context_engine.build_user_prompt( + context=context, + user_prompt=original_prompt, + ) + replaced = replace_message_text(message, injected_prompt) + context.metadata["message_injected"] = replaced + + self.last_context = context + if self.store is not None: + self.store.append_event( + HarnessEvent( + event_type="harness.run.started", + session_id=session_id, + run_id=run_id, + payload={ + "user_id": user_id, + "turn_type": context.turn_type, + "message_injected": context.metadata.get( + "message_injected", False + ), + }, + ) + ) + self.store.append_message( + session_id=session_id, + role="user", + content=original_prompt, + run_id=run_id, + ) + + def decorator( + event_generator_func: Callable[[], AsyncGenerator[object, None]], + ) -> Callable[[], AsyncGenerator[object, None]]: + async def wrapper() -> AsyncGenerator[object, None]: + token = _set_current_context(context) + final_text = "" + started = time.perf_counter() + try: + async for event in event_generator_func(): + event_text = extract_text_from_event(event) + if event_text: + final_text = event_text + yield event + except Exception as exc: + if self.store is not None: + self.store.append_event( + HarnessEvent( + event_type="harness.run.failed", + session_id=session_id, + run_id=run_id, + payload={ + "error_type": type(exc).__name__, + "error_message": str(exc), + }, + ) + ) + raise + finally: + _reset_current_context(token) + + if self.store is not None and final_text: + self.store.append_message( + session_id=session_id, + role="assistant", + content=final_text, + run_id=run_id, + ) + + receipts = [] + if self.store is not None: + receipts = self.store.load_receipts(run_id=run_id) + context.receipts = receipts + + if self.verify and self.verifier is not None: + report = self.verifier.verify( + final_text=final_text, + context=context, + receipts=receipts, + ) + self.last_report = report + if self.store is not None: + self.store.save_report(report) + + if self.store is not None: + self.store.append_event( + HarnessEvent( + event_type="harness.run.finished", + session_id=session_id, + run_id=run_id, + payload={ + "duration_ms": round( + (time.perf_counter() - started) * 1000, 3 + ), + "final_text_preview": summarize_text( + final_text, max_chars=300 + ), + "verified": self.last_report.done + if self.last_report + else None, + }, + ) + ) + + return wrapper + + return decorator diff --git a/examples/harness/harness_modules/result_verifier.py b/examples/harness/harness_modules/result_verifier.py new file mode 100644 index 00000000..861c039f --- /dev/null +++ b/examples/harness/harness_modules/result_verifier.py @@ -0,0 +1,382 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic result verification and hallucination suppression.""" + +from __future__ import annotations + +import json +import re +from typing import Callable, Protocol, TypedDict + +from .context_engine import has_external_fact_markers, has_output_format_markers +from .core import ( + AcceptanceCheck, + AcceptanceCriterion, + CapabilityReceipt, + HarnessContext, + VerificationReport, +) + + +class EvidenceReadStoreProtocol(Protocol): + def read_evidence(self, ref_id: str) -> str: ... + + +class EvidenceBundle(TypedDict): + corpus: str + refs: list[str] + urls: set[str] + + +URL_RE = re.compile(r"https?://[^\s)\],,。;;]+", re.IGNORECASE) +KEY_FACT_RE = re.compile( + r"(?:[$¥¥]\s?\d+(?:\.\d+)?|\d+(?:\.\d+)?%|\d{4}[-/年]\d{1,2}(?:[-/月]\d{1,2}日?)?|\d{4}年|\d{4,})" +) +SUCCESS_RECEIPT_STATUSES = {"success", "ok", "passed", "completed"} +SUCCESS_DECLARATION_RE = re.compile( + r'(?:"?operation_completed"?\s*[:=]\s*true|' + r'"?problem_observed"?\s*[:=]\s*false|' + r'"?status"?\s*[:=]\s*"?(?:passed|success|ok)"?)', + re.IGNORECASE, +) +COMPLETION_CLAIM_RE = re.compile( + r"(已完成|已经完成|完成了|操作完成|已成功|成功完成|已更新|已经更新|" + r"已写入|done|successfully completed|completed successfully|updated successfully)", + re.IGNORECASE, +) +NEGATED_COMPLETION_RE = re.compile( + r"(未完成|没有完成|无法完成|不能完成|不能视为已完成|未能完成|" + r"not\s+completed|failed\s+to\s+complete|could\s+not\s+complete|" + r"unable\s+to\s+complete|cannot\s+complete)", + re.IGNORECASE, +) + + +class ResultVerifier: + """Small deterministic verifier for external-fact answers.""" + + def __init__( + self, + *, + store: EvidenceReadStoreProtocol | None = None, + repair_callback: Callable[[str, VerificationReport], str] | None = None, + ) -> None: + self.store = store + self.repair_callback = repair_callback + + def build_acceptance(self, prompt: str) -> list[AcceptanceCriterion]: + criteria = [ + AcceptanceCriterion( + id="AC-final-answer", + description="Return a non-empty final answer that addresses the user request.", + ) + ] + if has_external_fact_markers(prompt): + criteria.append( + AcceptanceCriterion( + id="AC-grounded-facts", + description="External or current factual claims must be grounded in tool evidence or cited sources.", + ) + ) + if has_output_format_markers(prompt): + criteria.append( + AcceptanceCriterion( + id="AC-output-format", + description="Respect the requested output format.", + ) + ) + return criteria + + def verify( + self, + *, + final_text: str, + context: HarnessContext, + receipts: list[CapabilityReceipt] | None = None, + ) -> VerificationReport: + receipts = receipts if receipts is not None else context.receipts + prompt = self._contract_prompt(context) + criteria = ( + context.task_contract.acceptance + if context.task_contract is not None + else self.build_acceptance(prompt) + ) + evidence = self._collect_evidence(receipts) + evidence_corpus = evidence["corpus"] + evidence_refs = evidence["refs"] + allowed_urls = evidence["urls"] + + checks: list[AcceptanceCheck] = [] + checks.append(self._check_final_answer(final_text)) + receipt_check = self._check_failed_receipt_completion( + final_text=final_text, + receipts=receipts, + ) + if receipt_check is not None: + checks.append(receipt_check) + + if any(item.id == "AC-grounded-facts" for item in criteria): + checks.append( + self._check_grounded_facts( + final_text=final_text, + evidence_refs=evidence_refs, + allowed_urls=allowed_urls, + ) + ) + checks.extend( + self._build_evidence_checks( + final_text=final_text, + evidence_corpus=evidence_corpus, + allowed_urls=allowed_urls, + evidence_refs=evidence_refs, + ) + ) + + if any(item.id == "AC-output-format" for item in criteria): + checks.append( + self._check_output_format(prompt=prompt, final_text=final_text) + ) + + missing = [ + check.message + for check in checks + if not check.passed and check.severity == "error" + ] + done = not missing + guidance = "" + if missing: + guidance = ( + "Answer is not ready to trust. Add tool evidence, cite only observed sources, " + "or explicitly state which facts could not be verified." + ) + + return VerificationReport( + run_id=context.run_id, + session_id=context.session_id, + done=done, + checks=checks, + missing_requirements=missing, + evidence_refs=evidence_refs, + follow_up_guidance=guidance, + ) + + def repair(self, final_text: str, report: VerificationReport) -> str: + """Return a conservative repaired answer or delegate to a callback.""" + + if report.done: + return final_text + if self.repair_callback is not None: + return self.repair_callback(final_text, report) + + missing = "; ".join(report.missing_requirements) + return ( + "I cannot verify the answer with the available evidence. " + f"Missing requirements: {missing}. " + "Please run the required lookup/search tools or provide source material before treating this as final." + ) + + def _contract_prompt(self, context: HarnessContext) -> str: + if context.task_contract is None: + return context.original_prompt + current_prompt = context.task_contract.metadata.get("current_prompt", "") + return f"{context.task_contract.original_prompt}\n{current_prompt}".strip() + + def _check_final_answer(self, final_text: str) -> AcceptanceCheck: + passed = bool(final_text and final_text.strip()) + return AcceptanceCheck( + id="AC-final-answer", + passed=passed, + message="Final answer is present." if passed else "Final answer is empty.", + ) + + def _check_grounded_facts( + self, + *, + final_text: str, + evidence_refs: list[str], + allowed_urls: set[str], + ) -> AcceptanceCheck: + has_evidence = bool(evidence_refs or allowed_urls) + if has_evidence: + return AcceptanceCheck( + id="AC-grounded-facts", + passed=True, + message="External facts have evidence candidates.", + evidence_refs=list(evidence_refs), + ) + return AcceptanceCheck( + id="AC-grounded-facts", + passed=False, + message="External/current factual task has no tool evidence or source receipt.", + ) + + def _build_evidence_checks( + self, + *, + final_text: str, + evidence_corpus: str, + allowed_urls: set[str], + evidence_refs: list[str], + ) -> list[AcceptanceCheck]: + checks: list[AcceptanceCheck] = [] + + answer_urls = {self._clean_url(url) for url in URL_RE.findall(final_text)} + fabricated_urls = sorted(url for url in answer_urls if url not in allowed_urls) + checks.append( + AcceptanceCheck( + id="EG-source-refs", + passed=not fabricated_urls, + message=( + "All cited URLs are present in tool evidence." + if not fabricated_urls + else f"Answer cites URL(s) not present in evidence/source: {', '.join(fabricated_urls)}" + ), + evidence_refs=evidence_refs, + ) + ) + + key_facts = sorted(set(KEY_FACT_RE.findall(final_text))) + missing_facts = [ + fact + for fact in key_facts + if self._normalize(fact) not in self._normalize(evidence_corpus) + ] + checks.append( + AcceptanceCheck( + id="EG-key-facts", + passed=not missing_facts, + message=( + "Numeric/date/price facts are covered by evidence." + if not missing_facts + else f"Key fact(s) not found in evidence text: {', '.join(missing_facts)}" + ), + evidence_refs=evidence_refs, + ) + ) + return checks + + def _check_failed_receipt_completion( + self, + *, + final_text: str, + receipts: list[CapabilityReceipt], + ) -> AcceptanceCheck | None: + failed_receipts = [ + receipt + for receipt in receipts + if receipt.status.strip().lower() not in SUCCESS_RECEIPT_STATUSES + ] + if not failed_receipts: + return None + + failure_summary = ", ".join( + self._format_receipt_failure(receipt) for receipt in failed_receipts + ) + completion_claimed = self._claims_completion(final_text) + return AcceptanceCheck( + id="EG-tool-receipts", + passed=not completion_claimed, + message=( + "Final answer does not claim completion after failed tool receipt(s)." + if not completion_claimed + else ( + "Answer claims the operation completed despite failed tool " + f"receipt(s): {failure_summary}" + ) + ), + evidence_refs=[ + ref.ref_id + for receipt in failed_receipts + for ref in receipt.evidence_refs + ], + ) + + def _check_output_format(self, *, prompt: str, final_text: str) -> AcceptanceCheck: + lowered = prompt.lower() + text = final_text.strip() + passed = True + expected = "requested format" + + if "json" in lowered: + expected = "JSON" + try: + json.loads(text) + except Exception: + passed = False + elif "表格" in lowered or "table" in lowered: + expected = "table" + passed = "|" in text or "\t" in text + elif any( + marker in lowered for marker in ("清单", "列表", "要点", "list", "bullet") + ): + expected = "list" + passed = bool( + re.search(r"(^|\n)\s*(?:[-*]|\*{0,2}\d+[.)、]\*{0,2})\s+", text) + ) + + return AcceptanceCheck( + id="AC-output-format", + passed=passed, + message=( + f"Answer satisfies {expected} output format." + if passed + else f"Answer does not satisfy requested {expected} output format." + ), + ) + + def _collect_evidence(self, receipts: list[CapabilityReceipt]) -> EvidenceBundle: + corpus_parts: list[str] = [] + refs: list[str] = [] + urls: set[str] = set() + + for receipt in receipts: + corpus_parts.append(receipt.result_summary) + for source in receipt.sources: + source_text = json.dumps(source, ensure_ascii=False, sort_keys=True) + corpus_parts.append(source_text) + for url in URL_RE.findall(source_text): + urls.add(self._clean_url(url)) + for ref in receipt.evidence_refs: + refs.append(ref.ref_id) + corpus_parts.append(ref.preview) + if self.store is not None: + corpus_parts.append(self.store.read_evidence(ref.ref_id)) + for url in URL_RE.findall(ref.preview): + urls.add(self._clean_url(url)) + for url in URL_RE.findall(receipt.result_summary): + urls.add(self._clean_url(url)) + + return { + "corpus": "\n".join(corpus_parts), + "refs": refs, + "urls": urls, + } + + def _clean_url(self, url: str) -> str: + return url.strip().strip("\"'`<>()[]{}.,;:,。;:)】》").lower() + + def _normalize(self, value: str) -> str: + return re.sub(r"\s+", "", value).lower() + + def _claims_completion(self, final_text: str) -> bool: + if SUCCESS_DECLARATION_RE.search(final_text): + return True + if NEGATED_COMPLETION_RE.search(final_text): + return False + return bool(COMPLETION_CLAIM_RE.search(final_text)) + + def _format_receipt_failure(self, receipt: CapabilityReceipt) -> str: + detail = receipt.error_type or receipt.error_message or receipt.status + return f"{receipt.tool_name}({detail})" diff --git a/examples/harness/harness_modules/stores.py b/examples/harness/harness_modules/stores.py new file mode 100644 index 00000000..3a0346b1 --- /dev/null +++ b/examples/harness/harness_modules/stores.py @@ -0,0 +1,246 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Local JSONL/file store for the Harness example. + +The store is intentionally small and dependency-free. Production deployments +should replace it with a durable shared store, but the interfaces here are +enough to demonstrate context projection, tool receipts, evidence, and reports. +""" + +from __future__ import annotations + +import hashlib +import json +import threading +import uuid +from pathlib import Path + +from pydantic import BaseModel + +from .core import ( + CapabilityReceipt, + EvidenceRef, + HarnessEvent, + JSONDict, + JSONValue, + VerificationReport, + summarize_text, + utc_now, +) + + +def _to_jsonable(value: object) -> JSONValue: + if isinstance(value, BaseModel): + return _to_jsonable(value.model_dump(mode="json")) + if isinstance(value, Path): + return str(value) + if isinstance(value, dict): + return {str(k): _to_jsonable(v) for k, v in value.items()} + if isinstance(value, (list, tuple)): + return [_to_jsonable(v) for v in value] + if value is None or isinstance(value, (str, int, float, bool)): + return value + return str(value) + + +def _json_object(text: str) -> JSONDict: + value = json.loads(text) + return dict(value) if isinstance(value, dict) else {} + + +def _read_jsonl(path: Path) -> list[JSONDict]: + if not path.exists(): + return [] + records: list[JSONDict] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + record = _json_object(line) + if record: + records.append(record) + return records + + +class LocalHarnessStore: + """Simple local store used by the example and tests.""" + + def __init__(self, root_dir: str | Path = ".harness_runs") -> None: + self.root_dir = Path(root_dir) + self.evidence_dir = self.root_dir / "evidence" + self.reports_dir = self.root_dir / "reports" + self.events_path = self.root_dir / "events.jsonl" + self.receipts_path = self.root_dir / "receipts.jsonl" + self.messages_path = self.root_dir / "messages.jsonl" + self._lock = threading.Lock() + self.evidence_dir.mkdir(parents=True, exist_ok=True) + self.reports_dir.mkdir(parents=True, exist_ok=True) + + def _append_jsonl(self, path: Path, record: JSONDict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with self._lock: + with path.open("a", encoding="utf-8") as f: + f.write( + json.dumps(_to_jsonable(record), ensure_ascii=False, sort_keys=True) + ) + f.write("\n") + + def append_event(self, event: HarnessEvent) -> None: + self._append_jsonl(self.events_path, event.model_dump(mode="json")) + + def load_events( + self, *, run_id: str | None = None, session_id: str | None = None + ) -> list[JSONDict]: + records = _read_jsonl(self.events_path) + return [ + record + for record in records + if (run_id is None or record.get("run_id") == run_id) + and (session_id is None or record.get("session_id") == session_id) + ] + + def append_message( + self, + *, + session_id: str, + role: str, + content: str, + run_id: str = "", + metadata: JSONDict | None = None, + ) -> None: + self._append_jsonl( + self.messages_path, + { + "session_id": session_id, + "run_id": run_id, + "role": role, + "content": content, + "metadata": metadata or {}, + "created_at": utc_now(), + }, + ) + + def load_messages( + self, session_id: str, *, limit: int | None = None + ) -> list[JSONDict]: + records = [ + record + for record in _read_jsonl(self.messages_path) + if record.get("session_id") == session_id + ] + if limit is None: + return records + return records[-limit:] + + def put_evidence( + self, + *, + kind: str, + text: str, + metadata: JSONDict | None = None, + ref_id: str | None = None, + ) -> EvidenceRef: + digest = hashlib.sha256(text.encode("utf-8")).hexdigest() + ref_id = ref_id or f"ev-{digest[:12]}-{uuid.uuid4().hex[:6]}" + path = self.evidence_dir / f"{ref_id}.txt" + path.write_text(text, encoding="utf-8") + return EvidenceRef( + ref_id=ref_id, + kind=kind, + uri=str(path), + digest=digest, + preview=summarize_text(text, max_chars=700), + metadata=metadata or {}, + ) + + def read_evidence(self, ref_id: str) -> str: + path = self.evidence_dir / f"{ref_id}.txt" + if not path.exists(): + return "" + return path.read_text(encoding="utf-8") + + def append_receipt(self, receipt: CapabilityReceipt) -> None: + self._append_jsonl(self.receipts_path, receipt.model_dump(mode="json")) + + def load_receipts( + self, + *, + run_id: str | None = None, + session_id: str | None = None, + ) -> list[CapabilityReceipt]: + receipts: list[CapabilityReceipt] = [] + for record in _read_jsonl(self.receipts_path): + if run_id is not None and record.get("run_id") != run_id: + continue + if session_id is not None and record.get("session_id") != session_id: + continue + receipt = CapabilityReceipt( + id=str(record["id"]), + run_id=str(record.get("run_id", "")), + session_id=str(record.get("session_id", "")), + tool_name=str(record.get("tool_name", "")), + input_summary=str(record.get("input_summary", "")), + result_summary=str(record.get("result_summary", "")), + status=str(record.get("status", "unknown")), + duration_ms=float(record.get("duration_ms", 0)), + evidence_refs=[ + EvidenceRef(**ref) + for ref in record.get("evidence_refs", []) + if isinstance(ref, dict) + ], + sources=[ + dict(source) + for source in record.get("sources", []) + if isinstance(source, dict) + ], + artifacts=[ + dict(artifact) + for artifact in record.get("artifacts", []) + if isinstance(artifact, dict) + ], + error_type=str(record["error_type"]) + if record.get("error_type") + else None, + error_message=str(record["error_message"]) + if record.get("error_message") + else None, + created_at=str(record.get("created_at", utc_now())), + metadata=( + dict(record["metadata"]) + if isinstance(record.get("metadata"), dict) + else {} + ), + ) + receipts.append(receipt) + return receipts + + def save_report(self, report: VerificationReport) -> None: + self.reports_dir.mkdir(parents=True, exist_ok=True) + report_path = self.reports_dir / f"{report.session_id}-{report.run_id}.json" + report_path.write_text( + json.dumps( + _to_jsonable(report), ensure_ascii=False, indent=2, sort_keys=True + ), + encoding="utf-8", + ) + + def load_report(self, *, session_id: str, run_id: str) -> JSONDict: + report_path = self.reports_dir / f"{session_id}-{run_id}.json" + return _json_object(report_path.read_text(encoding="utf-8")) + + def latest_report(self, *, session_id: str) -> JSONDict | None: + reports = sorted(self.reports_dir.glob(f"{session_id}-*.json")) + if not reports: + return None + return _json_object(reports[-1].read_text(encoding="utf-8")) diff --git a/examples/harness/harness_modules/tool_wrappers.py b/examples/harness/harness_modules/tool_wrappers.py new file mode 100644 index 00000000..644f0259 --- /dev/null +++ b/examples/harness/harness_modules/tool_wrappers.py @@ -0,0 +1,282 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tool wrappers that record Harness capability receipts.""" + +from __future__ import annotations + +import functools +import inspect +import json +import time +import uuid +from collections.abc import Awaitable, Callable, Mapping +from typing import ParamSpec, Protocol, TypeVar, overload + +from .core import ( + CapabilityReceipt, + EvidenceRef, + JSONDict, + current_harness_context, + summarize_text, +) + + +P = ParamSpec("P") +R = TypeVar("R") + + +class ReceiptStoreProtocol(Protocol): + def put_evidence( + self, + *, + kind: str, + text: str, + metadata: JSONDict | None = None, + ref_id: str | None = None, + ) -> EvidenceRef: ... + + def append_receipt(self, receipt: CapabilityReceipt) -> None: ... + + +def wrap_tools( + tools: list[Callable[..., object]], + *, + store: ReceiptStoreProtocol, + externalize_threshold: int = 4000, +) -> list[Callable[..., object]]: + """Wrap multiple veADK tools without changing their public signatures.""" + + return [ + wrap_tool(tool, store=store, externalize_threshold=externalize_threshold) + for tool in tools + ] + + +@overload +def wrap_tool( + tool: Callable[P, Awaitable[R]], + *, + store: ReceiptStoreProtocol, + externalize_threshold: int = 4000, +) -> Callable[P, Awaitable[R]]: ... + + +@overload +def wrap_tool( + tool: Callable[P, R], + *, + store: ReceiptStoreProtocol, + externalize_threshold: int = 4000, +) -> Callable[P, R]: ... + + +def wrap_tool( + tool: Callable[P, R] | Callable[P, Awaitable[R]], + *, + store: ReceiptStoreProtocol, + externalize_threshold: int = 4000, +) -> Callable[P, R] | Callable[P, Awaitable[R]]: + """Return a receipt-recording wrapper for a sync or async callable.""" + + if inspect.iscoroutinefunction(tool): + + @functools.wraps(tool) + async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + started = time.perf_counter() + try: + result = await tool(*args, **kwargs) + except Exception as exc: + _record_receipt( + tool=tool, + args=args, + kwargs=kwargs, + result=None, + status="failed", + duration_ms=(time.perf_counter() - started) * 1000, + error=exc, + store=store, + externalize_threshold=externalize_threshold, + ) + raise + + _record_receipt( + tool=tool, + args=args, + kwargs=kwargs, + result=result, + status="success", + duration_ms=(time.perf_counter() - started) * 1000, + error=None, + store=store, + externalize_threshold=externalize_threshold, + ) + return result + + return async_wrapper + + @functools.wraps(tool) + def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + started = time.perf_counter() + try: + result = tool(*args, **kwargs) + except Exception as exc: + _record_receipt( + tool=tool, + args=args, + kwargs=kwargs, + result=None, + status="failed", + duration_ms=(time.perf_counter() - started) * 1000, + error=exc, + store=store, + externalize_threshold=externalize_threshold, + ) + raise + + _record_receipt( + tool=tool, + args=args, + kwargs=kwargs, + result=result, + status="success", + duration_ms=(time.perf_counter() - started) * 1000, + error=None, + store=store, + externalize_threshold=externalize_threshold, + ) + return result + + return sync_wrapper + + +def _record_receipt( + *, + tool: Callable[..., object], + args: tuple[object, ...], + kwargs: Mapping[str, object], + result: object, + status: str, + duration_ms: float, + error: Exception | None, + store: ReceiptStoreProtocol, + externalize_threshold: int, +) -> CapabilityReceipt: + context = current_harness_context() + result_text = _result_to_text(result) if error is None else "" + evidence_refs = [] + result_summary = summarize_text(result_text, max_chars=900) + + if error is None and len(result_text) > externalize_threshold: + evidence_refs.append( + store.put_evidence( + kind="tool-result", + text=result_text, + metadata={ + "tool_name": getattr(tool, "__name__", tool.__class__.__name__) + }, + ) + ) + result_summary = summarize_text(result_text, max_chars=500) + + receipt = CapabilityReceipt( + id=f"receipt-{uuid.uuid4().hex[:12]}", + run_id=context.run_id if context else "manual-run", + session_id=context.session_id if context else "manual-session", + tool_name=getattr(tool, "__name__", tool.__class__.__name__), + input_summary=_summarize_call(args=args, kwargs=kwargs), + result_summary=result_summary, + status=status, + duration_ms=round(duration_ms, 3), + evidence_refs=evidence_refs, + sources=_extract_sources(result), + artifacts=_extract_artifacts(result), + error_type=type(error).__name__ if error else None, + error_message=str(error) if error else None, + ) + store.append_receipt(receipt) + return receipt + + +def _summarize_call(*, args: tuple[object, ...], kwargs: Mapping[str, object]) -> str: + payload: dict[str, object] = {} + if args: + payload["args"] = args + if kwargs: + payload["kwargs"] = kwargs + return summarize_text(_json_dumps(payload), max_chars=500) + + +def _result_to_text(result: object) -> str: + if isinstance(result, (dict, list, tuple)): + return _json_dumps(result) + return "" if result is None else str(result) + + +def _json_dumps(value: object) -> str: + try: + return json.dumps(value, ensure_ascii=False, sort_keys=True, default=str) + except TypeError: + return str(value) + + +def _extract_sources(result: object) -> list[JSONDict]: + if not isinstance(result, Mapping): + return [] + + sources: list[JSONDict] = [] + for key in ("sources", "citations"): + raw_sources = result.get(key) + if raw_sources is None: + continue + if not isinstance(raw_sources, list): + raw_sources = [raw_sources] + for item in raw_sources: + sources.append(_normalize_source(item)) + + for key in ("url", "source_url", "link"): + if result.get(key): + sources.append({"url": str(result[key]), "source_key": key}) + + return [source for source in sources if source] + + +def _normalize_source(item: object) -> JSONDict: + if isinstance(item, Mapping): + return dict(item) + text = str(item) + if text.startswith(("http://", "https://")): + return {"url": text} + return {"text": text} + + +def _extract_artifacts(result: object) -> list[JSONDict]: + if not isinstance(result, Mapping): + return [] + + artifacts: list[JSONDict] = [] + raw_artifacts = result.get("artifacts") + if raw_artifacts: + if not isinstance(raw_artifacts, list): + raw_artifacts = [raw_artifacts] + for item in raw_artifacts: + if isinstance(item, Mapping): + artifacts.append(dict(item)) + else: + artifacts.append({"value": str(item)}) + + for key in ("artifact_id", "file_path", "path"): + if result.get(key): + artifacts.append({key: str(result[key])}) + return artifacts diff --git a/examples/harness/main.py b/examples/harness/main.py new file mode 100644 index 00000000..02b11dd7 --- /dev/null +++ b/examples/harness/main.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Run the ContextEngine + ResultVerifier Harness example.""" + +import asyncio + +from harness_agent import build_harness_agent + + +QUESTION = "请查一下 veADK Harness 示例的核心能力,给出来源,并用 3 条要点回答。" + + +async def main() -> None: + bundle = build_harness_agent() + answer = await bundle.run(QUESTION, session_id="harness-demo") + print(answer) + report = bundle.latest_report(session_id="harness-demo") + if report: + print(f"\nHarness verification done={report['done']}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/harness/tests/__init__.py b/examples/harness/tests/__init__.py new file mode 100644 index 00000000..a2c92164 --- /dev/null +++ b/examples/harness/tests/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the Harness example.""" diff --git a/examples/harness/tests/conftest.py b/examples/harness/tests/conftest.py new file mode 100644 index 00000000..922fd104 --- /dev/null +++ b/examples/harness/tests/conftest.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Make the self-contained harness example importable in tests.""" + +import sys +from pathlib import Path + +HARNESS_DIR = Path(__file__).resolve().parents[1] +if str(HARNESS_DIR) not in sys.path: + sys.path.insert(0, str(HARNESS_DIR)) diff --git a/examples/harness/tests/test_context_engine.py b/examples/harness/tests/test_context_engine.py new file mode 100644 index 00000000..2f1fab07 --- /dev/null +++ b/examples/harness/tests/test_context_engine.py @@ -0,0 +1,163 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from pydantic import ValidationError + +from harness_modules import ContextEngine, HarnessContext, LocalHarnessStore + + +def test_harness_context_validates_construction_and_assignment(): + with pytest.raises(ValidationError): + HarnessContext( + user_id=["not", "a", "string"], + session_id="s1", + run_id="r1", + original_prompt="prompt", + ) + + context = HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt="prompt", + ) + + with pytest.raises(ValidationError): + context.budget = {"estimated_tokens": "many", "max_context_chars": 100} + + +def test_task_anchor_is_pinned_for_follow_up(tmp_path): + store = LocalHarnessStore(tmp_path) + store.append_message( + session_id="s1", + role="user", + content="请查最新 AI 使用政策,给出来源,并用表格输出。", + run_id="r0", + ) + store.append_message( + session_id="s1", + role="assistant", + content="已根据来源整理。", + run_id="r0", + ) + + engine = ContextEngine(store=store) + context = engine.prepare_context( + HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt="继续按刚才的格式输出。", + ) + ) + + assert context.turn_type == "follow_up" + assert context.task_contract is not None + assert ( + context.task_contract.original_prompt + == "请查最新 AI 使用政策,给出来源,并用表格输出。" + ) + + header = engine.build_context_header(context=context) + assert "original_task: 请查最新 AI 使用政策,给出来源,并用表格输出。" in header + assert "AC-grounded-facts" in header + assert "AC-output-format" in header + + +def test_history_projection_excludes_control_messages(tmp_path): + store = LocalHarnessStore(tmp_path) + store.append_message(session_id="s1", role="user", content="请查政策", run_id="r0") + store.append_message( + session_id="s1", + role="assistant", + content="[progress] search started", + run_id="r0", + metadata={"control": True}, + ) + store.append_message( + session_id="s1", role="assistant", content="政策摘要", run_id="r0" + ) + + engine = ContextEngine(store=store) + context = engine.prepare_context( + HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt="继续", + ) + ) + + contents = [item["content"] for item in context.history_projection] + assert "政策摘要" in contents + assert all("[progress]" not in item for item in contents) + + +def test_follow_up_keeps_recent_answer_anchor(tmp_path): + store = LocalHarnessStore(tmp_path) + store.append_message( + session_id="s1", role="user", content="列出三条政策要求", run_id="r0" + ) + store.append_message( + session_id="s1", + role="assistant", + content="1. 保留来源\n2. 保留收据", + run_id="r0", + ) + + engine = ContextEngine(store=store) + context = engine.prepare_context( + HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt="继续按刚才那个格式", + ) + ) + + header = engine.build_context_header(context=context) + assert "history[2] assistant: 1. 保留来源 2. 保留收据" in header + assert context.task_contract is not None + assert context.task_contract.original_prompt == "列出三条政策要求" + + +def test_context_budget_truncates_low_value_history(tmp_path): + store = LocalHarnessStore(tmp_path) + original = "请查最新合规政策,并保留来源。" + store.append_message(session_id="s1", role="user", content=original, run_id="r0") + for idx in range(20): + store.append_message( + session_id="s1", + role="assistant", + content=f"历史消息 {idx} " + ("低价值内容 " * 80), + run_id=f"r{idx}", + ) + + engine = ContextEngine(store=store, max_history_messages=20, max_context_chars=900) + context = engine.prepare_context( + HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt="继续", + ) + ) + header = engine.build_context_header(context=context) + + assert f"original_task: {original}" in header + assert context.budget is not None + assert context.budget.truncated is True + assert context.budget.omitted_count > 0 + assert context.budget.estimated_tokens > 0 diff --git a/examples/harness/tests/test_evaluation.py b/examples/harness/tests/test_evaluation.py new file mode 100644 index 00000000..83589b02 --- /dev/null +++ b/examples/harness/tests/test_evaluation.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from evaluation.run_eval import run_evaluation + + +def test_offline_eval_proves_harness_metric_lift(tmp_path): + report = run_evaluation(tmp_path) + verifier = report["result_verifier"] + context = report["context_engine"] + + assert verifier["harness"]["accuracy"] > verifier["baseline"]["accuracy"] + assert verifier["baseline"]["unsafe_false_accept_rate"] == 1.0 + assert verifier["harness"]["unsafe_false_accept_rate"] == 0.0 + assert verifier["harness"]["unsafe_recall"] == 1.0 + + assert context["harness"]["quality_score"] > context["baseline"]["quality_score"] + assert context["harness"]["anchor_contract_rate"] == 1.0 + assert context["harness"]["control_pollution_rate"] == 0.0 + assert context["harness"]["budget_compliance_rate"] == 1.0 + + assert (tmp_path / "harness_eval_report.json").exists() + assert (tmp_path / "harness_eval_report.md").exists() + markdown = (tmp_path / "harness_eval_report.md").read_text(encoding="utf-8") + assert "| Scenario | Harness module |" in markdown + assert "RAG memory freshness" in markdown + assert markdown.index("Scenario Lift") < markdown.index("ResultVerifier Cases") diff --git a/examples/harness/tests/test_harness_agent.py b/examples/harness/tests/test_harness_agent.py new file mode 100644 index 00000000..b0eeb245 --- /dev/null +++ b/examples/harness/tests/test_harness_agent.py @@ -0,0 +1,150 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +from types import SimpleNamespace + +from harness_modules import ( + ContextEngine, + HarnessRunProcessor, + LocalHarnessStore, + ResultVerifier, +) + + +class FakeRunner: + user_id = "runner-user" + + +def _message(text: str) -> SimpleNamespace: + return SimpleNamespace(parts=[SimpleNamespace(text=text)]) + + +def _event(text: str) -> SimpleNamespace: + return SimpleNamespace( + content=SimpleNamespace(parts=[SimpleNamespace(text=text, thought=False)]) + ) + + +async def _collect(generator): + texts = [] + async for event in generator: + texts.append(event.content.parts[0].text) + return texts + + +def test_baseline_without_verifier_returns_fake_answer(tmp_path): + store = LocalHarnessStore(tmp_path) + processor = HarnessRunProcessor( + store=store, + context_engine=ContextEngine(store=store), + verifier=ResultVerifier(store=store), + verify=False, + ) + message = _message("请给出最新政策并附来源。") + + async def event_generator(): + yield _event("政策已发布,参考 https://fake.example/policy。") + + async def run(): + with processor.bind_run( + user_id="u1", + session_id="s1", + original_prompt="请给出最新政策并附来源。", + run_id="r1", + ): + wrapped = processor.process_run(FakeRunner(), message)(event_generator) + return await _collect(wrapped()) + + texts = asyncio.run(run()) + + assert texts == ["政策已发布,参考 https://fake.example/policy。"] + assert processor.last_report is None + + +def test_processor_injects_context_and_records_failed_verification(tmp_path): + store = LocalHarnessStore(tmp_path) + processor = HarnessRunProcessor( + store=store, + context_engine=ContextEngine(store=store), + verifier=ResultVerifier(store=store), + ) + message = _message("请给出最新政策并附来源。") + + async def event_generator(): + yield _event("政策已发布,参考 https://fake.example/policy。") + + async def run(): + with processor.bind_run( + user_id="u1", + session_id="s1", + original_prompt="请给出最新政策并附来源。", + run_id="r1", + ): + wrapped = processor.process_run(FakeRunner(), message)(event_generator) + return await _collect(wrapped()) + + asyncio.run(run()) + + assert "[Harness Context]" in message.parts[0].text + assert "AC-grounded-facts" in message.parts[0].text + assert processor.last_report is not None + assert processor.last_report.done is False + assert any( + "fake.example" in item for item in processor.last_report.missing_requirements + ) + + report = store.load_report(session_id="s1", run_id="r1") + assert report["done"] is False + + +def test_second_turn_gets_history_projection(tmp_path): + store = LocalHarnessStore(tmp_path) + context_engine = ContextEngine(store=store) + processor = HarnessRunProcessor( + store=store, + context_engine=context_engine, + verifier=ResultVerifier(store=store), + ) + + async def first_event_generator(): + yield _event("1. 保留来源\n2. 保留收据") + + async def second_event_generator(): + yield _event("继续输出。") + + async def run_once(prompt: str, run_id: str, generator): + message = _message(prompt) + with processor.bind_run( + user_id="u1", + session_id="s1", + original_prompt=prompt, + run_id=run_id, + ): + wrapped = processor.process_run(FakeRunner(), message)(generator) + await _collect(wrapped()) + return message + + asyncio.run(run_once("列出三条 Harness 验收要求。", "r1", first_event_generator)) + second_message = asyncio.run( + run_once("继续按刚才的格式输出。", "r2", second_event_generator) + ) + + assert processor.last_context is not None + assert processor.last_context.turn_type == "follow_up" + assert any( + item["role"] == "assistant" and "保留来源" in item["content"] + for item in processor.last_context.history_projection + ) + assert "Recent session history:" in second_message.parts[0].text diff --git a/examples/harness/tests/test_model_evaluation.py b/examples/harness/tests/test_model_evaluation.py new file mode 100644 index 00000000..2c532e20 --- /dev/null +++ b/examples/harness/tests/test_model_evaluation.py @@ -0,0 +1,189 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from evaluation.run_model_eval import ( + _model_metrics, + load_model_env, + parse_env_file, + render_model_markdown, +) + + +def test_model_env_loader_redacts_and_loads_only_model_keys(tmp_path, monkeypatch): + env_file = tmp_path / ".env" + env_file.write_text( + "\n".join( + [ + "MODEL_AGENT_API_KEY=example-test-key", + "MODEL_AGENT_NAME=test-model", + "MODEL_AGENT_API_BASE=https://example.com/api", + "DATABASE_POSTGRESQL_PASSWORD=unused-placeholder", + ] + ), + encoding="utf-8", + ) + for key in ("MODEL_AGENT_API_KEY", "MODEL_AGENT_NAME", "MODEL_AGENT_API_BASE"): + monkeypatch.delenv(key, raising=False) + + result = load_model_env(env_file) + + assert result.missing_required_keys == [] + assert result.model_name == "test-model" + assert set(result.loaded_keys) == { + "MODEL_AGENT_API_KEY", + "MODEL_AGENT_NAME", + "MODEL_AGENT_API_BASE", + } + assert "DATABASE_POSTGRESQL_PASSWORD" not in result.loaded_keys + + +def test_parse_env_file_handles_quotes_and_export(tmp_path): + env_file = tmp_path / ".env" + env_file.write_text( + "export MODEL_AGENT_NAME=\"quoted-model\"\nMODEL_AGENT_API_BASE='https://example.com'\n", + encoding="utf-8", + ) + + values = parse_env_file(env_file) + + assert values["MODEL_AGENT_NAME"] == "quoted-model" + assert values["MODEL_AGENT_API_BASE"] == "https://example.com" + + +def test_model_metrics_show_false_accept_reduction(): + rows = [ + { + "id": "unsupported", + "scenario_name": "Unsupported source claim", + "harness_capability": "ResultVerifier unsupported-answer block", + "scenario_type": "unsupported_without_evidence", + "prompt": "source required", + "evidence_required": True, + "expected_trusted": False, + "baseline": { + "answer": "unsupported answer", + "trusted_by_runtime": True, + "posthoc_verifier_done": False, + "posthoc_missing_requirements": ["missing evidence"], + }, + "harness": { + "answer": "cannot verify", + "trusted_by_runtime": False, + "verifier_done": False, + "missing_requirements": ["missing evidence"], + "receipt_count": 0, + "receipt_tools": [], + }, + }, + { + "id": "answerable", + "scenario_name": "Answerable with tool evidence", + "harness_capability": "Tool receipt + source verification", + "scenario_type": "answerable_with_tools", + "prompt": "source required", + "evidence_required": True, + "expected_trusted": True, + "baseline": { + "answer": "verified answer", + "trusted_by_runtime": True, + "posthoc_verifier_done": False, + "posthoc_missing_requirements": [], + }, + "harness": { + "answer": "verified answer", + "trusted_by_runtime": True, + "verifier_done": True, + "missing_requirements": [], + "receipt_count": 1, + "receipt_tools": ["lookup"], + }, + }, + ] + + metrics = _model_metrics(rows) + + assert metrics["baseline_trust_decision_accuracy"] == 0.5 + assert metrics["harness_trust_decision_accuracy"] == 1.0 + assert metrics["trust_decision_accuracy_gain_pp"] == 50.0 + assert metrics["baseline_unsupported_false_accept_rate"] == 1.0 + assert metrics["harness_unsupported_false_accept_rate"] == 0.0 + assert metrics["unsupported_false_accept_reduction_pp"] == 100.0 + assert metrics["harness_answerable_receipt_coverage_rate"] == 1.0 + assert metrics["harness_answerable_verified_pass_rate"] == 1.0 + assert metrics["harness_unsupported_block_rate"] == 1.0 + + +def test_model_report_is_grouped_by_scenario(): + report = { + "generated_at": "2026-06-11T00:00:00+00:00", + "env": { + "env_file": "", + "loaded_keys": [], + "model_name": "test-model", + "api_base": "https://example.com", + "api_key": "", + }, + "method": { + "baseline": "normal veADK Agent; every non-empty answer is trusted", + "harness": "veADK Agent with ContextEngine and ResultVerifier", + "case_count": 1, + }, + "metrics": { + "case_count": 1, + "answerable_case_count": 0, + "unsupported_case_count": 1, + "baseline_trust_decision_accuracy": 0.0, + "harness_trust_decision_accuracy": 1.0, + "trust_decision_accuracy_gain_pp": 100.0, + "baseline_unsupported_false_accept_rate": 1.0, + "harness_unsupported_false_accept_rate": 0.0, + "unsupported_false_accept_reduction_pp": 100.0, + "harness_answerable_verified_pass_rate": 0.0, + "harness_answerable_receipt_coverage_rate": 0.0, + "harness_unsupported_block_rate": 1.0, + "harness_trusted_answer_verification_rate": 0.0, + }, + "cases": [ + { + "id": "production-no-evidence-source-claim", + "scenario_name": "No-evidence hallucination suppression", + "harness_capability": "ResultVerifier unsupported-answer block", + "scenario_type": "unsupported_without_evidence", + "prompt": "source required", + "evidence_required": True, + "expected_trusted": False, + "baseline": { + "answer": "unsupported answer", + "trusted_by_runtime": True, + "posthoc_verifier_done": False, + "posthoc_missing_requirements": ["missing evidence"], + }, + "harness": { + "answer": "cannot verify", + "trusted_by_runtime": False, + "verifier_done": False, + "missing_requirements": ["missing evidence"], + "receipt_count": 0, + "receipt_tools": [], + }, + } + ], + } + + markdown = render_model_markdown(report) + + assert "| Scenario | Harness capability |" in markdown + assert markdown.index("Scenario") < markdown.index("Case Detail") + assert "No-evidence hallucination suppression" in markdown + assert "trust decision corrected" in markdown diff --git a/examples/harness/tests/test_production_golden.py b/examples/harness/tests/test_production_golden.py new file mode 100644 index 00000000..133a09f4 --- /dev/null +++ b/examples/harness/tests/test_production_golden.py @@ -0,0 +1,228 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import re +from pathlib import Path + +from harness_modules import ( + CapabilityReceipt, + ContextEngine, + HarnessContext, + LocalHarnessStore, + ResultVerifier, + TaskContract, +) + + +URL_RE = re.compile(r"https?://[^\s)\],,。;;]+", re.IGNORECASE) +SUCCESS_STATUSES = {"success", "ok", "passed"} + + +def _cases_path() -> Path: + return Path(__file__).resolve().parents[1] / "golden" / "production_scenarios.jsonl" + + +def _load_cases() -> list[dict[str, object]]: + return [ + json.loads(line) + for line in _cases_path().read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def _context(prompt: str, verifier: ResultVerifier) -> HarnessContext: + return HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt=prompt, + task_contract=TaskContract( + task_id="task-1", + original_prompt=prompt, + turn_type="new_task", + acceptance=verifier.build_acceptance(prompt), + ), + ) + + +def _build_receipts( + *, + store: LocalHarnessStore, + case: dict[str, object], +) -> list[CapabilityReceipt]: + receipts: list[CapabilityReceipt] = [] + tool_outputs = case.get("tool_outputs", []) + if not isinstance(tool_outputs, list): + return receipts + + for idx, raw_output in enumerate(tool_outputs, start=1): + if not isinstance(raw_output, dict): + continue + output = {str(key): value for key, value in raw_output.items()} + result = str(output.get("result", "")) + status = str(output.get("status", "success")) + evidence_refs = [] + sources = [] + if status in SUCCESS_STATUSES and result: + evidence_refs.append(store.put_evidence(kind="tool-result", text=result)) + sources = [{"url": _clean_url(url)} for url in URL_RE.findall(result)] + + receipts.append( + CapabilityReceipt( + id=f"receipt-{idx}", + run_id="r1", + session_id="s1", + tool_name=str(output.get("tool", "tool")), + input_summary=str(case["prompt"]), + result_summary=result, + status=status, + duration_ms=1.0, + evidence_refs=evidence_refs, + sources=sources, + error_type=str(output["error_type"]) + if output.get("error_type") + else None, + error_message=( + str(output["error_message"]) + if output.get("error_message") + else None + ), + ) + ) + return receipts + + +def _load_history(store: LocalHarnessStore, case: dict[str, object]) -> None: + history = case.get("history", []) + if not isinstance(history, list): + return + + for idx, raw_item in enumerate(history): + if not isinstance(raw_item, dict): + continue + item = {str(key): value for key, value in raw_item.items()} + raw_metadata = item.get("metadata", {}) + metadata = raw_metadata if isinstance(raw_metadata, dict) else {} + store.append_message( + session_id=str(case["id"]), + role=str(item.get("role", "")), + content=str(item.get("content", "")), + run_id=f"history-{idx}", + metadata={str(key): value for key, value in metadata.items()}, + ) + + +def _load_context_evidence(store: LocalHarnessStore, case: dict[str, object]) -> None: + evidence_text = case.get("evidence") + if not evidence_text: + return + + evidence = store.put_evidence(kind="tool-result", text=str(evidence_text)) + store.append_receipt( + CapabilityReceipt( + id=f"receipt-{case['id']}", + run_id=f"run-{case['id']}", + session_id=str(case["id"]), + tool_name="knowledge_lookup", + input_summary=str(case["prompt"]), + result_summary=str(evidence_text), + status="success", + duration_ms=1.0, + evidence_refs=[evidence], + sources=[ + {"url": _clean_url(url)} for url in URL_RE.findall(str(evidence_text)) + ], + ) + ) + + +def test_production_golden_collection_has_generic_scenarios(): + cases = _load_cases() + raw_text = _cases_path().read_text(encoding="utf-8").lower() + legacy_marker = "on" + "call" + + assert cases + assert {case["kind"] for case in cases} == {"result_verifier", "context_engine"} + assert f"task_{legacy_marker}" not in raw_text + assert legacy_marker not in raw_text + assert {case["scenario"] for case in cases} >= { + "RAG memory freshness", + "Tool failure claimed as success", + "Runtime parameter drift", + "Multi-turn context anchoring", + "Current evidence beats stale memory", + } + + +def test_production_golden_result_verifier_cases(tmp_path): + for case in _load_cases(): + if case["kind"] != "result_verifier": + continue + + store = LocalHarnessStore(tmp_path / str(case["id"])) + verifier = ResultVerifier(store=store) + report = verifier.verify( + final_text=str(case["final_answer"]), + context=_context(str(case["prompt"]), verifier), + receipts=_build_receipts(store=store, case=case), + ) + + assert report.done is case["expected_done"], case["id"] + expected_missing = case.get("expected_missing_contains") + if expected_missing: + assert any( + str(expected_missing) in item for item in report.missing_requirements + ) + + +def test_production_golden_context_engine_cases(tmp_path): + for case in _load_cases(): + if case["kind"] != "context_engine": + continue + + store = LocalHarnessStore(tmp_path / str(case["id"])) + _load_history(store, case) + _load_context_evidence(store, case) + + engine = ContextEngine( + store=store, + max_context_chars=int(case.get("max_context_chars", 1600)), + ) + context = engine.prepare_context( + HarnessContext( + user_id="u1", + session_id=str(case["id"]), + run_id=f"run-{case['id']}", + original_prompt=str(case["prompt"]), + ) + ) + header = engine.build_context_header(context=context) + + assert context.turn_type == case["expected_turn_type"], case["id"] + assert context.task_contract is not None + assert context.task_contract.original_prompt == case["expected_anchor"] + if case.get("expected_excluded"): + assert str(case["expected_excluded"]) not in header + if case.get("expected_included"): + assert str(case["expected_included"]) in header + if case.get("expected_evidence_url"): + assert str(case["expected_evidence_url"]) in header + assert header.index("Evidence preview:") < header.index( + "Recent session history:" + ) + + +def _clean_url(url: str) -> str: + return url.strip().strip("\"'`<>()[]{}.,;:,。;:)】》").lower() diff --git a/examples/harness/tests/test_result_verifier.py b/examples/harness/tests/test_result_verifier.py new file mode 100644 index 00000000..1f9b4a1b --- /dev/null +++ b/examples/harness/tests/test_result_verifier.py @@ -0,0 +1,330 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import re +from pathlib import Path + +import pytest + +from harness_modules import ( + CapabilityReceipt, + HarnessContext, + LocalHarnessStore, + ResultVerifier, + TaskContract, + wrap_tool, +) + + +def _context(prompt: str, verifier: ResultVerifier) -> HarnessContext: + return HarnessContext( + user_id="u1", + session_id="s1", + run_id="r1", + original_prompt=prompt, + task_contract=TaskContract( + task_id="task-1", + original_prompt=prompt, + turn_type="new_task", + acceptance=verifier.build_acceptance(prompt), + ), + ) + + +def test_passes_when_fact_is_in_evidence(tmp_path): + store = LocalHarnessStore(tmp_path) + verifier = ResultVerifier(store=store) + evidence = store.put_evidence( + kind="tool-result", + text=( + "AI Usage Policy v2026-06 requires source-backed answers. " + "Source URL: https://example.com/policies/ai-usage-2026-06" + ), + ) + receipt = CapabilityReceipt( + id="receipt-1", + run_id="r1", + session_id="s1", + tool_name="policy_lookup", + input_summary="policy", + result_summary=evidence.preview, + status="success", + duration_ms=1.0, + evidence_refs=[evidence], + sources=[{"url": "https://example.com/policies/ai-usage-2026-06"}], + ) + context = _context("请给出最新 AI 使用政策并附来源。", verifier) + + report = verifier.verify( + final_text=( + "AI Usage Policy v2026-06 requires source-backed answers. " + "来源:https://example.com/policies/ai-usage-2026-06" + ), + context=context, + receipts=[receipt], + ) + + assert report.done is True + assert all(check.passed for check in report.checks) + + +def test_fails_for_fabricated_url(tmp_path): + store = LocalHarnessStore(tmp_path) + verifier = ResultVerifier(store=store) + evidence = store.put_evidence( + kind="tool-result", + text="Policy source: https://example.com/policies/ai-usage-2026-06", + ) + receipt = CapabilityReceipt( + id="receipt-1", + run_id="r1", + session_id="s1", + tool_name="policy_lookup", + input_summary="policy", + result_summary=evidence.preview, + status="success", + duration_ms=1.0, + evidence_refs=[evidence], + sources=[{"url": "https://example.com/policies/ai-usage-2026-06"}], + ) + context = _context("请给出最新政策并附来源。", verifier) + + report = verifier.verify( + final_text="政策已经发布,参考 https://fake.example/policy。", + context=context, + receipts=[receipt], + ) + + assert report.done is False + assert any( + "not present in evidence" in item for item in report.missing_requirements + ) + + +def test_passes_when_url_is_only_in_source_receipt(tmp_path): + store = LocalHarnessStore(tmp_path) + verifier = ResultVerifier(store=store) + receipt = CapabilityReceipt( + id="receipt-1", + run_id="r1", + session_id="s1", + tool_name="policy_lookup", + input_summary="policy", + result_summary="AI Usage Policy v2026-06 requires source-backed answers.", + status="success", + duration_ms=1.0, + sources=[ + {"title": "policy", "url": "https://example.com/policies/ai-usage-2026-06"} + ], + ) + context = _context("请给出最新 AI 使用政策并附来源。", verifier) + + report = verifier.verify( + final_text=( + "AI Usage Policy v2026-06 requires source-backed answers. " + "来源:https://example.com/policies/ai-usage-2026-06" + ), + context=context, + receipts=[receipt], + ) + + assert report.done is True + + +def test_accepts_markdown_bold_numbered_list_format(tmp_path): + verifier = ResultVerifier(store=LocalHarnessStore(tmp_path)) + receipt = CapabilityReceipt( + id="receipt-1", + run_id="r1", + session_id="s1", + tool_name="public_web_lookup", + input_summary="harness capabilities", + result_summary=( + "ContextEngine anchors tasks. ResultVerifier checks evidence. " + "Source: https://example.com/veadk/harness-demo" + ), + status="success", + duration_ms=1.0, + sources=[{"url": "https://example.com/veadk/harness-demo"}], + ) + context = _context("请用 2 条要点回答,并给出来源。", verifier) + + report = verifier.verify( + final_text=( + "**1. ContextEngine** anchors tasks.\n" + "**2. ResultVerifier** checks evidence.\n" + "来源:https://example.com/veadk/harness-demo" + ), + context=context, + receipts=[receipt], + ) + + assert report.done is True + + +def test_fails_when_external_fact_has_no_evidence(tmp_path): + verifier = ResultVerifier(store=LocalHarnessStore(tmp_path)) + context = _context("请总结最新政策并给出来源。", verifier) + + report = verifier.verify( + final_text="最新政策已经发布。", + context=context, + receipts=[], + ) + + assert report.done is False + assert any("no tool evidence" in item for item in report.missing_requirements) + + +def test_tool_wrapper_records_failed_receipt(tmp_path): + store = LocalHarnessStore(tmp_path) + + def broken_lookup(topic: str) -> dict[str, str]: + """Broken lookup used by the test.""" + + raise RuntimeError(f"missing index for {topic}") + + wrapped = wrap_tool(broken_lookup, store=store) + + with pytest.raises(RuntimeError): + wrapped("policy") + + receipts = store.load_receipts() + assert len(receipts) == 1 + assert receipts[0].status == "failed" + assert receipts[0].tool_name == "broken_lookup" + assert receipts[0].error_type == "RuntimeError" + + +def test_fails_when_failed_receipt_is_claimed_successful(tmp_path): + verifier = ResultVerifier(store=LocalHarnessStore(tmp_path)) + receipt = CapabilityReceipt( + id="receipt-1", + run_id="r1", + session_id="s1", + tool_name="mock_skill_runner", + input_summary="append text", + result_summary="参数格式校验失败,文档追加未执行。", + status="failed", + duration_ms=1.0, + error_type="ValidationError", + error_message="schema rejected append_text payload", + ) + context = _context("请使用 mock Skill 模拟一次文档更新工具调用。", verifier) + + report = verifier.verify( + final_text=( + '{"category":"tool_skill","operation_completed":true,' + '"status":"passed","problem_observed":false}' + ), + context=context, + receipts=[receipt], + ) + + assert report.done is False + assert any( + "despite failed tool receipt" in item for item in report.missing_requirements + ) + + +def test_allows_failed_receipt_when_answer_reports_failure(tmp_path): + verifier = ResultVerifier(store=LocalHarnessStore(tmp_path)) + receipt = CapabilityReceipt( + id="receipt-1", + run_id="r1", + session_id="s1", + tool_name="mock_tool_call", + input_summary="authorized call", + result_summary="安全策略误拦截了授权调用。", + status="failed", + duration_ms=1.0, + error_type="SecurityPolicyBlocked", + ) + context = _context("请完成授权用户的 mock_tool_call。", verifier) + + report = verifier.verify( + final_text="mock_tool_call 未完成:授权调用被安全策略拦截,需要修复策略后重试。", + context=context, + receipts=[receipt], + ) + + assert report.done is True + + +def test_large_tool_result_externalized(tmp_path): + store = LocalHarnessStore(tmp_path) + + def large_lookup() -> dict[str, str]: + """Return a large result.""" + + return {"result": "large evidence " * 100} + + wrapped = wrap_tool(large_lookup, store=store, externalize_threshold=100) + wrapped() + + receipts = store.load_receipts() + assert len(receipts) == 1 + assert receipts[0].status == "success" + assert receipts[0].evidence_refs + assert "large evidence" in store.read_evidence(receipts[0].evidence_refs[0].ref_id) + + +def test_golden_verifier_cases(tmp_path): + cases_path = Path(__file__).resolve().parents[1] / "golden" / "verifier_cases.jsonl" + for line in cases_path.read_text(encoding="utf-8").splitlines(): + case = json.loads(line) + store = LocalHarnessStore(tmp_path / case["id"]) + verifier = ResultVerifier(store=store) + receipts = [] + for idx, output in enumerate(case.get("tool_outputs", []), start=1): + result = output["result"] + status = output.get("status", "success") + evidence_refs = [] + sources = [] + if status == "success": + evidence_refs.append( + store.put_evidence(kind="tool-result", text=result) + ) + sources = [ + {"url": url.rstrip(".,;:,。;:)")} + for url in re.findall(r"https?://[^\s)\],,。;;]+", result) + ] + receipts.append( + CapabilityReceipt( + id=f"receipt-{idx}", + run_id="r1", + session_id="s1", + tool_name=output["tool"], + input_summary="golden", + result_summary=result, + status=status, + duration_ms=1.0, + evidence_refs=evidence_refs, + sources=sources, + error_type=output.get("error_type"), + error_message=output.get("error_message"), + ) + ) + + report = verifier.verify( + final_text=case["final_answer"], + context=_context(case["prompt"], verifier), + receipts=receipts, + ) + + assert report.done is case["expected_done"], case["id"] + expected_missing = case.get("expected_missing_contains") + if expected_missing: + assert any(expected_missing in item for item in report.missing_requirements)