diff --git a/Cargo.lock b/Cargo.lock index b0f5d4e..f956303 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -360,7 +360,7 @@ dependencies = [ [[package]] name = "fastaguard" -version = "0.2.0" +version = "0.3.0" dependencies = [ "anyhow", "assert_cmd", diff --git a/Cargo.toml b/Cargo.toml index 1e5c05b..b1a11a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastaguard" -version = "0.2.0" +version = "0.3.0" edition = "2021" license = "MIT" description = "FASTA preflight QC for assembly pipelines" diff --git a/README.md b/README.md index 160ae19..dac5db0 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ FastaGuard is a fast, explainable FASTA QC tool for validating assembly FASTA files before expensive downstream analysis. +The assembly FASTA gate before expensive QC. + It is not intended to compete with QUAST, BUSCO, BlobToolKit, FastQC, or MultiQC. FastaGuard is the earlier preflight and triage layer: the first command that answers whether a FASTA file is valid, sane, interpretable, and ready for downstream tools. ```text @@ -57,9 +59,15 @@ fastaguard sample.fa \ Pipeline gate example: ```bash -fastaguard sample.fa --fail-on duplicate_ids,invalid_chars,high_n_rate +fastaguard sample.fa --profile assembly --gate pipeline ``` +The `pipeline` gate is the v0.3 assembly preset for workflow stop/go decisions. +It fails on duplicate IDs, invalid characters, invalid FASTA structure, and +high-N content. GC and length outliers remain advisory by default because they +are routing signals, not proof of contamination or misassembly. To make an +advisory finding block a pipeline, add it explicitly with `--fail-on`. + Inspect the machine-readable contract: ```bash @@ -80,7 +88,8 @@ docker run --rm -v "$PWD:/data" fastaguard:local /data/sample.fa \ --multiqc /data/fastaguard_mqc.json ``` -Use the generated BioContainers image in workflow engines: +Published BioContainers currently provides the v0.2 image, which does not +include v0.3 gate behavior yet: ```bash docker pull quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 @@ -116,6 +125,7 @@ FastaGuard is assembly-first. ```bash fastaguard sample.fa \ --profile assembly \ + --gate pipeline \ --out fastaguard_report.html \ --json fastaguard.json \ --tsv fastaguard.tsv \ @@ -147,6 +157,14 @@ v0.2 expands the assembly preflight layer with: - richer provenance, taxonomy context, and routing hints - hardened MultiQC and pipeline adoption material +v0.3 adds the assembly gate contract: + +- `--gate pipeline` for default workflow blocking behavior +- `gate.blocking_findings` for machine stop/go decisions +- checksum provenance with `provenance.input_sha256` +- explicit advisory findings for evidence that should route follow-up QC rather + than stop a pipeline by default + ## Positioning FastaGuard should recommend deeper tools when they are appropriate: @@ -189,7 +207,6 @@ serves v0.2.0 for `linux-64`, `linux-aarch64`, `osx-64`, and `osx-arm64`. BioContainers also publishes the pinned workflow image `quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0`. -The next internal milestone is the -[v0.2 evidence pack](docs/evidence/fastaguard-v0.2-evidence.md): reproducible -local and public FASTA runs that document runtime, verdicts, and top findings -before new biological profiles are added. +The current development milestone is v0.3: evidence, checksum provenance, and +the assembly gate contract. Published Bioconda and BioContainers packages remain +v0.2.0 until a v0.3 release is cut. diff --git a/docs/benchmarking.md b/docs/benchmarking.md index d422f22..513c865 100644 --- a/docs/benchmarking.md +++ b/docs/benchmarking.md @@ -19,6 +19,12 @@ python3 scripts/benchmark_large_fasta.py \ This should finish quickly and produce `fastaguard.json`, `fastaguard.tsv`, `fastaguard_report.html`, and `fastaguard_mqc.json` in `target/bench-smoke/`. +For the v0.3 assembly gate contract, add the pipeline gate preset: + +```bash +fastaguard sample.fa --profile assembly --gate pipeline +``` + ## Larger Local Benchmark Build an optimized binary: @@ -84,17 +90,21 @@ Do not use it to claim performance on contaminated assemblies, highly ambiguous ## v0.2 Evidence Targets FastaGuard should prove four preflight categories with small reproducible -fixtures: +fixtures. For v0.3, the same evidence should also show whether each category +blocks the pipeline gate: -| Evidence case | What FastaGuard catches | Why it should run before heavier tools | -| --- | --- | --- | -| duplicate IDs | repeated FASTA identifiers | helps prevent workflow joins, indexes, and annotations from becoming ambiguous | -| invalid characters | non-IUPAC sequence symbols | flags inputs that may trigger downstream parser and aligner failures | -| high-N | ambiguous scaffolds and gap-heavy records | flags low-confidence mapping and annotation inputs before they are treated as clean | -| GC outliers | composition-anomalous records | supports routing suspicious records to BlobToolKit, sourmash, Kraken, or other follow-up tools | +| Evidence case | Gate behavior | What FastaGuard catches | Why it should run before heavier tools | +| --- | --- | --- | --- | +| duplicate IDs | blocking | repeated FASTA identifiers | helps prevent workflow joins, indexes, and annotations from becoming ambiguous | +| invalid characters | blocking | non-IUPAC sequence symbols | flags inputs that may trigger downstream parser and aligner failures | +| high-N | blocking | ambiguous scaffolds and gap-heavy records | flags low-confidence mapping and annotation inputs before they are treated as clean | +| GC outliers | advisory by default | composition-anomalous records | supports routing suspicious records to BlobToolKit, sourmash, Kraken, or other follow-up tools | FastaGuard should not replace QUAST, BUSCO, or BlobToolKit. It should make their inputs safer and make obvious FASTA-level problems visible before those tools run. +For automated workflows, record `gate.blocking_findings` and +`provenance.input_sha256` alongside runtime and verdict so the gate decision can +be audited against exact input bytes. ## Evidence To Collect Next @@ -115,6 +125,8 @@ For each run, record: - peak memory if measured externally - verdict and top findings - whether downstream tools would have been blocked or recommended +- gate status and `gate.blocking_findings` when run with `--gate pipeline` +- `provenance.input_sha256` This evidence matters more than synthetic speed alone because it shows the wedge: cheap FASTA preflight before expensive downstream QC. @@ -122,6 +134,8 @@ This evidence matters more than synthetic speed alone because it shows the wedge The v0.2 evidence workflow is documented in `docs/evidence/fastaguard-v0.2-evidence.md`. +The published evidence document remains v0.2-focused; v0.3 gate evidence should +extend that workflow after the gate contract is released. CI-safe local run: diff --git a/docs/evidence/fastaguard-v0.3-evidence.md b/docs/evidence/fastaguard-v0.3-evidence.md new file mode 100644 index 0000000..d48356b --- /dev/null +++ b/docs/evidence/fastaguard-v0.3-evidence.md @@ -0,0 +1,105 @@ +# FastaGuard v0.3 Evidence Pack + +This page records the evidence workflow for FastaGuard v0.3. The purpose is to +make the assembly gate inspectable before expanding into broader biological +profiles. + +FastaGuard is FASTA preflight QC. It is not biological completeness analysis, +not assembly correctness analysis, and not contamination confirmation. Passing +the v0.3 gate means the FASTA-level contract is sane enough to continue into +downstream tools such as QUAST, BUSCO, BlobToolKit, CheckM, seqkit, or +annotation. + +## Local Evidence Run + +Build the release binary: + +```bash +cargo build --release --locked +``` + +Run the CI-safe local evidence path: + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3-local \ + --local-only +``` + +Local-only mode does not require network access or the NCBI Datasets CLI. It +runs: + +- a deterministic synthetic FASTA +- `testdata/problem_assembly.fa` +- a gzipped copy of `testdata/valid_assembly.fa` + +The evidence command runs FastaGuard with `--profile assembly --gate pipeline` +and keeps `--min-contig-length 1` so tiny local fixtures remain useful for +contract testing. + +## Public NCBI Evidence Run + +Install the NCBI Datasets CLI, then run: + +```bash +cargo build --release --locked +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3 +``` + +The public workflow downloads genomic FASTA packages with commands shaped like: + +```bash +datasets download genome accession GCF_000005845.2 --include genome --filename target/evidence/v0.3/ecoli_k12_mg1655/ncbi_dataset.zip +``` + +If `datasets` is not installed, use `--local-only` for offline smoke tests. The +default public manifest is: + +```text +docs/evidence/public_assemblies.json +``` + +It currently includes: + +- `GCF_000005845.2`: Escherichia coli K-12 MG1655 +- `GCF_000182925.2`: Neurospora crassa OR74A + +## Outputs + +Each case writes FastaGuard artifacts under the selected output directory, for +example `target/evidence/v0.3-local//` or `target/evidence/v0.3//`: + +- `fastaguard.json` +- `fastaguard.tsv` +- `fastaguard_report.html` +- `fastaguard_mqc.json` + +The workflow also writes compact summaries: + +- `evidence_summary.json` +- `evidence_summary.tsv` + +The summaries include verdict, gate status, blocking findings, top findings, +runtime, input size, sequence counts, and `input_sha256`. Commit compact +summaries when useful. Do not commit downloaded FASTA files, NCBI zip archives, +or full generated per-case reports. + +## Interpretation + +The v0.3 gate means the FASTA-level contract is sane enough to continue through +the pipeline. It checks validity, duplicate identifiers, invalid characters, +composition red flags, gap signals, and related FASTA-level evidence. + +The gate does not prove biological completeness, does not prove assembly +correctness, and does not rule out contamination. Use QUAST, BUSCO, +BlobToolKit, CheckM, sourmash, Kraken, or other tools for deeper biological +interpretation after FastaGuard has checked the FASTA-level contract. + +## References + +- [NCBI Datasets genome download reference](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/download/genome/) +- [NCBI Datasets genome download guide](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/how-tos/genomes/download-genome/) +- [Neurospora crassa OR74A BioProject](https://www.ncbi.nlm.nih.gov/bioproject/132) diff --git a/docs/output-contract.md b/docs/output-contract.md index 92910c7..dde1a71 100644 --- a/docs/output-contract.md +++ b/docs/output-contract.md @@ -19,14 +19,14 @@ schema/finding-catalog.json ## JSON Contract -Example v0.2 shape: +Example v0.3 shape: ```json { - "schema_version": "0.2.0", + "schema_version": "0.3.0", "tool": { "name": "FastaGuard", - "version": "0.2.0" + "version": "0.3.0" }, "input": { "path": "sample.fa", @@ -34,11 +34,18 @@ Example v0.2 shape: "compressed": false }, "verdict": { - "status": "WARN", - "reasons": ["high_n_rate", "tiny_contigs"] + "status": "FAIL", + "reasons": ["duplicate_ids", "invalid_chars", "high_n_rate"] + }, + "gate": { + "mode": "pipeline", + "status": "FAIL", + "blocking_findings": ["duplicate_ids", "invalid_chars", "high_n_rate"], + "advisory_findings": ["tiny_contigs", "gc_outliers"], + "fail_on": ["duplicate_ids", "high_n_rate", "invalid_chars", "invalid_fasta_structure"] }, "machine_summary": { - "verdict": "WARN", + "verdict": "FAIL", "safe_for_downstream": false, "top_findings": ["high_n_rate", "tiny_contigs"], "recommended_next_tools": [ @@ -72,12 +79,13 @@ Example v0.2 shape: "provenance": { "profile": "assembly", "threads": 1, - "fail_on": [], - "command": "fastaguard sample.fa --profile assembly", + "fail_on": ["duplicate_ids", "high_n_rate", "invalid_chars", "invalid_fasta_structure"], + "command": "fastaguard sample.fa --profile assembly --gate pipeline", "started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:01Z", "duration_ms": 842, "input_size_bytes": 5120340, + "input_sha256": "3f786850e387550fdab836ed7e6dc881de23001b3a28c9f1f4b2d0a4c6e7f8aa", "thresholds": { "high_n_sequence_fraction": 0.2, "high_global_n_fraction": 0.05, @@ -101,9 +109,9 @@ Example v0.2 shape: "at_percent": 44.8, "n_percent": 3.4, "ambiguity_percent": 3.7, - "duplicate_id_count": 0, + "duplicate_id_count": 1, "duplicate_sequence_count": 0, - "invalid_sequence_count": 0, + "invalid_sequence_count": 1, "high_n_sequence_count": 62, "tiny_contig_count": 4, "max_gap_run": 25 @@ -209,7 +217,7 @@ Example v0.2 shape: ## Stability Rules -Stable in the v0.2 contract: +Stable in the v0.3 contract: - `schema_version` - `tool.name` @@ -217,6 +225,11 @@ Stable in the v0.2 contract: - `input.profile` - `verdict.status` - `verdict.reasons` +- `gate.mode` +- `gate.status` +- `gate.blocking_findings` +- `gate.advisory_findings` +- `gate.fail_on` - `machine_summary.verdict` - `machine_summary.safe_for_downstream` - `machine_summary.top_findings` @@ -231,6 +244,7 @@ Stable in the v0.2 contract: - `provenance.completed_at` - `provenance.duration_ms` - `provenance.input_size_bytes` +- `provenance.input_sha256` - `provenance.thresholds` - `summary.sequence_count` - `summary.total_length` @@ -255,6 +269,32 @@ Stable in the v0.2 contract: Fields can be added in later schema versions, but existing meanings should not drift casually. +## Gate Contract + +The v0.3 assembly gate makes workflow stop/go behavior explicit in JSON: + +```json +"gate": { + "mode": "pipeline", + "status": "FAIL", + "blocking_findings": ["duplicate_ids", "invalid_chars", "high_n_rate"], + "advisory_findings": ["tiny_contigs", "gc_outliers"], + "fail_on": ["duplicate_ids", "high_n_rate", "invalid_chars", "invalid_fasta_structure"] +} +``` + +Machines should use `gate.blocking_findings` for workflow stop/go decisions. +This list is the stable explanation of why a gated run blocked downstream work. + +Humans should inspect the HTML evidence before deciding how to repair or route +the assembly. Advisory findings such as GC or length outliers can indicate +records worth reviewing, but they are not blocking unless the user explicitly +adds them with `--fail-on`. + +`provenance.input_sha256` identifies the exact input bytes used for the report. +That checksum lets workflow engines, reviewers, and future audit tools connect a +gate decision to one immutable FASTA input. + ## Machine-Actionable Contract The JSON output should become the source of truth for humans, workflow engines, dashboards, and future tool-using LLM agents. @@ -300,9 +340,14 @@ Recommended first rows: ```text metric value -schema_version 0.2.0 +schema_version 0.3.0 profile assembly -verdict WARN +verdict FAIL +gate_mode pipeline +gate_status FAIL +gate_blocking_findings duplicate_ids,invalid_chars,high_n_rate +gate_advisory_findings tiny_contigs,gc_outliers +input_sha256 3f786850e387550fdab836ed7e6dc881de23001b3a28c9f1f4b2d0a4c6e7f8aa sequence_count 481 total_length 5042301 n50 128003 @@ -360,6 +405,6 @@ Report layers: suggested next tools and remediation steps ``` -Outlier findings are part of the v0.2 report contract. They are preflight +Outlier findings are part of the v0.3 report contract. They are preflight triage signals only; GC and composite anomalies do not by themselves prove contamination, cobionts, plasmids, or misassembly. diff --git a/docs/releases/v0.3.0.md b/docs/releases/v0.3.0.md new file mode 100644 index 0000000..7f9530c --- /dev/null +++ b/docs/releases/v0.3.0.md @@ -0,0 +1,81 @@ +# FastaGuard v0.3.0 + +FastaGuard v0.3.0 is the Evidence And Assembly Gate release. + +This release makes FastaGuard more useful as an early, machine-readable +assembly preflight gate before QUAST, BUSCO, BlobToolKit, CheckM, annotation, +or other expensive downstream QC. + +## Highlights + +- Adds `--gate pipeline`, a conservative assembly gate preset for pipeline + automation. +- Adds a machine-readable `gate` report object with mode, status, blocking + findings, advisory findings, and the active failure policy. +- Adds `provenance.input_sha256` so workflow engines and agents can tie a + report back to the exact FASTA bytes that were checked. +- Updates JSON, TSV, HTML, and MultiQC-compatible outputs with the v0.3 report + contract. +- Keeps stable finding IDs and structured actions as the source of truth for + automation. +- Supports the v0.3 evidence workflow without requiring databases or external + services for normal runs. + +## Install + +Until a public v0.3.0 source archive is published, the Bioconda recipe remains +on the published v0.2.0 package. Until v0.3.0 binaries are published, use a +local build from this release branch when validating the new gate contract: + +```bash +cargo install --path . +fastaguard --version +``` + +## Pipeline Gate + +Use the assembly gate when FastaGuard should make a pipeline decision: + +```bash +fastaguard sample.fa --gate pipeline \ + --out fastaguard_report.html \ + --json fastaguard.json \ + --tsv fastaguard.tsv \ + --multiqc fastaguard_mqc.json +``` + +`--gate pipeline` fails on conservative FASTA-level blockers such as invalid +FASTA structure, invalid characters, duplicate identifiers, and high global N +rate. Other findings can still be emitted as advisory evidence, so downstream +workflow logic can distinguish "stop and fix the FASTA" from "continue, but +route to deeper assembly QC." + +## Machine-Readable Outputs + +The JSON report remains the source of truth. The v0.3.0 contract adds: + +- `gate.mode` and `gate.status` for direct pipeline decisions. +- `gate.blocking_findings` and `gate.advisory_findings` for routing. +- `gate.fail_on` for the active failure policy after preset expansion. +- `provenance.input_sha256` for input identity checks. +- Pipeline-friendly TSV fields that surface gate status and input checksum + alongside the usual summary metrics. +- MultiQC-compatible fields that surface gate mode, gate status, and blocking + finding IDs alongside the usual summary metrics. + +## Evidence Workflow + +The evidence workflow for v0.3.0 is intended to show that the assembly gate, +report schema, provenance checksum, and committed examples are reproducible. +The workflow should use local FASTA metrics only; FastaGuard does not fetch +taxonomy databases or make biological completeness claims. + +## Known Limits + +- FastaGuard remains a FASTA preflight tool, not a replacement for QUAST, + BUSCO, BlobToolKit, CheckM, seqkit, MultiQC, or annotation workflows. +- `--gate pipeline` is intentionally conservative and database-free. +- Composition and outlier findings are triage signals, not contamination or + taxonomic assignments. +- `input_sha256` identifies the checked input bytes; it does not by itself + prove sample identity, biological quality, or reference correctness. diff --git a/docs/roadmap.md b/docs/roadmap.md index f4e1aff..2c7181f 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -57,14 +57,18 @@ Goal: Make FastaGuard credible enough for pipeline authors to add as a default assembly gate. ``` -Potential additions: +Development scope: - public evidence pack from local fixtures and public assemblies -- Bioconda and BioContainers v0.2 availability documented -- input checksum provenance +- Bioconda and BioContainers v0.2 availability documented without implying a + v0.3 package exists yet +- input checksum provenance with `provenance.input_sha256` - clearer machine-readable threshold metadata -- assembly gate preset for common pipeline behavior -- clearer blocking vs follow-up recommendations +- assembly gate preset with `--gate pipeline` +- explicit `gate.blocking_findings` and `gate.advisory_findings` for workflow + engines and humans +- clearer blocking vs follow-up recommendations; GC and length outliers remain + advisory unless added with `--fail-on` ## v0.4: Compare Mode @@ -116,12 +120,12 @@ Potential additions: Potential additions: -- merge and verify the v0.2 Bioconda update +- publish and verify a v0.3 Bioconda update after the gate contract is released - upstream nf-core module submission - official Snakemake wrapper submission - Galaxy wrapper - upstream MultiQC distribution path -- BioContainers verification for the v0.2 package +- BioContainers verification for each published package - Homebrew formula ## Later Innovation @@ -146,6 +150,7 @@ Completed foundation: - add structured `actions[]` records to findings - add bounded per-record evidence to findings - add provenance for profile, thresholds, fail rules, thread count, command, timestamps, duration, and input size +- add input checksum provenance with `provenance.input_sha256` - add explicit scope fields for what FastaGuard can and cannot conclude - add structured routing hints for workflow engines and tool agents - add `--schema`, `--finding-catalog`, and `--explain-finding ` commands @@ -154,5 +159,5 @@ Completed foundation: Recommended next sequence: - extend evidence tables across future transcriptome, protein, reference, and compare modes -- enrich provenance with input checksums +- keep the v0.3 gate contract stable through workflow adoption examples - explore an MCP or tool-server interface after the CLI schema is stable diff --git a/docs/superpowers/plans/2026-05-27-fastaguard-v0.3-assembly-gate.md b/docs/superpowers/plans/2026-05-27-fastaguard-v0.3-assembly-gate.md new file mode 100644 index 0000000..e32a35f --- /dev/null +++ b/docs/superpowers/plans/2026-05-27-fastaguard-v0.3-assembly-gate.md @@ -0,0 +1,1367 @@ +# FastaGuard v0.3 Assembly Gate Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build FastaGuard v0.3 as a pipeline-ready assembly gate with `--gate pipeline`, machine-readable gate decisions, input SHA256 provenance, updated report outputs, and evidence documentation. + +**Architecture:** Keep gate policy separate from finding detection. CLI parsing records a gate mode, a new gate module expands that mode into final failure rules, report assembly derives a compact `gate` object from triggered findings, and provenance streams the input file to compute SHA256 without loading the FASTA into memory. + +**Tech Stack:** Rust 2021, clap, serde, sha2, hex, JSON Schema, assert_cmd, jsonschema, Python unittest, existing FastaGuard report writers, NCBI Datasets CLI for optional public evidence. + +--- + +## File Structure + +- Create `src/gate.rs`: gate mode enum, pipeline preset failure IDs, final failure-set expansion, and gate decision derivation. +- Modify `src/cli.rs`: add `--gate`, store gate mode in `RunConfig`, and union preset rules with explicit `--fail-on`. +- Modify `src/lib.rs`: expose `gate` module and update test config fixtures. +- Modify `src/models.rs`: bump schema version, add `GateDecision`, add `gate` to reports, add `provenance.input_sha256`, and compute streaming checksum. +- Modify `src/report/tsv.rs`, `src/report/multiqc.rs`, and `src/report/html.rs`: surface gate and checksum fields. +- Modify `schema/fastaguard.schema.json`, `schema/finding-catalog.json`, golden JSON files, example reports, and docs to reflect schema `0.3.0`. +- Modify `tests/cli.rs`, `tests/schema_contract.rs`, report writer unit tests, and Python adoption/release tests. +- Add `docs/evidence/fastaguard-v0.3-evidence.md` and, after a successful public run, compact summaries under `docs/evidence/v0.3/`. + +## Task 1: Add Gate Policy And CLI Parsing + +**Files:** +- Create: `src/gate.rs` +- Modify: `src/cli.rs` +- Modify: `src/lib.rs` +- Test: `src/cli.rs` +- Test: `tests/cli.rs` + +- [ ] **Step 1: Write failing CLI and gate unit tests** + +Add these imports in `src/cli.rs` tests: + +```rust +use crate::gate::GateMode; +``` + +Add these tests to `src/cli.rs`: + +```rust +#[test] +fn gate_none_preserves_explicit_fail_rules() { + let cli = Cli::parse_from([ + "fastaguard", + "input.fa", + "--gate", + "none", + "--fail-on", + "gc_outliers", + ]); + let config = cli.to_run_config().unwrap(); + + assert_eq!(config.gate_mode, GateMode::None); + assert_eq!( + config.rules.fail_on, + ["gc_outliers"].into_iter().map(str::to_string).collect() + ); +} + +#[test] +fn gate_pipeline_adds_conservative_fail_rules() { + let cli = Cli::parse_from(["fastaguard", "input.fa", "--gate", "pipeline"]); + let config = cli.to_run_config().unwrap(); + + assert_eq!(config.gate_mode, GateMode::Pipeline); + assert_eq!( + config.rules.fail_on, + [ + "duplicate_ids", + "high_n_rate", + "invalid_chars", + "invalid_fasta_structure", + ] + .into_iter() + .map(str::to_string) + .collect() + ); +} + +#[test] +fn gate_pipeline_unions_explicit_fail_rules() { + let cli = Cli::parse_from([ + "fastaguard", + "input.fa", + "--gate", + "pipeline", + "--fail-on", + "gc_outliers", + ]); + let config = cli.to_run_config().unwrap(); + + assert!(config.rules.fail_on.contains("duplicate_ids")); + assert!(config.rules.fail_on.contains("invalid_chars")); + assert!(config.rules.fail_on.contains("invalid_fasta_structure")); + assert!(config.rules.fail_on.contains("high_n_rate")); + assert!(config.rules.fail_on.contains("gc_outliers")); +} +``` + +Add this CLI integration test to `tests/cli.rs`: + +```rust +#[test] +fn unknown_gate_value_is_cli_error() { + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args(["testdata/valid_assembly.fa", "--gate", "strict"]) + .assert() + .failure() + .stderr(predicate::str::contains("invalid value 'strict'")); +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +cargo test --locked gate_none_preserves_explicit_fail_rules gate_pipeline_adds_conservative_fail_rules gate_pipeline_unions_explicit_fail_rules unknown_gate_value_is_cli_error +``` + +Expected: compile failure because `crate::gate`, `GateMode`, `Cli.gate`, and `RunConfig.gate_mode` do not exist. + +- [ ] **Step 3: Add the gate module** + +Create `src/gate.rs`: + +```rust +use clap::ValueEnum; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; + +pub const PIPELINE_FAIL_ON: [&str; 4] = [ + "duplicate_ids", + "high_n_rate", + "invalid_chars", + "invalid_fasta_structure", +]; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[value(rename_all = "snake_case")] +pub enum GateMode { + None, + Pipeline, +} + +impl GateMode { + pub fn as_str(self) -> &'static str { + match self { + GateMode::None => "none", + GateMode::Pipeline => "pipeline", + } + } +} + +pub fn final_fail_on(mode: GateMode, explicit_rules: &[String]) -> BTreeSet { + let mut fail_on = explicit_rules + .iter() + .flat_map(|value| value.split(',')) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + .collect::>(); + + if mode == GateMode::Pipeline { + fail_on.extend(PIPELINE_FAIL_ON.into_iter().map(ToOwned::to_owned)); + } + + fail_on +} +``` + +- [ ] **Step 4: Wire the gate module into lib and CLI** + +In `src/lib.rs`, add: + +```rust +pub mod gate; +``` + +In `src/cli.rs`, add: + +```rust +use crate::gate::{self, GateMode}; +``` + +Add this field to `Cli` after `profile`: + +```rust +/// Gate preset for pipeline-friendly failure behavior. +#[arg(long, value_enum, default_value_t = GateMode::None)] +pub gate: GateMode, +``` + +Add this field to `RunConfig`: + +```rust +pub gate_mode: GateMode, +``` + +In `Cli::to_run_config`, set: + +```rust +gate_mode: self.gate, +rules: RuleConfig { + fail_on: gate::final_fail_on(self.gate, &self.fail_on), +}, +``` + +Remove the old call to `normalize_rules(&self.fail_on)`. Keep `normalize_rules` only if another test or helper still uses it; otherwise delete it. + +Update `cli_with_max_n_rate` test helper: + +```rust +gate: GateMode::None, +``` + +Update `src/lib.rs` test config: + +```rust +gate_mode: crate::gate::GateMode::None, +``` + +- [ ] **Step 5: Run tests and commit** + +Run: + +```bash +cargo test --locked gate_none_preserves_explicit_fail_rules gate_pipeline_adds_conservative_fail_rules gate_pipeline_unions_explicit_fail_rules unknown_gate_value_is_cli_error +``` + +Expected: all four tests pass. + +Commit: + +```bash +git add src/gate.rs src/cli.rs src/lib.rs tests/cli.rs +git commit -m "feat: add assembly gate preset" +``` + +## Task 2: Add JSON Gate Decision And Input SHA256 Provenance + +**Files:** +- Modify: `src/gate.rs` +- Modify: `src/models.rs` +- Modify: `schema/fastaguard.schema.json` +- Test: `tests/cli.rs` +- Test: `tests/schema_contract.rs` + +- [ ] **Step 1: Write failing report contract tests** + +Add this test to `tests/cli.rs`: + +```rust +#[test] +fn pipeline_gate_report_lists_blocking_and_advisory_findings() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "pipeline_gate"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args(["testdata/problem_assembly.fa", "--gate", "pipeline", "--out"]) + .arg(&outputs.html) + .arg("--json") + .arg(&outputs.json) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let report = read_json(&outputs.json); + assert_eq!(report["schema_version"], json!("0.3.0")); + assert_eq!(report["gate"]["mode"], json!("pipeline")); + assert_eq!(report["gate"]["status"], json!("FAIL")); + assert!(array_contains_string(&report["gate"]["blocking_findings"], "duplicate_ids")); + assert!(array_contains_string(&report["gate"]["blocking_findings"], "invalid_chars")); + assert!(array_contains_string(&report["gate"]["blocking_findings"], "high_n_rate")); + assert!(array_contains_string(&report["gate"]["advisory_findings"], "gap_runs")); + assert!(array_contains_string(&report["gate"]["fail_on"], "invalid_fasta_structure")); + assert_eq!( + report["provenance"]["input_sha256"], + json!(sha256_file(Path::new("testdata/problem_assembly.fa"))) + ); +} +``` + +Add this test to `tests/cli.rs`: + +```rust +#[test] +fn gate_none_report_preserves_warning_behavior_and_checksum() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "gate_none"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args(["testdata/problem_assembly.fa", "--out"]) + .arg(&outputs.html) + .arg("--json") + .arg(&outputs.json) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["mode"], json!("none")); + assert_eq!(report["gate"]["status"], json!("FAIL")); + assert!(array_contains_string(&report["gate"]["blocking_findings"], "duplicate_ids")); + assert!(array_contains_string(&report["gate"]["blocking_findings"], "invalid_chars")); + assert!(array_contains_string(&report["gate"]["advisory_findings"], "high_n_rate")); + assert_eq!( + report["provenance"]["input_sha256"], + json!(sha256_file(Path::new("testdata/problem_assembly.fa"))) + ); +} +``` + +Add this helper to `tests/cli.rs` near the other helpers: + +```rust +fn sha256_file(path: &Path) -> String { + use sha2::{Digest, Sha256}; + + let bytes = std::fs::read(path).unwrap(); + hex::encode(Sha256::digest(bytes)) +} +``` + +Add this schema test to `tests/schema_contract.rs`: + +```rust +#[test] +fn schema_requires_gate_and_input_sha256() { + let schema = read_json(Path::new("schema/fastaguard.schema.json")); + let report_required = schema["required"].as_array().unwrap(); + let provenance_required = schema["properties"]["provenance"]["required"] + .as_array() + .unwrap(); + + assert!(report_required.contains(&serde_json::json!("gate"))); + assert!(provenance_required.contains(&serde_json::json!("input_sha256"))); +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +cargo test --locked pipeline_gate_report_lists_blocking_and_advisory_findings gate_none_report_preserves_warning_behavior_and_checksum schema_requires_gate_and_input_sha256 +``` + +Expected: compile or assertion failures because report `gate`, schema `gate`, schema `0.3.0`, and `provenance.input_sha256` do not exist yet. + +- [ ] **Step 3: Add gate decision types and derivation** + +In `src/models.rs`, change: + +```rust +pub const SCHEMA_VERSION: &str = "0.3.0"; +``` + +Add `gate` to `FastaguardReport` after `verdict`: + +```rust +pub gate: GateDecision, +``` + +Add this struct: + +```rust +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GateDecision { + pub mode: String, + pub status: VerdictStatus, + pub blocking_findings: Vec, + pub advisory_findings: Vec, + pub fail_on: Vec, +} +``` + +In `src/gate.rs`, add: + +```rust +use crate::models::{Finding, GateDecision, VerdictStatus}; + +pub fn decision( + mode: GateMode, + status: VerdictStatus, + findings: &[Finding], + fail_on: &BTreeSet, +) -> GateDecision { + let mut blocking_findings = Vec::new(); + let mut advisory_findings = Vec::new(); + + for finding in findings { + if fail_on.contains(&finding.id) || finding.severity == crate::models::Severity::Critical { + blocking_findings.push(finding.id.clone()); + } else { + advisory_findings.push(finding.id.clone()); + } + } + + GateDecision { + mode: mode.as_str().to_string(), + status, + blocking_findings, + advisory_findings, + fail_on: fail_on.iter().cloned().collect(), + } +} +``` + +In `src/models.rs`, import `crate::gate` and set `gate` in both constructors: + +```rust +gate: gate::decision( + config.gate_mode, + analysis.status, + &findings, + &config.rules.fail_on, +), +``` + +For invalid FASTA: + +```rust +gate: gate::decision( + config.gate_mode, + VerdictStatus::Fail, + &findings, + &config.rules.fail_on, +), +``` + +- [ ] **Step 4: Add streaming input SHA256** + +In `src/models.rs`, add imports: + +```rust +use anyhow::{Context, Result}; +use sha2::{Digest, Sha256}; +use std::fs::File; +use std::io::{BufReader, Read}; +``` + +Add `input_sha256` to `Provenance` after `input_size_bytes`: + +```rust +pub input_sha256: String, +``` + +Add helper: + +```rust +fn input_sha256(path: &Path) -> Result { + let file = File::open(path) + .with_context(|| format!("failed to open {} for SHA256", path.display()))?; + let mut reader = BufReader::new(file); + let mut hasher = Sha256::new(); + let mut buffer = [0_u8; 64 * 1024]; + + loop { + let bytes_read = reader + .read(&mut buffer) + .with_context(|| format!("failed to read {} for SHA256", path.display()))?; + if bytes_read == 0 { + break; + } + hasher.update(&buffer[..bytes_read]); + } + + Ok(hex::encode(hasher.finalize())) +} +``` + +Change `build_provenance` to compute: + +```rust +let input_sha256 = input_sha256(&config.input).unwrap_or_else(|_| String::new()); +``` + +Set: + +```rust +input_sha256, +``` + +The empty-string fallback should only affect report-only test fixtures that build synthetic reports without readable input paths. CLI runs should already fail before report creation when input is unreadable. + +- [ ] **Step 5: Update schema** + +In `schema/fastaguard.schema.json`: + +- Change `schema_version.const` from `0.2.0` to `0.3.0`. +- Add `"gate"` to top-level `required` after `"verdict"`. +- Add `gate` under top-level `properties`: + +```json +"gate": { + "type": "object", + "required": ["mode", "status", "blocking_findings", "advisory_findings", "fail_on"], + "properties": { + "mode": { + "type": "string", + "enum": ["none", "pipeline"] + }, + "status": { + "type": "string", + "enum": ["PASS", "WARN", "FAIL"] + }, + "blocking_findings": { + "type": "array", + "items": { "type": "string" }, + "uniqueItems": true + }, + "advisory_findings": { + "type": "array", + "items": { "type": "string" }, + "uniqueItems": true + }, + "fail_on": { + "type": "array", + "items": { "type": "string" }, + "uniqueItems": true + } + } +} +``` + +- Add `"input_sha256"` to `provenance.required`. +- Add `input_sha256` to `provenance.properties`: + +```json +"input_sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$" +} +``` + +- [ ] **Step 6: Update test fixture builders** + +Update every manual `FastaguardReport` literal in: + +- `src/report/tsv.rs` +- `src/report/multiqc.rs` +- `src/report/html.rs` +- `src/report/mod.rs` + +Add: + +```rust +gate: GateDecision { + mode: "none".to_string(), + status, + blocking_findings: Vec::new(), + advisory_findings: Vec::new(), + fail_on: Vec::new(), +}, +``` + +For literals with a fixed `VerdictStatus::Pass`, use `status: VerdictStatus::Pass`. + +Add provenance: + +```rust +input_sha256: "0".repeat(64), +``` + +- [ ] **Step 7: Run tests and commit** + +Run: + +```bash +cargo test --locked pipeline_gate_report_lists_blocking_and_advisory_findings gate_none_report_preserves_warning_behavior_and_checksum schema_requires_gate_and_input_sha256 +cargo test --locked +``` + +Expected: all tests pass except golden/schema tests may still fail until Task 4 regenerates fixtures. If only golden/schema fixture mismatches remain, continue to Task 4 before committing. If unit or CLI behavior tests fail, fix before continuing. + +Commit after behavior tests and schema update are passing or after Task 4 if golden fixtures are part of the same change: + +```bash +git add src/gate.rs src/models.rs schema/fastaguard.schema.json tests/cli.rs tests/schema_contract.rs src/report/tsv.rs src/report/multiqc.rs src/report/html.rs src/report/mod.rs +git commit -m "feat: add gate report contract" +``` + +## Task 3: Surface Gate Fields In TSV, MultiQC, And HTML + +**Files:** +- Modify: `src/report/tsv.rs` +- Modify: `src/report/multiqc.rs` +- Modify: `src/report/html.rs` +- Test: report writer unit tests and `tests/cli.rs` + +- [ ] **Step 1: Write failing output tests** + +In `src/report/tsv.rs`, add: + +```rust +#[test] +fn writes_gate_and_checksum_rows() { + let mut report = test_report(VerdictStatus::Fail); + report.gate.mode = "pipeline".to_string(); + report.gate.status = VerdictStatus::Fail; + report.gate.blocking_findings = vec!["duplicate_ids".to_string()]; + report.gate.advisory_findings = vec!["gc_outliers".to_string()]; + report.provenance.input_sha256 = "a".repeat(64); + let file = NamedTempFile::new().unwrap(); + + write(&report, file.path()).unwrap(); + + let output = fs::read_to_string(file.path()).unwrap(); + assert!(output.contains("gate_mode\tpipeline\n"), "{output}"); + assert!(output.contains("gate_status\tFAIL\n"), "{output}"); + assert!(output.contains("gate_blocking_findings\tduplicate_ids\n"), "{output}"); + assert!(output.contains("gate_advisory_findings\tgc_outliers\n"), "{output}"); + assert!(output.contains(&format!("input_sha256\t{}\n", "a".repeat(64))), "{output}"); +} +``` + +In `src/report/multiqc.rs`, extend `writes_multiqc_custom_content_table`: + +```rust +assert_eq!(output["data"]["sample"]["gate_mode"], "none"); +assert_eq!(output["data"]["sample"]["gate_status"], "PASS"); +assert_eq!(output["data"]["sample"]["gate_blocking_findings"], ""); +``` + +In `tests/cli.rs`, add: + +```rust +#[test] +fn html_report_shows_gate_decision() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "html_gate"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args(["testdata/problem_assembly.fa", "--gate", "pipeline", "--out"]) + .arg(&outputs.html) + .arg("--json") + .arg(&outputs.json) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2); + + let html = std::fs::read_to_string(&outputs.html).unwrap(); + assert!(html.contains("Gate Decision"), "{html}"); + assert!(html.contains("Blocking"), "{html}"); + assert!(html.contains("Advisory"), "{html}"); +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +cargo test --locked writes_gate_and_checksum_rows writes_multiqc_custom_content_table html_report_shows_gate_decision +``` + +Expected: failures because the output writers do not emit gate fields yet. + +- [ ] **Step 3: Add TSV rows** + +In `src/report/tsv.rs`, after verdict: + +```rust +write_metric(&mut writer, "gate_mode", &report.gate.mode)?; +write_metric( + &mut writer, + "gate_status", + verdict_status(report.gate.status), +)?; +write_metric( + &mut writer, + "gate_blocking_findings", + report.gate.blocking_findings.join(","), +)?; +write_metric( + &mut writer, + "gate_advisory_findings", + report.gate.advisory_findings.join(","), +)?; +write_metric( + &mut writer, + "input_sha256", + &report.provenance.input_sha256, +)?; +``` + +- [ ] **Step 4: Add MultiQC fields** + +In `src/report/multiqc.rs`, add fields to `MultiqcSummaryRow`: + +```rust +gate_mode: String, +gate_status: &'static str, +gate_blocking_findings: String, +``` + +Set them in `summary_row`: + +```rust +gate_mode: report.gate.mode.clone(), +gate_status: verdict_status(report.gate.status), +gate_blocking_findings: report.gate.blocking_findings.join(","), +``` + +- [ ] **Step 5: Add HTML gate panel** + +In `src/report/html.rs`, add a `let gate = render_gate(report);` line in `render`. + +Place this block after the positioning paragraph and before Machine Summary: + +```html +

Gate Decision

+{gate} +``` + +Add helper: + +```rust +fn render_gate(report: &FastaguardReport) -> String { + format!( + r#"
+
+

Gate

+

Mode: {mode}

+

Status: {status}

+
+
+

Blocking

+{blocking} +
+
+

Advisory

+{advisory} +
+
"#, + mode = escape_html(&report.gate.mode), + status = escape_html(verdict_status(report.gate.status)), + blocking = render_string_list_or_none(&report.gate.blocking_findings), + advisory = render_string_list_or_none(&report.gate.advisory_findings), + ) +} + +fn render_string_list_or_none(values: &[String]) -> String { + if values.is_empty() { + "

None

".to_string() + } else { + render_string_list(values) + } +} +``` + +- [ ] **Step 6: Run tests and commit** + +Run: + +```bash +cargo test --locked writes_gate_and_checksum_rows writes_multiqc_custom_content_table html_report_shows_gate_decision +``` + +Expected: all pass. + +Commit: + +```bash +git add src/report/tsv.rs src/report/multiqc.rs src/report/html.rs tests/cli.rs +git commit -m "feat: surface assembly gate outputs" +``` + +## Task 4: Bump Version, Schema, Golden Reports, And Examples To v0.3.0 + +**Files:** +- Modify: `Cargo.toml` +- Modify: `Cargo.lock` +- Modify: `src/models.rs` +- Modify: `schema/finding-catalog.json` +- Modify: `tests/golden/*.json` +- Modify: `examples/reports/**` +- Modify: `tests/python/test_release_metadata.py` +- Modify: `tests/schema_contract.rs` + +- [ ] **Step 1: Update metadata tests first** + +In `tests/python/test_release_metadata.py`, change version expectations from `0.2.0` to `0.3.0` for Cargo and release notes existence. Add assertions that v0.3 release notes mention: + +```python +self.assertIn("FastaGuard v0.3.0", text) +self.assertIn("Evidence And Assembly Gate", text) +self.assertIn("--gate pipeline", text) +self.assertIn("input_sha256", text) +``` + +Keep Bioconda source SHA checks scoped to the current recipe until the v0.3 GitHub source archive exists; do not require `packaging/bioconda/meta.yaml` to be v0.3 during feature implementation. + +- [ ] **Step 2: Run Python metadata tests to verify failure** + +Run: + +```bash +python3 -m unittest tests.python.test_release_metadata -v +``` + +Expected: failure because Cargo is still `0.2.0` and `docs/releases/v0.3.0.md` does not exist. + +- [ ] **Step 3: Bump Cargo package version** + +In `Cargo.toml`: + +```toml +version = "0.3.0" +``` + +Run: + +```bash +cargo update -p fastaguard --precise 0.3.0 +``` + +Expected: `Cargo.lock` updates the local package version to `0.3.0`. + +- [ ] **Step 4: Update schema and catalog versions** + +In `schema/finding-catalog.json`, change: + +```json +"schema_version": "0.3.0", +"catalog_version": "0.3.0" +``` + +In tests that assert catalog version, update expected strings to `0.3.0`. + +- [ ] **Step 5: Add v0.3 release notes** + +Create `docs/releases/v0.3.0.md`: + +````markdown +# FastaGuard v0.3.0 + +FastaGuard v0.3.0 is the Evidence And Assembly Gate release. + +## Highlights + +- Adds `--gate pipeline` for conservative assembly preflight gating. +- Adds a machine-readable `gate` object to JSON reports. +- Adds `provenance.input_sha256` so reports identify the exact input bytes. +- Surfaces gate mode, status, blocking findings, advisory findings, and input + checksum in pipeline-friendly outputs. +- Documents the v0.3 evidence workflow for local and public assembly runs. + +## Install + +```bash +mamba install -c conda-forge -c bioconda fastaguard +``` + +Until the v0.3 Bioconda update is merged, Bioconda may still serve the previous +published release. GitHub release binaries and source archives should be used +for immediate v0.3 testing after the tag is published. + +## Pipeline Gate + +```bash +fastaguard sample.fa --profile assembly --gate pipeline +``` + +The pipeline gate fails on duplicate IDs, invalid characters, structurally +invalid FASTA, and high-N content. GC and length outliers remain advisory unless +explicitly added with `--fail-on`. + +## Known Limits + +- FastaGuard remains assembly-focused. +- Gate decisions are FASTA preflight decisions, not biological completeness, + contamination, or assembly correctness claims. +- External taxonomy, coverage, k-mer, and database-backed checks remain + follow-up steps. +```` + +- [ ] **Step 6: Regenerate golden and example reports** + +Run the existing golden update workflow manually by using the commands encoded in `tests/cli.rs`. If no helper exists, regenerate with: + +```bash +FASTAGUARD_PROVENANCE_TIMESTAMP=2026-05-23T00:00:00Z \ +FASTAGUARD_PROVENANCE_COMMAND='fastaguard testdata/valid_assembly.fa --min-contig-length 1 --out target/fastaguard-golden-runtime/valid_assembly.html --json target/fastaguard-golden-runtime/valid_assembly.json --tsv target/fastaguard-golden-runtime/valid_assembly.tsv --multiqc target/fastaguard-golden-runtime/valid_assembly_multiqc.json' \ +cargo run -- testdata/valid_assembly.fa --min-contig-length 1 \ + --out target/fastaguard-golden-runtime/valid_assembly.html \ + --json tests/golden/valid_assembly.json \ + --tsv target/fastaguard-golden-runtime/valid_assembly.tsv \ + --multiqc target/fastaguard-golden-runtime/valid_assembly_multiqc.json + +FASTAGUARD_PROVENANCE_TIMESTAMP=2026-05-23T00:00:00Z \ +FASTAGUARD_PROVENANCE_COMMAND='fastaguard testdata/problem_assembly.fa --out target/fastaguard-golden-runtime/problem_assembly.html --json target/fastaguard-golden-runtime/problem_assembly.json --tsv target/fastaguard-golden-runtime/problem_assembly.tsv --multiqc target/fastaguard-golden-runtime/problem_assembly_multiqc.json' \ +cargo run -- testdata/problem_assembly.fa \ + --out target/fastaguard-golden-runtime/problem_assembly.html \ + --json tests/golden/problem_assembly.json \ + --tsv target/fastaguard-golden-runtime/problem_assembly.tsv \ + --multiqc target/fastaguard-golden-runtime/problem_assembly_multiqc.json || test "$?" = "2" + +FASTAGUARD_PROVENANCE_TIMESTAMP=2026-05-23T00:00:00Z \ +FASTAGUARD_PROVENANCE_COMMAND='fastaguard testdata/invalid_empty_record.fa --out target/fastaguard-golden-runtime/invalid_empty_record.html --json target/fastaguard-golden-runtime/invalid_empty_record.json --tsv target/fastaguard-golden-runtime/invalid_empty_record.tsv --multiqc target/fastaguard-golden-runtime/invalid_empty_record_multiqc.json' \ +cargo run -- testdata/invalid_empty_record.fa \ + --out target/fastaguard-golden-runtime/invalid_empty_record.html \ + --json tests/golden/invalid_empty_record.json \ + --tsv target/fastaguard-golden-runtime/invalid_empty_record.tsv \ + --multiqc target/fastaguard-golden-runtime/invalid_empty_record_multiqc.json || test "$?" = "2" +``` + +Regenerate committed examples: + +```bash +cargo run -- testdata/valid_assembly.fa \ + --min-contig-length 1 \ + --out examples/reports/assembly_pass/fastaguard_report.html \ + --json examples/reports/assembly_pass/fastaguard.json \ + --tsv examples/reports/assembly_pass/fastaguard.tsv \ + --multiqc examples/reports/assembly_pass/fastaguard_mqc.json + +cargo run -- testdata/problem_assembly.fa \ + --out examples/reports/assembly_fail/fastaguard_report.html \ + --json examples/reports/assembly_fail/fastaguard.json \ + --tsv examples/reports/assembly_fail/fastaguard.tsv \ + --multiqc examples/reports/assembly_fail/fastaguard_mqc.json || test "$?" = "2" +``` + +- [ ] **Step 7: Run tests and commit** + +Run: + +```bash +python3 -m unittest tests.python.test_release_metadata -v +cargo test --locked +``` + +Expected: all pass. + +Commit: + +```bash +git add Cargo.toml Cargo.lock src/models.rs schema/finding-catalog.json schema/fastaguard.schema.json tests/golden examples/reports docs/releases/v0.3.0.md tests/python/test_release_metadata.py tests/schema_contract.rs tests/cli.rs +git commit -m "chore: prepare v0.3 report contract" +``` + +## Task 5: Update User Docs And Workflow Examples + +**Files:** +- Modify: `README.md` +- Modify: `docs/output-contract.md` +- Modify: `docs/roadmap.md` +- Modify: `docs/benchmarking.md` +- Modify: `docs/tool-landscape.md` +- Modify: `examples/nextflow/main.nf` +- Modify: `examples/nf-core/modules/local/fastaguard/main.nf` +- Modify: `examples/nf-core/README.md` +- Modify: `examples/snakemake/Snakefile` +- Modify: `examples/snakemake/wrapper/README.md` +- Test: `tests/python/test_adoption_assets.py` + +- [ ] **Step 1: Write failing adoption tests** + +In `tests/python/test_adoption_assets.py`, add: + +```python +def test_v0_3_gate_docs_and_examples_are_present(self): + readme = (ROOT / "README.md").read_text() + output_contract = (ROOT / "docs" / "output-contract.md").read_text() + nfcore_module = ( + ROOT / "examples" / "nf-core" / "modules" / "local" / "fastaguard" / "main.nf" + ).read_text() + snakemake = (ROOT / "examples" / "snakemake" / "Snakefile").read_text() + + self.assertIn("--gate pipeline", readme) + self.assertIn("The assembly FASTA gate before expensive QC.", readme) + self.assertIn('"gate"', output_contract) + self.assertIn("provenance.input_sha256", output_contract) + self.assertIn("--gate pipeline", nfcore_module) + self.assertIn("--gate pipeline", snakemake) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: + +```bash +python3 -m unittest tests.python.test_adoption_assets.AdoptionAssetsTest.test_v0_3_gate_docs_and_examples_are_present -v +``` + +Expected: failure because docs and examples do not mention v0.3 gate yet. + +- [ ] **Step 3: Update README** + +Add this quickstart example near the current pipeline gate example: + +````markdown +Pipeline gate preset: + +```bash +fastaguard sample.fa --profile assembly --gate pipeline +``` + +```text +The assembly FASTA gate before expensive QC. +``` +```` + +Mention that `--gate pipeline` fails on duplicate IDs, invalid characters, invalid FASTA structure, and high-N content while keeping GC and length outliers advisory. + +- [ ] **Step 4: Update output contract docs** + +In `docs/output-contract.md`, add a `Gate Contract` section: + +````markdown +## Gate Contract + +The `gate` object is the machine-readable assembly gate decision. + +```json +"gate": { + "mode": "pipeline", + "status": "FAIL", + "blocking_findings": ["duplicate_ids", "invalid_chars"], + "advisory_findings": ["gc_outliers"], + "fail_on": ["duplicate_ids", "high_n_rate", "invalid_chars", "invalid_fasta_structure"] +} +``` + +Machines should use `gate.blocking_findings` for workflow stop/go decisions. +Humans should use the HTML report to inspect the evidence behind each finding. +`provenance.input_sha256` identifies the exact input bytes used for the report. +```` + +- [ ] **Step 5: Update workflow examples** + +Add `--gate pipeline` to the FastaGuard command blocks in: + +- `examples/nextflow/main.nf` +- `examples/nf-core/modules/local/fastaguard/main.nf` +- `examples/snakemake/Snakefile` +- `examples/snakemake/wrapper/wrapper/fastaguard/wrapper.py` + +Keep outputs unchanged. + +- [ ] **Step 6: Run tests and commit** + +Run: + +```bash +python3 -m unittest tests.python.test_adoption_assets -v +``` + +Expected: all Python adoption tests pass. + +Commit: + +```bash +git add README.md docs/output-contract.md docs/roadmap.md docs/benchmarking.md docs/tool-landscape.md examples/nextflow/main.nf examples/nf-core/README.md examples/nf-core/modules/local/fastaguard/main.nf examples/snakemake/Snakefile examples/snakemake/wrapper/README.md examples/snakemake/wrapper/wrapper/fastaguard/wrapper.py tests/python/test_adoption_assets.py +git commit -m "docs: document v0.3 assembly gate" +``` + +## Task 6: Evidence Pack Updates + +**Files:** +- Modify: `scripts/collect_evidence.py` +- Create: `docs/evidence/fastaguard-v0.3-evidence.md` +- Create when public run succeeds: `docs/evidence/v0.3/evidence_summary.json` +- Create when public run succeeds: `docs/evidence/v0.3/evidence_summary.tsv` +- Test: `tests/python/test_adoption_assets.py` + +- [ ] **Step 1: Write failing evidence documentation tests** + +Add to `tests/python/test_adoption_assets.py`: + +```python +def test_v0_3_evidence_docs_reference_gate_and_checksum(self): + evidence = ROOT / "docs" / "evidence" / "fastaguard-v0.3-evidence.md" + + self.assertTrue(evidence.exists()) + text = evidence.read_text() + self.assertIn("--gate pipeline", text) + self.assertIn("input_sha256", text) + self.assertIn("not biological completeness", text) + self.assertIn("not contamination confirmation", text) + self.assertIn("python3 scripts/collect_evidence.py", text) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: + +```bash +python3 -m unittest tests.python.test_adoption_assets.AdoptionAssetsTest.test_v0_3_evidence_docs_reference_gate_and_checksum -v +``` + +Expected: failure because v0.3 evidence page does not exist. + +- [ ] **Step 3: Update evidence script to run gate mode** + +In `scripts/collect_evidence.py`, add `--gate pipeline` to the command list in `run_case` immediately after `--profile assembly`. + +Expected command shape: + +```python +command = [ + str(binary), + str(case["input_path"]), + "--profile", + "assembly", + "--gate", + "pipeline", + "--out", + str(html_path), + "--json", + str(json_path), + "--tsv", + str(tsv_path), + "--multiqc", + str(multiqc_path), +] +``` + +Add summary fields from the parsed report: + +```python +"gate_mode": report.get("gate", {}).get("mode"), +"gate_status": report.get("gate", {}).get("status"), +"gate_blocking_findings": ",".join(report.get("gate", {}).get("blocking_findings", [])), +"input_sha256": report.get("provenance", {}).get("input_sha256"), +``` + +Add these names to `SUMMARY_COLUMNS`: + +```python +"gate_mode", +"gate_status", +"gate_blocking_findings", +"input_sha256", +``` + +- [ ] **Step 4: Add v0.3 evidence page** + +Create `docs/evidence/fastaguard-v0.3-evidence.md`: + +````markdown +# FastaGuard v0.3 Evidence Pack + +This page records the evidence workflow for the v0.3 assembly gate release. + +FastaGuard is FASTA preflight QC. It is not biological completeness analysis, +not assembly correctness analysis, and not contamination confirmation. + +## Local Gate Evidence + +```bash +cargo build --release --locked +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3-local \ + --local-only +``` + +The evidence command runs FastaGuard with `--gate pipeline`. Summaries include +the verdict, gate status, blocking findings, top findings, runtime, input size, +and `input_sha256`. + +## Public NCBI Evidence + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3 +``` + +The public workflow uses the assembly manifest in +`docs/evidence/public_assemblies.json` and requires NCBI Datasets CLI plus +network access. + +## Interpretation + +Use this evidence to decide whether FastaGuard is useful as the first assembly +gate before QUAST, BUSCO, BlobToolKit, CheckM, annotation, or submission. +Passing the gate means the FASTA-level contract is sane enough to continue; it +does not prove biological completeness or rule out contamination. +```` + +- [ ] **Step 5: Run local-only evidence smoke** + +Run: + +```bash +cargo build --release --locked +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3-local \ + --local-only +``` + +Expected: command exits `0`, prints JSON summary, and all cases include `gate_mode`, `gate_status`, and `input_sha256`. + +- [ ] **Step 6: Run public evidence only if NCBI Datasets CLI is available** + +Check: + +```bash +command -v datasets +``` + +If present, run: + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3 +``` + +If the public run succeeds, copy only compact summary files into docs: + +```bash +mkdir -p docs/evidence/v0.3 +cp target/evidence/v0.3/evidence_summary.json docs/evidence/v0.3/evidence_summary.json +cp target/evidence/v0.3/evidence_summary.tsv docs/evidence/v0.3/evidence_summary.tsv +``` + +Do not commit downloaded FASTA files, NCBI zip archives, or generated per-case reports. + +- [ ] **Step 7: Run tests and commit** + +Run: + +```bash +python3 -m unittest tests.python.test_adoption_assets -v +``` + +Expected: all Python adoption tests pass. + +Commit: + +```bash +git add scripts/collect_evidence.py docs/evidence/fastaguard-v0.3-evidence.md docs/evidence/v0.3 tests/python/test_adoption_assets.py +git commit -m "docs: add v0.3 gate evidence workflow" +``` + +If no public evidence run was possible, omit `docs/evidence/v0.3` from `git add` and note the reason in the final summary. + +## Task 7: Full Verification And Release Readiness + +**Files:** +- Modify only files needed to fix verification failures from previous tasks. + +- [ ] **Step 1: Run full local gates** + +Run: + +```bash +python3 -m unittest discover tests/python -v +cargo fmt --check +cargo test --locked +cargo clippy --locked --all-targets --all-features -- -D warnings +git diff --check +git ls-files | xargs perl -ne 'print "$ARGV:$.:$_" if /[ \t]$/' +``` + +Expected: all commands exit `0` and whitespace scan prints no output. + +- [ ] **Step 2: Run CLI smoke for pipeline gate** + +Run: + +```bash +cargo run -- testdata/problem_assembly.fa \ + --gate pipeline \ + --out target/v0.3-smoke/fastaguard_report.html \ + --json target/v0.3-smoke/fastaguard.json \ + --tsv target/v0.3-smoke/fastaguard.tsv \ + --multiqc target/v0.3-smoke/fastaguard_mqc.json || test "$?" = "2" +``` + +Inspect: + +```bash +jq '.schema_version, .gate, .provenance.input_sha256' target/v0.3-smoke/fastaguard.json +``` + +Expected: + +```text +"0.3.0" +``` + +Gate mode is `pipeline`, status is `FAIL`, and `input_sha256` is a 64-character lowercase hex string. + +- [ ] **Step 3: Review final diff** + +Run: + +```bash +git status --short --branch +git diff --stat +``` + +Expected: only v0.3 assembly gate, evidence, docs, tests, schema, and generated example/golden files are changed. + +- [ ] **Step 4: Commit final verification fixes** + +If Step 1 or Step 2 required changes, run `git status --short` and stage the +specific files shown there that belong to v0.3 assembly gate work. Do not stage +unrelated local files. Use this commit message: + +```bash +git commit -m "chore: finalize v0.3 assembly gate" +``` + +If Step 1 and Step 2 required no changes, skip this commit step. + +- [ ] **Step 5: Prepare PR** + +Push the branch and open a draft PR: + +```bash +git push -u origin codex/v0.3-evidence-assembly-gate +gh pr create \ + --repo ehsanestaji/FastaGuard \ + --base main \ + --head codex/v0.3-evidence-assembly-gate \ + --draft \ + --title "[codex] Add v0.3 assembly gate" \ + --body-file /tmp/fastaguard-v0.3-pr.md +``` + +Use this PR body: + +````markdown +## Summary + +- Add `--gate pipeline` for conservative assembly FASTA preflight gating. +- Add machine-readable `gate` JSON plus TSV, MultiQC, and HTML gate outputs. +- Add `provenance.input_sha256` for exact input-file identity. +- Bump the report contract and package metadata to v0.3.0. +- Update docs, examples, release notes, and evidence workflow. + +## Validation + +- `python3 -m unittest discover tests/python -v` +- `cargo fmt --check` +- `cargo test --locked` +- `cargo clippy --locked --all-targets --all-features -- -D warnings` +- `git diff --check` +- trailing whitespace scan +- local evidence smoke with `scripts/collect_evidence.py --local-only` + +## Notes + +FastaGuard remains FASTA preflight QC. The gate does not replace QUAST, BUSCO, +BlobToolKit, CheckM, annotation, or contamination workflows. +```` diff --git a/docs/superpowers/specs/2026-05-27-fastaguard-v0.3-assembly-gate-design.md b/docs/superpowers/specs/2026-05-27-fastaguard-v0.3-assembly-gate-design.md new file mode 100644 index 0000000..48b16e4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-27-fastaguard-v0.3-assembly-gate-design.md @@ -0,0 +1,351 @@ +# FastaGuard v0.3 Design: Evidence And Assembly Gate + +## Summary + +FastaGuard v0.3 should turn the current assembly preflight checker into a +pipeline-ready gate that workflow authors can add before QUAST, BUSCO, +BlobToolKit, CheckM, annotation, or submission. + +Release theme: + +```text +FastaGuard v0.3: Evidence And Assembly Gate +``` + +Product promise: + +```text +FastaGuard gives assembly pipelines a fast, explainable PASS/WARN/FAIL gate +before expensive QC. +``` + +The release should stay assembly-first. It should not add transcriptome, +protein, or reference-panel profiles yet. v0.3 should make the assembly +contract more credible, easier to enforce, and easier to cite. + +## Goals + +- Add a pipeline gate preset for common assembly preflight behavior. +- Add input checksum provenance so reports can be tied to exact FASTA bytes. +- Make gate decisions machine-readable without requiring log or HTML parsing. +- Improve report language that separates blocking failures from follow-up + recommendations. +- Run and document public assembly evidence using the existing evidence script. +- Keep the default product fast, deterministic, database-free, and easy to run + through Bioconda and BioContainers. + +## Non-Goals + +- Do not add external databases, taxonomy calls, coverage analysis, aligners, or + internet requirements to the CLI. +- Do not claim biological completeness, assembly correctness, or contamination + confirmation. +- Do not add transcriptome, protein, reference-panel, or compare mode in v0.3. +- Do not make an LLM summary feature. +- Do not break the existing `--fail-on` mechanism; the gate preset should be a + convenience layer over explicit behavior. + +## Product Position + +v0.1 proved the basic assembly preflight contract. v0.2 made the reports more +trustworthy and pipeline-friendly. v0.3 should answer the pipeline author's +practical question: + +```text +Can I add this as the first assembly QC gate and trust what it blocks? +``` + +Recommended public message: + +```text +The assembly FASTA gate before expensive QC. +``` + +## Feature Scope + +### Assembly Gate Preset + +Add a new CLI option: + +```bash +fastaguard sample.fa --profile assembly --gate pipeline +``` + +Supported values: + +```text +none +pipeline +``` + +Default behavior: + +```text +--gate none +``` + +`--gate pipeline` should encode conservative defaults for workflow engines. It +should fail the run for findings that make downstream assembly QC unreliable: + +```text +duplicate_ids +invalid_chars +invalid_fasta_structure +high_n_rate +``` + +It should not fail on `gc_outliers`, `length_outliers`, or +`composite_anomalies` by default. Those remain follow-up and prioritization +signals unless the user explicitly includes them with `--fail-on`. + +If the user supplies both `--gate pipeline` and `--fail-on`, the final failure +set should be the union of the pipeline preset and the explicit finding IDs. +This keeps the preset easy to understand and avoids surprising overrides. + +The CLI should reject unknown gate values with a clear tool error. + +### Machine-Readable Gate Decision + +Add a compact gate decision to the JSON report: + +```json +"gate": { + "mode": "pipeline", + "status": "FAIL", + "blocking_findings": ["duplicate_ids", "invalid_chars"], + "advisory_findings": ["gc_outliers"], + "fail_on": ["duplicate_ids", "high_n_rate", "invalid_chars", "invalid_fasta_structure"] +} +``` + +Rules: + +- `mode` is `none` or `pipeline`. +- `status` matches the report verdict. +- `blocking_findings` lists triggered finding IDs that are in the active + failure set. +- `advisory_findings` lists triggered finding IDs that are not in the active + failure set. +- `fail_on` records the final active failure set after applying the gate preset + and user-provided `--fail-on`. + +This field is intentionally small. Workflow engines and future tool agents +should be able to route from it without reading human prose. + +### Provenance Checksums + +Add input checksum metadata to provenance: + +```json +"input_sha256": "..." +``` + +The checksum should be computed over the exact input bytes on disk, not the +decompressed FASTA stream. For `.fa.gz` inputs, this means the checksum +identifies the compressed file that was passed to FastaGuard. + +The checksum should be enabled by default in v0.3. The implementation should +stream file bytes and must not load the whole FASTA into memory. If the input +cannot be read, normal input error handling should already fail the run before a +report is emitted. + +### Threshold Metadata + +Keep the existing provenance threshold fields, and add enough gate context for +machines to understand why a finding blocked: + +```json +"thresholds": { + "high_n_sequence_fraction": 0.2, + "high_global_n_fraction": 0.05, + "min_contig_length": 200, + "max_gap_run": 100, + "gc_outlier_zscore": 3.0 +} +``` + +No new threshold schema is required for v0.3. The key improvement is that the +active `gate.fail_on` set makes threshold-backed blocking behavior explicit. + +### Report Language + +Update HTML and release-facing docs so users see three classes of outcome: + +```text +Blocking: fix before downstream QC. +Advisory: safe to continue, but inspect. +Routing: run a deeper downstream tool if the question matters. +``` + +Examples: + +- duplicate IDs and invalid characters are blocking for `--gate pipeline` +- GC outliers are advisory and may route to BlobToolKit, sourmash, Kraken, or + related tools +- high N content can be blocking when it exceeds the configured gate threshold + +This wording should preserve the core boundary: FastaGuard is preflight QC, not +biological confirmation. + +### Public Evidence Pack + +Use the existing evidence workflow as the v0.3 proof layer. The default public +manifest should remain small and fast enough to be rerun by maintainers. + +v0.3 should commit compact evidence summaries, not downloaded FASTA files, +archives, or full generated reports. + +Commit these files when a public run is available: + +```text +docs/evidence/v0.3/evidence_summary.json +docs/evidence/v0.3/evidence_summary.tsv +docs/evidence/fastaguard-v0.3-evidence.md +``` + +The evidence page should include: + +- command used +- FastaGuard version and git commit +- platform and date +- public assembly accessions +- input size, sequence count, elapsed seconds +- verdict and top findings +- reminder that FastaGuard is preflight QC, not completeness or contamination + confirmation + +If NCBI Datasets CLI or network access is unavailable during implementation, +the local-only evidence workflow should still be tested and documented. Public +evidence summaries should only be committed after a real public run succeeds. + +## CLI And Contract + +New CLI: + +```text +--gate +``` + +Schema version should become: + +```text +0.3.0 +``` + +Cargo package version should become: + +```text +0.3.0 +``` + +JSON report additions: + +```text +gate +provenance.input_sha256 +``` + +TSV additions: + +```text +gate_mode +gate_status +gate_blocking_findings +gate_advisory_findings +input_sha256 +``` + +MultiQC custom content should include compact `gate_mode`, `gate_status`, and +`gate_blocking_findings` values in the existing custom-content table. + +HTML report should show the gate decision near the verdict. + +## Architecture + +The implementation should keep gate policy separate from finding generation. + +Recommended units: + +- CLI parsing records the requested gate mode. +- A small gate-policy module maps gate mode to default failure IDs. +- Run configuration stores the final `fail_on` set and gate mode. +- Finding generation remains responsible only for detecting findings. +- Report assembly derives the gate decision from triggered findings and active + failure IDs. +- Provenance computes `input_sha256` with streaming file reads. + +This avoids hiding gate behavior inside individual findings and keeps future +gate presets possible. + +## Testing + +Add focused tests for: + +- `--gate pipeline` adds the expected failure IDs. +- `--gate none` preserves existing default behavior. +- `--gate pipeline --fail-on gc_outliers` unions preset and explicit rules. +- unknown gate values are rejected. +- problem fixture reports include a `gate` object with blocking and advisory + findings. +- valid fixture reports include `gate.mode`, `gate.status`, empty blocking + findings, and `provenance.input_sha256`. +- gzipped input checksum is computed from the compressed bytes. +- JSON schema validates updated golden reports. +- TSV includes gate and checksum rows. +- HTML includes gate decision language. +- MultiQC output includes gate mode, gate status, and blocking findings. +- evidence script local-only path continues to pass without network access. + +Run release gates: + +```bash +python3 -m unittest discover tests/python -v +cargo fmt --check +cargo test --locked +cargo clippy --locked --all-targets --all-features -- -D warnings +git diff --check +git ls-files | xargs perl -ne 'print "$ARGV:$.:$_" if /[ \t]$/' +``` + +Optional evidence checks: + +```bash +cargo build --release --locked +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3-local \ + --local-only +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.3 +``` + +The public evidence command requires NCBI Datasets CLI and network access. + +## Release And Adoption + +v0.3 should ship with: + +- GitHub release notes +- updated README quickstart for `--gate pipeline` +- updated Nextflow/nf-core and Snakemake examples +- updated output contract documentation +- updated schema and golden fixtures +- local evidence summary +- public evidence summary when available + +After the GitHub `v0.3.0` release exists, update the Bioconda recipe and let +the Bioconda update path produce the next BioContainers image. Do not open a +Bioconda update before the public GitHub source archive exists. + +## Success Criteria + +v0.3 is successful if: + +- a pipeline author can copy one command and get a conservative assembly gate +- the JSON report makes the gate decision obvious to machines +- provenance identifies the exact input file with SHA256 +- the report still routes to downstream tools without claiming to replace them +- evidence summaries show FastaGuard running on local and public FASTA cases +- all existing v0.2 outputs remain understandable with a clear schema version + bump diff --git a/docs/tool-landscape.md b/docs/tool-landscape.md index ad82794..f883eea 100644 --- a/docs/tool-landscape.md +++ b/docs/tool-landscape.md @@ -17,6 +17,12 @@ Long-form positioning: The FASTA preflight QC layer for modern bioinformatics pipelines. ``` +v0.3 positioning: + +```text +The assembly FASTA gate before expensive QC. +``` + ## Where FastaGuard Fits | Tool | Primary role | When it runs | What FastaGuard adds before it | @@ -58,6 +64,8 @@ Current product evidence: `quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0`. - JSON Schema validates committed golden reports. - Reports include bounded evidence records and suggested actions. +- The v0.3 gate contract exposes `gate.blocking_findings`, + `gate.advisory_findings`, and `provenance.input_sha256` for workflow engines. - MultiQC custom-content JSON is emitted as `fastaguard_mqc.json`. - A native MultiQC plugin starter exists under `integrations/multiqc/`. - Bioconda recipe mirror exists under `packaging/bioconda/`. @@ -68,6 +76,7 @@ Current product evidence: Evidence still needed: - committed benchmark summaries from public assemblies +- v0.3 assembly gate evidence runs using `--gate pipeline` - user feedback from real pipeline authors - broader public assembly evidence runs - official MultiQC module or packaged plugin diff --git a/examples/nextflow/main.nf b/examples/nextflow/main.nf index 0485086..29a4c0e 100644 --- a/examples/nextflow/main.nf +++ b/examples/nextflow/main.nf @@ -14,8 +14,10 @@ process FASTAGUARD { script: """ + # Gate failures intentionally exit 2 after writing reports. fastaguard ${fasta} \ --profile assembly \ + --gate pipeline \ --out fastaguard_report.html \ --json fastaguard.json \ --tsv fastaguard.tsv \ diff --git a/examples/nf-core/README.md b/examples/nf-core/README.md index 8be540a..4d7f68b 100644 --- a/examples/nf-core/README.md +++ b/examples/nf-core/README.md @@ -23,12 +23,25 @@ container. The recommended install is: mamba install -c conda-forge -c bioconda fastaguard ``` -The local module also includes the pinned BioContainers image: +Published BioContainers currently provides the v0.2 image: ```text quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 ``` +Do not pair that v0.2 image with the v0.3 `--gate pipeline` command below. +Run this starter with a current local v0.3 binary, or add a v0.3 container tag +after the v0.3 BioContainers image is available. + +The command block is written for the v0.3 assembly gate and runs: + +```bash +fastaguard sample.fa --profile assembly --gate pipeline +``` + +That gate contract blocks downstream workflow steps on duplicate IDs, invalid +characters, invalid FASTA structure, and high-N content. Gate failures intentionally exit with code `2` after writing reports, so downstream workflow steps stop while the JSON/HTML evidence remains available. + Example include: ```nextflow diff --git a/examples/nf-core/modules/local/fastaguard/main.nf b/examples/nf-core/modules/local/fastaguard/main.nf index b4726f8..3a69c71 100644 --- a/examples/nf-core/modules/local/fastaguard/main.nf +++ b/examples/nf-core/modules/local/fastaguard/main.nf @@ -1,7 +1,6 @@ process FASTAGUARD { tag "$meta.id" label 'process_low' - container 'quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0' input: tuple val(meta), path(fasta) @@ -18,6 +17,7 @@ process FASTAGUARD { """ fastaguard ${fasta} \ --profile assembly \ + --gate pipeline \ --out ${prefix}.fastaguard.html \ --json ${prefix}.fastaguard.json \ --tsv ${prefix}.fastaguard.tsv \ diff --git a/examples/reports/README.md b/examples/reports/README.md index b1b2f83..d441d74 100644 --- a/examples/reports/README.md +++ b/examples/reports/README.md @@ -1,6 +1,6 @@ # Example Reports -These tiny examples show the complete v0.2 output contract without requiring large datasets. +These tiny examples show the complete v0.3 output contract without requiring large datasets. ## Assembly Pass @@ -41,4 +41,4 @@ cargo run -- testdata/problem_assembly.fa \ --multiqc examples/reports/assembly_fail/fastaguard_mqc.json ``` -The command exits with code `2` because this example is supposed to fail the hard QC gate. +The command exits with code `2` because this example contains critical FASTA-level blockers. In v0.3, inspect the JSON `gate` object to separate blocking findings from advisory findings. diff --git a/examples/reports/assembly_fail/fastaguard.json b/examples/reports/assembly_fail/fastaguard.json index 11862e1..2d24e0c 100644 --- a/examples/reports/assembly_fail/fastaguard.json +++ b/examples/reports/assembly_fail/fastaguard.json @@ -1,8 +1,8 @@ { - "schema_version": "0.2.0", + "schema_version": "0.3.0", "tool": { "name": "FastaGuard", - "version": "0.2.0" + "version": "0.3.0" }, "input": { "path": "testdata/problem_assembly.fa", @@ -16,6 +16,22 @@ "invalid_chars" ] }, + "gate": { + "mode": "none", + "status": "FAIL", + "blocking_findings": [ + "duplicate_ids", + "invalid_chars" + ], + "advisory_findings": [ + "high_n_rate", + "tiny_contigs", + "gap_runs", + "length_outliers", + "composite_anomalies" + ], + "fail_on": [] + }, "machine_summary": { "verdict": "FAIL", "safe_for_downstream": false, @@ -106,7 +122,8 @@ "started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 187 + "input_size_bytes": 187, + "input_sha256": "4b8551daeda739b62c8e7aaa1ebf300e4118167ca582c51eeccfa1549c96f0a5" }, "summary": { "sequence_count": 5, diff --git a/examples/reports/assembly_fail/fastaguard.tsv b/examples/reports/assembly_fail/fastaguard.tsv index ab0ca97..9363160 100644 --- a/examples/reports/assembly_fail/fastaguard.tsv +++ b/examples/reports/assembly_fail/fastaguard.tsv @@ -1,7 +1,12 @@ metric value -schema_version 0.2.0 +schema_version 0.3.0 profile assembly verdict FAIL +gate_mode none +gate_status FAIL +gate_blocking_findings duplicate_ids,invalid_chars +gate_advisory_findings high_n_rate,tiny_contigs,gap_runs,length_outliers,composite_anomalies +input_sha256 4b8551daeda739b62c8e7aaa1ebf300e4118167ca582c51eeccfa1549c96f0a5 sequence_count 5 total_length 145 n50 110 diff --git a/examples/reports/assembly_fail/fastaguard_mqc.json b/examples/reports/assembly_fail/fastaguard_mqc.json index 2ca5bbe..33dcf22 100644 --- a/examples/reports/assembly_fail/fastaguard_mqc.json +++ b/examples/reports/assembly_fail/fastaguard_mqc.json @@ -10,6 +10,9 @@ "data": { "problem_assembly": { "verdict": "FAIL", + "gate_mode": "none", + "gate_status": "FAIL", + "gate_blocking_findings": "duplicate_ids,invalid_chars", "sequence_count": 5, "total_length": 145, "n50": 110, diff --git a/examples/reports/assembly_fail/fastaguard_report.html b/examples/reports/assembly_fail/fastaguard_report.html index aa530f2..23f5160 100644 --- a/examples/reports/assembly_fail/fastaguard_report.html +++ b/examples/reports/assembly_fail/fastaguard_report.html @@ -42,6 +42,27 @@

FastaGuard Report

Verdict: FAIL

Before QUAST. Before BUSCO. Before BlobToolKit. Run FastaGuard first.

+

Gate Decision

+
+
+

Gate

+

Mode: none

+

Status: FAIL

+
+
+

Blocking

+
  • duplicate_ids
  • +
  • invalid_chars
+
+
+

Advisory

+
  • high_n_rate
  • +
  • tiny_contigs
  • +
  • gap_runs
  • +
  • length_outliers
  • +
  • composite_anomalies
+
+

Machine Summary

@@ -433,10 +454,10 @@

Suggested Actions

JSON

{
-  "schema_version": "0.2.0",
+  "schema_version": "0.3.0",
   "tool": {
     "name": "FastaGuard",
-    "version": "0.2.0"
+    "version": "0.3.0"
   },
   "input": {
     "path": "testdata/problem_assembly.fa",
@@ -450,6 +471,22 @@ 

JSON

"invalid_chars" ] }, + "gate": { + "mode": "none", + "status": "FAIL", + "blocking_findings": [ + "duplicate_ids", + "invalid_chars" + ], + "advisory_findings": [ + "high_n_rate", + "tiny_contigs", + "gap_runs", + "length_outliers", + "composite_anomalies" + ], + "fail_on": [] + }, "machine_summary": { "verdict": "FAIL", "safe_for_downstream": false, @@ -540,7 +577,8 @@

JSON

"started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 187 + "input_size_bytes": 187, + "input_sha256": "4b8551daeda739b62c8e7aaa1ebf300e4118167ca582c51eeccfa1549c96f0a5" }, "summary": { "sequence_count": 5, diff --git a/examples/reports/assembly_pass/fastaguard.json b/examples/reports/assembly_pass/fastaguard.json index 65cca50..4505cc9 100644 --- a/examples/reports/assembly_pass/fastaguard.json +++ b/examples/reports/assembly_pass/fastaguard.json @@ -1,8 +1,8 @@ { - "schema_version": "0.2.0", + "schema_version": "0.3.0", "tool": { "name": "FastaGuard", - "version": "0.2.0" + "version": "0.3.0" }, "input": { "path": "testdata/valid_assembly.fa", @@ -13,6 +13,13 @@ "status": "PASS", "reasons": [] }, + "gate": { + "mode": "none", + "status": "PASS", + "blocking_findings": [], + "advisory_findings": [], + "fail_on": [] + }, "machine_summary": { "verdict": "PASS", "safe_for_downstream": true, @@ -64,7 +71,8 @@ "started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 92 + "input_size_bytes": 92, + "input_sha256": "373699c0422b364607fc6879c46c053c105413c81df9403998d3d26a7122d2e8" }, "summary": { "sequence_count": 3, diff --git a/examples/reports/assembly_pass/fastaguard.tsv b/examples/reports/assembly_pass/fastaguard.tsv index 25e324c..528b29f 100644 --- a/examples/reports/assembly_pass/fastaguard.tsv +++ b/examples/reports/assembly_pass/fastaguard.tsv @@ -1,7 +1,12 @@ metric value -schema_version 0.2.0 +schema_version 0.3.0 profile assembly verdict PASS +gate_mode none +gate_status PASS +gate_blocking_findings +gate_advisory_findings +input_sha256 373699c0422b364607fc6879c46c053c105413c81df9403998d3d26a7122d2e8 sequence_count 3 total_length 47 n50 16 diff --git a/examples/reports/assembly_pass/fastaguard_mqc.json b/examples/reports/assembly_pass/fastaguard_mqc.json index 7c0a486..7594238 100644 --- a/examples/reports/assembly_pass/fastaguard_mqc.json +++ b/examples/reports/assembly_pass/fastaguard_mqc.json @@ -10,6 +10,9 @@ "data": { "valid_assembly": { "verdict": "PASS", + "gate_mode": "none", + "gate_status": "PASS", + "gate_blocking_findings": "", "sequence_count": 3, "total_length": 47, "n50": 16, diff --git a/examples/reports/assembly_pass/fastaguard_report.html b/examples/reports/assembly_pass/fastaguard_report.html index 6646947..71de727 100644 --- a/examples/reports/assembly_pass/fastaguard_report.html +++ b/examples/reports/assembly_pass/fastaguard_report.html @@ -42,6 +42,22 @@

FastaGuard Report

Verdict: PASS

Before QUAST. Before BUSCO. Before BlobToolKit. Run FastaGuard first.

+

Gate Decision

+
+
+

Gate

+

Mode: none

+

Status: PASS

+
+
+

Blocking

+

None

+
+
+

Advisory

+

None

+
+

Machine Summary

@@ -120,10 +136,10 @@

Findings

No findings.

JSON

{
-  "schema_version": "0.2.0",
+  "schema_version": "0.3.0",
   "tool": {
     "name": "FastaGuard",
-    "version": "0.2.0"
+    "version": "0.3.0"
   },
   "input": {
     "path": "testdata/valid_assembly.fa",
@@ -134,6 +150,13 @@ 

JSON

"status": "PASS", "reasons": [] }, + "gate": { + "mode": "none", + "status": "PASS", + "blocking_findings": [], + "advisory_findings": [], + "fail_on": [] + }, "machine_summary": { "verdict": "PASS", "safe_for_downstream": true, @@ -185,7 +208,8 @@

JSON

"started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 92 + "input_size_bytes": 92, + "input_sha256": "373699c0422b364607fc6879c46c053c105413c81df9403998d3d26a7122d2e8" }, "summary": { "sequence_count": 3, diff --git a/examples/snakemake/Snakefile b/examples/snakemake/Snakefile index a6d22b8..65f3f50 100644 --- a/examples/snakemake/Snakefile +++ b/examples/snakemake/Snakefile @@ -8,8 +8,10 @@ rule fastaguard: multiqc="fastaguard_mqc.json" shell: """ + # Gate failures intentionally exit 2 after writing reports. fastaguard {input.fasta} \ --profile assembly \ + --gate pipeline \ --out {output.html} \ --json {output.json} \ --tsv {output.tsv} \ diff --git a/examples/snakemake/wrapper/README.md b/examples/snakemake/wrapper/README.md index 3c63780..7c2febd 100644 --- a/examples/snakemake/wrapper/README.md +++ b/examples/snakemake/wrapper/README.md @@ -2,30 +2,48 @@ This is a local wrapper-style starter for FastaGuard. It assumes `fastaguard` is available on `PATH`. -Recommended install: +Published Bioconda currently provides v0.2.0: ```bash mamba install -c conda-forge -c bioconda fastaguard ``` +The v0.3 gate example below needs a current local v0.3 binary until the v0.3 +Bioconda package is published. + Run from this directory with a `sample.fa` input: ```bash snakemake -s Snakefile --cores 1 ``` -The wrapper also includes a Conda environment: +The wrapper command uses the v0.3 assembly gate: + +```bash +fastaguard sample.fa --profile assembly --gate pipeline +``` + +That gate blocks downstream workflow steps on duplicate IDs, invalid characters, +invalid FASTA structure, and high-N content. GC and length outliers remain +advisory unless explicitly added with `--fail-on`. Gate failures intentionally exit with code `2` after writing reports, so downstream workflow steps stop while the JSON/HTML evidence remains available. + +The wrapper also includes a v0.3 Conda environment for use after the v0.3 +Bioconda package is published: ```bash snakemake -s Snakefile --cores 1 --use-conda ``` -For containerized workflow runs, use the pinned BioContainers image: +For current published containerized workflow runs, the latest pinned +BioContainers image is still v0.2: ```text quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 ``` +Do not pair that v0.2 image with `--gate pipeline`; update the container tag +when a v0.3 BioContainers image is available for gate runs. + The wrapper emits: - `fastaguard_report.html` diff --git a/examples/snakemake/wrapper/Snakefile b/examples/snakemake/wrapper/Snakefile index ab11ef8..f8160e1 100644 --- a/examples/snakemake/wrapper/Snakefile +++ b/examples/snakemake/wrapper/Snakefile @@ -8,6 +8,7 @@ rule fastaguard_wrapped: multiqc="fastaguard_mqc.json" params: profile="assembly", + gate="pipeline", extra="" conda: "environment.yaml" wrapper: diff --git a/examples/snakemake/wrapper/environment.yaml b/examples/snakemake/wrapper/environment.yaml index 0c32f64..a77544a 100644 --- a/examples/snakemake/wrapper/environment.yaml +++ b/examples/snakemake/wrapper/environment.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - fastaguard=0.2.0 + - fastaguard=0.3.0 diff --git a/examples/snakemake/wrapper/wrapper/fastaguard/wrapper.py b/examples/snakemake/wrapper/wrapper/fastaguard/wrapper.py index 7f8a07f..17150f7 100644 --- a/examples/snakemake/wrapper/wrapper/fastaguard/wrapper.py +++ b/examples/snakemake/wrapper/wrapper/fastaguard/wrapper.py @@ -1,11 +1,13 @@ from snakemake.shell import shell profile = snakemake.params.get("profile", "assembly") +gate = snakemake.params.get("gate", "pipeline") extra = snakemake.params.get("extra", "") shell( "fastaguard {snakemake.input.fasta} " "--profile {profile} " + "--gate {gate} " "--out {snakemake.output.html} " "--json {snakemake.output.json} " "--tsv {snakemake.output.tsv} " diff --git a/integrations/multiqc/src/fastaguard_multiqc/multiqc_module.py b/integrations/multiqc/src/fastaguard_multiqc/multiqc_module.py index 55cc1e6..36c2d42 100644 --- a/integrations/multiqc/src/fastaguard_multiqc/multiqc_module.py +++ b/integrations/multiqc/src/fastaguard_multiqc/multiqc_module.py @@ -184,6 +184,18 @@ def _summary_headers() -> dict: "min": 0, "scale": "OrRd", }, + "gate_mode": { + "title": "Gate mode", + "description": "FastaGuard gate profile used for the report", + }, + "gate_status": { + "title": "Gate status", + "description": "FastaGuard assembly gate status", + }, + "gate_blocking_findings": { + "title": "Gate blockers", + "description": "Finding IDs blocking the FastaGuard gate", + }, "duplicate_id_count": { "title": "Duplicate IDs", "description": "Number of duplicate FASTA record IDs", diff --git a/integrations/multiqc/src/fastaguard_multiqc/parser.py b/integrations/multiqc/src/fastaguard_multiqc/parser.py index 3ec74da..1b3d4d9 100644 --- a/integrations/multiqc/src/fastaguard_multiqc/parser.py +++ b/integrations/multiqc/src/fastaguard_multiqc/parser.py @@ -19,6 +19,9 @@ ) OPTIONAL_SUMMARY_FIELDS = ( + "gate_mode", + "gate_status", + "gate_blocking_findings", "duplicate_id_count", "invalid_sequence_count", "high_n_sequence_count", diff --git a/schema/fastaguard.schema.json b/schema/fastaguard.schema.json index a8378fe..e2c53f6 100644 --- a/schema/fastaguard.schema.json +++ b/schema/fastaguard.schema.json @@ -9,6 +9,7 @@ "tool", "input", "verdict", + "gate", "machine_summary", "scope", "provenance", @@ -19,7 +20,7 @@ ], "properties": { "schema_version": { - "const": "0.2.0", + "const": "0.3.0", "description": "Version of the FastaGuard report schema." }, "tool": { @@ -66,6 +67,47 @@ } } }, + "gate": { + "type": "object", + "required": [ + "mode", + "status", + "blocking_findings", + "advisory_findings", + "fail_on" + ], + "properties": { + "mode": { + "type": "string", + "enum": ["none", "pipeline"] + }, + "status": { + "type": "string", + "enum": ["PASS", "WARN", "FAIL"] + }, + "blocking_findings": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "advisory_findings": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "fail_on": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + } + } + }, "machine_summary": { "type": "object", "required": [ @@ -135,7 +177,8 @@ "started_at", "completed_at", "duration_ms", - "input_size_bytes" + "input_size_bytes", + "input_sha256" ], "properties": { "profile": { @@ -204,6 +247,10 @@ "input_size_bytes": { "type": "integer", "minimum": 0 + }, + "input_sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$" } } }, diff --git a/schema/finding-catalog.json b/schema/finding-catalog.json index c81391f..65cf81c 100644 --- a/schema/finding-catalog.json +++ b/schema/finding-catalog.json @@ -1,6 +1,6 @@ { - "schema_version": "0.2.0", - "catalog_version": "0.2.0", + "schema_version": "0.3.0", + "catalog_version": "0.3.0", "tool": { "name": "FastaGuard" }, diff --git a/scripts/collect_evidence.py b/scripts/collect_evidence.py index 5385c3a..12c48ad 100755 --- a/scripts/collect_evidence.py +++ b/scripts/collect_evidence.py @@ -30,6 +30,10 @@ "elapsed_seconds", "exit_code", "verdict", + "gate_mode", + "gate_status", + "gate_blocking_findings", + "input_sha256", "sequence_count", "total_length", "n50", @@ -94,7 +98,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--out-dir", type=Path, - default=Path("target/evidence/v0.2"), + default=Path("target/evidence/v0.3"), help="Directory for evidence outputs and summaries.", ) parser.add_argument( @@ -244,6 +248,8 @@ def run_case(binary: Path, case: dict[str, Any]) -> dict[str, Any]: str(case["input_path"]), "--profile", "assembly", + "--gate", + "pipeline", "--min-contig-length", "1", "--out", @@ -272,6 +278,30 @@ def run_case(binary: Path, case: dict[str, Any]) -> dict[str, Any]: summary = report["summary"] findings = report.get("findings", []) top_findings = [finding.get("id", "unknown") for finding in findings[:5]] + gate = required_mapping(report, "gate", case["id"]) + provenance = required_mapping(report, "provenance", case["id"]) + gate_mode = required_value(gate, "mode", "gate.mode", case["id"]) + gate_status = required_value(gate, "status", "gate.status", case["id"]) + blocking_findings = required_list( + gate, "blocking_findings", "gate.blocking_findings", case["id"] + ) + input_sha256 = required_value( + provenance, "input_sha256", "provenance.input_sha256", case["id"] + ) + if not is_sha256(input_sha256): + raise SystemExit( + f"FastaGuard evidence report for {case['id']} has invalid " + "provenance.input_sha256" + ) + if gate_mode != "pipeline": + raise SystemExit( + f"FastaGuard evidence report for {case['id']} expected " + f"gate.mode pipeline, got {gate_mode!r}" + ) + if gate_status not in {"PASS", "WARN", "FAIL"}: + raise SystemExit( + f"FastaGuard evidence report for {case['id']} has invalid gate.status" + ) return { "id": case["id"], @@ -284,6 +314,10 @@ def run_case(binary: Path, case: dict[str, Any]) -> dict[str, Any]: "elapsed_seconds": round(elapsed, 4), "exit_code": completed.returncode, "verdict": report["verdict"]["status"], + "gate_mode": gate_mode, + "gate_status": gate_status, + "gate_blocking_findings": ",".join(blocking_findings), + "input_sha256": input_sha256, "sequence_count": summary["sequence_count"], "total_length": summary["total_length"], "n50": summary["n50"], @@ -300,6 +334,35 @@ def run_case(binary: Path, case: dict[str, Any]) -> dict[str, Any]: } +def required_mapping(report: dict[str, Any], key: str, case_id: str) -> dict[str, Any]: + value = report.get(key) + if not isinstance(value, dict): + raise SystemExit(f"FastaGuard evidence report for {case_id} missing {key}") + return value + + +def required_value( + mapping: dict[str, Any], key: str, label: str, case_id: str +) -> str: + value = mapping.get(key) + if not isinstance(value, str) or not value: + raise SystemExit(f"FastaGuard evidence report for {case_id} missing {label}") + return value + + +def required_list( + mapping: dict[str, Any], key: str, label: str, case_id: str +) -> list[str]: + value = mapping.get(key) + if not isinstance(value, list) or not all(isinstance(item, str) for item in value): + raise SystemExit(f"FastaGuard evidence report for {case_id} missing {label}") + return value + + +def is_sha256(value: str) -> bool: + return len(value) == 64 and all(character in "0123456789abcdef" for character in value) + + def write_summary(out_dir: Path, summary: dict[str, Any]) -> None: json_path = out_dir / "evidence_summary.json" tsv_path = out_dir / "evidence_summary.tsv" diff --git a/src/cli.rs b/src/cli.rs index ceca74b..d900790 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -5,6 +5,7 @@ use std::collections::BTreeSet; use std::env::VarError; use std::path::PathBuf; +use crate::gate::{self, GateMode}; use crate::profile::ThresholdOverrides; #[derive(Debug, Parser)] @@ -36,6 +37,10 @@ pub struct Cli { #[arg(long, default_value = "assembly")] pub profile: String, + /// Gate preset for pipeline-friendly failure behavior. + #[arg(long, value_enum, default_value_t = GateMode::None)] + pub gate: GateMode, + /// HTML report path. #[arg(long, default_value = "fastaguard_report.html")] pub out: PathBuf, @@ -73,6 +78,7 @@ pub struct Cli { pub struct RunConfig { pub input: PathBuf, pub profile: String, + pub gate_mode: GateMode, pub outputs: OutputPaths, pub rules: RuleConfig, pub thresholds: ThresholdOverrides, @@ -122,6 +128,7 @@ impl Cli { Ok(RunConfig { input, profile: self.profile.clone(), + gate_mode: self.gate, outputs: OutputPaths { html: self.out.clone(), json: self.json.clone(), @@ -129,7 +136,7 @@ impl Cli { multiqc: self.multiqc.clone(), }, rules: RuleConfig { - fail_on: normalize_rules(&self.fail_on), + fail_on: gate::final_fail_on(self.gate, &self.fail_on), }, thresholds: ThresholdOverrides { max_n_rate: self.max_n_rate, @@ -179,19 +186,10 @@ fn current_utc_timestamp() -> String { Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true) } -fn normalize_rules(values: &[String]) -> BTreeSet { - values - .iter() - .flat_map(|value| value.split(',')) - .map(str::trim) - .filter(|value| !value.is_empty()) - .map(ToOwned::to_owned) - .collect() -} - #[cfg(test)] mod tests { use super::*; + use crate::gate::GateMode; use clap::Parser; fn cli_with_max_n_rate(max_n_rate: Option) -> Cli { @@ -201,6 +199,7 @@ mod tests { finding_catalog: false, explain_finding: None, profile: "assembly".to_string(), + gate: GateMode::None, out: PathBuf::from("fastaguard_report.html"), json: PathBuf::from("fastaguard.json"), tsv: PathBuf::from("fastaguard.tsv"), @@ -243,4 +242,62 @@ mod tests { assert_eq!(config.outputs.multiqc, PathBuf::from("fastaguard_mqc.json")); } + + #[test] + fn gate_none_preserves_explicit_fail_rules() { + let cli = Cli::parse_from([ + "fastaguard", + "input.fa", + "--gate", + "none", + "--fail-on", + "gc_outliers", + ]); + let config = cli.to_run_config().unwrap(); + + assert_eq!(config.gate_mode, GateMode::None); + assert_eq!( + config.rules.fail_on, + ["gc_outliers"].into_iter().map(str::to_string).collect() + ); + } + + #[test] + fn gate_pipeline_adds_conservative_fail_rules() { + let cli = Cli::parse_from(["fastaguard", "input.fa", "--gate", "pipeline"]); + let config = cli.to_run_config().unwrap(); + + assert_eq!(config.gate_mode, GateMode::Pipeline); + assert_eq!( + config.rules.fail_on, + [ + "duplicate_ids", + "high_n_rate", + "invalid_chars", + "invalid_fasta_structure", + ] + .into_iter() + .map(str::to_string) + .collect() + ); + } + + #[test] + fn gate_pipeline_unions_explicit_fail_rules() { + let cli = Cli::parse_from([ + "fastaguard", + "input.fa", + "--gate", + "pipeline", + "--fail-on", + "gc_outliers", + ]); + let config = cli.to_run_config().unwrap(); + + assert!(config.rules.fail_on.contains("duplicate_ids")); + assert!(config.rules.fail_on.contains("invalid_chars")); + assert!(config.rules.fail_on.contains("invalid_fasta_structure")); + assert!(config.rules.fail_on.contains("high_n_rate")); + assert!(config.rules.fail_on.contains("gc_outliers")); + } } diff --git a/src/gate.rs b/src/gate.rs new file mode 100644 index 0000000..193e303 --- /dev/null +++ b/src/gate.rs @@ -0,0 +1,71 @@ +use clap::ValueEnum; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; + +use crate::models::{Finding, GateDecision, Severity, VerdictStatus}; + +pub const PIPELINE_FAIL_ON: [&str; 4] = [ + "duplicate_ids", + "high_n_rate", + "invalid_chars", + "invalid_fasta_structure", +]; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[value(rename_all = "snake_case")] +pub enum GateMode { + None, + Pipeline, +} + +impl GateMode { + pub fn as_str(self) -> &'static str { + match self { + GateMode::None => "none", + GateMode::Pipeline => "pipeline", + } + } +} + +pub fn final_fail_on(mode: GateMode, explicit_rules: &[String]) -> BTreeSet { + let mut fail_on = explicit_rules + .iter() + .flat_map(|value| value.split(',')) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + .collect::>(); + + if mode == GateMode::Pipeline { + fail_on.extend(PIPELINE_FAIL_ON.into_iter().map(ToOwned::to_owned)); + } + + fail_on +} + +pub fn decision( + mode: GateMode, + status: VerdictStatus, + findings: &[Finding], + fail_on: &BTreeSet, +) -> GateDecision { + let mut blocking_findings = Vec::new(); + let mut advisory_findings = Vec::new(); + + for finding in findings { + if fail_on.contains(&finding.id) || finding.severity == Severity::Critical { + blocking_findings.push(finding.id.clone()); + } else { + advisory_findings.push(finding.id.clone()); + } + } + + GateDecision { + mode: mode.as_str().to_string(), + status, + blocking_findings, + advisory_findings, + fail_on: fail_on.iter().cloned().collect(), + } +} diff --git a/src/lib.rs b/src/lib.rs index bc1270b..e3f9a57 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ pub mod cli; pub mod contract; pub mod findings; +pub mod gate; pub mod metrics; pub mod models; pub mod parser; @@ -37,7 +38,7 @@ pub fn run(cli: Cli) -> Result { &profile, error.to_string(), measured_duration_ms(&config, run_started), - ); + )?; report::write_all(&output, &config.outputs)?; return Ok(output.exit_code()); } @@ -51,7 +52,7 @@ pub fn run(cli: Cli) -> Result { metrics, analysis, duration_ms, - ); + )?; report::write_all(&output, &config.outputs)?; Ok(output.exit_code()) } @@ -68,6 +69,7 @@ fn measured_duration_ms(config: &cli::RunConfig, started: Instant) -> u64 { mod tests { use super::*; use crate::cli::{OutputPaths, RuleConfig, RunConfig}; + use crate::gate::GateMode; use crate::profile::ThresholdOverrides; use std::collections::BTreeSet; use std::path::PathBuf; @@ -97,6 +99,7 @@ mod tests { RunConfig { input: PathBuf::from("input.fa"), profile: "assembly".to_string(), + gate_mode: GateMode::None, outputs: OutputPaths { html: PathBuf::from("fastaguard_report.html"), json: PathBuf::from("fastaguard.json"), diff --git a/src/models.rs b/src/models.rs index 3d0dd21..3c1bc7e 100644 --- a/src/models.rs +++ b/src/models.rs @@ -1,14 +1,19 @@ +use anyhow::{Context, Result}; +use sha2::{Digest, Sha256}; +use std::fs::File; +use std::io::{BufReader, Read}; use std::path::Path; use serde::{Deserialize, Serialize}; use crate::cli::RunConfig; use crate::findings::Analysis; +use crate::gate; use crate::metrics::AssemblyMetrics; use crate::profile::ProfileConfig; use crate::stats::composition::percent; -pub const SCHEMA_VERSION: &str = "0.2.0"; +pub const SCHEMA_VERSION: &str = "0.3.0"; pub const TOOL_NAME: &str = "FastaGuard"; pub const TOOL_VERSION: &str = env!("CARGO_PKG_VERSION"); const LENGTH_HISTOGRAM_BIN_COUNT: u64 = 10; @@ -20,6 +25,7 @@ pub struct FastaguardReport { pub tool: ToolInfo, pub input: InputInfo, pub verdict: Verdict, + pub gate: GateDecision, pub machine_summary: MachineSummary, pub scope: Scope, pub provenance: Provenance, @@ -48,6 +54,15 @@ pub struct Verdict { pub reasons: Vec, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GateDecision { + pub mode: String, + pub status: VerdictStatus, + pub blocking_findings: Vec, + pub advisory_findings: Vec, + pub fail_on: Vec, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MachineSummary { pub verdict: VerdictStatus, @@ -88,6 +103,7 @@ pub struct Provenance { pub completed_at: String, pub duration_ms: u64, pub input_size_bytes: u64, + pub input_sha256: String, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -250,10 +266,12 @@ impl FastaguardReport { metrics: AssemblyMetrics, analysis: Analysis, duration_ms: u64, - ) -> Self { + ) -> Result { let findings = analysis.findings; let plots = build_plots(&metrics, profile); - Self { + let provenance = build_provenance(&config, profile, duration_ms)?; + + Ok(Self { schema_version: SCHEMA_VERSION.to_string(), tool: ToolInfo { name: TOOL_NAME.to_string(), @@ -268,9 +286,15 @@ impl FastaguardReport { status: analysis.status, reasons: analysis.reasons, }, + gate: gate::decision( + config.gate_mode, + analysis.status, + &findings, + &config.rules.fail_on, + ), machine_summary: build_machine_summary(analysis.status, &findings), scope: fasta_preflight_scope(), - provenance: build_provenance(&config, profile, duration_ms), + provenance, summary: Summary { sequence_count: metrics.sequence_count, total_length: metrics.total_length, @@ -300,7 +324,7 @@ impl FastaguardReport { tsv: config.outputs.tsv.display().to_string(), multiqc: config.outputs.multiqc.display().to_string(), }, - } + }) } pub fn from_invalid_fasta( @@ -308,7 +332,7 @@ impl FastaguardReport { profile: &ProfileConfig, message: String, duration_ms: u64, - ) -> Self { + ) -> Result { let findings = vec![Finding { id: "invalid_fasta_structure".to_string(), category: FindingCategory::Validity, @@ -327,8 +351,9 @@ impl FastaguardReport { evidence: empty_evidence(), actions: finding_actions("invalid_fasta_structure"), }]; + let provenance = build_provenance(&config, profile, duration_ms)?; - Self { + Ok(Self { schema_version: SCHEMA_VERSION.to_string(), tool: ToolInfo { name: TOOL_NAME.to_string(), @@ -343,9 +368,15 @@ impl FastaguardReport { status: VerdictStatus::Fail, reasons: vec!["invalid_fasta_structure".to_string()], }, + gate: gate::decision( + config.gate_mode, + VerdictStatus::Fail, + &findings, + &config.rules.fail_on, + ), machine_summary: build_machine_summary(VerdictStatus::Fail, &findings), scope: fasta_preflight_scope(), - provenance: build_provenance(&config, profile, duration_ms), + provenance, summary: Summary { sequence_count: 0, total_length: 0, @@ -375,7 +406,7 @@ impl FastaguardReport { tsv: config.outputs.tsv.display().to_string(), multiqc: config.outputs.multiqc.display().to_string(), }, - } + }) } pub fn exit_code(&self) -> i32 { @@ -647,16 +678,26 @@ fn fasta_preflight_scope() -> Scope { } } -fn build_provenance(config: &RunConfig, profile: &ProfileConfig, duration_ms: u64) -> Provenance { +fn build_provenance( + config: &RunConfig, + profile: &ProfileConfig, + duration_ms: u64, +) -> Result { let completed_at = config .provenance_timestamp_override .clone() .unwrap_or_else(|| chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true)); + let input_sha256 = input_sha256(&config.input)?; let input_size_bytes = std::fs::metadata(&config.input) - .map(|metadata| metadata.len()) - .unwrap_or(0); - - Provenance { + .with_context(|| { + format!( + "failed to inspect input size for {}", + config.input.display() + ) + })? + .len(); + + Ok(Provenance { profile: profile.name.clone(), threads: config.threads, fail_on: config.rules.fail_on.iter().cloned().collect(), @@ -672,7 +713,28 @@ fn build_provenance(config: &RunConfig, profile: &ProfileConfig, duration_ms: u6 completed_at, duration_ms, input_size_bytes, + input_sha256, + }) +} + +fn input_sha256(path: &Path) -> Result { + let file = File::open(path) + .with_context(|| format!("failed to open {} for SHA256", path.display()))?; + let mut reader = BufReader::new(file); + let mut hasher = Sha256::new(); + let mut buffer = [0_u8; 64 * 1024]; + + loop { + let read = reader + .read(&mut buffer) + .with_context(|| format!("failed to read {} for SHA256", path.display()))?; + if read == 0 { + break; + } + hasher.update(&buffer[..read]); } + + Ok(hex::encode(hasher.finalize())) } fn build_plots(metrics: &AssemblyMetrics, profile: &ProfileConfig) -> Plots { @@ -771,12 +833,26 @@ fn path_is_gzip(path: &Path) -> bool { #[cfg(test)] mod tests { + use crate::cli::{OutputPaths, RuleConfig, RunConfig}; + use crate::gate::GateMode; use crate::metrics::AssemblyMetrics; use crate::parser::FastaRecord; use crate::profile::{ProfileConfig, ThresholdOverrides}; + use std::collections::BTreeSet; + use std::path::PathBuf; use super::*; + #[test] + fn build_provenance_errors_when_input_checksum_cannot_be_read() { + let profile = profile(); + let config = test_config(PathBuf::from("target/missing-for-sha.fa")); + + let error = build_provenance(&config, &profile, 0).unwrap_err(); + + assert!(error.to_string().contains("SHA256"), "{error:?}"); + } + #[test] fn plot_histogram_uses_deterministic_linear_bins() { let profile = profile(); @@ -804,6 +880,31 @@ mod tests { }) } + fn test_config(input: PathBuf) -> RunConfig { + RunConfig { + input, + profile: "assembly".to_string(), + gate_mode: GateMode::None, + outputs: OutputPaths { + html: PathBuf::from("fastaguard_report.html"), + json: PathBuf::from("fastaguard.json"), + tsv: PathBuf::from("fastaguard.tsv"), + multiqc: PathBuf::from("fastaguard_mqc.json"), + }, + rules: RuleConfig { + fail_on: BTreeSet::new(), + }, + thresholds: ThresholdOverrides { + max_n_rate: None, + min_contig_length: Some(1), + }, + threads: 1, + command: "fastaguard input.fa".to_string(), + started_at: "2026-05-23T00:00:00Z".to_string(), + provenance_timestamp_override: Some("2026-05-23T00:00:00Z".to_string()), + } + } + fn record(id: &str, length: usize) -> FastaRecord { FastaRecord { id: id.to_string(), diff --git a/src/report/html.rs b/src/report/html.rs index 673e654..541d4a3 100644 --- a/src/report/html.rs +++ b/src/report/html.rs @@ -11,6 +11,7 @@ pub fn write(report: &FastaguardReport, path: &Path) -> Result<()> { fn render(report: &FastaguardReport) -> Result { let summary = &report.summary; + let gate = render_gate(report); let machine_summary = render_machine_summary(report); let scope = render_scope(report); let plots = render_plots(report); @@ -62,6 +63,8 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }}

FastaGuard Report

Verdict: {verdict}

Before QUAST. Before BUSCO. Before BlobToolKit. Run FastaGuard first.

+

Gate Decision

+{gate}

Machine Summary

{machine_summary}

Summary

@@ -96,6 +99,7 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }} n90 = summary.n90, gc_percent = summary.gc_percent, n_percent = summary.n_percent, + gate = gate, scope = scope, plots = plots, findings = findings, @@ -103,6 +107,30 @@ pre {{ overflow-x: auto; background: #202124; color: #f7f7f4; padding: 16px; }} )) } +fn render_gate(report: &FastaguardReport) -> String { + format!( + r#"
+
+

Gate

+

Mode: {mode}

+

Status: {status}

+
+
+

Blocking

+{blocking} +
+
+

Advisory

+{advisory} +
+
"#, + mode = escape_html(&report.gate.mode), + status = escape_html(verdict_status(report.gate.status)), + blocking = render_string_list_or_none(&report.gate.blocking_findings), + advisory = render_string_list_or_none(&report.gate.advisory_findings), + ) +} + fn render_machine_summary(report: &FastaguardReport) -> String { let summary = &report.machine_summary; let top_findings = if summary.top_findings.is_empty() { @@ -456,6 +484,14 @@ fn render_string_list(values: &[String]) -> String { format!("
    {items}
") } +fn render_string_list_or_none(values: &[String]) -> String { + if values.is_empty() { + "

None

".to_string() + } else { + render_string_list(values) + } +} + fn render_optional_u64(value: Option) -> String { value .map(|number| number.to_string()) @@ -517,9 +553,9 @@ mod tests { use super::*; use crate::models::{ empty_evidence, empty_plots, Artifacts, EvidenceRecord, FastaguardReport, Finding, - FindingAction, FindingCategory, FindingConfidence, FindingEvidence, InputInfo, - MachineSummary, Provenance, ProvenanceThresholds, RecommendedTool, Scope, Severity, - Summary, ToolInfo, Verdict, VerdictStatus, + FindingAction, FindingCategory, FindingConfidence, FindingEvidence, GateDecision, + InputInfo, MachineSummary, Provenance, ProvenanceThresholds, RecommendedTool, Scope, + Severity, Summary, ToolInfo, Verdict, VerdictStatus, }; #[test] @@ -638,6 +674,13 @@ mod tests { status: VerdictStatus::Pass, reasons: Vec::new(), }, + gate: GateDecision { + mode: "none".to_string(), + status: VerdictStatus::Pass, + blocking_findings: Vec::new(), + advisory_findings: Vec::new(), + fail_on: Vec::new(), + }, machine_summary: MachineSummary { verdict: VerdictStatus::Pass, safe_for_downstream: true, @@ -666,6 +709,7 @@ mod tests { completed_at: "2026-05-23T00:00:00Z".to_string(), duration_ms: 0, input_size_bytes: 100, + input_sha256: "0".repeat(64), }, summary: Summary { sequence_count: 2, diff --git a/src/report/mod.rs b/src/report/mod.rs index 85686ba..00a83d7 100644 --- a/src/report/mod.rs +++ b/src/report/mod.rs @@ -110,8 +110,8 @@ mod tests { use super::*; use crate::models::{ - empty_plots, Artifacts, FastaguardReport, InputInfo, MachineSummary, Provenance, - ProvenanceThresholds, Scope, Summary, ToolInfo, Verdict, VerdictStatus, + empty_plots, Artifacts, FastaguardReport, GateDecision, InputInfo, MachineSummary, + Provenance, ProvenanceThresholds, Scope, Summary, ToolInfo, Verdict, VerdictStatus, }; #[test] @@ -272,6 +272,13 @@ mod tests { status: VerdictStatus::Pass, reasons: Vec::new(), }, + gate: GateDecision { + mode: "none".to_string(), + status: VerdictStatus::Pass, + blocking_findings: Vec::new(), + advisory_findings: Vec::new(), + fail_on: Vec::new(), + }, machine_summary: MachineSummary { verdict: VerdictStatus::Pass, safe_for_downstream: true, @@ -300,6 +307,7 @@ mod tests { completed_at: "2026-05-23T00:00:00Z".to_string(), duration_ms: 0, input_size_bytes: 100, + input_sha256: "0".repeat(64), }, summary: Summary { sequence_count: 2, diff --git a/src/report/multiqc.rs b/src/report/multiqc.rs index 8f2e873..9c8feae 100644 --- a/src/report/multiqc.rs +++ b/src/report/multiqc.rs @@ -25,6 +25,9 @@ struct MultiqcPlotConfig { #[derive(Serialize)] struct MultiqcSummaryRow { verdict: &'static str, + gate_mode: String, + gate_status: &'static str, + gate_blocking_findings: String, sequence_count: u64, total_length: u64, n50: u64, @@ -80,6 +83,9 @@ fn sample_name(path: &str) -> String { fn summary_row(report: &FastaguardReport) -> MultiqcSummaryRow { MultiqcSummaryRow { verdict: verdict_status(report.verdict.status), + gate_mode: report.gate.mode.clone(), + gate_status: verdict_status(report.gate.status), + gate_blocking_findings: report.gate.blocking_findings.join(","), sequence_count: report.summary.sequence_count, total_length: report.summary.total_length, n50: report.summary.n50, @@ -124,8 +130,8 @@ mod tests { use super::*; use crate::models::{ - empty_plots, Artifacts, FastaguardReport, InputInfo, MachineSummary, Provenance, - ProvenanceThresholds, Scope, Summary, ToolInfo, Verdict, VerdictStatus, + empty_plots, Artifacts, FastaguardReport, GateDecision, InputInfo, MachineSummary, + Provenance, ProvenanceThresholds, Scope, Summary, ToolInfo, Verdict, VerdictStatus, }; #[test] @@ -141,6 +147,9 @@ mod tests { assert_eq!(output["plot_type"], "table"); assert_eq!(output["pconfig"]["id"], "fastaguard_summary"); assert_eq!(output["data"]["sample"]["verdict"], "PASS"); + assert_eq!(output["data"]["sample"]["gate_mode"], "none"); + assert_eq!(output["data"]["sample"]["gate_status"], "PASS"); + assert_eq!(output["data"]["sample"]["gate_blocking_findings"], ""); assert_eq!(output["data"]["sample"]["sequence_count"], 2); assert_eq!(output["data"]["sample"]["duplicate_id_count"], 0); assert_eq!(output["data"]["sample"]["invalid_sequence_count"], 0); @@ -175,6 +184,13 @@ mod tests { status: VerdictStatus::Pass, reasons: Vec::new(), }, + gate: GateDecision { + mode: "none".to_string(), + status: VerdictStatus::Pass, + blocking_findings: Vec::new(), + advisory_findings: Vec::new(), + fail_on: Vec::new(), + }, machine_summary: MachineSummary { verdict: VerdictStatus::Pass, safe_for_downstream: true, @@ -203,6 +219,7 @@ mod tests { completed_at: "2026-05-23T00:00:00Z".to_string(), duration_ms: 0, input_size_bytes: 100, + input_sha256: "0".repeat(64), }, summary: Summary { sequence_count: 2, diff --git a/src/report/tsv.rs b/src/report/tsv.rs index eb8277b..6e7963a 100644 --- a/src/report/tsv.rs +++ b/src/report/tsv.rs @@ -18,6 +18,23 @@ pub fn write(report: &FastaguardReport, path: &Path) -> Result<()> { "verdict", verdict_status(report.verdict.status), )?; + write_metric(&mut writer, "gate_mode", &report.gate.mode)?; + write_metric( + &mut writer, + "gate_status", + verdict_status(report.gate.status), + )?; + write_metric( + &mut writer, + "gate_blocking_findings", + report.gate.blocking_findings.join(","), + )?; + write_metric( + &mut writer, + "gate_advisory_findings", + report.gate.advisory_findings.join(","), + )?; + write_metric(&mut writer, "input_sha256", &report.provenance.input_sha256)?; write_metric(&mut writer, "sequence_count", report.summary.sequence_count)?; write_metric(&mut writer, "total_length", report.summary.total_length)?; write_metric(&mut writer, "n50", report.summary.n50)?; @@ -82,8 +99,8 @@ mod tests { use super::*; use crate::models::{ empty_evidence, empty_plots, Artifacts, FastaguardReport, Finding, FindingCategory, - FindingConfidence, InputInfo, MachineSummary, Provenance, ProvenanceThresholds, Scope, - Severity, Summary, ToolInfo, Verdict, VerdictStatus, + FindingConfidence, GateDecision, InputInfo, MachineSummary, Provenance, + ProvenanceThresholds, Scope, Severity, Summary, ToolInfo, Verdict, VerdictStatus, }; #[test] @@ -115,6 +132,36 @@ mod tests { assert!(output.contains("composite_anomaly_count\t1\n"), "{output}"); } + #[test] + fn writes_gate_and_checksum_rows() { + let mut report = test_report(VerdictStatus::Fail); + report.gate.mode = "pipeline".to_string(); + report.gate.status = VerdictStatus::Fail; + report.gate.blocking_findings = vec!["duplicate_ids".to_string()]; + report.gate.advisory_findings = vec!["gc_outliers".to_string()]; + report.provenance.input_sha256 = "a".repeat(64); + let file = NamedTempFile::new().unwrap(); + + write(&report, file.path()).unwrap(); + + let checksum = "a".repeat(64); + let output = fs::read_to_string(file.path()).unwrap(); + assert!(output.contains("gate_mode\tpipeline\n"), "{output}"); + assert!(output.contains("gate_status\tFAIL\n"), "{output}"); + assert!( + output.contains("gate_blocking_findings\tduplicate_ids\n"), + "{output}" + ); + assert!( + output.contains("gate_advisory_findings\tgc_outliers\n"), + "{output}" + ); + assert!( + output.contains(&format!("input_sha256\t{checksum}\n")), + "{output}" + ); + } + fn test_report(status: VerdictStatus) -> FastaguardReport { FastaguardReport { schema_version: "0.1.0".to_string(), @@ -131,6 +178,13 @@ mod tests { status, reasons: Vec::new(), }, + gate: GateDecision { + mode: "none".to_string(), + status, + blocking_findings: Vec::new(), + advisory_findings: Vec::new(), + fail_on: Vec::new(), + }, machine_summary: MachineSummary { verdict: status, safe_for_downstream: status == VerdictStatus::Pass, @@ -159,6 +213,7 @@ mod tests { completed_at: "2026-05-23T00:00:00Z".to_string(), duration_ms: 0, input_size_bytes: 100, + input_sha256: "0".repeat(64), }, summary: Summary { sequence_count: 2, diff --git a/tests/cli.rs b/tests/cli.rs index f9d6fd5..90ade8a 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,6 +1,7 @@ use assert_cmd::Command; use predicates::prelude::*; use serde_json::{json, Value}; +use sha2::{Digest, Sha256}; use std::path::{Path, PathBuf}; use tempfile::TempDir; @@ -46,7 +47,7 @@ fn contract_finding_catalog_can_be_printed_without_input() { cmd.arg("--finding-catalog") .assert() .success() - .stdout(predicate::str::contains(r#""schema_version": "0.2.0""#)) + .stdout(predicate::str::contains(r#""schema_version": "0.3.0""#)) .stdout(predicate::str::contains(r#""duplicate_ids""#)) .stdout(predicate::str::contains(r#""invalid_fasta_structure""#)) .stdout(predicate::str::contains(r#""gc_outliers""#)) @@ -196,7 +197,7 @@ fn valid_report_includes_machine_summary_scope_and_provenance() { } #[test] -fn report_includes_v0_2_provenance_and_routing_hints() { +fn report_includes_v0_3_provenance_and_routing_hints() { let temp_dir = TempDir::new().unwrap(); let outputs = output_paths(&temp_dir, "v02_contract"); @@ -218,7 +219,11 @@ fn report_includes_v0_2_provenance_and_routing_hints() { .success(); let report = read_json(&outputs.json); - assert_eq!(report["schema_version"], json!("0.2.0")); + assert_eq!(report["schema_version"], json!("0.3.0")); + assert_eq!(report["gate"]["mode"], json!("none")); + assert_eq!(report["gate"]["status"], json!("PASS")); + assert_eq!(report["gate"]["blocking_findings"], json!([])); + assert_eq!(report["gate"]["advisory_findings"], json!([])); assert!(report["provenance"]["command"] .as_str() .unwrap() @@ -233,6 +238,10 @@ fn report_includes_v0_2_provenance_and_routing_hints() { .ends_with('Z')); assert!(report["provenance"]["duration_ms"].as_u64().is_some()); assert!(report["provenance"]["input_size_bytes"].as_u64().unwrap() > 0); + assert_eq!( + report["provenance"]["input_sha256"], + json!(sha256_file(Path::new("testdata/valid_assembly.fa"))) + ); assert!(report["machine_summary"]["routing_hints"] .as_array() .unwrap() @@ -491,6 +500,138 @@ fn problem_assembly_returns_failure_for_default_critical_findings() { assert_all_outputs_exist(&outputs); } +#[test] +fn pipeline_gate_report_lists_blocking_and_advisory_findings() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "pipeline_gate"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/problem_assembly.fa", + "--gate", + "pipeline", + "--out", + ]) + .arg(&outputs.html) + .arg("--json") + .arg(&outputs.json) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2) + .stderr(predicate::str::contains("fastaguard error:").not()); + + let report = read_json(&outputs.json); + assert_eq!(report["schema_version"], json!("0.3.0")); + assert_eq!(report["gate"]["mode"], json!("pipeline")); + assert_eq!(report["gate"]["status"], json!("FAIL")); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "duplicate_ids" + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "invalid_chars" + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "high_n_rate" + )); + assert!(array_contains_string( + &report["gate"]["advisory_findings"], + "gap_runs" + )); + assert!(array_contains_string( + &report["gate"]["fail_on"], + "invalid_fasta_structure" + )); + assert_eq!( + report["provenance"]["input_sha256"], + json!(sha256_file(Path::new("testdata/problem_assembly.fa"))) + ); +} + +#[test] +fn html_report_shows_gate_decision() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "html_gate"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/problem_assembly.fa", + "--gate", + "pipeline", + "--out", + ]) + .arg(&outputs.html) + .arg("--json") + .arg(&outputs.json) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2) + .stderr(predicate::str::contains("fastaguard error:").not()); + + let html = std::fs::read_to_string(&outputs.html).unwrap(); + assert!(html.contains("Gate Decision"), "{html}"); + assert!(html.contains("Blocking"), "{html}"); + assert!(html.contains("Advisory"), "{html}"); +} + +#[test] +fn gate_none_report_preserves_warning_behavior_and_checksum() { + let temp_dir = TempDir::new().unwrap(); + let outputs = output_paths(&temp_dir, "gate_none"); + + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args([ + "testdata/problem_assembly.fa", + "--gate", + "none", + "--fail-on", + "duplicate_ids,invalid_chars", + "--out", + ]) + .arg(&outputs.html) + .arg("--json") + .arg(&outputs.json) + .arg("--tsv") + .arg(&outputs.tsv) + .arg("--multiqc") + .arg(&outputs.multiqc) + .assert() + .code(2) + .stderr(predicate::str::contains("fastaguard error:").not()); + + let report = read_json(&outputs.json); + assert_eq!(report["gate"]["mode"], json!("none")); + assert_eq!(report["gate"]["status"], json!("FAIL")); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "duplicate_ids" + )); + assert!(array_contains_string( + &report["gate"]["blocking_findings"], + "invalid_chars" + )); + assert!(array_contains_string( + &report["gate"]["advisory_findings"], + "high_n_rate" + )); + assert_eq!( + report["gate"]["fail_on"], + json!(["duplicate_ids", "invalid_chars"]) + ); + assert_eq!( + report["provenance"]["input_sha256"], + json!(sha256_file(Path::new("testdata/problem_assembly.fa"))) + ); +} + #[test] fn problem_assembly_json_matches_golden_contract() { let paths = golden_output_paths("problem_assembly"); @@ -753,6 +894,15 @@ fn unsupported_profile_is_tool_error() { .stderr(predicate::str::contains("unsupported profile")); } +#[test] +fn unknown_gate_value_is_cli_error() { + let mut cmd = Command::cargo_bin("fastaguard").unwrap(); + cmd.args(["testdata/valid_assembly.fa", "--gate", "strict"]) + .assert() + .failure() + .stderr(predicate::str::contains("invalid value 'strict'")); +} + #[test] fn invalid_provenance_timestamp_override_is_tool_error() { let mut cmd = Command::cargo_bin("fastaguard").unwrap(); @@ -830,6 +980,13 @@ fn read_json(path: &Path) -> Value { serde_json::from_str(&std::fs::read_to_string(path).unwrap()).unwrap() } +fn sha256_file(path: &Path) -> String { + let mut hasher = Sha256::new(); + let bytes = std::fs::read(path).unwrap(); + hasher.update(bytes); + hex::encode(hasher.finalize()) +} + fn balanced_sequence(length: usize) -> String { "ACGT" .repeat(length.div_ceil(4)) diff --git a/tests/golden/invalid_empty_record.json b/tests/golden/invalid_empty_record.json index 0406fec..c399aa3 100644 --- a/tests/golden/invalid_empty_record.json +++ b/tests/golden/invalid_empty_record.json @@ -1,8 +1,8 @@ { - "schema_version": "0.2.0", + "schema_version": "0.3.0", "tool": { "name": "FastaGuard", - "version": "0.2.0" + "version": "0.3.0" }, "input": { "path": "testdata/invalid_empty_record.fa", @@ -15,6 +15,15 @@ "invalid_fasta_structure" ] }, + "gate": { + "mode": "none", + "status": "FAIL", + "blocking_findings": [ + "invalid_fasta_structure" + ], + "advisory_findings": [], + "fail_on": [] + }, "machine_summary": { "verdict": "FAIL", "safe_for_downstream": false, @@ -66,7 +75,8 @@ "started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 7 + "input_size_bytes": 7, + "input_sha256": "fbb5d90b2f96e8df5e849aed066b203e2205460da3fd971e4abeec471adbdf25" }, "summary": { "sequence_count": 0, diff --git a/tests/golden/problem_assembly.json b/tests/golden/problem_assembly.json index 09b0e05..824e36b 100644 --- a/tests/golden/problem_assembly.json +++ b/tests/golden/problem_assembly.json @@ -1,8 +1,8 @@ { - "schema_version": "0.2.0", + "schema_version": "0.3.0", "tool": { "name": "FastaGuard", - "version": "0.2.0" + "version": "0.3.0" }, "input": { "path": "testdata/problem_assembly.fa", @@ -16,6 +16,22 @@ "invalid_chars" ] }, + "gate": { + "mode": "none", + "status": "FAIL", + "blocking_findings": [ + "duplicate_ids", + "invalid_chars" + ], + "advisory_findings": [ + "high_n_rate", + "tiny_contigs", + "gap_runs", + "length_outliers", + "composite_anomalies" + ], + "fail_on": [] + }, "machine_summary": { "verdict": "FAIL", "safe_for_downstream": false, @@ -106,7 +122,8 @@ "started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 187 + "input_size_bytes": 187, + "input_sha256": "4b8551daeda739b62c8e7aaa1ebf300e4118167ca582c51eeccfa1549c96f0a5" }, "summary": { "sequence_count": 5, diff --git a/tests/golden/valid_assembly.json b/tests/golden/valid_assembly.json index b5e198d..7fa6b75 100644 --- a/tests/golden/valid_assembly.json +++ b/tests/golden/valid_assembly.json @@ -1,8 +1,8 @@ { - "schema_version": "0.2.0", + "schema_version": "0.3.0", "tool": { "name": "FastaGuard", - "version": "0.2.0" + "version": "0.3.0" }, "input": { "path": "testdata/valid_assembly.fa", @@ -13,6 +13,13 @@ "status": "PASS", "reasons": [] }, + "gate": { + "mode": "none", + "status": "PASS", + "blocking_findings": [], + "advisory_findings": [], + "fail_on": [] + }, "machine_summary": { "verdict": "PASS", "safe_for_downstream": true, @@ -64,7 +71,8 @@ "started_at": "2026-05-23T00:00:00Z", "completed_at": "2026-05-23T00:00:00Z", "duration_ms": 0, - "input_size_bytes": 92 + "input_size_bytes": 92, + "input_sha256": "373699c0422b364607fc6879c46c053c105413c81df9403998d3d26a7122d2e8" }, "summary": { "sequence_count": 3, diff --git a/tests/python/test_adoption_assets.py b/tests/python/test_adoption_assets.py index c4fd604..48dc7f4 100644 --- a/tests/python/test_adoption_assets.py +++ b/tests/python/test_adoption_assets.py @@ -15,6 +15,70 @@ class AdoptionAssetsTest(unittest.TestCase): + def test_v0_3_gate_docs_and_examples_are_present(self): + readme = (ROOT / "README.md").read_text() + output_contract = (ROOT / "docs" / "output-contract.md").read_text() + nf_core_module = ( + ROOT + / "examples" + / "nf-core" + / "modules" + / "local" + / "fastaguard" + / "main.nf" + ).read_text() + snakemake = (ROOT / "examples" / "snakemake" / "Snakefile").read_text() + + self.assertIn("--gate pipeline", readme) + self.assertIn("The assembly FASTA gate before expensive QC.", readme) + self.assertIn('"gate"', output_contract) + self.assertIn("provenance.input_sha256", output_contract) + self.assertIn("--gate pipeline", nf_core_module) + self.assertIn("--gate pipeline", snakemake) + self.assertIn( + '"blocking_findings": ["duplicate_ids", "invalid_chars", "high_n_rate"]', + output_contract, + ) + self.assertIn( + '"command": "fastaguard sample.fa --profile assembly --gate pipeline"', + output_contract, + ) + self.assertIn('"duplicate_id_count": 1', output_contract) + self.assertIn('"invalid_sequence_count": 1', output_contract) + + def test_v0_3_gate_examples_do_not_pin_v0_2_runtimes(self): + nf_core_module = ( + ROOT + / "examples" + / "nf-core" + / "modules" + / "local" + / "fastaguard" + / "main.nf" + ).read_text() + wrapper_env = ( + ROOT / "examples" / "snakemake" / "wrapper" / "environment.yaml" + ).read_text() + wrapper_py = ( + ROOT + / "examples" + / "snakemake" + / "wrapper" + / "wrapper" + / "fastaguard" + / "wrapper.py" + ).read_text() + nf_core_readme = (ROOT / "examples" / "nf-core" / "README.md").read_text() + snakemake_readme = ( + ROOT / "examples" / "snakemake" / "wrapper" / "README.md" + ).read_text() + + self.assertNotIn("0.2.0--", nf_core_module) + self.assertNotIn("fastaguard=0.2.0", wrapper_env) + self.assertIn("--gate {gate}", wrapper_py) + self.assertIn("Gate failures intentionally exit with code `2`", nf_core_readme) + self.assertIn("Gate failures intentionally exit with code `2`", snakemake_readme) + def test_multiqc_parser_reads_fastaguard_custom_content(self): fixture = ROOT / "examples" / "reports" / "assembly_pass" / "fastaguard_mqc.json" @@ -34,6 +98,9 @@ def test_multiqc_parser_reads_expanded_fields_from_cli_example(self): summary["problem_assembly"], { "verdict": "FAIL", + "gate_mode": "none", + "gate_status": "FAIL", + "gate_blocking_findings": "duplicate_ids,invalid_chars", "sequence_count": 5, "total_length": 145, "n50": 110, @@ -110,6 +177,44 @@ def test_multiqc_parser_reads_expanded_summary_fields(self): }, ) + def test_multiqc_parser_preserves_gate_fields(self): + with TemporaryDirectory() as temp_dir: + fixture = Path(temp_dir) / "fastaguard_mqc.json" + fixture.write_text( + json.dumps( + { + "id": "fastaguard", + "section_name": "FastaGuard", + "description": "FASTA preflight QC summary", + "plot_type": "table", + "pconfig": {"id": "fastaguard_summary", "title": "FastaGuard"}, + "data": { + "sample": { + "verdict": "FAIL", + "sequence_count": 8, + "total_length": 2000, + "n50": 500, + "n90": 100, + "gc_percent": 50.0, + "n_percent": 2.5, + "finding_count": 4, + "gate_mode": "pipeline", + "gate_status": "FAIL", + "gate_blocking_findings": "duplicate_ids,high_n_rate", + } + }, + } + ) + ) + + summary = load_custom_content_summary(fixture) + self.assertEqual(summary["sample"]["gate_mode"], "pipeline") + self.assertEqual(summary["sample"]["gate_status"], "FAIL") + self.assertEqual( + summary["sample"]["gate_blocking_findings"], + "duplicate_ids,high_n_rate", + ) + def test_multiqc_parser_rejects_missing_required_summary_fields(self): with TemporaryDirectory() as temp_dir: fixture = Path(temp_dir) / "fastaguard_mqc.json" @@ -198,6 +303,21 @@ def test_multiqc_plugin_declares_module_entry_point(self): self.assertIn('fastaguard = "fastaguard_multiqc:MultiqcModule"', pyproject) self.assertIn("multiqc", pyproject) + def test_multiqc_plugin_summary_headers_include_gate_fields(self): + module_source = ( + ROOT + / "integrations" + / "multiqc" + / "src" + / "fastaguard_multiqc" + / "multiqc_module.py" + ).read_text() + + self.assertIn('"gate_mode"', module_source) + self.assertIn('"gate_status"', module_source) + self.assertIn('"gate_blocking_findings"', module_source) + self.assertIn("Finding IDs blocking the FastaGuard gate", module_source) + def test_multiqc_plugin_registers_filename_first_fastaguard_search_pattern(self): patterns = getattr(multiqc_parser, "FASTAGUARD_SEARCH_PATTERN", {}) fastaguard_patterns = patterns.get("fastaguard", []) @@ -286,10 +406,7 @@ def test_workflow_docs_reference_bioconda_and_container_status(self): "quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", nfcore_readme, ) - self.assertIn( - "quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", - nfcore_module, - ) + self.assertNotIn("0.2.0--", nfcore_module) self.assertIn( "quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", snakemake_readme, @@ -351,6 +468,17 @@ def test_evidence_docs_reference_local_and_public_workflows(self): "docs/evidence/fastaguard-v0.2-evidence.md", path.read_text() ) + def test_v0_3_evidence_docs_reference_gate_and_checksum(self): + evidence = ROOT / "docs" / "evidence" / "fastaguard-v0.3-evidence.md" + + self.assertTrue(evidence.exists()) + evidence_text = evidence.read_text() + self.assertIn("--gate pipeline", evidence_text) + self.assertIn("input_sha256", evidence_text) + self.assertIn("not biological completeness", evidence_text) + self.assertIn("not contamination confirmation", evidence_text) + self.assertIn("python3 scripts/collect_evidence.py", evidence_text) + def test_collect_evidence_local_only_smoke_does_not_require_network(self): with TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) @@ -363,6 +491,8 @@ def test_collect_evidence_local_only_smoke_does_not_require_network(self): args = sys.argv[1:] input_path = Path(args[0]) +if "--gate" not in args or args[args.index("--gate") + 1] != "pipeline": + raise SystemExit("unexpected gate mode") def option_path(flag): try: @@ -383,6 +513,12 @@ def option_path(flag): report = { "tool": {"name": "fastaguard", "version": "test"}, "verdict": {"status": "PASS"}, + "gate": { + "mode": "pipeline", + "status": "PASS", + "blocking_findings": [], + }, + "provenance": {"input_sha256": "0" * 64}, "summary": summary, "findings": [], } @@ -424,9 +560,144 @@ def option_path(flag): self.assertTrue((out_dir / "evidence_summary.tsv").exists()) for case in summary["cases"]: self.assertEqual(case["verdict"], "PASS") + self.assertEqual(case["gate_mode"], "pipeline") + self.assertEqual(case["gate_status"], "PASS") + self.assertEqual(case["gate_blocking_findings"], "") + self.assertEqual(case["input_sha256"], "0" * 64) + self.assertIn("--gate pipeline", case["command"]) self.assertGreater(case["elapsed_seconds"], 0) self.assertIn("command", case) + def test_collect_evidence_rejects_reports_without_gate_contract(self): + with TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + fake_binary = temp_path / "old_fastaguard.py" + fake_binary.write_text( + """#!/usr/bin/env python3 +import json +import sys +from pathlib import Path + +args = sys.argv[1:] +input_path = Path(args[0]) + +def option_path(flag): + try: + return Path(args[args.index(flag) + 1]) + except ValueError: + return None + +json_path = option_path("--json") +html_path = option_path("--out") +tsv_path = option_path("--tsv") +multiqc_path = option_path("--multiqc") +summary = { + "sequence_count": 1, + "total_length": input_path.stat().st_size, + "n50": input_path.stat().st_size, + "n90": input_path.stat().st_size, +} +report = { + "tool": {"name": "fastaguard", "version": "old"}, + "verdict": {"status": "PASS"}, + "summary": summary, + "findings": [], +} +json_path.write_text(json.dumps(report)) +html_path.write_text("fake") +tsv_path.write_text("metric\\tvalue\\n") +multiqc_path.write_text(json.dumps({"id": "fastaguard", "data": {}})) +""" + ) + fake_binary.chmod(fake_binary.stat().st_mode | 0o111) + + completed = subprocess.run( + [ + sys.executable, + str(ROOT / "scripts" / "collect_evidence.py"), + "--binary", + str(fake_binary), + "--out-dir", + str(temp_path / "evidence"), + "--local-only", + ], + cwd=ROOT, + capture_output=True, + text=True, + check=False, + ) + + self.assertNotEqual(completed.returncode, 0) + self.assertIn("missing gate", completed.stderr) + + def test_collect_evidence_rejects_non_pipeline_gate_reports(self): + with TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + fake_binary = temp_path / "wrong_gate_fastaguard.py" + fake_binary.write_text( + """#!/usr/bin/env python3 +import json +import sys +from pathlib import Path + +args = sys.argv[1:] +input_path = Path(args[0]) + +def option_path(flag): + try: + return Path(args[args.index(flag) + 1]) + except ValueError: + return None + +json_path = option_path("--json") +html_path = option_path("--out") +tsv_path = option_path("--tsv") +multiqc_path = option_path("--multiqc") +summary = { + "sequence_count": 1, + "total_length": input_path.stat().st_size, + "n50": input_path.stat().st_size, + "n90": input_path.stat().st_size, +} +report = { + "tool": {"name": "fastaguard", "version": "test"}, + "verdict": {"status": "PASS"}, + "gate": { + "mode": "none", + "status": "PASS", + "blocking_findings": [], + }, + "provenance": {"input_sha256": "0" * 64}, + "summary": summary, + "findings": [], +} +json_path.write_text(json.dumps(report)) +html_path.write_text("fake") +tsv_path.write_text("metric\\tvalue\\n") +multiqc_path.write_text(json.dumps({"id": "fastaguard", "data": {}})) +""" + ) + fake_binary.chmod(fake_binary.stat().st_mode | 0o111) + + completed = subprocess.run( + [ + sys.executable, + str(ROOT / "scripts" / "collect_evidence.py"), + "--binary", + str(fake_binary), + "--out-dir", + str(temp_path / "evidence"), + "--local-only", + ], + cwd=ROOT, + capture_output=True, + text=True, + check=False, + ) + + self.assertNotEqual(completed.returncode, 0) + self.assertIn("expected gate.mode pipeline", completed.stderr) + def test_deep_release_vision_is_documented_and_memorized(self): vision = (ROOT / "docs" / "vision-plan.md").read_text() memory = (ROOT / "AGENTS.md").read_text() @@ -468,7 +739,7 @@ def test_snakemake_wrapper_declares_bioconda_environment(self): " - conda-forge", " - bioconda", "dependencies:", - " - fastaguard=0.2.0", + " - fastaguard=0.3.0", ], ) self.assertIn('conda: "environment.yaml"', snakefile.read_text()) diff --git a/tests/python/test_release_metadata.py b/tests/python/test_release_metadata.py index 77ebefd..f67c3b7 100644 --- a/tests/python/test_release_metadata.py +++ b/tests/python/test_release_metadata.py @@ -10,11 +10,14 @@ class ReleaseMetadataTest(unittest.TestCase): - def test_package_and_bioconda_recipe_target_v0_2_0(self): + def test_package_targets_v0_3_0(self): cargo = tomllib.loads((ROOT / "Cargo.toml").read_text()) + + self.assertEqual(cargo["package"]["version"], "0.3.0") + + def test_bioconda_recipe_remains_on_published_v0_2_0_archive(self): recipe = (ROOT / "packaging" / "bioconda" / "meta.yaml").read_text() - self.assertEqual(cargo["package"]["version"], "0.2.0") self.assertIn('{% set version = "0.2.0" %}', recipe) def test_v0_2_0_release_notes_exist(self): @@ -29,13 +32,22 @@ def test_v0_2_0_release_notes_exist(self): self.assertIn("v0.2.0 GitHub release binaries and source archive", text) self.assertIn("quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", text) + def test_v0_3_0_release_notes_exist(self): + notes = ROOT / "docs" / "releases" / "v0.3.0.md" + + self.assertTrue(notes.exists()) + text = notes.read_text() + self.assertIn("FastaGuard v0.3.0", text) + self.assertIn("Evidence And Assembly Gate", text) + self.assertIn("--gate pipeline", text) + self.assertIn("input_sha256", text) + def test_bioconda_recipe_has_publishable_v0_2_0_source_sha(self): - cargo = tomllib.loads((ROOT / "Cargo.toml").read_text()) recipe = (ROOT / "packaging" / "bioconda" / "meta.yaml").read_text() marker = "REPLACE" + "_WITH_" - self.assertEqual(cargo["package"]["version"], "0.2.0") self.assertTrue((ROOT / "docs" / "releases" / "v0.2.0.md").exists()) + self.assertIn('{% set version = "0.2.0" %}', recipe) self.assertNotIn(marker, recipe) match = re.search(r"sha256: ([a-f0-9]{64})", recipe) diff --git a/tests/schema_contract.rs b/tests/schema_contract.rs index 02b2c33..135e4b2 100644 --- a/tests/schema_contract.rs +++ b/tests/schema_contract.rs @@ -34,6 +34,29 @@ fn schema_requires_emitted_finding_taxonomy_fields() { assert!(required.contains(&serde_json::json!("requires_followup_tool"))); } +#[test] +fn schema_requires_gate_and_input_sha256() { + let schema = read_json(Path::new("schema/fastaguard.schema.json")); + let report_required = schema["required"].as_array().unwrap(); + let gate_required = schema["properties"]["gate"]["required"].as_array().unwrap(); + let provenance_required = schema["properties"]["provenance"]["required"] + .as_array() + .unwrap(); + + assert_eq!(schema["properties"]["schema_version"]["const"], "0.3.0"); + assert!(report_required.contains(&serde_json::json!("gate"))); + assert!(gate_required.contains(&serde_json::json!("mode"))); + assert!(gate_required.contains(&serde_json::json!("status"))); + assert!(gate_required.contains(&serde_json::json!("blocking_findings"))); + assert!(gate_required.contains(&serde_json::json!("advisory_findings"))); + assert!(gate_required.contains(&serde_json::json!("fail_on"))); + assert!(provenance_required.contains(&serde_json::json!("input_sha256"))); + assert_eq!( + schema["properties"]["provenance"]["properties"]["input_sha256"]["pattern"], + "^[a-f0-9]{64}$" + ); +} + #[test] fn freshly_generated_outlier_report_validates_against_json_schema() { let temp_dir = TempDir::new().unwrap();