diff --git a/.github/conventional-commit-baseline.txt b/.github/conventional-commit-baseline.txt index f288cbe2..d2882890 100644 --- a/.github/conventional-commit-baseline.txt +++ b/.github/conventional-commit-baseline.txt @@ -4,3 +4,4 @@ 035bb71cd9059b571e18bbd69059e6dbd80f2b80 # PR #100 split/profile-branch-foundation c4d4ce4c427988701cde38915f4f3a5b5fa7d52d # PR #101 split/mcp-tool-surface 5b4eeb1b9d279bfe64f289a80292bfd83fd852a1 # PR #102 split/daemon-transport +deb256e4276da2c7be8729a4db767e56ebffd4ca # PR #178 subject 86 chars; published before lint caught it, unrewritable without force-push diff --git a/src/automation/lifecycle.rs b/src/automation/lifecycle.rs index 269332f8..fb989a77 100644 --- a/src/automation/lifecycle.rs +++ b/src/automation/lifecycle.rs @@ -40,6 +40,11 @@ pub(crate) struct AgentTaskRunContext<'a> { config: &'a AutomationConfig, task: AgentTaskKind, started_at: String, + /// Ledger records loaded once by [`Self::gate`] on the scheduler path. + /// Both gate-level and post-gate skips compute their repeat-skip dedup + /// from these cached records, so the append path never re-reads the + /// ledger. + ledger_records: Option>, } impl<'a> AgentTaskRunContext<'a> { @@ -58,6 +63,7 @@ impl<'a> AgentTaskRunContext<'a> { config, task, started_at: current_timestamp().to_string(), + ledger_records: None, } } @@ -65,8 +71,11 @@ impl<'a> AgentTaskRunContext<'a> { &self.started_at } - pub(crate) async fn gate(&self) -> Result { - task_run_gate(self.config, &self.dashboard_root, self.task, self.trigger).await + pub(crate) async fn gate(&mut self) -> Result { + let (gate, records) = + task_run_gate(self.config, &self.dashboard_root, self.task, self.trigger).await?; + self.ledger_records = records; + Ok(gate) } pub(crate) async fn skipped_parts( @@ -85,10 +94,23 @@ impl<'a> AgentTaskRunContext<'a> { reason, self.started_at(), report_task_key, + self.scheduler_skip_is_repeat(reason), ) .await } + /// Computes the repeat-skip dedup decision from the records cached by + /// [`Self::gate`], with no ledger I/O. A scheduler-trigger context whose + /// gate has not run yet has no cached records and conservatively persists + /// the skip. + fn scheduler_skip_is_repeat(&self, reason: &str) -> bool { + self.trigger == AutomationTrigger::Scheduler + && self + .ledger_records + .as_deref() + .is_some_and(|records| is_repeat_scheduler_skip(records, self.task, reason)) + } + pub(crate) fn finalizer(&self, input_hash: Option) -> AgentRunFinalizer<'_> { AgentRunFinalizer::new( &self.dashboard_root, @@ -121,17 +143,21 @@ pub(crate) fn task_skip_reason( None } +/// Evaluates the scheduler gate, returning the ledger records it loaded so +/// callers can reuse them for skip dedup instead of re-reading the ledger. +/// The ledger is read at most once per gate evaluation. pub(crate) async fn scheduler_gate( config: &AutomationConfig, dashboard_root: &Path, task: AgentTaskKind, trigger: AutomationTrigger, -) -> Result { +) -> Result<(SchedulerGate, Option>)> { if trigger != AutomationTrigger::Scheduler { - return Ok(SchedulerGate::Proceed(None)); + return Ok((SchedulerGate::Proceed(None), None)); } let now_secs = current_timestamp(); + let records = load_run_records(dashboard_root, 200).await?; let Some(lock) = AutomationTaskLock::try_acquire( dashboard_root, task, @@ -140,16 +166,15 @@ pub(crate) async fn scheduler_gate( ) .await? else { - return Ok(SchedulerGate::Skip("scheduler_lock_active")); + return Ok((SchedulerGate::Skip("scheduler_lock_active"), Some(records))); }; - let records = load_run_records(dashboard_root, 200).await?; let decision = schedule_decision(config, task, &records, now_secs); if let Some(reason) = scheduler_skip_reason(&decision, task) { - return Ok(SchedulerGate::Skip(reason)); + return Ok((SchedulerGate::Skip(reason), Some(records))); } - Ok(SchedulerGate::Proceed(Some(lock))) + Ok((SchedulerGate::Proceed(Some(lock)), Some(records))) } pub(crate) async fn task_run_gate( @@ -157,16 +182,21 @@ pub(crate) async fn task_run_gate( dashboard_root: &Path, task: AgentTaskKind, trigger: AutomationTrigger, -) -> Result { - match scheduler_gate(config, dashboard_root, task, trigger).await? { - SchedulerGate::Skip(reason) => Ok(SchedulerGate::Skip(reason)), +) -> Result<(SchedulerGate, Option>)> { + let (gate, records) = scheduler_gate(config, dashboard_root, task, trigger).await?; + let gate = match gate { + SchedulerGate::Skip(reason) => SchedulerGate::Skip(reason), SchedulerGate::Proceed(lock) => match task_skip_reason(config, task) { - Some(reason) => Ok(SchedulerGate::Skip(reason)), - None => Ok(SchedulerGate::Proceed(lock)), + Some(reason) => SchedulerGate::Skip(reason), + None => SchedulerGate::Proceed(lock), }, - } + }; + Ok((gate, records)) } +/// Appends a skipped run record unless the caller already determined it is a +/// repeat scheduler skip. Performs no ledger reads: `is_repeat` must be +/// computed from the records the gate evaluation loaded. #[allow(clippy::too_many_arguments)] pub(crate) async fn append_skipped_record( dashboard_root: &Path, @@ -177,6 +207,7 @@ pub(crate) async fn append_skipped_record( evidence_hash: Option, reason: &str, started_at: &str, + is_repeat: bool, ) -> Result { let record = ledger_record( run_id, @@ -191,10 +222,41 @@ pub(crate) async fn append_skipped_record( Some(reason.to_string()), started_at, ); + // Scheduler ticks re-evaluate every task every few seconds, so a standing + // skip condition (interval not elapsed, task disabled, ...) would append + // thousands of identical records and drown real runs out of the ledger. + // Persist only the first record of each consecutive identical skip. + // + // The gate's ledger read and this append are not atomic: two concurrent + // writers can both observe no prior skip and each append the "first" + // record. The duplicate is benign, so no cross-process locking is done. + if trigger == AutomationTrigger::Scheduler && is_repeat { + return Ok(record); + } append_run_record(dashboard_root, &record).await?; Ok(record) } +/// True when the most recent ledger record for `task` is already a scheduler +/// skip with the same reason. +/// +/// The skip reason is read out of `record.error`, inheriting the pre-existing +/// modeling wart that skipped runs store their reason in the error field. +fn is_repeat_scheduler_skip( + records: &[AutomationRunLedgerRecord], + task: AgentTaskKind, + reason: &str, +) -> bool { + records + .iter() + .find(|record| record.task == task) + .is_some_and(|record| { + record.trigger == AutomationTrigger::Scheduler + && record.status == AutomationRunStatus::Skipped + && record.error.as_deref() == Some(reason) + }) +} + #[allow(clippy::too_many_arguments)] pub(crate) async fn skipped_run_parts( dashboard_root: &Path, @@ -206,6 +268,7 @@ pub(crate) async fn skipped_run_parts( reason: &str, started_at: &str, report_task_key: Option<&'static str>, + is_repeat: bool, ) -> Result<(Value, AutomationRunLedgerRecord)> { let mut report = json!({ "status": "skipped", @@ -226,6 +289,7 @@ pub(crate) async fn skipped_run_parts( evidence_hash, reason, started_at, + is_repeat, ) .await?; Ok((report, record)) @@ -605,7 +669,9 @@ fn noop_output_for_task(task: AgentTaskKind) -> Value { } #[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used)] mod tests { + use super::super::config::{AutomationTaskConfig, AutomationTaskSet}; use super::*; #[test] @@ -615,4 +681,229 @@ mod tests { assert_ne!(first, second); } + + /// Runs the production skip path: evaluate the gate (which caches ledger + /// records for scheduler triggers) and then record the skip, exactly as a + /// gate-level skip does in the task runners. + async fn append_skip( + dashboard_root: &Path, + run_id: &str, + trigger: AutomationTrigger, + task: AgentTaskKind, + reason: &str, + ) -> AutomationRunLedgerRecord { + let config = AutomationConfig::default(); + let mut run = AgentTaskRunContext::new( + dashboard_root.to_path_buf(), + Some(run_id.to_string()), + "test", + trigger, + &config, + task, + ); + run.gate().await.expect("gate"); + let (_report, record) = run + .skipped_parts(None, reason, None) + .await + .expect("append skipped record"); + record + } + + #[tokio::test] + async fn consecutive_identical_scheduler_skips_persist_once() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + let task = AgentTaskKind::MemoryCurator; + + append_skip( + root, + "run-1", + AutomationTrigger::Scheduler, + task, + "scheduler_interval_not_elapsed", + ) + .await; + append_skip( + root, + "run-2", + AutomationTrigger::Scheduler, + task, + "scheduler_interval_not_elapsed", + ) + .await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!( + records.len(), + 1, + "repeat scheduler skip must not append a second record" + ); + assert_eq!(records[0].run_id, "run-1"); + } + + #[tokio::test] + async fn scheduler_skips_with_new_reason_or_task_still_persist() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + + append_skip( + root, + "run-1", + AutomationTrigger::Scheduler, + AgentTaskKind::MemoryCurator, + "scheduler_interval_not_elapsed", + ) + .await; + append_skip( + root, + "run-2", + AutomationTrigger::Scheduler, + AgentTaskKind::MemoryCurator, + "scheduler_cooldown_active", + ) + .await; + append_skip( + root, + "run-3", + AutomationTrigger::Scheduler, + AgentTaskKind::SessionReflector, + "scheduler_interval_not_elapsed", + ) + .await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!(records.len(), 3, "distinct skip conditions must persist"); + } + + #[tokio::test] + async fn manual_trigger_skips_always_persist() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + let task = AgentTaskKind::SkillWriter; + + append_skip( + root, + "run-1", + AutomationTrigger::ManualCli, + task, + "skill_writer_disabled", + ) + .await; + append_skip( + root, + "run-2", + AutomationTrigger::ManualCli, + task, + "skill_writer_disabled", + ) + .await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!(records.len(), 2, "manual skips must always be recorded"); + } + + fn scheduler_enabled_config() -> AutomationConfig { + AutomationConfig { + enabled: true, + backend: AutomationBackend::CodexAppServer, + host_mode: AutomationHostMode::Standalone, + tasks: AutomationTaskSet { + memory_curator: AutomationTaskConfig { + enabled: true, + schedule: Some("hourly".to_string()), + ..AutomationTaskConfig::default() + }, + ..AutomationTaskSet::default() + }, + ..AutomationConfig::default() + } + } + + /// Runs the production post-gate skip path: the gate proceeds (caching + /// ledger records), and the task body later decides to skip. + async fn post_gate_scheduler_skip(dashboard_root: &Path, run_id: &str, reason: &str) { + let config = scheduler_enabled_config(); + let mut run = AgentTaskRunContext::new( + dashboard_root.to_path_buf(), + Some(run_id.to_string()), + "test", + AutomationTrigger::Scheduler, + &config, + AgentTaskKind::MemoryCurator, + ); + let SchedulerGate::Proceed(lock) = run.gate().await.expect("gate") else { + panic!("gate must proceed so the skip is decided post-gate"); + }; + run.skipped_parts(None, reason, None) + .await + .expect("append post-gate skip"); + drop(lock); + } + + #[tokio::test] + async fn consecutive_identical_post_gate_scheduler_skips_persist_once() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + + post_gate_scheduler_skip(root, "run-1", "nothing_to_review").await; + post_gate_scheduler_skip(root, "run-2", "nothing_to_review").await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!( + records.len(), + 1, + "repeat post-gate scheduler skip must not append a second record" + ); + assert_eq!(records[0].run_id, "run-1"); + } + + #[tokio::test] + async fn append_path_relies_solely_on_caller_computed_repeat_flag() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + let config = AutomationConfig::default(); + let task = AgentTaskKind::MemoryCurator; + + // Both identical scheduler skips persist when the caller reports + // is_repeat=false, even though the second is a repeat on disk: the + // append path must not perform its own ledger read to second-guess + // the flag computed from the gate's cached records. + for run_id in ["run-1", "run-2"] { + append_skipped_record( + root, + run_id, + AutomationTrigger::Scheduler, + &config, + task, + None, + "scheduler_interval_not_elapsed", + "1000", + false, + ) + .await + .expect("append skipped record"); + } + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!( + records.len(), + 2, + "append path must trust the caller-computed repeat flag" + ); + + append_skipped_record( + root, + "run-3", + AutomationTrigger::Scheduler, + &config, + task, + None, + "scheduler_interval_not_elapsed", + "1000", + true, + ) + .await + .expect("append skipped record"); + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!(records.len(), 2, "is_repeat=true must suppress the append"); + } } diff --git a/src/automation/memory_curator.rs b/src/automation/memory_curator.rs index bd7952a0..339459a8 100644 --- a/src/automation/memory_curator.rs +++ b/src/automation/memory_curator.rs @@ -51,7 +51,7 @@ pub async fn run_memory_curator_with_backend( backend: &dyn AgentTaskBackend, options: MemoryCuratorAutomationOptions, ) -> Result { - let run = AgentTaskRunContext::new( + let mut run = AgentTaskRunContext::new( cg.store_layout().dashboard_root.clone(), options.run_id.clone(), "memory_curator", diff --git a/src/automation/runner.rs b/src/automation/runner.rs index 1906ae5e..477a3f52 100644 --- a/src/automation/runner.rs +++ b/src/automation/runner.rs @@ -173,7 +173,7 @@ pub async fn run_session_reflector_with_backend( backend: &dyn AgentTaskBackend, options: SessionReflectorAutomationOptions, ) -> Result { - let run = AgentTaskRunContext::new( + let mut run = AgentTaskRunContext::new( cg.store_layout().dashboard_root.clone(), options.run_id.clone(), "session_reflector", @@ -429,7 +429,7 @@ pub async fn run_skill_writer_with_backend( backend: &dyn AgentTaskBackend, options: SkillWriterAutomationOptions, ) -> Result { - let run = AgentTaskRunContext::new( + let mut run = AgentTaskRunContext::new( cg.store_layout().dashboard_root.clone(), options.run_id.clone(), "skill_writer", diff --git a/src/automation/session_reflector.rs b/src/automation/session_reflector.rs index 1172d085..e6fc0374 100644 --- a/src/automation/session_reflector.rs +++ b/src/automation/session_reflector.rs @@ -4,6 +4,7 @@ use serde_json::{json, Value}; use crate::errors::Result; use crate::memory::retrieval::FactRetriever; +use crate::memory::trust::{DEFAULT_TRUST, HIGH_TRUST_REPRESENTATIVE, LOW_TRUST_REPRESENTATIVE}; use crate::memory::types::{AddFactRequest, MemoryCategory}; use crate::tracedecay::TraceDecay; @@ -129,12 +130,14 @@ async fn validate_fact_proposal( "entities must be an array of strings", )); }; - let trust = match object.get("trust") { - Some(value) => match value.as_f64() { - Some(trust) if (0.0..=1.0).contains(&trust) => Some(trust), - _ => return Ok(rejected_fact(proposal, "trust must be between 0 and 1")), - }, - None => return Ok(rejected_fact(proposal, "trust is required")), + let Some(trust) = object.get("trust") else { + return Ok(rejected_fact(proposal, "trust is required")); + }; + let Some(trust) = proposal_trust_value(trust) else { + return Ok(rejected_fact( + proposal, + "trust must be a number between 0 and 1, or one of low, medium, high", + )); }; if object.contains_key("confidence") { return Ok(rejected_fact( @@ -204,7 +207,7 @@ async fn validate_fact_proposal( source: Some("session_reflector".to_string()), tags, entities, - trust, + trust: Some(trust), metadata: json!({ "source": "session_reflector", "source_span": source_span, @@ -229,6 +232,28 @@ async fn validate_fact_proposal( }))) } +/// Accepts numeric trust in `[0, 1]` plus the `low`/`medium`/`high` bucket +/// labels models frequently emit despite the numeric prompt instruction. +/// Buckets map to the representative scores defined next to +/// [`crate::memory::trust::trust_bucket`], so they cannot drift out of their +/// documented ranges. +/// +/// Deliberate decision: the prompt forbids string labels, but they are +/// accepted defensively rather than rejecting an otherwise valid fact. Note +/// that the "high" representative can clear auto-apply thresholds when +/// `auto_apply_memory_ops` is enabled — this is intentional. +fn proposal_trust_value(value: &Value) -> Option { + if let Some(trust) = value.as_f64() { + return (0.0..=1.0).contains(&trust).then_some(trust); + } + match value.as_str()?.trim().to_ascii_lowercase().as_str() { + "low" => Some(LOW_TRUST_REPRESENTATIVE), + "medium" => Some(DEFAULT_TRUST), + "high" => Some(HIGH_TRUST_REPRESENTATIVE), + _ => None, + } +} + fn value_as_i64(value: &Value) -> Option { value .as_i64() diff --git a/src/automation_cli.rs b/src/automation_cli.rs index 2c1f3265..e09aa3f2 100644 --- a/src/automation_cli.rs +++ b/src/automation_cli.rs @@ -1,5 +1,6 @@ use crate::cli::*; -use crate::{parse_lcm_scope_arg, resolve_cli_project_root, tracedecay_bin_on_path}; +use crate::update_cmd::tracedecay_bin_on_path; +use crate::{parse_lcm_scope_arg, resolve_cli_project_root}; pub(crate) async fn handle_automation_command( action: AutomationAction, diff --git a/src/branch_meta.rs b/src/branch_meta.rs index 82f47288..08fcb25c 100644 --- a/src/branch_meta.rs +++ b/src/branch_meta.rs @@ -8,7 +8,7 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -const BRANCH_META_FILENAME: &str = "branch-meta.json"; +use crate::storage::BRANCH_META_FILENAME; /// Metadata for a single tracked branch. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -115,6 +115,17 @@ impl BranchMeta { } } +/// Parses `branch-meta.json` content into [`BranchMeta`]. +/// +/// This is the canonical definition of "corrupt branch metadata": anything +/// this rejects — invalid JSON *or* valid JSON with the wrong schema — makes +/// the runtime fall back to single-DB mode. Every consumer (loading at +/// runtime, quarantining in the post-update health pass) must go through this +/// one predicate so they agree on what corrupt means. +pub fn parse(content: &str) -> serde_json::Result { + serde_json::from_str(content) +} + /// Loads branch metadata from `branch-meta.json` in the project data dir. /// /// Returns `None` if the file doesn't exist (single-DB mode / pre-branch projects). @@ -122,7 +133,7 @@ impl BranchMeta { pub fn load_branch_meta(data_dir: &Path) -> Option { let path = data_dir.join(BRANCH_META_FILENAME); let content = std::fs::read_to_string(&path).ok()?; - match serde_json::from_str(&content) { + match parse(&content) { Ok(meta) => Some(meta), Err(e) => { eprintln!( @@ -215,6 +226,14 @@ mod tests { assert!(meta.remove_branch("main").is_none()); } + #[test] + fn parse_rejects_schema_mismatch_as_corrupt() { + assert!(parse(r#"{"default_branch":"main","branches":{}}"#).is_ok()); + assert!(parse("{not valid json").is_err()); + assert!(parse(r#"{"default_branch": 5}"#).is_err()); + assert!(parse("[]").is_err()); + } + #[test] fn roundtrip_json() { let mut meta = BranchMeta::new("main"); diff --git a/src/cli.rs b/src/cli.rs index 9e85ee5d..c274aed0 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -270,10 +270,18 @@ pub enum Commands { /// Download and install the latest version from GitHub Upgrade, /// Refresh the tracedecay binary, generated plugins, and daemon - Update, + Update { + /// Skip the post-update health pass (safe repairs + doctor summary) + #[arg(long)] + no_heal: bool, + }, /// Refresh plugins and daemon after the binary has been updated. #[command(name = "post-update", hide = true)] - PostUpdate, + PostUpdate { + /// Skip the post-update health pass (safe repairs + doctor summary) + #[arg(long)] + no_heal: bool, + }, /// Show or switch the update channel (stable or beta) Channel { /// Target channel: "stable" or "beta" (omit to show current) diff --git a/src/cli/parse_tests.rs b/src/cli/parse_tests.rs index ad6f2076..7cd404de 100644 --- a/src/cli/parse_tests.rs +++ b/src/cli/parse_tests.rs @@ -134,7 +134,10 @@ fn update_upgrade_and_update_plugin_parse_to_distinct_commands() { let update_plugin = Cli::try_parse_from(["tracedecay", "update-plugin"]).expect("update-plugin should parse"); - assert!(matches!(update.command, Some(Commands::Update))); + assert!(matches!( + update.command, + Some(Commands::Update { no_heal: false }) + )); assert!(matches!(upgrade.command, Some(Commands::Upgrade))); assert!(matches!( update_plugin.command, @@ -142,6 +145,29 @@ fn update_upgrade_and_update_plugin_parse_to_distinct_commands() { )); } +#[test] +fn update_and_post_update_parse_no_heal_flag() { + let update = Cli::try_parse_from(["tracedecay", "update", "--no-heal"]) + .expect("update --no-heal should parse"); + let post_update = Cli::try_parse_from(["tracedecay", "post-update", "--no-heal"]) + .expect("post-update --no-heal should parse"); + let post_update_default = Cli::try_parse_from(["tracedecay", "post-update"]) + .expect("post-update should parse without --no-heal"); + + assert!(matches!( + update.command, + Some(Commands::Update { no_heal: true }) + )); + assert!(matches!( + post_update.command, + Some(Commands::PostUpdate { no_heal: true }) + )); + assert!(matches!( + post_update_default.command, + Some(Commands::PostUpdate { no_heal: false }) + )); +} + #[test] fn update_help_describes_refresh_scope() { let help = Cli::command().render_long_help().to_string(); diff --git a/src/commands.rs b/src/commands.rs index 818908b9..6c1cba85 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -352,16 +352,12 @@ pub(crate) async fn handle_migrate_action(action: MigrateAction) -> tracedecay:: message: "could not open global DB for registry cleanup".to_string(), })?; let projects = global_db.list_code_projects(usize::MAX).await; - let prefix_path = prefix.as_deref().map(PathBuf::from); - let stale: Vec<_> = projects - .into_iter() - .filter(|project| { - prefix_path.as_ref().is_none_or(|prefix| { - PathBuf::from(&project.canonical_root).starts_with(prefix) - }) - }) - .filter(|project| !PathBuf::from(&project.canonical_root).exists()) - .collect(); + let prefixes: Vec = prefix.iter().map(PathBuf::from).collect(); + let stale = tracedecay::migrate::registry::stale_code_projects( + &projects, + &prefixes, + tracedecay::migrate::registry::StaleRootScope::CanonicalRootMissing, + ); let deleted = if apply { let project_ids: Vec = stale .iter() diff --git a/src/daemon.rs b/src/daemon.rs index 5f3627a9..0c962a3b 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -29,6 +29,12 @@ pub const SOCKET_ENV: &str = "TRACEDECAY_DAEMON_SOCKET"; pub const HOOK_EVENT_METHOD: &str = "tracedecay/hookEvent"; #[cfg(unix)] const HOOK_EVENT_NOTIFY_TIMEOUT: Duration = Duration::from_millis(750); +/// Upper bound on graceful-shutdown persistence work (per-server token +/// persistence and WAL checkpoints). Must stay comfortably below systemd's +/// stop timeout (90s by default) so the daemon exits cleanly instead of +/// being killed with `SIGKILL` mid-checkpoint. +#[cfg(unix)] +const DAEMON_SHUTDOWN_DEADLINE: Duration = Duration::from_secs(45); mod service; pub use service::{ @@ -987,7 +993,27 @@ async fn run_foreground_unix(socket_path: PathBuf) -> Result<()> { // will never be served. drop(listener); let _ = std::fs::remove_file(&socket_path); - engine.shutdown_all().await; + // Graceful shutdown persists tokens-saved counters and checkpoints WALs + // for every live project server sequentially; with many servers or large + // WALs that can exceed systemd's stop timeout, which then sends `SIGKILL` + // to the daemon. On timeout the shutdown future is dropped and we proceed + // to exit: the remaining persistence is best-effort and the database WAL + // keeps state crash-safe. + let completed = timeout(DAEMON_SHUTDOWN_DEADLINE, engine.shutdown_all()) + .await + .is_ok(); + if !completed { + log_daemon_event( + "daemon_shutdown", + &[ + ("outcome", "timeout".to_string()), + ( + "deadline_secs", + DAEMON_SHUTDOWN_DEADLINE.as_secs().to_string(), + ), + ], + ); + } Ok(()) } diff --git a/src/doctor.rs b/src/doctor.rs index 7285ceda..28b4b20e 100644 --- a/src/doctor.rs +++ b/src/doctor.rs @@ -7,9 +7,11 @@ use std::path::{Component, Path, PathBuf}; use crate::agents::{self, DoctorCounters, HealthcheckContext}; use crate::display::{format_bytes, format_token_count}; +use crate::migrate::registry::code_project_root_exists; use crate::storage::StoreLayout; use crate::tracedecay::{TraceDecay, TraceDecayOpenOptions}; +pub mod heal; mod registry_drift; /// Runs a comprehensive health check of the tracedecay installation. @@ -365,10 +367,6 @@ async fn check_stale_code_projects(dc: &mut DoctorCounters, gdb: &crate::global_ )); } -fn code_project_root_exists(project: &crate::global_db::CodeProjectRecord) -> bool { - Path::new(&project.canonical_root).exists() || Path::new(&project.display_root).exists() -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum DoctorStorageStatus { RepoLocal, @@ -525,17 +523,33 @@ fn check_orphan_store_manifests(dc: &mut DoctorCounters, project_paths: &[String let Some(profile_root) = crate::config::user_data_dir() else { return; }; + let (orphan_count, issues) = orphan_store_manifest_report(&profile_root, project_paths); + for issue in issues.iter().take(10) { + dc.warn(&format!("Store manifest issue: {issue}")); + } + if orphan_count > 0 { + dc.warn(&format!( + "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" + )); + dc.info(" Run `tracedecay migrate reconstruct --profile-root --apply` after review."); + } +} + +/// Counts profile store manifests with no matching registry row, plus any +/// manifest scan issues. Shared between `doctor` and the post-update health +/// pass. +pub(crate) fn orphan_store_manifest_report( + profile_root: &Path, + project_paths: &[String], +) -> (usize, Vec) { let registered: std::collections::HashSet = project_paths .iter() .map(|path| crate::global_db::GlobalDb::canonical_project_key(std::path::Path::new(path))) .collect(); let report = crate::migrate::registry::scan_profile_store_manifests( - &profile_root, + profile_root, crate::tracedecay::current_timestamp(), ); - for issue in report.issues.iter().take(10) { - dc.warn(&format!("Store manifest issue: {issue}")); - } let orphan_count = report .plans .iter() @@ -544,12 +558,7 @@ fn check_orphan_store_manifests(dc: &mut DoctorCounters, project_paths: &[String !registered.contains(&key) }) .count(); - if orphan_count > 0 { - dc.warn(&format!( - "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" - )); - dc.info(" Run `tracedecay migrate reconstruct --profile-root --apply` after review."); - } + (orphan_count, report.issues) } /// Check user config file. diff --git a/src/doctor/heal.rs b/src/doctor/heal.rs new file mode 100644 index 00000000..13490187 --- /dev/null +++ b/src/doctor/heal.rs @@ -0,0 +1,352 @@ +//! Post-update health pass: safe, automatic repairs plus a concise summary +//! of the doctor findings that still need a human decision. +//! +//! Runs at the end of `tracedecay update` / `tracedecay post-update`. Running +//! by default (opt-out via `--no-heal`) is intentional product policy: the +//! hidden `post-update` subcommand fires from the self-update re-exec path, +//! so every successful `tracedecay update` heals the store unless the user +//! explicitly skips it. Every step is failure-tolerant: a failing check +//! prints a warning but never fails the update itself. Only remedies that +//! are safe to automate are applied: +//! +//! - corrupt `branch-meta.json` files (anything [`crate::branch_meta::parse`] +//! rejects) are quarantined — renamed to +//! `branch-meta.json.corrupt-`, never deleted — preserving the +//! evidence while restoring the silent single-DB fallback, +//! - registry rows whose project root no longer exists AND lives under the +//! system temp directory are purged (the automated equivalent of +//! `tracedecay migrate registry-gc --prefix --apply`), and only when +//! BOTH the canonical and display roots are gone. +//! +//! Those auto-applied remedies are safe precisely because quarantine renames +//! instead of deleting and the GC removes only temp-rooted registry metadata +//! whose every known root has vanished — no user data is ever destroyed. +//! +//! Everything else (orphan store manifests, stale rows outside the temp +//! directory, registry/manifest identity drift) is only reported. + +use std::path::{Path, PathBuf}; + +use crate::global_db::{CodeProjectRecord, GlobalDb}; +use crate::migrate::registry::{code_project_root_exists, stale_code_projects, StaleRootScope}; +use crate::storage::{BRANCH_META_FILENAME, BRANCH_META_QUARANTINE_PREFIX}; + +/// A corrupt `branch-meta.json` that was renamed out of the way. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BranchMetaQuarantine { + pub original: PathBuf, + pub quarantined: PathBuf, +} + +/// Outcome of one post-update health pass. +#[derive(Debug, Default)] +pub struct HealthPassReport { + pub quarantined_branch_meta: Vec, + /// `None` when the global DB could not be opened, so the GC never ran. + pub purged_temp_registry_rows: Option, + pub remaining_findings: Vec, + pub warnings: Vec, +} + +/// Runs the full post-update health pass and prints a doctor-style summary. +/// +/// Never fails: every error is collected as a warning so a broken store can +/// never abort `tracedecay update`. +pub async fn run_post_update_health_pass() -> HealthPassReport { + eprintln!("\n\x1b[1mPost-update health pass\x1b[0m (skip with --no-heal)"); + + let Some(profile_root) = crate::config::user_data_dir() else { + let report = HealthPassReport { + warnings: vec!["could not determine the profile data directory".to_string()], + ..HealthPassReport::default() + }; + render_warnings(&report.warnings); + return report; + }; + + let report = compute_health_pass_report(&profile_root).await; + render_health_pass_report(&report); + report +} + +/// Applies the safe remedies and gathers everything the pass has to say into +/// a [`HealthPassReport`], without printing anything. +async fn compute_health_pass_report(profile_root: &Path) -> HealthPassReport { + let mut report = HealthPassReport::default(); + + let (quarantined, warnings) = quarantine_corrupt_branch_meta(profile_root); + report.quarantined_branch_meta = quarantined; + report.warnings.extend(warnings); + + // Opening the global DB applies its idempotent schema migrations — the + // same lazy upgrade every normal open path performs. + let Some(global_db) = GlobalDb::open().await else { + report + .warnings + .push("could not open the global DB for the health pass".to_string()); + return report; + }; + + // One registry snapshot for the whole pass: the GC and the remaining + // findings below both work from this list. + let projects = global_db.list_code_projects(usize::MAX).await; + let (purged, purged_ids) = gc_stale_temp_registry_rows(&global_db, &projects).await; + report.purged_temp_registry_rows = Some(purged); + + let (findings, warnings) = + collect_remaining_findings(&global_db, profile_root, &projects, &purged_ids).await; + report.remaining_findings = findings; + report.warnings.extend(warnings); + report +} + +/// Prints the doctor-style summary for a computed report. +fn render_health_pass_report(report: &HealthPassReport) { + if report.quarantined_branch_meta.is_empty() { + eprintln!(" \x1b[32m✔\x1b[0m No corrupt branch metadata files"); + } else { + eprintln!( + " \x1b[32m✔\x1b[0m Quarantined {} corrupt branch metadata file(s):", + report.quarantined_branch_meta.len() + ); + for quarantine in &report.quarantined_branch_meta { + eprintln!(" • {}", quarantine.quarantined.display()); + } + } + + match report.purged_temp_registry_rows { + Some(0) => eprintln!(" \x1b[32m✔\x1b[0m No stale temp-root registry rows"), + Some(purged) => { + eprintln!(" \x1b[32m✔\x1b[0m Purged {purged} stale temp-root registry row(s)"); + } + None => {} + } + + if report.remaining_findings.is_empty() { + eprintln!(" \x1b[32m✔\x1b[0m No remaining doctor findings"); + } else { + eprintln!(" Remaining findings (not auto-fixed — run `tracedecay doctor` for details):"); + for finding in &report.remaining_findings { + eprintln!(" • {finding}"); + } + } + render_warnings(&report.warnings); +} + +fn render_warnings(warnings: &[String]) { + for warning in warnings { + eprintln!(" \x1b[33mwarning:\x1b[0m health pass: {warning}"); + } +} + +/// Renames every `branch-meta.json` under `/projects/*` that +/// [`crate::branch_meta::parse`] rejects — the runtime's own definition of +/// corrupt, covering both invalid JSON and schema mismatches — to +/// `branch-meta.json.corrupt-`, preserving the corrupt content as +/// evidence while restoring the single-DB fallback. +/// +/// Returns the performed quarantines and any warnings. +fn quarantine_corrupt_branch_meta(profile_root: &Path) -> (Vec, Vec) { + let mut quarantines = Vec::new(); + let mut warnings = Vec::new(); + let projects_root = profile_root.join("projects"); + let Ok(entries) = std::fs::read_dir(&projects_root) else { + return (quarantines, warnings); + }; + let mut meta_paths: Vec = entries + .flatten() + .map(|entry| entry.path().join(BRANCH_META_FILENAME)) + .filter(|path| path.is_file()) + .collect(); + meta_paths.sort(); + + let now = crate::tracedecay::current_timestamp(); + for path in meta_paths { + let content = match std::fs::read_to_string(&path) { + Ok(content) => content, + Err(err) => { + warnings.push(format!("could not read '{}': {err}", path.display())); + continue; + } + }; + if crate::branch_meta::parse(&content).is_ok() { + continue; + } + let quarantined = path.with_file_name(format!("{BRANCH_META_QUARANTINE_PREFIX}{now}")); + match std::fs::rename(&path, &quarantined) { + Ok(()) => quarantines.push(BranchMetaQuarantine { + original: path, + quarantined, + }), + Err(err) => warnings.push(format!( + "could not quarantine corrupt '{}': {err}", + path.display() + )), + } + } + (quarantines, warnings) +} + +/// Purges registry rows in the auto-GC scope: canonical root under the +/// system temp directory AND every known root gone +/// ([`StaleRootScope::AllRootsMissing`]) — the only registry GC scope that is +/// safe to run without review. +/// +/// Returns the purged row count plus the candidate ids, so the remaining +/// findings can exclude them from the shared pre-purge registry snapshot. +async fn gc_stale_temp_registry_rows( + global_db: &GlobalDb, + projects: &[CodeProjectRecord], +) -> (usize, Vec) { + let stale_ids: Vec = stale_code_projects( + projects, + &temp_dir_prefixes(), + StaleRootScope::AllRootsMissing, + ) + .into_iter() + .map(|project| project.project_id.clone()) + .collect(); + if stale_ids.is_empty() { + return (0, stale_ids); + } + let purged = global_db.delete_code_projects(&stale_ids).await; + (purged, stale_ids) +} + +/// The system temp directory in both its literal and canonicalized spellings, +/// so registry rows recorded through a symlinked temp path still match. +fn temp_dir_prefixes() -> Vec { + let temp_dir = std::env::temp_dir(); + let mut prefixes = vec![temp_dir.clone()]; + if let Ok(canonical) = temp_dir.canonicalize() { + if !prefixes.contains(&canonical) { + prefixes.push(canonical); + } + } + prefixes +} + +/// Summarizes the doctor findings that are NOT safe to auto-apply so the user +/// sees them at the end of `tracedecay update` output. `projects` is the +/// pre-purge registry snapshot; rows in `purged_ids` are skipped. +/// +/// Returns the findings and any warnings. +async fn collect_remaining_findings( + global_db: &GlobalDb, + profile_root: &Path, + projects: &[CodeProjectRecord], + purged_ids: &[String], +) -> (Vec, Vec) { + let mut findings = Vec::new(); + let project_paths = global_db.list_project_paths().await; + let (orphan_count, warnings) = + super::orphan_store_manifest_report(profile_root, &project_paths); + if orphan_count > 0 { + findings.push(format!( + "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" + )); + } + + let stale_rows = projects + .iter() + .filter(|project| !purged_ids.contains(&project.project_id)) + .filter(|project| !code_project_root_exists(project)) + .count(); + if stale_rows > 0 { + findings.push(format!( + "{stale_rows} stale code project registry row(s) outside the temp directory" + )); + } + + let drift = super::registry_drift::registry_drift_findings(global_db, profile_root).await; + if !drift.is_empty() { + findings.push(format!( + "{} registry/store manifest identity drift finding(s)", + drift.len() + )); + } + (findings, warnings) +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + + fn write_branch_meta(projects_root: &Path, project_id: &str, content: &str) -> PathBuf { + let shard = projects_root.join(project_id); + std::fs::create_dir_all(&shard).unwrap(); + let path = shard.join(BRANCH_META_FILENAME); + std::fs::write(&path, content).unwrap(); + path + } + + #[test] + fn quarantine_renames_only_corrupt_branch_meta() { + let dir = tempfile::TempDir::new().unwrap(); + let projects_root = dir.path().join("projects"); + let corrupt = write_branch_meta(&projects_root, "proj_corrupt", "{not valid json"); + let valid = write_branch_meta( + &projects_root, + "proj_valid", + r#"{"default_branch":"main","branches":{}}"#, + ); + + let (quarantines, warnings) = quarantine_corrupt_branch_meta(dir.path()); + + assert_eq!(quarantines.len(), 1); + assert!(warnings.is_empty()); + let quarantine = &quarantines[0]; + assert_eq!(quarantine.original, corrupt); + assert!(!corrupt.exists(), "corrupt file should be renamed away"); + assert_eq!( + std::fs::read_to_string(&quarantine.quarantined).unwrap(), + "{not valid json", + "quarantined file must preserve the corrupt content as evidence" + ); + assert!( + quarantine + .quarantined + .file_name() + .unwrap() + .to_string_lossy() + .starts_with(BRANCH_META_QUARANTINE_PREFIX), + "quarantine name should be branch-meta.json.corrupt-: {quarantine:?}" + ); + assert!(valid.exists(), "valid branch-meta must be left untouched"); + } + + #[test] + fn quarantine_treats_schema_mismatch_as_corrupt() { + let dir = tempfile::TempDir::new().unwrap(); + let projects_root = dir.path().join("projects"); + // Valid JSON, but not a valid BranchMeta — the runtime warns + // "corrupt" on every open, so the health pass must agree. + let schema_corrupt = + write_branch_meta(&projects_root, "proj_schema", r#"{"default_branch": 5}"#); + + let (quarantines, warnings) = quarantine_corrupt_branch_meta(dir.path()); + + assert!(warnings.is_empty()); + assert_eq!( + quarantines.len(), + 1, + "schema-corrupt branch-meta must be quarantined: {quarantines:?}" + ); + assert_eq!(quarantines[0].original, schema_corrupt); + assert!(!schema_corrupt.exists()); + assert_eq!( + std::fs::read_to_string(&quarantines[0].quarantined).unwrap(), + r#"{"default_branch": 5}"#, + "quarantined file must preserve the corrupt content as evidence" + ); + } + + #[test] + fn quarantine_is_a_no_op_without_a_projects_dir() { + let dir = tempfile::TempDir::new().unwrap(); + let (quarantines, warnings) = quarantine_corrupt_branch_meta(dir.path()); + assert!(quarantines.is_empty()); + assert!(warnings.is_empty()); + } +} diff --git a/src/hooks.rs b/src/hooks.rs index ae12dcb1..688c7811 100644 --- a/src/hooks.rs +++ b/src/hooks.rs @@ -94,10 +94,7 @@ fn hook_analytics_path(root: Option<&Path>) -> Option { } fn append_private_jsonl(path: &Path, line: &str) { - let mut content = std::fs::read_to_string(path).unwrap_or_default(); - content.push_str(line); - content.push('\n'); - let _ = crate::storage::PrivateStoreIo::write_file(path, content.as_bytes()); + let _ = crate::storage::PrivateStoreIo::append_line(path, line); } fn now_unix_millis() -> u64 { diff --git a/src/main.rs b/src/main.rs index ddf30ec2..92834c35 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,7 @@ mod project_cmd; mod sessions_cmd; mod status_cmd; mod tool_command; +mod update_cmd; pub use tracedecay::serve; @@ -352,170 +353,6 @@ fn maybe_run_silent_reinstall(user_config: &mut tracedecay::user_config::UserCon } } -fn refresh_generated_plugins() -> tracedecay::errors::Result<()> { - let home = tracedecay_home_dir()?; - let tracedecay_bin = tracedecay_bin_on_path()?; - eprintln!("Refreshing tracedecay-generated plugin artifacts (agent configs are not touched)"); - - // Detection-driven, not `installed_agents`-driven: each integration - // decides whether generated artifacts exist on this machine, so stale - // tracking state can neither skip a real install nor install anywhere new. - let mut refreshed_any = false; - let mut config_only_installed: Vec<&'static str> = Vec::new(); - let mut failures: Vec = Vec::new(); - for ag in tracedecay::agents::all_integrations() { - let ctx = tracedecay::agents::InstallContext { - home: home.clone(), - tracedecay_bin: tracedecay_bin.clone(), - tool_permissions: tracedecay::agents::expected_tool_perms(), - profile: None, - project_root: None, - dashboard: true, - }; - match ag.update_plugin(&ctx) { - Ok(tracedecay::agents::UpdatePluginOutcome::Refreshed(paths)) => { - refreshed_any = true; - for path in paths { - eprintln!( - " \x1b[32m✔\x1b[0m {}: refreshed {}", - ag.id(), - path.display() - ); - } - } - Ok(tracedecay::agents::UpdatePluginOutcome::NotInstalled) => {} - Ok(tracedecay::agents::UpdatePluginOutcome::ConfigOnly) => { - if ag.has_tracedecay(&home) { - config_only_installed.push(ag.id()); - } - } - Err(e) => failures.push(format!("{}: {e}", ag.id())), - } - } - if !config_only_installed.is_empty() { - eprintln!( - " Config-managed integrations left untouched: {} (run `tracedecay reinstall` to refresh their config entries)", - config_only_installed.join(", ") - ); - } - if !refreshed_any { - eprintln!("No generated plugin installs detected — nothing to update."); - } - if !failures.is_empty() { - return Err(tracedecay::errors::TraceDecayError::Config { - message: format!("update-plugin failed for {}", failures.join("; ")), - }); - } - - Ok(()) -} - -/// Rewrites and restarts the installed daemon service, returning the service -/// path and its socket, or `None` when no service is installed. -fn refresh_daemon_service() -> tracedecay::errors::Result> { - let tracedecay_bin = tracedecay_bin_on_path()?; - let spec = tracedecay::daemon::service_spec(tracedecay_bin, None)?; - let socket_path = tracedecay::daemon::installed_service_socket_path()? - .unwrap_or_else(|| spec.socket_path.clone()); - Ok(tracedecay::daemon::refresh_installed_service(&spec)? - .map(|service_path| (service_path, socket_path))) -} - -fn refresh_daemon_service_after_update() -> tracedecay::errors::Result<()> { - match refresh_daemon_service()? { - Some((service_path, socket_path)) => { - eprintln!( - "\x1b[32m✔\x1b[0m Daemon service refreshed at {}", - service_path.display() - ); - eprintln!("Daemon socket: {}", socket_path.display()); - } - None if tracedecay::daemon::daemon_reachable() => { - eprintln!( - " \x1b[33mwarning:\x1b[0m a TraceDecay daemon is running without an installed service; \ - it keeps serving the previous version until its `tracedecay daemon run` process is restarted." - ); - } - None => { - eprintln!("TraceDecay daemon service is not installed; skipping daemon restart."); - } - } - Ok(()) -} - -fn restart_daemon_service() -> tracedecay::errors::Result<()> { - match refresh_daemon_service()? { - Some((service_path, socket_path)) => { - eprintln!( - "\x1b[32m✔\x1b[0m Daemon service restarted at {}", - service_path.display() - ); - eprintln!("Daemon socket: {}", socket_path.display()); - Ok(()) - } - None => Err(tracedecay::errors::TraceDecayError::Config { - message: "no TraceDecay daemon service is installed — restart your `tracedecay daemon run` \ - process manually, or run `tracedecay daemon install-service` to manage it as a service" - .to_string(), - }), - } -} - -fn tracedecay_home_dir() -> tracedecay::errors::Result { - tracedecay::agents::home_dir().ok_or_else(|| tracedecay::errors::TraceDecayError::Config { - message: "could not determine home directory".to_string(), - }) -} - -pub(crate) fn tracedecay_bin_on_path() -> tracedecay::errors::Result { - tracedecay::agents::which_tracedecay().ok_or_else(|| { - tracedecay::errors::TraceDecayError::Config { - message: "tracedecay not found on PATH".to_string(), - } - }) -} - -fn run_update_steps(mut upgrade: U, mut post_update: P) -> tracedecay::errors::Result<()> -where - U: FnMut() -> tracedecay::errors::Result<()>, - P: FnMut() -> tracedecay::errors::Result<()>, -{ - upgrade()?; - post_update()?; - Ok(()) -} - -fn run_update_command() -> tracedecay::errors::Result<()> { - run_update_steps( - || tracedecay::upgrade::run_upgrade().map(|_| ()), - run_post_update_subcommand, - ) -} - -fn run_post_update_subcommand() -> tracedecay::errors::Result<()> { - let tracedecay_bin = tracedecay_bin_on_path()?; - let status = std::process::Command::new(&tracedecay_bin) - .arg("post-update") - .status() - .map_err(|e| tracedecay::errors::TraceDecayError::Config { - message: format!("failed to run post-update with '{tracedecay_bin}': {e}"), - })?; - if status.success() { - return Ok(()); - } - Err(tracedecay::errors::TraceDecayError::Config { - message: format!("post-update failed with status: {status}"), - }) -} - -fn run_post_update_tasks() -> tracedecay::errors::Result<()> { - refresh_generated_plugins()?; - if let Err(error) = refresh_daemon_service_after_update() { - eprintln!(" \x1b[33mwarning:\x1b[0m daemon service refresh failed: {error}"); - } - Ok(()) -} - async fn resolve_registered_project_root( project_id: Option, project_path: Option, @@ -653,7 +490,7 @@ async fn dispatch_command(command: Commands) -> tracedecay::errors::Result<()> { agent_cmd::handle_reinstall_command().await?; } Commands::UpdatePlugin => { - refresh_generated_plugins()?; + update_cmd::refresh_generated_plugins()?; } Commands::Uninstall { agent, @@ -798,7 +635,7 @@ async fn dispatch_command(command: Commands) -> tracedecay::errors::Result<()> { ); } DaemonAction::Restart => { - restart_daemon_service()?; + update_cmd::restart_daemon_service()?; } DaemonAction::Status => { let socket_path = tracedecay::daemon::socket_path_or_default(None)?; @@ -808,11 +645,11 @@ async fn dispatch_command(command: Commands) -> tracedecay::errors::Result<()> { Commands::Upgrade => { tracedecay::upgrade::run_upgrade()?; } - Commands::Update => { - run_update_command()?; + Commands::Update { no_heal } => { + update_cmd::run_update_command(no_heal)?; } - Commands::PostUpdate => { - run_post_update_tasks()?; + Commands::PostUpdate { no_heal } => { + update_cmd::run_post_update_tasks(no_heal).await?; } Commands::Channel { channel } => match channel { Some(target) => { @@ -945,8 +782,8 @@ fn should_skip_startup_maintenance(command: &Commands) -> bool { Commands::Install { .. } | Commands::Reinstall | Commands::UpdatePlugin - | Commands::Update - | Commands::PostUpdate + | Commands::Update { .. } + | Commands::PostUpdate { .. } | Commands::Uninstall { .. } | Commands::Lsp { .. } | Commands::Doctor { .. } @@ -1010,8 +847,8 @@ fn should_skip_agent_install_maintenance(command: &Commands) -> bool { | Commands::Install { .. } | Commands::Reinstall | Commands::UpdatePlugin - | Commands::Update - | Commands::PostUpdate + | Commands::Update { .. } + | Commands::PostUpdate { .. } | Commands::Uninstall { .. } | Commands::Lsp { .. } | Commands::Doctor { .. } diff --git a/src/memory/trust.rs b/src/memory/trust.rs index 0a9edca7..f5e35730 100644 --- a/src/memory/trust.rs +++ b/src/memory/trust.rs @@ -8,6 +8,14 @@ pub const TRUST_MIN: f64 = 0.0; pub const TRUST_MAX: f64 = 1.0; pub const DEFAULT_TRUST: f64 = 0.5; pub const DEFAULT_MIN_TRUST: f64 = 0.3; +/// Lower bound of the "high" bucket in [`trust_bucket`]; scores in +/// `[DEFAULT_MIN_TRUST, HIGH_TRUST_THRESHOLD)` are "medium". +pub const HIGH_TRUST_THRESHOLD: f64 = 0.75; +/// Representative score for a "low" trust label, inside the low bucket. +pub const LOW_TRUST_REPRESENTATIVE: f64 = 0.15; +/// Representative score for a "high" trust label, inside the high bucket. +/// `DEFAULT_TRUST` is the representative for "medium". +pub const HIGH_TRUST_REPRESENTATIVE: f64 = 0.85; pub fn clamp_trust(score: f64) -> f64 { score.clamp(TRUST_MIN, TRUST_MAX) @@ -26,7 +34,7 @@ pub fn trust_bucket(score: f64) -> &'static str { let clamped = clamp_trust(score); if clamped < DEFAULT_MIN_TRUST { "low" - } else if clamped < 0.75 { + } else if clamped < HIGH_TRUST_THRESHOLD { "medium" } else { "high" @@ -42,3 +50,17 @@ pub fn trust_distribution(scores: &[f64]) -> (usize, usize, usize) { } }) } + +#[cfg(test)] +mod tests { + use super::*; + + /// Guards the label representative scores against bucket-boundary drift: + /// each representative must map back onto its own bucket. + #[test] + fn label_representatives_map_onto_their_buckets() { + assert_eq!(trust_bucket(LOW_TRUST_REPRESENTATIVE), "low"); + assert_eq!(trust_bucket(DEFAULT_TRUST), "medium"); + assert_eq!(trust_bucket(HIGH_TRUST_REPRESENTATIVE), "high"); + } +} diff --git a/src/migrate/registry.rs b/src/migrate/registry.rs index cddfd742..28b7e12c 100644 --- a/src/migrate/registry.rs +++ b/src/migrate/registry.rs @@ -4,7 +4,9 @@ use std::path::{Component, Path, PathBuf}; use serde::Serialize; use crate::branch_meta; -use crate::global_db::{GlobalDb, GraphScopeUpsert, StoreArtifactUpsert, StoreInstanceUpsert}; +use crate::global_db::{ + CodeProjectRecord, GlobalDb, GraphScopeUpsert, StoreArtifactUpsert, StoreInstanceUpsert, +}; use crate::storage::{ read_store_manifest, validate_project_id, StorageMode, StoreKind, STORE_MANIFEST_FILENAME, STORE_MANIFEST_SCHEMA_VERSION, @@ -128,6 +130,49 @@ pub async fn apply_registry_reconstruction_report( } } +/// How dead a registry row's project root must be before the row counts as +/// stale. This is the single definition of both GC scopes, so a reader never +/// has to reassemble the effective condition from scattered half-checks. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StaleRootScope { + /// Manual `tracedecay migrate registry-gc` scope: the canonical root is + /// gone (the user reviews candidates before applying). + CanonicalRootMissing, + /// Post-update auto-GC scope: both the canonical and display roots are + /// gone — stricter, because nobody reviews the deletion. + AllRootsMissing, +} + +/// Returns true if the project's canonical or display root still exists. +pub fn code_project_root_exists(project: &CodeProjectRecord) -> bool { + Path::new(&project.canonical_root).exists() || Path::new(&project.display_root).exists() +} + +/// Filters registry rows that are stale under `scope`, restricted to +/// canonical roots under one of `prefixes` (an empty slice means no +/// restriction). Shared by `tracedecay migrate registry-gc` and the +/// post-update health pass so both agree on what counts as a GC candidate. +pub fn stale_code_projects<'a>( + projects: &'a [CodeProjectRecord], + prefixes: &[PathBuf], + scope: StaleRootScope, +) -> Vec<&'a CodeProjectRecord> { + projects + .iter() + .filter(|project| { + let canonical_root = Path::new(&project.canonical_root); + prefixes.is_empty() + || prefixes + .iter() + .any(|prefix| canonical_root.starts_with(prefix)) + }) + .filter(|project| match scope { + StaleRootScope::CanonicalRootMissing => !Path::new(&project.canonical_root).exists(), + StaleRootScope::AllRootsMissing => !code_project_root_exists(project), + }) + .collect() +} + pub fn scan_profile_store_manifests( profile_root: &Path, verified_at: i64, diff --git a/src/startup_tests.rs b/src/startup_tests.rs index 06bc8e36..556a626c 100644 --- a/src/startup_tests.rs +++ b/src/startup_tests.rs @@ -3,8 +3,10 @@ use super::{ hermes_profile_targets, hermes_selected_profile_targets, validate_hermes_profile_flags, validate_hermes_project_root_flag, }, - is_local_install_command, run_update_steps, should_skip_agent_install_maintenance, - should_skip_startup_maintenance, Commands, + is_local_install_command, should_skip_agent_install_maintenance, + should_skip_startup_maintenance, + update_cmd::run_update_steps, + Commands, }; use std::cell::RefCell; use tempfile::TempDir; @@ -31,8 +33,12 @@ fn explicit_agent_config_commands_skip_startup_maintenance() { })); assert!(should_skip_startup_maintenance(&Commands::Reinstall)); assert!(should_skip_startup_maintenance(&Commands::UpdatePlugin)); - assert!(should_skip_startup_maintenance(&Commands::Update)); - assert!(should_skip_startup_maintenance(&Commands::PostUpdate)); + assert!(should_skip_startup_maintenance(&Commands::Update { + no_heal: false + })); + assert!(should_skip_startup_maintenance(&Commands::PostUpdate { + no_heal: false + })); assert!(should_skip_startup_maintenance(&Commands::Uninstall { agent: Some("kiro".to_string()), profile: None, @@ -78,8 +84,12 @@ fn agent_install_maintenance_is_selective() { assert!(should_skip_agent_install_maintenance( &Commands::UpdatePlugin )); - assert!(should_skip_agent_install_maintenance(&Commands::Update)); - assert!(should_skip_agent_install_maintenance(&Commands::PostUpdate)); + assert!(should_skip_agent_install_maintenance(&Commands::Update { + no_heal: false + })); + assert!(should_skip_agent_install_maintenance( + &Commands::PostUpdate { no_heal: false } + )); assert!(should_skip_agent_install_maintenance(&Commands::Tool { project: None, name: Some("message_search".to_string()), diff --git a/src/storage.rs b/src/storage.rs index fbfe7e68..fdb347ec 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -1,5 +1,5 @@ use std::fs; -use std::io; +use std::io::{self, Write}; use std::path::{Component, Path, PathBuf}; use serde::{Deserialize, Serialize}; @@ -12,6 +12,9 @@ pub const ENROLLMENT_FILENAME: &str = "enrollment.json"; pub const STORE_MANIFEST_FILENAME: &str = "store_manifest.json"; pub const SESSIONS_DB_FILENAME: &str = "sessions.db"; pub const BRANCH_META_FILENAME: &str = "branch-meta.json"; +/// Filename prefix for corrupt `branch-meta.json` files renamed out of the +/// way by the post-update health pass (`branch-meta.json.corrupt-`). +pub const BRANCH_META_QUARANTINE_PREFIX: &str = "branch-meta.json.corrupt-"; pub const STORE_MANIFEST_SCHEMA_VERSION: u32 = 1; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -459,10 +462,35 @@ impl PrivateStoreIo { Self::create_dir_all(parent)?; } reject_symlink_components(path, "private store file")?; - fs::write(path, contents)?; + Self::open_private(path, fs::OpenOptions::new().write(true).truncate(true))? + .write_all(contents)?; set_private_file_permissions(path) } + /// Appends one line via a single `O_APPEND` write so concurrent hook + /// processes never interleave partial lines or lose each other's entries + /// the way read-modify-rewrite appends do. + pub fn append_line(path: &Path, line: &str) -> io::Result<()> { + if let Some(parent) = path.parent() { + Self::create_dir_all(parent)?; + } + reject_symlink_components(path, "private store file")?; + Self::open_private(path, fs::OpenOptions::new().append(true))? + .write_all(format!("{line}\n").as_bytes())?; + set_private_file_permissions(path) + } + + /// Opens `path` for writing, creating it if missing with owner-only + /// permissions applied at create time (Unix), so a fresh file never + /// exists with umask-default permissions before the trailing + /// `set_private_file_permissions` call. Pre-existing files keep their + /// mode here and are tightened by that trailing call. + fn open_private(path: &Path, options: &mut fs::OpenOptions) -> io::Result { + options.create(true); + apply_private_create_mode(options); + options.open(path) + } + pub fn write_file_atomically(path: &Path, temp_path: &Path, contents: &[u8]) -> io::Result<()> { if path_parent(path) != path_parent(temp_path) { return Err(invalid_input( @@ -702,6 +730,16 @@ fn set_private_file_permissions(path: &Path) -> std::io::Result<()> { fs::set_permissions(path, fs::Permissions::from_mode(0o600)) } +#[cfg(unix)] +fn apply_private_create_mode(options: &mut fs::OpenOptions) { + use std::os::unix::fs::OpenOptionsExt; + + options.mode(0o600); +} + +#[cfg(not(unix))] +fn apply_private_create_mode(_options: &mut fs::OpenOptions) {} + #[cfg(not(unix))] fn set_private_file_permissions(_path: &Path) -> std::io::Result<()> { Ok(()) diff --git a/src/update_cmd.rs b/src/update_cmd.rs new file mode 100644 index 00000000..31ee11db --- /dev/null +++ b/src/update_cmd.rs @@ -0,0 +1,181 @@ +//! The `update` / `post-update` / `update-plugin` flow: binary upgrade via +//! subprocess re-exec, generated-plugin refresh, daemon service refresh, and +//! the post-update health pass. + +use std::path::PathBuf; + +pub(crate) fn refresh_generated_plugins() -> tracedecay::errors::Result<()> { + let home = tracedecay_home_dir()?; + let tracedecay_bin = tracedecay_bin_on_path()?; + eprintln!("Refreshing tracedecay-generated plugin artifacts (agent configs are not touched)"); + + // Detection-driven, not `installed_agents`-driven: each integration + // decides whether generated artifacts exist on this machine, so stale + // tracking state can neither skip a real install nor install anywhere new. + let mut refreshed_any = false; + let mut config_only_installed: Vec<&'static str> = Vec::new(); + let mut failures: Vec = Vec::new(); + for ag in tracedecay::agents::all_integrations() { + let ctx = tracedecay::agents::InstallContext { + home: home.clone(), + tracedecay_bin: tracedecay_bin.clone(), + tool_permissions: tracedecay::agents::expected_tool_perms(), + profile: None, + project_root: None, + dashboard: true, + }; + match ag.update_plugin(&ctx) { + Ok(tracedecay::agents::UpdatePluginOutcome::Refreshed(paths)) => { + refreshed_any = true; + for path in paths { + eprintln!( + " \x1b[32m✔\x1b[0m {}: refreshed {}", + ag.id(), + path.display() + ); + } + } + Ok(tracedecay::agents::UpdatePluginOutcome::NotInstalled) => {} + Ok(tracedecay::agents::UpdatePluginOutcome::ConfigOnly) => { + if ag.has_tracedecay(&home) { + config_only_installed.push(ag.id()); + } + } + Err(e) => failures.push(format!("{}: {e}", ag.id())), + } + } + if !config_only_installed.is_empty() { + eprintln!( + " Config-managed integrations left untouched: {} (run `tracedecay reinstall` to refresh their config entries)", + config_only_installed.join(", ") + ); + } + if !refreshed_any { + eprintln!("No generated plugin installs detected — nothing to update."); + } + if !failures.is_empty() { + return Err(tracedecay::errors::TraceDecayError::Config { + message: format!("update-plugin failed for {}", failures.join("; ")), + }); + } + + Ok(()) +} + +/// Rewrites and restarts the installed daemon service, returning the service +/// path and its socket, or `None` when no service is installed. +fn refresh_daemon_service() -> tracedecay::errors::Result> { + let tracedecay_bin = tracedecay_bin_on_path()?; + let spec = tracedecay::daemon::service_spec(tracedecay_bin, None)?; + let socket_path = tracedecay::daemon::installed_service_socket_path()? + .unwrap_or_else(|| spec.socket_path.clone()); + Ok(tracedecay::daemon::refresh_installed_service(&spec)? + .map(|service_path| (service_path, socket_path))) +} + +fn refresh_daemon_service_after_update() -> tracedecay::errors::Result<()> { + match refresh_daemon_service()? { + Some((service_path, socket_path)) => { + eprintln!( + "\x1b[32m✔\x1b[0m Daemon service refreshed at {}", + service_path.display() + ); + eprintln!("Daemon socket: {}", socket_path.display()); + } + None if tracedecay::daemon::daemon_reachable() => { + eprintln!( + " \x1b[33mwarning:\x1b[0m a TraceDecay daemon is running without an installed service; \ + it keeps serving the previous version until its `tracedecay daemon run` process is restarted." + ); + } + None => { + eprintln!("TraceDecay daemon service is not installed; skipping daemon restart."); + } + } + Ok(()) +} + +pub(crate) fn restart_daemon_service() -> tracedecay::errors::Result<()> { + match refresh_daemon_service()? { + Some((service_path, socket_path)) => { + eprintln!( + "\x1b[32m✔\x1b[0m Daemon service restarted at {}", + service_path.display() + ); + eprintln!("Daemon socket: {}", socket_path.display()); + Ok(()) + } + None => Err(tracedecay::errors::TraceDecayError::Config { + message: "no TraceDecay daemon service is installed — restart your `tracedecay daemon run` \ + process manually, or run `tracedecay daemon install-service` to manage it as a service" + .to_string(), + }), + } +} + +fn tracedecay_home_dir() -> tracedecay::errors::Result { + tracedecay::agents::home_dir().ok_or_else(|| tracedecay::errors::TraceDecayError::Config { + message: "could not determine home directory".to_string(), + }) +} + +pub(crate) fn tracedecay_bin_on_path() -> tracedecay::errors::Result { + tracedecay::agents::which_tracedecay().ok_or_else(|| { + tracedecay::errors::TraceDecayError::Config { + message: "tracedecay not found on PATH".to_string(), + } + }) +} + +pub(crate) fn run_update_steps( + mut upgrade: U, + mut post_update: P, +) -> tracedecay::errors::Result<()> +where + U: FnMut() -> tracedecay::errors::Result<()>, + P: FnMut() -> tracedecay::errors::Result<()>, +{ + upgrade()?; + post_update()?; + Ok(()) +} + +pub(crate) fn run_update_command(no_heal: bool) -> tracedecay::errors::Result<()> { + run_update_steps( + || tracedecay::upgrade::run_upgrade().map(|_| ()), + || run_post_update_subcommand(no_heal), + ) +} + +fn run_post_update_subcommand(no_heal: bool) -> tracedecay::errors::Result<()> { + let tracedecay_bin = tracedecay_bin_on_path()?; + let mut command = std::process::Command::new(&tracedecay_bin); + command.arg("post-update"); + if no_heal { + command.arg("--no-heal"); + } + let status = command + .status() + .map_err(|e| tracedecay::errors::TraceDecayError::Config { + message: format!("failed to run post-update with '{tracedecay_bin}': {e}"), + })?; + if status.success() { + return Ok(()); + } + Err(tracedecay::errors::TraceDecayError::Config { + message: format!("post-update failed with status: {status}"), + }) +} + +pub(crate) async fn run_post_update_tasks(no_heal: bool) -> tracedecay::errors::Result<()> { + refresh_generated_plugins()?; + if let Err(error) = refresh_daemon_service_after_update() { + eprintln!(" \x1b[33mwarning:\x1b[0m daemon service refresh failed: {error}"); + } + if no_heal { + eprintln!("Skipping post-update health pass (--no-heal)."); + } else { + tracedecay::doctor::heal::run_post_update_health_pass().await; + } + Ok(()) +} diff --git a/tests/automation_session_reflector_runner_test.rs b/tests/automation_session_reflector_runner_test.rs index 1b73fc09..80e1103a 100644 --- a/tests/automation_session_reflector_runner_test.rs +++ b/tests/automation_session_reflector_runner_test.rs @@ -93,15 +93,6 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { "source_span": {"session_id": "session-reflect-1", "message_id": "session-reflect-1-message-001"}, "reason": "missing trust should be rejected" }, - { - "content": "Session reflection trust must be a numeric score", - "category": "project", - "tags": ["automation"], - "entities": ["TraceDecay"], - "trust": "high", - "source_span": {"session_id": "session-reflect-1", "message_id": "session-reflect-1-message-001"}, - "reason": "string trust labels should be rejected" - }, { "content": "Session reflection facts require a rationale", "category": "project", @@ -123,6 +114,24 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { { "content": "", "category": "project" + }, + { + "content": "Bucket trust labels emitted by backends map onto calibrated numeric scores", + "category": "project", + "tags": ["automation"], + "entities": ["TraceDecay"], + "trust": "high", + "source_span": {"session_id": "session-reflect-1", "message_id": "session-reflect-1-message-001"}, + "reason": "bucket trust labels should be accepted" + }, + { + "content": "Unknown trust labels must not be accepted", + "category": "project", + "tags": ["automation"], + "entities": ["TraceDecay"], + "trust": "sky-high", + "source_span": {"session_id": "session-reflect-1", "message_id": "session-reflect-1-message-001"}, + "reason": "unknown trust label should be rejected" } ] })); @@ -161,7 +170,7 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { assert_eq!(backend.calls(), 1); assert_eq!(run.ledger_record.task, AgentTaskKind::SessionReflector); assert_eq!(run.ledger_record.status, AutomationRunStatus::Succeeded); - assert_eq!(run.ledger_record.accepted_count, 2); + assert_eq!(run.ledger_record.accepted_count, 3); assert_eq!(run.ledger_record.rejected_count, 8); assert_eq!( run.report["accepted_facts"][0]["add_fact_request"]["source"], @@ -198,11 +207,17 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { "source_span must cite a bounded session reflection evidence hit" )); assert!(has_rejection_reason("trust is required")); - assert!(has_rejection_reason("trust must be between 0 and 1")); + assert!(has_rejection_reason( + "trust must be a number between 0 and 1, or one of low, medium, high" + )); assert!(has_rejection_reason("reason is required")); assert!(has_rejection_reason( "confidence is not supported; use trust" )); + assert_eq!( + run.report["accepted_facts"][2]["add_fact_request"]["trust"], + json!(0.85) + ); let proposals = list_fact_proposals( &cg.store_layout().dashboard_root, Some(FactProposalState::PendingApproval), @@ -210,7 +225,7 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { ) .await .unwrap(); - assert_eq!(proposals.len(), 2); + assert_eq!(proposals.len(), 3); assert_eq!(proposals[0].run_id, run.run_id); assert_eq!( proposals[0].add_fact_request.as_ref().unwrap().content, @@ -259,7 +274,7 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { ); let eval_payload = read_artifact(&cg, &run.run_id, &run.ledger_record, "generated_evals").await; assert_eq!(eval_payload["task"], json!("session_reflector")); - assert_eq!(eval_payload["summary"]["eval_count"], json!(10)); + assert_eq!(eval_payload["summary"]["eval_count"], json!(11)); assert!(eval_payload["eval_definitions"] .as_array() .unwrap() @@ -338,7 +353,7 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { .unwrap(); assert_eq!(records.len(), 1); assert_eq!(records[0].run_id, run.run_id); - assert_eq!(records[0].accepted_count, 2); + assert_eq!(records[0].accepted_count, 3); assert_eq!(records[0].rejected_count, 8); assert!(records[0].applied_ops.is_none()); } diff --git a/tests/branch_db_safety_test.rs b/tests/branch_db_safety_test.rs index a8ecad01..abaab130 100644 --- a/tests/branch_db_safety_test.rs +++ b/tests/branch_db_safety_test.rs @@ -1,9 +1,11 @@ +mod common; + use std::fs; use std::path::Path; use std::path::PathBuf; use std::process::Command; -use tempfile::TempDir; +use common::IsolatedEnv; use tracedecay::branch::BranchAddOutcome; use tracedecay::branch_meta::load_branch_meta; use tracedecay::storage::resolve_layout_for_current_profile; @@ -46,9 +48,8 @@ fn project_data_dir(project: &Path) -> PathBuf { .data_root } -async fn open_untracked_project() -> (TempDir, PathBuf, TraceDecay) { - let dir = TempDir::new().unwrap(); - let project = dir.path().to_path_buf(); +async fn open_untracked_project() -> (IsolatedEnv, PathBuf, TraceDecay) { + let (env, project) = IsolatedEnv::acquire().await; git(&project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -71,12 +72,11 @@ async fn open_untracked_project() -> (TempDir, PathBuf, TraceDecay) { assert_eq!(feature.serving_branch(), Some("feature/untracked")); assert!(!feature.is_fallback()); - (dir, project, feature) + (env, project, feature) } -async fn open_detached_fallback_project() -> (TempDir, PathBuf, TraceDecay) { - let dir = TempDir::new().unwrap(); - let project = dir.path().to_path_buf(); +async fn open_detached_fallback_project() -> (IsolatedEnv, PathBuf, TraceDecay) { + let (env, project) = IsolatedEnv::acquire().await; git(&project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -107,7 +107,7 @@ async fn open_detached_fallback_project() -> (TempDir, PathBuf, TraceDecay) { "detached HEAD should explain the fallback branch" ); - (dir, project, fallback) + (env, project, fallback) } async fn assert_main_db_missing_symbol(project: &Path, symbol: &str, message: &str) { @@ -129,7 +129,7 @@ fn assert_fallback_write_refused(operation: &str, err: impl std::fmt::Display) { #[tokio::test] async fn open_auto_tracks_untracked_branch_and_syncs_its_db() { - let (_dir, project, feature) = open_untracked_project().await; + let (_env, project, feature) = open_untracked_project().await; assert!( !feature @@ -158,7 +158,7 @@ async fn open_auto_tracks_untracked_branch_and_syncs_its_db() { #[tokio::test] async fn fallback_writes_are_refused_by_all_sync_entry_points() { - let (_dir, project, fallback) = open_detached_fallback_project().await; + let (_env, project, fallback) = open_detached_fallback_project().await; let err = fallback .sync() @@ -196,8 +196,8 @@ async fn fallback_writes_are_refused_by_all_sync_entry_points() { #[tokio::test] async fn add_branch_tracking_copies_from_nearest_tracked_ancestor() { - let dir = TempDir::new().unwrap(); - let project = dir.path(); + let (_env, project) = IsolatedEnv::acquire().await; + let project = project.as_path(); git(project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -274,8 +274,8 @@ async fn add_branch_tracking_copies_from_nearest_tracked_ancestor() { #[tokio::test] async fn add_branch_tracking_refuses_corrupt_metadata_without_overwriting() { - let dir = TempDir::new().unwrap(); - let project = dir.path(); + let (_env, project) = IsolatedEnv::acquire().await; + let project = project.as_path(); git(project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -315,15 +315,17 @@ async fn add_branch_tracking_refuses_corrupt_metadata_without_overwriting() { #[test] fn cli_branch_add_refuses_corrupt_metadata_without_overwriting() { - let dir = TempDir::new().unwrap(); - let project = dir.path(); + let (env, project) = IsolatedEnv::acquire_blocking(); + let project = project.as_path(); git(project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); fs::write(project.join("src/lib.rs"), "pub fn main_only() {}\n").unwrap(); commit_all(project, "initial commit"); - let init = Command::new(env!("CARGO_BIN_EXE_tracedecay")) + let mut init_command = Command::new(env!("CARGO_BIN_EXE_tracedecay")); + common::apply_tracedecay_home_env(&mut init_command, env.home()); + let init = init_command .arg("init") .arg(project) .output() @@ -347,7 +349,9 @@ fn cli_branch_add_refuses_corrupt_metadata_without_overwriting() { .unwrap(); commit_all(project, "feature commit"); - let output = Command::new(env!("CARGO_BIN_EXE_tracedecay")) + let mut branch_add_command = Command::new(env!("CARGO_BIN_EXE_tracedecay")); + common::apply_tracedecay_home_env(&mut branch_add_command, env.home()); + let output = branch_add_command .args(["branch", "add", "feature/corrupt-meta", "--path"]) .arg(project) .output() diff --git a/tests/common/mod.rs b/tests/common/mod.rs index d3ece00b..42ef34b9 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -59,8 +59,72 @@ impl Drop for EnvVarGuard { pub const GLOBAL_DB_ENV: &str = "TRACEDECAY_GLOBAL_DB"; /// Serializes tests within one binary that mutate process-wide env vars. +/// +/// Prefer [`IsolatedEnv`], which bundles this serialization with a throwaway +/// home and [`TraceDecayStorageEnvGuard`]; reach for this raw lock only when +/// a test needs finer-grained control over which env vars it swaps. pub static GLOBAL_DB_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); +/// Serializes [`IsolatedEnv`] users within one test binary: storage isolation +/// swaps process-wide env vars (`HOME`, `TRACEDECAY_DATA_DIR`, ...), so tests +/// must not overlap. +static ISOLATED_ENV_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(()); + +/// The canonical way to isolate env-mutating tests: serializes tests within +/// one binary and keeps every test's project registration, store manifests, +/// and branch-meta writes inside a throwaway home instead of the developer's +/// real `~/.tracedecay` profile store. +/// +/// Construct via [`IsolatedEnv::acquire`] (async tests) or +/// [`IsolatedEnv::acquire_blocking`] (sync tests); both return the guard plus +/// a ready-made `project` directory inside the temp home. +pub struct IsolatedEnv { + // Field order matters: fields drop in declaration order, so the lock must + // be declared last. Dropping it first would let the next waiting test + // install its own isolated env, only for `storage`'s restore to clobber it. + storage: TraceDecayStorageEnvGuard, + _dir: TempDir, + _env_lock: tokio::sync::MutexGuard<'static, ()>, +} + +impl IsolatedEnv { + fn build(env_lock: tokio::sync::MutexGuard<'static, ()>) -> (Self, PathBuf) { + let dir = tempdir_or_panic(); + let storage = TraceDecayStorageEnvGuard::for_tempdir(&dir); + let project = dir.path().join("project"); + fs::create_dir_all(&project).unwrap_or_else(|err| { + panic!( + "failed to create isolated project directory '{}': {err}", + project.display() + ) + }); + ( + Self { + storage, + _dir: dir, + _env_lock: env_lock, + }, + project, + ) + } + + pub async fn acquire() -> (Self, PathBuf) { + Self::build(ISOLATED_ENV_LOCK.lock().await) + } + + /// Sync counterpart of [`IsolatedEnv::acquire`] for plain `#[test]` fns. + /// + /// Warning: this uses `blocking_lock`, which panics if called from within + /// an async context — use [`IsolatedEnv::acquire`] there instead. + pub fn acquire_blocking() -> (Self, PathBuf) { + Self::build(ISOLATED_ENV_LOCK.blocking_lock()) + } + + pub fn home(&self) -> &Path { + self.storage.home() + } +} + /// Sets [`GLOBAL_DB_ENV`] to a test DB path for the guard's lifetime. pub struct GlobalDbEnvGuard { _env_guard: EnvVarGuard, diff --git a/tests/update_health_pass_test.rs b/tests/update_health_pass_test.rs new file mode 100644 index 00000000..7d3f4723 --- /dev/null +++ b/tests/update_health_pass_test.rs @@ -0,0 +1,332 @@ +//! Integration tests for the post-update health pass that runs at the end of +//! `tracedecay update` / `tracedecay post-update` (skippable via `--no-heal`). +//! +//! All tests spawn the real binary against an isolated home directory via +//! `apply_tracedecay_home_env`, so they never touch the real `~/.tracedecay`. + +mod common; + +#[cfg(unix)] +use std::os::unix::fs::PermissionsExt; +use std::path::{Path, PathBuf}; +use std::process::{Command, Output, Stdio}; +use std::time::{Duration, Instant}; + +use common::apply_tracedecay_home_env; +use tempfile::TempDir; +use tracedecay::global_db::GlobalDb; +use tracedecay::storage::BRANCH_META_QUARANTINE_PREFIX; + +fn canonical_temp_path(path: &Path) -> PathBuf { + #[cfg(windows)] + { + path.to_path_buf() + } + #[cfg(not(windows))] + { + path.canonicalize().unwrap_or_else(|_| path.to_path_buf()) + } +} + +fn cli_timeout() -> Duration { + if cfg!(windows) { + Duration::from_secs(90) + } else { + Duration::from_secs(30) + } +} + +/// `post-update` requires `tracedecay` on PATH (for the plugin refresh), so +/// link the test binary into an isolated bin dir and prepend it to PATH. +fn add_tracedecay_path_shim(command: &mut Command, home: &Path) { + let bin_dir = home.join("bin"); + std::fs::create_dir_all(&bin_dir).unwrap(); + let shim = bin_dir.join(if cfg!(windows) { + "tracedecay.exe" + } else { + "tracedecay" + }); + if std::fs::hard_link(env!("CARGO_BIN_EXE_tracedecay"), &shim).is_err() { + std::fs::copy(env!("CARGO_BIN_EXE_tracedecay"), &shim).unwrap(); + } + #[cfg(unix)] + { + let mut permissions = std::fs::metadata(&shim).unwrap().permissions(); + permissions.set_mode(0o755); + std::fs::set_permissions(&shim, permissions).unwrap(); + } + let path = std::env::var_os("PATH").unwrap_or_default(); + let joined = + std::env::join_paths(std::iter::once(bin_dir).chain(std::env::split_paths(&path))).unwrap(); + command.env("PATH", joined); +} + +fn post_update_command(home: &Path) -> Command { + let mut command = Command::new(env!("CARGO_BIN_EXE_tracedecay")); + apply_tracedecay_home_env(&mut command, home); + add_tracedecay_path_shim(&mut command, home); + command + .current_dir(home) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + command +} + +fn run_with_timeout(mut command: Command, timeout: Duration) -> Output { + let mut child = command + .spawn() + .unwrap_or_else(|e| panic!("failed to spawn tracedecay: {e}")); + let started = Instant::now(); + loop { + if child + .try_wait() + .unwrap_or_else(|e| panic!("failed to poll child: {e}")) + .is_some() + { + return child + .wait_with_output() + .unwrap_or_else(|e| panic!("failed to collect output: {e}")); + } + if started.elapsed() >= timeout { + let _ = child.kill(); + let output = child + .wait_with_output() + .unwrap_or_else(|e| panic!("failed to collect timed-out output: {e}")); + panic!( + "tracedecay post-update hung after {:?}\nstdout:\n{}\nstderr:\n{}", + started.elapsed(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + } + std::thread::sleep(Duration::from_millis(50)); + } +} + +fn assert_success(output: &Output, what: &str) { + assert!( + output.status.success(), + "{what} should succeed\nstdout:\n{}\nstderr:\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); +} + +fn write_branch_meta(profile_root: &Path, project_id: &str, content: &str) -> PathBuf { + let shard = profile_root.join("projects").join(project_id); + std::fs::create_dir_all(&shard).unwrap(); + let path = shard.join("branch-meta.json"); + std::fs::write(&path, content).unwrap(); + path +} + +fn quarantined_branch_meta_files(shard: &Path) -> Vec { + std::fs::read_dir(shard) + .unwrap() + .flatten() + .map(|entry| entry.path()) + .filter(|path| { + path.file_name().is_some_and(|name| { + name.to_string_lossy() + .starts_with(BRANCH_META_QUARANTINE_PREFIX) + }) + }) + .collect() +} + +#[test] +fn post_update_quarantines_corrupt_branch_meta() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + let corrupt = write_branch_meta(&profile_root, "proj_corrupt", "{not valid json"); + let valid = write_branch_meta( + &profile_root, + "proj_valid", + r#"{"default_branch":"main","branches":{}}"#, + ); + + let mut command = post_update_command(&home_root); + command.arg("post-update"); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update"); + assert!( + !corrupt.exists(), + "corrupt branch-meta.json should be quarantined away" + ); + let quarantined = quarantined_branch_meta_files(corrupt.parent().unwrap()); + assert_eq!( + quarantined.len(), + 1, + "exactly one quarantine file expected, got {quarantined:?}" + ); + assert_eq!( + std::fs::read_to_string(&quarantined[0]).unwrap(), + "{not valid json", + "quarantine must preserve the corrupt content as evidence" + ); + assert_eq!( + std::fs::read_to_string(&valid).unwrap(), + r#"{"default_branch":"main","branches":{}}"#, + "valid branch-meta.json must be left untouched" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Quarantined 1 corrupt branch metadata file(s)"), + "stderr should report the quarantine\nstderr:\n{stderr}" + ); +} + +#[test] +fn post_update_quarantines_schema_corrupt_branch_meta() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + // Valid JSON, but not a valid BranchMeta — the runtime treats any schema + // mismatch as corrupt, so the health pass must quarantine it too. + let schema_corrupt = + write_branch_meta(&profile_root, "proj_schema", r#"{"default_branch": 5}"#); + + let mut command = post_update_command(&home_root); + command.arg("post-update"); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update"); + assert!( + !schema_corrupt.exists(), + "schema-corrupt branch-meta.json should be quarantined away" + ); + let quarantined = quarantined_branch_meta_files(schema_corrupt.parent().unwrap()); + assert_eq!( + quarantined.len(), + 1, + "exactly one quarantine file expected, got {quarantined:?}" + ); + assert_eq!( + std::fs::read_to_string(&quarantined[0]).unwrap(), + r#"{"default_branch": 5}"#, + "quarantine must preserve the corrupt content as evidence" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Quarantined 1 corrupt branch metadata file(s)"), + "stderr should report the quarantine\nstderr:\n{stderr}" + ); +} + +#[test] +fn post_update_no_heal_skips_health_pass() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + let corrupt = write_branch_meta(&profile_root, "proj_corrupt", "{not valid json"); + + let mut command = post_update_command(&home_root); + command.args(["post-update", "--no-heal"]); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update --no-heal"); + assert!( + corrupt.exists(), + "--no-heal must leave the corrupt branch-meta.json in place" + ); + assert_eq!( + std::fs::read_to_string(&corrupt).unwrap(), + "{not valid json" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Skipping post-update health pass (--no-heal)"), + "stderr should say the health pass was skipped\nstderr:\n{stderr}" + ); + assert!( + !stderr.contains("Post-update health pass"), + "the health pass must not run with --no-heal\nstderr:\n{stderr}" + ); +} + +#[tokio::test] +async fn post_update_gcs_stale_registry_rows_under_temp_dir_only() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + std::fs::create_dir_all(&profile_root).unwrap(); + + // The child's system temp directory is pinned to a dir inside the test + // home, so the GC scope is fully controlled by the fixture. + let fake_tmp = home_root.join("fake-tmp"); + std::fs::create_dir_all(&fake_tmp).unwrap(); + let fake_tmp = canonical_temp_path(&fake_tmp); + let live_root = fake_tmp.join("live-project"); + std::fs::create_dir_all(&live_root).unwrap(); + + { + let db = GlobalDb::open_at(&profile_root.join("global.db")) + .await + .expect("global db should open"); + db.upsert_code_project( + "proj_tmp_gone", + &fake_tmp.join("gone-project"), + None, + None, + Some("main"), + ) + .await + .expect("stale temp project should upsert"); + db.upsert_code_project( + "proj_elsewhere_gone", + &home_root.join("gone-elsewhere"), + None, + None, + Some("main"), + ) + .await + .expect("stale non-temp project should upsert"); + db.upsert_code_project("proj_tmp_live", &live_root, None, None, Some("main")) + .await + .expect("live temp project should upsert"); + } + + let mut command = post_update_command(&home_root); + command + .arg("post-update") + .env("TMPDIR", &fake_tmp) + .env("TMP", &fake_tmp) + .env("TEMP", &fake_tmp); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update"); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Purged 1 stale temp-root registry row(s)"), + "stderr should report the temp-root registry GC\nstderr:\n{stderr}" + ); + + let db = GlobalDb::open_at(&profile_root.join("global.db")) + .await + .expect("global db should reopen"); + let remaining: Vec = db + .list_code_projects(usize::MAX) + .await + .into_iter() + .map(|project| project.project_id) + .collect(); + assert!( + !remaining.contains(&"proj_tmp_gone".to_string()), + "stale temp-root row must be purged, remaining: {remaining:?}" + ); + assert!( + remaining.contains(&"proj_elsewhere_gone".to_string()), + "stale row outside the temp dir must be surfaced, not auto-purged: {remaining:?}" + ); + assert!( + remaining.contains(&"proj_tmp_live".to_string()), + "temp-root row with an existing project root must survive: {remaining:?}" + ); + assert!( + stderr.contains("stale code project registry row(s) outside the temp directory"), + "remaining stale rows should be surfaced in the findings summary\nstderr:\n{stderr}" + ); +}