From deb256e4276da2c7be8729a4db767e56ebffd4ca Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Wed, 1 Jul 2026 23:45:10 +0000 Subject: [PATCH 01/11] fix: harden automation ledger, fact-proposal trust, hook analytics, and test isolation - accept low/medium/high bucket trust labels in session reflector fact proposals and clarify the numeric-trust prompt instruction - stop persisting consecutive identical scheduler-skip run records that flooded the automation run ledger every tick - append hook_analytics.jsonl lines via a single O_APPEND write so concurrent hook processes no longer corrupt or drop entries - isolate branch_db_safety_test under a throwaway profile home so it stops writing corrupt branch-meta.json and stale registry rows into the real ~/.tracedecay store --- src/automation/lifecycle.rs | 143 ++++++++++++++++++ src/automation/runner.rs | 2 +- src/automation/session_reflector.rs | 27 +++- src/hooks.rs | 5 +- src/storage.rs | 22 +++ ...utomation_session_reflector_runner_test.rs | 37 ++++- tests/branch_db_safety_test.rs | 83 +++++++--- 7 files changed, 287 insertions(+), 32 deletions(-) diff --git a/src/automation/lifecycle.rs b/src/automation/lifecycle.rs index 269332f8..a262f903 100644 --- a/src/automation/lifecycle.rs +++ b/src/automation/lifecycle.rs @@ -191,10 +191,37 @@ pub(crate) async fn append_skipped_record( Some(reason.to_string()), started_at, ); + // Scheduler ticks re-evaluate every task every few seconds, so a standing + // skip condition (interval not elapsed, task disabled, ...) would append + // thousands of identical records and drown real runs out of the ledger. + // Persist only the first record of each consecutive identical skip. + if trigger == AutomationTrigger::Scheduler + && is_repeat_scheduler_skip(dashboard_root, task, reason).await? + { + return Ok(record); + } append_run_record(dashboard_root, &record).await?; Ok(record) } +/// True when the most recent ledger record for `task` is already a scheduler +/// skip with the same reason. +async fn is_repeat_scheduler_skip( + dashboard_root: &Path, + task: AgentTaskKind, + reason: &str, +) -> Result { + let records = load_run_records(dashboard_root, 200).await?; + Ok(records + .iter() + .find(|record| record.task == task) + .is_some_and(|record| { + record.trigger == AutomationTrigger::Scheduler + && record.status == AutomationRunStatus::Skipped + && record.error.as_deref() == Some(reason) + })) +} + #[allow(clippy::too_many_arguments)] pub(crate) async fn skipped_run_parts( dashboard_root: &Path, @@ -605,6 +632,7 @@ fn noop_output_for_task(task: AgentTaskKind) -> Value { } #[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used)] mod tests { use super::*; @@ -615,4 +643,119 @@ mod tests { assert_ne!(first, second); } + + async fn append_skip( + dashboard_root: &Path, + run_id: &str, + trigger: AutomationTrigger, + task: AgentTaskKind, + reason: &str, + ) -> AutomationRunLedgerRecord { + let config = AutomationConfig::default(); + append_skipped_record( + dashboard_root, + run_id, + trigger, + &config, + task, + None, + reason, + "1000", + ) + .await + .expect("append skipped record") + } + + #[tokio::test] + async fn consecutive_identical_scheduler_skips_persist_once() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + let task = AgentTaskKind::MemoryCurator; + + append_skip( + root, + "run-1", + AutomationTrigger::Scheduler, + task, + "scheduler_interval_not_elapsed", + ) + .await; + append_skip( + root, + "run-2", + AutomationTrigger::Scheduler, + task, + "scheduler_interval_not_elapsed", + ) + .await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!( + records.len(), + 1, + "repeat scheduler skip must not append a second record" + ); + assert_eq!(records[0].run_id, "run-1"); + } + + #[tokio::test] + async fn scheduler_skips_with_new_reason_or_task_still_persist() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + + append_skip( + root, + "run-1", + AutomationTrigger::Scheduler, + AgentTaskKind::MemoryCurator, + "scheduler_interval_not_elapsed", + ) + .await; + append_skip( + root, + "run-2", + AutomationTrigger::Scheduler, + AgentTaskKind::MemoryCurator, + "scheduler_cooldown_active", + ) + .await; + append_skip( + root, + "run-3", + AutomationTrigger::Scheduler, + AgentTaskKind::SessionReflector, + "scheduler_interval_not_elapsed", + ) + .await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!(records.len(), 3, "distinct skip conditions must persist"); + } + + #[tokio::test] + async fn manual_trigger_skips_always_persist() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + let task = AgentTaskKind::SkillWriter; + + append_skip( + root, + "run-1", + AutomationTrigger::ManualCli, + task, + "skill_writer_disabled", + ) + .await; + append_skip( + root, + "run-2", + AutomationTrigger::ManualCli, + task, + "skill_writer_disabled", + ) + .await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!(records.len(), 2, "manual skips must always be recorded"); + } } diff --git a/src/automation/runner.rs b/src/automation/runner.rs index 1ed540a3..70a7046b 100644 --- a/src/automation/runner.rs +++ b/src/automation/runner.rs @@ -650,7 +650,7 @@ async fn skipped_skill_writer_run( fn build_session_reflector_prompt(evidence: &Value) -> String { format!( - "Review these bounded TraceDecay session snippets and propose only durable memory facts. Return only JSON with a facts array. Each fact must include content, category, optional tags, optional entities, trust, source_span, and reason. Category must be one of general, user_pref, project, tool, decision, or code_area. Use trust, not confidence. source_span must cite one bounded evidence hit by session_id plus message_id for raw messages, by store_id for raw messages, or by node_id for summaries. Do not include secrets or ephemeral status.\n{}", + "Review these bounded TraceDecay session snippets and propose only durable memory facts. Return only JSON with a facts array. Each fact must include content, category, optional tags, optional entities, trust, source_span, and reason. Category must be one of general, user_pref, project, tool, decision, or code_area. Use trust, not confidence; trust must be a number between 0 and 1. source_span must cite one bounded evidence hit by session_id plus message_id for raw messages, by store_id for raw messages, or by node_id for summaries. Do not include secrets or ephemeral status.\n{}", serde_json::to_string_pretty(evidence).unwrap_or_else(|_| "{}".to_string()) ) } diff --git a/src/automation/session_reflector.rs b/src/automation/session_reflector.rs index 1172d085..8f92791b 100644 --- a/src/automation/session_reflector.rs +++ b/src/automation/session_reflector.rs @@ -130,9 +130,14 @@ async fn validate_fact_proposal( )); }; let trust = match object.get("trust") { - Some(value) => match value.as_f64() { - Some(trust) if (0.0..=1.0).contains(&trust) => Some(trust), - _ => return Ok(rejected_fact(proposal, "trust must be between 0 and 1")), + Some(value) => match proposal_trust_value(value) { + Some(trust) => Some(trust), + None => { + return Ok(rejected_fact( + proposal, + "trust must be a number between 0 and 1, or one of low, medium, high", + )) + } }, None => return Ok(rejected_fact(proposal, "trust is required")), }; @@ -229,6 +234,22 @@ async fn validate_fact_proposal( }))) } +/// Accepts numeric trust in `[0, 1]` plus the `low`/`medium`/`high` bucket +/// labels models frequently emit despite the numeric prompt instruction. +/// Buckets map to representative scores inside the matching +/// [`crate::memory::trust::trust_bucket`] range. +fn proposal_trust_value(value: &Value) -> Option { + if let Some(trust) = value.as_f64() { + return (0.0..=1.0).contains(&trust).then_some(trust); + } + match value.as_str()?.trim().to_ascii_lowercase().as_str() { + "low" => Some(0.15), + "medium" => Some(0.5), + "high" => Some(0.85), + _ => None, + } +} + fn value_as_i64(value: &Value) -> Option { value .as_i64() diff --git a/src/hooks.rs b/src/hooks.rs index 0b16c498..09a825d9 100644 --- a/src/hooks.rs +++ b/src/hooks.rs @@ -94,10 +94,7 @@ fn hook_analytics_path(root: Option<&Path>) -> Option { } fn append_private_jsonl(path: &Path, line: &str) { - let mut content = std::fs::read_to_string(path).unwrap_or_default(); - content.push_str(line); - content.push('\n'); - let _ = crate::storage::PrivateStoreIo::write_file(path, content.as_bytes()); + let _ = crate::storage::PrivateStoreIo::append_line(path, line); } fn now_unix_millis() -> u64 { diff --git a/src/storage.rs b/src/storage.rs index fbfe7e68..caea2bb4 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -463,6 +463,28 @@ impl PrivateStoreIo { set_private_file_permissions(path) } + /// Appends one line via a single `O_APPEND` write so concurrent hook + /// processes never interleave partial lines or lose each other's entries + /// the way read-modify-rewrite appends do. + pub fn append_line(path: &Path, line: &str) -> io::Result<()> { + use io::Write; + + if let Some(parent) = path.parent() { + Self::create_dir_all(parent)?; + } + reject_symlink_components(path, "private store file")?; + let mut payload = String::with_capacity(line.len() + 1); + payload.push_str(line); + payload.push('\n'); + let mut file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(path)?; + file.write_all(payload.as_bytes())?; + drop(file); + set_private_file_permissions(path) + } + pub fn write_file_atomically(path: &Path, temp_path: &Path, contents: &[u8]) -> io::Result<()> { if path_parent(path) != path_parent(temp_path) { return Err(invalid_input( diff --git a/tests/automation_session_reflector_runner_test.rs b/tests/automation_session_reflector_runner_test.rs index bad2ae0f..3781ff5d 100644 --- a/tests/automation_session_reflector_runner_test.rs +++ b/tests/automation_session_reflector_runner_test.rs @@ -114,6 +114,24 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { { "content": "", "category": "project" + }, + { + "content": "Bucket trust labels emitted by backends map onto calibrated numeric scores", + "category": "project", + "tags": ["automation"], + "entities": ["TraceDecay"], + "trust": "high", + "source_span": {"session_id": "session-reflect-1", "message_id": "session-reflect-1-message-001"}, + "reason": "bucket trust labels should be accepted" + }, + { + "content": "Unknown trust labels must not be accepted", + "category": "project", + "tags": ["automation"], + "entities": ["TraceDecay"], + "trust": "sky-high", + "source_span": {"session_id": "session-reflect-1", "message_id": "session-reflect-1-message-001"}, + "reason": "unknown trust label should be rejected" } ] })); @@ -152,8 +170,8 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { assert_eq!(backend.calls(), 1); assert_eq!(run.ledger_record.task, AgentTaskKind::SessionReflector); assert_eq!(run.ledger_record.status, AutomationRunStatus::Succeeded); - assert_eq!(run.ledger_record.accepted_count, 2); - assert_eq!(run.ledger_record.rejected_count, 7); + assert_eq!(run.ledger_record.accepted_count, 3); + assert_eq!(run.ledger_record.rejected_count, 8); assert_eq!( run.report["accepted_facts"][0]["add_fact_request"]["source"], json!("session_reflector") @@ -189,10 +207,17 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { "source_span must cite a bounded session reflection evidence hit" )); assert!(has_rejection_reason("trust is required")); + assert!(has_rejection_reason( + "trust must be a number between 0 and 1, or one of low, medium, high" + )); assert!(has_rejection_reason("reason is required")); assert!(has_rejection_reason( "confidence is not supported; use trust" )); + assert_eq!( + run.report["accepted_facts"][2]["add_fact_request"]["trust"], + json!(0.85) + ); let proposals = list_fact_proposals( &cg.store_layout().dashboard_root, Some(FactProposalState::PendingApproval), @@ -200,7 +225,7 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { ) .await .unwrap(); - assert_eq!(proposals.len(), 2); + assert_eq!(proposals.len(), 3); assert_eq!(proposals[0].run_id, run.run_id); assert_eq!( proposals[0].add_fact_request.as_ref().unwrap().content, @@ -249,7 +274,7 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { ); let eval_payload = read_artifact(&cg, &run.run_id, &run.ledger_record, "generated_evals").await; assert_eq!(eval_payload["task"], json!("session_reflector")); - assert_eq!(eval_payload["summary"]["eval_count"], json!(9)); + assert_eq!(eval_payload["summary"]["eval_count"], json!(11)); assert!(eval_payload["eval_definitions"] .as_array() .unwrap() @@ -328,8 +353,8 @@ async fn session_reflector_runner_validates_fact_proposals_without_applying() { .unwrap(); assert_eq!(records.len(), 1); assert_eq!(records[0].run_id, run.run_id); - assert_eq!(records[0].accepted_count, 2); - assert_eq!(records[0].rejected_count, 7); + assert_eq!(records[0].accepted_count, 3); + assert_eq!(records[0].rejected_count, 8); assert!(records[0].applied_ops.is_none()); } diff --git a/tests/branch_db_safety_test.rs b/tests/branch_db_safety_test.rs index a8ecad01..4ebc7f5b 100644 --- a/tests/branch_db_safety_test.rs +++ b/tests/branch_db_safety_test.rs @@ -1,14 +1,59 @@ +mod common; + use std::fs; use std::path::Path; use std::path::PathBuf; use std::process::Command; +use common::TraceDecayStorageEnvGuard; use tempfile::TempDir; use tracedecay::branch::BranchAddOutcome; use tracedecay::branch_meta::load_branch_meta; use tracedecay::storage::resolve_layout_for_current_profile; use tracedecay::tracedecay::TraceDecay; +/// Serializes the tests in this binary: storage isolation swaps process-wide +/// env vars (`HOME`, `TRACEDECAY_DATA_DIR`, ...), so tests must not overlap. +static ENV_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(()); + +/// Keeps every test's project registration, store manifests, and +/// branch-meta writes inside a throwaway home instead of the developer's +/// real `~/.tracedecay` profile store. +struct IsolatedEnv { + _env_lock: tokio::sync::MutexGuard<'static, ()>, + storage: TraceDecayStorageEnvGuard, + _dir: TempDir, +} + +impl IsolatedEnv { + fn build(env_lock: tokio::sync::MutexGuard<'static, ()>) -> (Self, PathBuf) { + let dir = TempDir::new().unwrap(); + let storage = TraceDecayStorageEnvGuard::for_tempdir(&dir); + let project = dir.path().join("project"); + fs::create_dir_all(&project).unwrap(); + ( + Self { + _env_lock: env_lock, + storage, + _dir: dir, + }, + project, + ) + } + + async fn acquire() -> (Self, PathBuf) { + Self::build(ENV_LOCK.lock().await) + } + + fn acquire_blocking() -> (Self, PathBuf) { + Self::build(ENV_LOCK.blocking_lock()) + } + + fn home(&self) -> &Path { + self.storage.home() + } +} + fn git(project: &Path, args: &[&str]) { let output = Command::new("git") .args(["-c", "core.hooksPath=.git/no-hooks"]) @@ -46,9 +91,8 @@ fn project_data_dir(project: &Path) -> PathBuf { .data_root } -async fn open_untracked_project() -> (TempDir, PathBuf, TraceDecay) { - let dir = TempDir::new().unwrap(); - let project = dir.path().to_path_buf(); +async fn open_untracked_project() -> (IsolatedEnv, PathBuf, TraceDecay) { + let (env, project) = IsolatedEnv::acquire().await; git(&project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -71,12 +115,11 @@ async fn open_untracked_project() -> (TempDir, PathBuf, TraceDecay) { assert_eq!(feature.serving_branch(), Some("feature/untracked")); assert!(!feature.is_fallback()); - (dir, project, feature) + (env, project, feature) } -async fn open_detached_fallback_project() -> (TempDir, PathBuf, TraceDecay) { - let dir = TempDir::new().unwrap(); - let project = dir.path().to_path_buf(); +async fn open_detached_fallback_project() -> (IsolatedEnv, PathBuf, TraceDecay) { + let (env, project) = IsolatedEnv::acquire().await; git(&project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -107,7 +150,7 @@ async fn open_detached_fallback_project() -> (TempDir, PathBuf, TraceDecay) { "detached HEAD should explain the fallback branch" ); - (dir, project, fallback) + (env, project, fallback) } async fn assert_main_db_missing_symbol(project: &Path, symbol: &str, message: &str) { @@ -129,7 +172,7 @@ fn assert_fallback_write_refused(operation: &str, err: impl std::fmt::Display) { #[tokio::test] async fn open_auto_tracks_untracked_branch_and_syncs_its_db() { - let (_dir, project, feature) = open_untracked_project().await; + let (_env, project, feature) = open_untracked_project().await; assert!( !feature @@ -158,7 +201,7 @@ async fn open_auto_tracks_untracked_branch_and_syncs_its_db() { #[tokio::test] async fn fallback_writes_are_refused_by_all_sync_entry_points() { - let (_dir, project, fallback) = open_detached_fallback_project().await; + let (_env, project, fallback) = open_detached_fallback_project().await; let err = fallback .sync() @@ -196,8 +239,8 @@ async fn fallback_writes_are_refused_by_all_sync_entry_points() { #[tokio::test] async fn add_branch_tracking_copies_from_nearest_tracked_ancestor() { - let dir = TempDir::new().unwrap(); - let project = dir.path(); + let (_env, project) = IsolatedEnv::acquire().await; + let project = project.as_path(); git(project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -274,8 +317,8 @@ async fn add_branch_tracking_copies_from_nearest_tracked_ancestor() { #[tokio::test] async fn add_branch_tracking_refuses_corrupt_metadata_without_overwriting() { - let dir = TempDir::new().unwrap(); - let project = dir.path(); + let (_env, project) = IsolatedEnv::acquire().await; + let project = project.as_path(); git(project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); @@ -315,15 +358,17 @@ async fn add_branch_tracking_refuses_corrupt_metadata_without_overwriting() { #[test] fn cli_branch_add_refuses_corrupt_metadata_without_overwriting() { - let dir = TempDir::new().unwrap(); - let project = dir.path(); + let (env, project) = IsolatedEnv::acquire_blocking(); + let project = project.as_path(); git(project, &["init", "-b", "main"]); fs::create_dir_all(project.join("src")).unwrap(); fs::write(project.join("src/lib.rs"), "pub fn main_only() {}\n").unwrap(); commit_all(project, "initial commit"); - let init = Command::new(env!("CARGO_BIN_EXE_tracedecay")) + let mut init_command = Command::new(env!("CARGO_BIN_EXE_tracedecay")); + common::apply_tracedecay_home_env(&mut init_command, env.home()); + let init = init_command .arg("init") .arg(project) .output() @@ -347,7 +392,9 @@ fn cli_branch_add_refuses_corrupt_metadata_without_overwriting() { .unwrap(); commit_all(project, "feature commit"); - let output = Command::new(env!("CARGO_BIN_EXE_tracedecay")) + let mut branch_add_command = Command::new(env!("CARGO_BIN_EXE_tracedecay")); + common::apply_tracedecay_home_env(&mut branch_add_command, env.home()); + let output = branch_add_command .args(["branch", "add", "feature/corrupt-meta", "--path"]) .arg(project) .output() From 7d6de7ec534dffce386a360664ac79fb7dc27c14 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Wed, 1 Jul 2026 23:48:44 +0000 Subject: [PATCH 02/11] fix: bound daemon graceful shutdown so SIGTERM cannot hang into SIGKILL Graceful shutdown persists token counters and checkpoints WALs for every live project server sequentially, which can exceed systemd's stop timeout and end in a SIGKILL mid-checkpoint. Cap shutdown work at 45s, log the timeout outcome, and abort the stalled task; SQLite WAL keeps remaining state crash-safe. --- src/daemon.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/src/daemon.rs b/src/daemon.rs index d64f0b70..783b3249 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -2,6 +2,8 @@ use std::collections::HashMap; use std::fmt::Write; #[cfg(unix)] +use std::future::Future; +#[cfg(unix)] use std::os::unix::fs::PermissionsExt; #[cfg(unix)] use std::os::unix::net::UnixStream as StdUnixStream; @@ -32,6 +34,12 @@ pub const SOCKET_ENV: &str = "TRACEDECAY_DAEMON_SOCKET"; pub const HOOK_EVENT_METHOD: &str = "tracedecay/hookEvent"; #[cfg(unix)] const HOOK_EVENT_NOTIFY_TIMEOUT: Duration = Duration::from_millis(750); +/// Upper bound on graceful-shutdown persistence work (per-server token +/// persistence and WAL checkpoints). Must stay comfortably below systemd's +/// stop timeout (90s by default) so the daemon exits cleanly instead of +/// being killed with `SIGKILL` mid-checkpoint. +#[cfg(unix)] +const DAEMON_SHUTDOWN_DEADLINE: Duration = Duration::from_secs(45); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct DaemonHookEvent { @@ -1029,11 +1037,50 @@ async fn run_foreground_unix(socket_path: PathBuf) -> Result<()> { "daemon_shutdown", &[("socket", socket_path.display().to_string())], ); - engine.shutdown_all().await; + let completed = await_shutdown_within_deadline( + async move { engine.shutdown_all().await }, + DAEMON_SHUTDOWN_DEADLINE, + ) + .await; + if !completed { + log_daemon_event( + "daemon_shutdown", + &[ + ("outcome", "timeout".to_string()), + ( + "deadline_secs", + DAEMON_SHUTDOWN_DEADLINE.as_secs().to_string(), + ), + ], + ); + } let _ = std::fs::remove_file(&socket_path); Ok(()) } +/// Runs the shutdown future on its own task and waits at most `deadline` for +/// it to finish, returning `true` when it completed in time. +/// +/// Graceful shutdown persists tokens-saved counters and checkpoints WALs for +/// every live project server sequentially; with many servers or large WALs +/// that can exceed systemd's stop timeout, which then sends `SIGKILL` to the +/// daemon. On timeout the task is aborted and we proceed to exit: the +/// remaining persistence is best-effort and the SQLite WAL keeps state +/// crash-safe. +#[cfg(unix)] +async fn await_shutdown_within_deadline(shutdown: F, deadline: Duration) -> bool +where + F: Future + Send + 'static, +{ + let mut handle = tokio::spawn(shutdown); + if timeout(deadline, &mut handle).await.is_ok() { + true + } else { + handle.abort(); + false + } +} + #[cfg(unix)] fn set_owner_only_permissions(path: &Path, mode: u32) -> Result<()> { let permissions = std::fs::Permissions::from_mode(mode); @@ -2867,4 +2914,31 @@ mod tests { .expect("server B task") .expect("server B result"); } + + #[cfg(unix)] + #[tokio::test] + async fn shutdown_deadline_reports_completion_for_fast_shutdown() { + let completed = super::await_shutdown_within_deadline( + std::future::ready(()), + std::time::Duration::from_secs(5), + ) + .await; + assert!(completed, "instant shutdown should finish within deadline"); + } + + #[cfg(unix)] + #[tokio::test] + async fn shutdown_deadline_aborts_stalled_shutdown() { + let started = std::time::Instant::now(); + let completed = super::await_shutdown_within_deadline( + std::future::pending(), + std::time::Duration::from_millis(50), + ) + .await; + assert!(!completed, "stalled shutdown must report a timeout"); + assert!( + started.elapsed() < std::time::Duration::from_secs(5), + "timeout must return promptly instead of waiting on the stalled task" + ); + } } From fec1a0dc399c5355b0b10edabaac1c24f87a9b59 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Wed, 1 Jul 2026 23:50:25 +0000 Subject: [PATCH 03/11] docs: generalize WAL wording in daemon shutdown-deadline comment --- src/daemon.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/daemon.rs b/src/daemon.rs index 783b3249..834eaf24 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -1065,7 +1065,7 @@ async fn run_foreground_unix(socket_path: PathBuf) -> Result<()> { /// every live project server sequentially; with many servers or large WALs /// that can exceed systemd's stop timeout, which then sends `SIGKILL` to the /// daemon. On timeout the task is aborted and we proceed to exit: the -/// remaining persistence is best-effort and the SQLite WAL keeps state +/// remaining persistence is best-effort and the database WAL keeps state /// crash-safe. #[cfg(unix)] async fn await_shutdown_within_deadline(shutdown: F, deadline: Duration) -> bool From 2adc4bab8ae649a5e7541fb58b82addc44cce631 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 00:08:35 +0000 Subject: [PATCH 04/11] feat: run automatic health pass after tracedecay update After refreshing the binary, plugins, and daemon, `tracedecay update` now re-execs a post-update health pass: applies idempotent global-DB schema migrations, quarantines corrupt branch-meta.json files as branch-meta.json.corrupt-, purges stale registry rows under the system temp dir, and summarizes remaining doctor findings. The pass is failure-tolerant (warnings, never update failure) and skippable with --no-heal. --- src/branch_meta.rs | 3 +- src/cli.rs | 12 +- src/cli/parse_tests.rs | 28 ++- src/commands.rs | 13 +- src/doctor.rs | 32 ++-- src/doctor/heal.rs | 286 ++++++++++++++++++++++++++++++ src/main.rs | 39 ++-- src/migrate/registry.rs | 21 ++- src/startup_tests.rs | 16 +- tests/update_health_pass_test.rs | 293 +++++++++++++++++++++++++++++++ 10 files changed, 701 insertions(+), 42 deletions(-) create mode 100644 src/doctor/heal.rs create mode 100644 tests/update_health_pass_test.rs diff --git a/src/branch_meta.rs b/src/branch_meta.rs index 82f47288..1e66c084 100644 --- a/src/branch_meta.rs +++ b/src/branch_meta.rs @@ -8,7 +8,8 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -const BRANCH_META_FILENAME: &str = "branch-meta.json"; +/// Filename of the branch metadata file inside a project data dir. +pub const BRANCH_META_FILENAME: &str = "branch-meta.json"; /// Metadata for a single tracked branch. #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/src/cli.rs b/src/cli.rs index 36b2a899..0268f5f4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -270,10 +270,18 @@ pub enum Commands { /// Download and install the latest version from GitHub Upgrade, /// Refresh the tracedecay binary, generated plugins, and daemon - Update, + Update { + /// Skip the post-update health pass (safe repairs + doctor summary) + #[arg(long)] + no_heal: bool, + }, /// Refresh plugins and daemon after the binary has been updated. #[command(name = "post-update", hide = true)] - PostUpdate, + PostUpdate { + /// Skip the post-update health pass (safe repairs + doctor summary) + #[arg(long)] + no_heal: bool, + }, /// Show or switch the update channel (stable or beta) Channel { /// Target channel: "stable" or "beta" (omit to show current) diff --git a/src/cli/parse_tests.rs b/src/cli/parse_tests.rs index ad6f2076..7cd404de 100644 --- a/src/cli/parse_tests.rs +++ b/src/cli/parse_tests.rs @@ -134,7 +134,10 @@ fn update_upgrade_and_update_plugin_parse_to_distinct_commands() { let update_plugin = Cli::try_parse_from(["tracedecay", "update-plugin"]).expect("update-plugin should parse"); - assert!(matches!(update.command, Some(Commands::Update))); + assert!(matches!( + update.command, + Some(Commands::Update { no_heal: false }) + )); assert!(matches!(upgrade.command, Some(Commands::Upgrade))); assert!(matches!( update_plugin.command, @@ -142,6 +145,29 @@ fn update_upgrade_and_update_plugin_parse_to_distinct_commands() { )); } +#[test] +fn update_and_post_update_parse_no_heal_flag() { + let update = Cli::try_parse_from(["tracedecay", "update", "--no-heal"]) + .expect("update --no-heal should parse"); + let post_update = Cli::try_parse_from(["tracedecay", "post-update", "--no-heal"]) + .expect("post-update --no-heal should parse"); + let post_update_default = Cli::try_parse_from(["tracedecay", "post-update"]) + .expect("post-update should parse without --no-heal"); + + assert!(matches!( + update.command, + Some(Commands::Update { no_heal: true }) + )); + assert!(matches!( + post_update.command, + Some(Commands::PostUpdate { no_heal: true }) + )); + assert!(matches!( + post_update_default.command, + Some(Commands::PostUpdate { no_heal: false }) + )); +} + #[test] fn update_help_describes_refresh_scope() { let help = Cli::command().render_long_help().to_string(); diff --git a/src/commands.rs b/src/commands.rs index 818908b9..fe93a097 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -353,15 +353,10 @@ pub(crate) async fn handle_migrate_action(action: MigrateAction) -> tracedecay:: })?; let projects = global_db.list_code_projects(usize::MAX).await; let prefix_path = prefix.as_deref().map(PathBuf::from); - let stale: Vec<_> = projects - .into_iter() - .filter(|project| { - prefix_path.as_ref().is_none_or(|prefix| { - PathBuf::from(&project.canonical_root).starts_with(prefix) - }) - }) - .filter(|project| !PathBuf::from(&project.canonical_root).exists()) - .collect(); + let stale = tracedecay::migrate::registry::stale_code_projects( + projects, + prefix_path.as_deref(), + ); let deleted = if apply { let project_ids: Vec = stale .iter() diff --git a/src/doctor.rs b/src/doctor.rs index 44b1e435..bac63622 100644 --- a/src/doctor.rs +++ b/src/doctor.rs @@ -9,6 +9,7 @@ use crate::agents::{self, DoctorCounters, HealthcheckContext}; use crate::display::{format_bytes, format_token_count}; use crate::tracedecay::TraceDecay; +pub mod heal; mod registry_drift; /// Runs a comprehensive health check of the tracedecay installation. @@ -470,17 +471,33 @@ fn check_orphan_store_manifests(dc: &mut DoctorCounters, project_paths: &[String let Some(profile_root) = crate::config::user_data_dir() else { return; }; + let (orphan_count, issues) = orphan_store_manifest_report(&profile_root, project_paths); + for issue in issues.iter().take(10) { + dc.warn(&format!("Store manifest issue: {issue}")); + } + if orphan_count > 0 { + dc.warn(&format!( + "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" + )); + dc.info(" Run `tracedecay migrate reconstruct --profile-root --apply` after review."); + } +} + +/// Counts profile store manifests with no matching registry row, plus any +/// manifest scan issues. Shared between `doctor` and the post-update health +/// pass. +pub(crate) fn orphan_store_manifest_report( + profile_root: &Path, + project_paths: &[String], +) -> (usize, Vec) { let registered: std::collections::HashSet = project_paths .iter() .map(|path| crate::global_db::GlobalDb::canonical_project_key(std::path::Path::new(path))) .collect(); let report = crate::migrate::registry::scan_profile_store_manifests( - &profile_root, + profile_root, crate::tracedecay::current_timestamp(), ); - for issue in report.issues.iter().take(10) { - dc.warn(&format!("Store manifest issue: {issue}")); - } let orphan_count = report .plans .iter() @@ -489,12 +506,7 @@ fn check_orphan_store_manifests(dc: &mut DoctorCounters, project_paths: &[String !registered.contains(&key) }) .count(); - if orphan_count > 0 { - dc.warn(&format!( - "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" - )); - dc.info(" Run `tracedecay migrate reconstruct --profile-root --apply` after review."); - } + (orphan_count, report.issues) } /// Check user config file. diff --git a/src/doctor/heal.rs b/src/doctor/heal.rs new file mode 100644 index 00000000..5c2ea838 --- /dev/null +++ b/src/doctor/heal.rs @@ -0,0 +1,286 @@ +//! Post-update health pass: safe, automatic repairs plus a concise summary +//! of the doctor findings that still need a human decision. +//! +//! Runs at the end of `tracedecay update` / `tracedecay post-update` (skip +//! with `--no-heal`). Every step is failure-tolerant: a failing check prints +//! a warning but never fails the update itself. Only remedies that are safe +//! to automate are applied: +//! +//! - corrupt `branch-meta.json` files are quarantined (renamed to +//! `branch-meta.json.corrupt-`), which preserves evidence while +//! restoring the silent single-DB fallback, +//! - registry rows whose project root no longer exists AND lives under the +//! system temp directory are purged (the automated equivalent of +//! `tracedecay migrate registry-gc --prefix --apply`). +//! +//! Everything else (orphan store manifests, stale rows outside the temp +//! directory, registry/manifest identity drift) is only reported. + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use crate::branch_meta::BRANCH_META_FILENAME; +use crate::global_db::GlobalDb; +use crate::migrate::registry::stale_code_projects; + +/// A corrupt `branch-meta.json` that was renamed out of the way. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct BranchMetaQuarantine { + pub original: PathBuf, + pub quarantined: PathBuf, +} + +/// Outcome of one post-update health pass. +#[derive(Debug, Default)] +pub struct HealthPassReport { + pub quarantined_branch_meta: Vec, + pub purged_temp_registry_rows: usize, + pub remaining_findings: Vec, + pub warnings: Vec, +} + +/// Runs the full post-update health pass and prints a doctor-style summary. +/// +/// Never fails: every error is collected as a warning so a broken store can +/// never abort `tracedecay update`. +pub async fn run_post_update_health_pass() -> HealthPassReport { + let mut report = HealthPassReport::default(); + eprintln!("\n\x1b[1mPost-update health pass\x1b[0m (skip with --no-heal)"); + + let Some(profile_root) = crate::config::user_data_dir() else { + report + .warnings + .push("could not determine the profile data directory".to_string()); + print_warnings(&report.warnings); + return report; + }; + + quarantine_corrupt_branch_meta(&profile_root, &mut report); + if report.quarantined_branch_meta.is_empty() { + eprintln!(" \x1b[32m✔\x1b[0m No corrupt branch metadata files"); + } else { + eprintln!( + " \x1b[32m✔\x1b[0m Quarantined {} corrupt branch metadata file(s):", + report.quarantined_branch_meta.len() + ); + for quarantine in &report.quarantined_branch_meta { + eprintln!(" • {}", quarantine.quarantined.display()); + } + } + + // Opening the global DB applies its idempotent schema migrations — the + // same lazy upgrade every normal open path performs. + match GlobalDb::open().await { + Some(global_db) => { + gc_stale_temp_registry_rows(&global_db, &mut report).await; + if report.purged_temp_registry_rows > 0 { + eprintln!( + " \x1b[32m✔\x1b[0m Purged {} stale temp-root registry row(s)", + report.purged_temp_registry_rows + ); + } else { + eprintln!(" \x1b[32m✔\x1b[0m No stale temp-root registry rows"); + } + collect_remaining_findings(&global_db, &profile_root, &mut report).await; + } + None => { + report + .warnings + .push("could not open the global DB for the health pass".to_string()); + } + } + + if report.remaining_findings.is_empty() { + eprintln!(" \x1b[32m✔\x1b[0m No remaining doctor findings"); + } else { + eprintln!(" Remaining findings (not auto-fixed — run `tracedecay doctor` for details):"); + for finding in &report.remaining_findings { + eprintln!(" • {finding}"); + } + } + print_warnings(&report.warnings); + report +} + +fn print_warnings(warnings: &[String]) { + for warning in warnings { + eprintln!(" \x1b[33mwarning:\x1b[0m health pass: {warning}"); + } +} + +/// Renames every `branch-meta.json` under `/projects/*` that is +/// not valid JSON to `branch-meta.json.corrupt-`, preserving the +/// corrupt content as evidence while restoring the single-DB fallback. +pub fn quarantine_corrupt_branch_meta(profile_root: &Path, report: &mut HealthPassReport) { + let projects_root = profile_root.join("projects"); + let Ok(entries) = std::fs::read_dir(&projects_root) else { + return; + }; + let mut meta_paths: Vec = entries + .flatten() + .map(|entry| entry.path().join(BRANCH_META_FILENAME)) + .filter(|path| path.is_file()) + .collect(); + meta_paths.sort(); + + let now = crate::tracedecay::current_timestamp(); + for path in meta_paths { + let content = match std::fs::read_to_string(&path) { + Ok(content) => content, + Err(err) => { + report + .warnings + .push(format!("could not read '{}': {err}", path.display())); + continue; + } + }; + if serde_json::from_str::(&content).is_ok() { + continue; + } + let quarantined = path.with_file_name(format!("{BRANCH_META_FILENAME}.corrupt-{now}")); + match std::fs::rename(&path, &quarantined) { + Ok(()) => report.quarantined_branch_meta.push(BranchMetaQuarantine { + original: path, + quarantined, + }), + Err(err) => report.warnings.push(format!( + "could not quarantine corrupt '{}': {err}", + path.display() + )), + } + } +} + +/// Purges registry rows whose canonical project root is gone AND lives under +/// the system temp directory. This is the only registry GC scope that is safe +/// to run without review. +async fn gc_stale_temp_registry_rows(global_db: &GlobalDb, report: &mut HealthPassReport) { + let projects = global_db.list_code_projects(usize::MAX).await; + let mut seen: HashSet = HashSet::new(); + let mut stale_ids: Vec = Vec::new(); + for prefix in temp_dir_prefixes() { + for project in stale_code_projects(projects.clone(), Some(&prefix)) { + // Extra safety over the manual `migrate registry-gc`: also + // require the display root to be gone before auto-deleting. + if super::code_project_root_exists(&project) { + continue; + } + if seen.insert(project.project_id.clone()) { + stale_ids.push(project.project_id); + } + } + } + if stale_ids.is_empty() { + return; + } + report.purged_temp_registry_rows = global_db.delete_code_projects(&stale_ids).await; +} + +/// The system temp directory in both its literal and canonicalized spellings, +/// so registry rows recorded through a symlinked temp path still match. +fn temp_dir_prefixes() -> Vec { + let temp_dir = std::env::temp_dir(); + let mut prefixes = vec![temp_dir.clone()]; + if let Ok(canonical) = temp_dir.canonicalize() { + if !prefixes.contains(&canonical) { + prefixes.push(canonical); + } + } + prefixes +} + +/// Summarizes the doctor findings that are NOT safe to auto-apply so the user +/// sees them at the end of `tracedecay update` output. +async fn collect_remaining_findings( + global_db: &GlobalDb, + profile_root: &Path, + report: &mut HealthPassReport, +) { + let project_paths = global_db.list_project_paths().await; + let (orphan_count, issues) = super::orphan_store_manifest_report(profile_root, &project_paths); + report.warnings.extend(issues); + if orphan_count > 0 { + report.remaining_findings.push(format!( + "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" + )); + } + + let stale_rows = global_db + .list_code_projects(usize::MAX) + .await + .into_iter() + .filter(|project| !super::code_project_root_exists(project)) + .count(); + if stale_rows > 0 { + report.remaining_findings.push(format!( + "{stale_rows} stale code project registry row(s) outside the temp directory" + )); + } + + let drift = super::registry_drift::registry_drift_findings(global_db, profile_root).await; + if !drift.is_empty() { + report.remaining_findings.push(format!( + "{} registry/store manifest identity drift finding(s)", + drift.len() + )); + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + + fn write_branch_meta(projects_root: &Path, project_id: &str, content: &str) -> PathBuf { + let shard = projects_root.join(project_id); + std::fs::create_dir_all(&shard).unwrap(); + let path = shard.join(BRANCH_META_FILENAME); + std::fs::write(&path, content).unwrap(); + path + } + + #[test] + fn quarantine_renames_only_corrupt_branch_meta() { + let dir = tempfile::TempDir::new().unwrap(); + let projects_root = dir.path().join("projects"); + let corrupt = write_branch_meta(&projects_root, "proj_corrupt", "{not valid json"); + let valid = write_branch_meta( + &projects_root, + "proj_valid", + r#"{"default_branch":"main","branches":{}}"#, + ); + + let mut report = HealthPassReport::default(); + quarantine_corrupt_branch_meta(dir.path(), &mut report); + + assert_eq!(report.quarantined_branch_meta.len(), 1); + assert!(report.warnings.is_empty()); + let quarantine = &report.quarantined_branch_meta[0]; + assert_eq!(quarantine.original, corrupt); + assert!(!corrupt.exists(), "corrupt file should be renamed away"); + assert_eq!( + std::fs::read_to_string(&quarantine.quarantined).unwrap(), + "{not valid json", + "quarantined file must preserve the corrupt content as evidence" + ); + assert!( + quarantine + .quarantined + .file_name() + .unwrap() + .to_string_lossy() + .starts_with("branch-meta.json.corrupt-"), + "quarantine name should be branch-meta.json.corrupt-: {quarantine:?}" + ); + assert!(valid.exists(), "valid branch-meta must be left untouched"); + } + + #[test] + fn quarantine_is_a_no_op_without_a_projects_dir() { + let dir = tempfile::TempDir::new().unwrap(); + let mut report = HealthPassReport::default(); + quarantine_corrupt_branch_meta(dir.path(), &mut report); + assert!(report.quarantined_branch_meta.is_empty()); + assert!(report.warnings.is_empty()); + } +} diff --git a/src/main.rs b/src/main.rs index 2b30e007..e057b04b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -454,17 +454,21 @@ where Ok(()) } -fn run_update_command() -> tracedecay::errors::Result<()> { +fn run_update_command(no_heal: bool) -> tracedecay::errors::Result<()> { run_update_steps( || tracedecay::upgrade::run_upgrade().map(|_| ()), - run_post_update_subcommand, + || run_post_update_subcommand(no_heal), ) } -fn run_post_update_subcommand() -> tracedecay::errors::Result<()> { +fn run_post_update_subcommand(no_heal: bool) -> tracedecay::errors::Result<()> { let tracedecay_bin = tracedecay_bin_on_path()?; - let status = std::process::Command::new(&tracedecay_bin) - .arg("post-update") + let mut command = std::process::Command::new(&tracedecay_bin); + command.arg("post-update"); + if no_heal { + command.arg("--no-heal"); + } + let status = command .status() .map_err(|e| tracedecay::errors::TraceDecayError::Config { message: format!("failed to run post-update with '{tracedecay_bin}': {e}"), @@ -477,11 +481,18 @@ fn run_post_update_subcommand() -> tracedecay::errors::Result<()> { }) } -fn run_post_update_tasks() -> tracedecay::errors::Result<()> { +async fn run_post_update_tasks(no_heal: bool) -> tracedecay::errors::Result<()> { refresh_generated_plugins()?; if let Err(error) = refresh_daemon_service() { eprintln!(" \x1b[33mwarning:\x1b[0m daemon service refresh failed: {error}"); } + if no_heal { + eprintln!("Skipping post-update health pass (--no-heal)."); + } else { + // Failure-tolerant by construction: the health pass collects every + // error as a printed warning and never fails the update. + tracedecay::doctor::heal::run_post_update_health_pass().await; + } Ok(()) } @@ -774,11 +785,11 @@ async fn dispatch_command(command: Commands) -> tracedecay::errors::Result<()> { Commands::Upgrade => { tracedecay::upgrade::run_upgrade()?; } - Commands::Update => { - run_update_command()?; + Commands::Update { no_heal } => { + run_update_command(no_heal)?; } - Commands::PostUpdate => { - run_post_update_tasks()?; + Commands::PostUpdate { no_heal } => { + run_post_update_tasks(no_heal).await?; } Commands::Channel { channel } => match channel { Some(target) => { @@ -911,8 +922,8 @@ fn should_skip_startup_maintenance(command: &Commands) -> bool { Commands::Install { .. } | Commands::Reinstall | Commands::UpdatePlugin - | Commands::Update - | Commands::PostUpdate + | Commands::Update { .. } + | Commands::PostUpdate { .. } | Commands::Uninstall { .. } | Commands::Lsp { .. } | Commands::Doctor { .. } @@ -976,8 +987,8 @@ fn should_skip_agent_install_maintenance(command: &Commands) -> bool { | Commands::Install { .. } | Commands::Reinstall | Commands::UpdatePlugin - | Commands::Update - | Commands::PostUpdate + | Commands::Update { .. } + | Commands::PostUpdate { .. } | Commands::Uninstall { .. } | Commands::Lsp { .. } | Commands::Doctor { .. } diff --git a/src/migrate/registry.rs b/src/migrate/registry.rs index cddfd742..2fc907eb 100644 --- a/src/migrate/registry.rs +++ b/src/migrate/registry.rs @@ -4,7 +4,9 @@ use std::path::{Component, Path, PathBuf}; use serde::Serialize; use crate::branch_meta; -use crate::global_db::{GlobalDb, GraphScopeUpsert, StoreArtifactUpsert, StoreInstanceUpsert}; +use crate::global_db::{ + CodeProjectRecord, GlobalDb, GraphScopeUpsert, StoreArtifactUpsert, StoreInstanceUpsert, +}; use crate::storage::{ read_store_manifest, validate_project_id, StorageMode, StoreKind, STORE_MANIFEST_FILENAME, STORE_MANIFEST_SCHEMA_VERSION, @@ -128,6 +130,23 @@ pub async fn apply_registry_reconstruction_report( } } +/// Filters registry rows whose canonical project root no longer exists on +/// disk, optionally scoped to roots under `prefix`. Shared by +/// `tracedecay migrate registry-gc` and the post-update health pass so both +/// agree on what counts as a GC candidate. +pub fn stale_code_projects( + projects: Vec, + prefix: Option<&Path>, +) -> Vec { + projects + .into_iter() + .filter(|project| { + prefix.is_none_or(|prefix| Path::new(&project.canonical_root).starts_with(prefix)) + }) + .filter(|project| !Path::new(&project.canonical_root).exists()) + .collect() +} + pub fn scan_profile_store_manifests( profile_root: &Path, verified_at: i64, diff --git a/src/startup_tests.rs b/src/startup_tests.rs index 06bc8e36..e4b21df4 100644 --- a/src/startup_tests.rs +++ b/src/startup_tests.rs @@ -31,8 +31,12 @@ fn explicit_agent_config_commands_skip_startup_maintenance() { })); assert!(should_skip_startup_maintenance(&Commands::Reinstall)); assert!(should_skip_startup_maintenance(&Commands::UpdatePlugin)); - assert!(should_skip_startup_maintenance(&Commands::Update)); - assert!(should_skip_startup_maintenance(&Commands::PostUpdate)); + assert!(should_skip_startup_maintenance(&Commands::Update { + no_heal: false + })); + assert!(should_skip_startup_maintenance(&Commands::PostUpdate { + no_heal: false + })); assert!(should_skip_startup_maintenance(&Commands::Uninstall { agent: Some("kiro".to_string()), profile: None, @@ -78,8 +82,12 @@ fn agent_install_maintenance_is_selective() { assert!(should_skip_agent_install_maintenance( &Commands::UpdatePlugin )); - assert!(should_skip_agent_install_maintenance(&Commands::Update)); - assert!(should_skip_agent_install_maintenance(&Commands::PostUpdate)); + assert!(should_skip_agent_install_maintenance(&Commands::Update { + no_heal: false + })); + assert!(should_skip_agent_install_maintenance( + &Commands::PostUpdate { no_heal: false } + )); assert!(should_skip_agent_install_maintenance(&Commands::Tool { project: None, name: Some("message_search".to_string()), diff --git a/tests/update_health_pass_test.rs b/tests/update_health_pass_test.rs new file mode 100644 index 00000000..7db00974 --- /dev/null +++ b/tests/update_health_pass_test.rs @@ -0,0 +1,293 @@ +//! Integration tests for the post-update health pass that runs at the end of +//! `tracedecay update` / `tracedecay post-update` (skippable via `--no-heal`). +//! +//! All tests spawn the real binary against an isolated home directory via +//! `apply_tracedecay_home_env`, so they never touch the real `~/.tracedecay`. + +use std::path::{Path, PathBuf}; +use std::process::{Command, Output, Stdio}; +use std::time::{Duration, Instant}; + +mod common; + +use common::apply_tracedecay_home_env; +use tempfile::TempDir; +use tracedecay::global_db::GlobalDb; + +fn canonical_temp_path(path: &Path) -> PathBuf { + #[cfg(windows)] + { + path.to_path_buf() + } + #[cfg(not(windows))] + { + path.canonicalize().unwrap_or_else(|_| path.to_path_buf()) + } +} + +fn cli_timeout() -> Duration { + if cfg!(windows) { + Duration::from_secs(90) + } else { + Duration::from_secs(30) + } +} + +/// `post-update` requires `tracedecay` on PATH (for the plugin refresh), so +/// link the test binary into an isolated bin dir and prepend it to PATH. +fn add_tracedecay_path_shim(command: &mut Command, home: &Path) { + let bin_dir = home.join("bin"); + std::fs::create_dir_all(&bin_dir).unwrap(); + let shim = bin_dir.join(if cfg!(windows) { + "tracedecay.exe" + } else { + "tracedecay" + }); + if std::fs::hard_link(env!("CARGO_BIN_EXE_tracedecay"), &shim).is_err() { + std::fs::copy(env!("CARGO_BIN_EXE_tracedecay"), &shim).unwrap(); + } + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut permissions = std::fs::metadata(&shim).unwrap().permissions(); + permissions.set_mode(0o755); + std::fs::set_permissions(&shim, permissions).unwrap(); + } + let path = std::env::var_os("PATH").unwrap_or_default(); + let joined = + std::env::join_paths(std::iter::once(bin_dir).chain(std::env::split_paths(&path))).unwrap(); + command.env("PATH", joined); +} + +fn post_update_command(home: &Path) -> Command { + let mut command = Command::new(env!("CARGO_BIN_EXE_tracedecay")); + apply_tracedecay_home_env(&mut command, home); + add_tracedecay_path_shim(&mut command, home); + command + .current_dir(home) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + command +} + +fn run_with_timeout(mut command: Command, timeout: Duration) -> Output { + let mut child = command + .spawn() + .unwrap_or_else(|e| panic!("failed to spawn tracedecay: {e}")); + let started = Instant::now(); + loop { + if child + .try_wait() + .unwrap_or_else(|e| panic!("failed to poll child: {e}")) + .is_some() + { + return child + .wait_with_output() + .unwrap_or_else(|e| panic!("failed to collect output: {e}")); + } + if started.elapsed() >= timeout { + let _ = child.kill(); + let output = child + .wait_with_output() + .unwrap_or_else(|e| panic!("failed to collect timed-out output: {e}")); + panic!( + "tracedecay post-update hung after {:?}\nstdout:\n{}\nstderr:\n{}", + started.elapsed(), + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + } + std::thread::sleep(Duration::from_millis(50)); + } +} + +fn assert_success(output: &Output, what: &str) { + assert!( + output.status.success(), + "{what} should succeed\nstdout:\n{}\nstderr:\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); +} + +fn write_branch_meta(profile_root: &Path, project_id: &str, content: &str) -> PathBuf { + let shard = profile_root.join("projects").join(project_id); + std::fs::create_dir_all(&shard).unwrap(); + let path = shard.join("branch-meta.json"); + std::fs::write(&path, content).unwrap(); + path +} + +fn quarantined_branch_meta_files(shard: &Path) -> Vec { + std::fs::read_dir(shard) + .unwrap() + .flatten() + .map(|entry| entry.path()) + .filter(|path| { + path.file_name().is_some_and(|name| { + name.to_string_lossy() + .starts_with("branch-meta.json.corrupt-") + }) + }) + .collect() +} + +#[test] +fn post_update_quarantines_corrupt_branch_meta() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + let corrupt = write_branch_meta(&profile_root, "proj_corrupt", "{not valid json"); + let valid = write_branch_meta( + &profile_root, + "proj_valid", + r#"{"default_branch":"main","branches":{}}"#, + ); + + let mut command = post_update_command(&home_root); + command.arg("post-update"); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update"); + assert!( + !corrupt.exists(), + "corrupt branch-meta.json should be quarantined away" + ); + let quarantined = quarantined_branch_meta_files(corrupt.parent().unwrap()); + assert_eq!( + quarantined.len(), + 1, + "exactly one quarantine file expected, got {quarantined:?}" + ); + assert_eq!( + std::fs::read_to_string(&quarantined[0]).unwrap(), + "{not valid json", + "quarantine must preserve the corrupt content as evidence" + ); + assert_eq!( + std::fs::read_to_string(&valid).unwrap(), + r#"{"default_branch":"main","branches":{}}"#, + "valid branch-meta.json must be left untouched" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Quarantined 1 corrupt branch metadata file(s)"), + "stderr should report the quarantine\nstderr:\n{stderr}" + ); +} + +#[test] +fn post_update_no_heal_skips_health_pass() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + let corrupt = write_branch_meta(&profile_root, "proj_corrupt", "{not valid json"); + + let mut command = post_update_command(&home_root); + command.args(["post-update", "--no-heal"]); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update --no-heal"); + assert!( + corrupt.exists(), + "--no-heal must leave the corrupt branch-meta.json in place" + ); + assert_eq!( + std::fs::read_to_string(&corrupt).unwrap(), + "{not valid json" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Skipping post-update health pass (--no-heal)"), + "stderr should say the health pass was skipped\nstderr:\n{stderr}" + ); + assert!( + !stderr.contains("Post-update health pass"), + "the health pass must not run with --no-heal\nstderr:\n{stderr}" + ); +} + +#[tokio::test] +async fn post_update_gcs_stale_registry_rows_under_temp_dir_only() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + std::fs::create_dir_all(&profile_root).unwrap(); + + // The child's system temp directory is pinned to a dir inside the test + // home, so the GC scope is fully controlled by the fixture. + let fake_tmp = home_root.join("fake-tmp"); + std::fs::create_dir_all(&fake_tmp).unwrap(); + let fake_tmp = canonical_temp_path(&fake_tmp); + let live_root = fake_tmp.join("live-project"); + std::fs::create_dir_all(&live_root).unwrap(); + + { + let db = GlobalDb::open_at(&profile_root.join("global.db")) + .await + .expect("global db should open"); + db.upsert_code_project( + "proj_tmp_gone", + &fake_tmp.join("gone-project"), + None, + None, + Some("main"), + ) + .await + .expect("stale temp project should upsert"); + db.upsert_code_project( + "proj_elsewhere_gone", + &home_root.join("gone-elsewhere"), + None, + None, + Some("main"), + ) + .await + .expect("stale non-temp project should upsert"); + db.upsert_code_project("proj_tmp_live", &live_root, None, None, Some("main")) + .await + .expect("live temp project should upsert"); + } + + let mut command = post_update_command(&home_root); + command + .arg("post-update") + .env("TMPDIR", &fake_tmp) + .env("TMP", &fake_tmp) + .env("TEMP", &fake_tmp); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update"); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Purged 1 stale temp-root registry row(s)"), + "stderr should report the temp-root registry GC\nstderr:\n{stderr}" + ); + + let db = GlobalDb::open_at(&profile_root.join("global.db")) + .await + .expect("global db should reopen"); + let remaining: Vec = db + .list_code_projects(usize::MAX) + .await + .into_iter() + .map(|project| project.project_id) + .collect(); + assert!( + !remaining.contains(&"proj_tmp_gone".to_string()), + "stale temp-root row must be purged, remaining: {remaining:?}" + ); + assert!( + remaining.contains(&"proj_elsewhere_gone".to_string()), + "stale row outside the temp dir must be surfaced, not auto-purged: {remaining:?}" + ); + assert!( + remaining.contains(&"proj_tmp_live".to_string()), + "temp-root row with an existing project root must survive: {remaining:?}" + ); + assert!( + stderr.contains("stale code project registry row(s) outside the temp directory"), + "remaining stale rows should be surfaced in the findings summary\nstderr:\n{stderr}" + ); +} From d7c773bfbf6cdfc302309cf7c45360f4af5fa7a6 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 00:25:12 +0000 Subject: [PATCH 05/11] refactor: deslop and simplify self-improvement branch --- src/automation/session_reflector.rs | 21 +++++++++------------ src/branch_meta.rs | 3 +-- src/doctor/heal.rs | 4 ++-- src/main.rs | 2 -- src/storage.rs | 14 ++++---------- tests/update_health_pass_test.rs | 7 ++++--- 6 files changed, 20 insertions(+), 31 deletions(-) diff --git a/src/automation/session_reflector.rs b/src/automation/session_reflector.rs index 8f92791b..faf6b2a9 100644 --- a/src/automation/session_reflector.rs +++ b/src/automation/session_reflector.rs @@ -129,17 +129,14 @@ async fn validate_fact_proposal( "entities must be an array of strings", )); }; - let trust = match object.get("trust") { - Some(value) => match proposal_trust_value(value) { - Some(trust) => Some(trust), - None => { - return Ok(rejected_fact( - proposal, - "trust must be a number between 0 and 1, or one of low, medium, high", - )) - } - }, - None => return Ok(rejected_fact(proposal, "trust is required")), + let Some(trust) = object.get("trust") else { + return Ok(rejected_fact(proposal, "trust is required")); + }; + let Some(trust) = proposal_trust_value(trust) else { + return Ok(rejected_fact( + proposal, + "trust must be a number between 0 and 1, or one of low, medium, high", + )); }; if object.contains_key("confidence") { return Ok(rejected_fact( @@ -209,7 +206,7 @@ async fn validate_fact_proposal( source: Some("session_reflector".to_string()), tags, entities, - trust, + trust: Some(trust), metadata: json!({ "source": "session_reflector", "source_span": source_span, diff --git a/src/branch_meta.rs b/src/branch_meta.rs index 1e66c084..82f47288 100644 --- a/src/branch_meta.rs +++ b/src/branch_meta.rs @@ -8,8 +8,7 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -/// Filename of the branch metadata file inside a project data dir. -pub const BRANCH_META_FILENAME: &str = "branch-meta.json"; +const BRANCH_META_FILENAME: &str = "branch-meta.json"; /// Metadata for a single tracked branch. #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/src/doctor/heal.rs b/src/doctor/heal.rs index 5c2ea838..0232dae2 100644 --- a/src/doctor/heal.rs +++ b/src/doctor/heal.rs @@ -19,9 +19,9 @@ use std::collections::HashSet; use std::path::{Path, PathBuf}; -use crate::branch_meta::BRANCH_META_FILENAME; use crate::global_db::GlobalDb; use crate::migrate::registry::stale_code_projects; +use crate::storage::BRANCH_META_FILENAME; /// A corrupt `branch-meta.json` that was renamed out of the way. #[derive(Debug, Clone, PartialEq, Eq)] @@ -111,7 +111,7 @@ fn print_warnings(warnings: &[String]) { /// Renames every `branch-meta.json` under `/projects/*` that is /// not valid JSON to `branch-meta.json.corrupt-`, preserving the /// corrupt content as evidence while restoring the single-DB fallback. -pub fn quarantine_corrupt_branch_meta(profile_root: &Path, report: &mut HealthPassReport) { +fn quarantine_corrupt_branch_meta(profile_root: &Path, report: &mut HealthPassReport) { let projects_root = profile_root.join("projects"); let Ok(entries) = std::fs::read_dir(&projects_root) else { return; diff --git a/src/main.rs b/src/main.rs index e057b04b..be98b186 100644 --- a/src/main.rs +++ b/src/main.rs @@ -489,8 +489,6 @@ async fn run_post_update_tasks(no_heal: bool) -> tracedecay::errors::Result<()> if no_heal { eprintln!("Skipping post-update health pass (--no-heal)."); } else { - // Failure-tolerant by construction: the health pass collects every - // error as a printed warning and never fails the update. tracedecay::doctor::heal::run_post_update_health_pass().await; } Ok(()) diff --git a/src/storage.rs b/src/storage.rs index caea2bb4..b08f208b 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -1,5 +1,5 @@ use std::fs; -use std::io; +use std::io::{self, Write}; use std::path::{Component, Path, PathBuf}; use serde::{Deserialize, Serialize}; @@ -467,21 +467,15 @@ impl PrivateStoreIo { /// processes never interleave partial lines or lose each other's entries /// the way read-modify-rewrite appends do. pub fn append_line(path: &Path, line: &str) -> io::Result<()> { - use io::Write; - if let Some(parent) = path.parent() { Self::create_dir_all(parent)?; } reject_symlink_components(path, "private store file")?; - let mut payload = String::with_capacity(line.len() + 1); - payload.push_str(line); - payload.push('\n'); - let mut file = fs::OpenOptions::new() + fs::OpenOptions::new() .create(true) .append(true) - .open(path)?; - file.write_all(payload.as_bytes())?; - drop(file); + .open(path)? + .write_all(format!("{line}\n").as_bytes())?; set_private_file_permissions(path) } diff --git a/tests/update_health_pass_test.rs b/tests/update_health_pass_test.rs index 7db00974..d54308f1 100644 --- a/tests/update_health_pass_test.rs +++ b/tests/update_health_pass_test.rs @@ -4,12 +4,14 @@ //! All tests spawn the real binary against an isolated home directory via //! `apply_tracedecay_home_env`, so they never touch the real `~/.tracedecay`. +mod common; + +#[cfg(unix)] +use std::os::unix::fs::PermissionsExt; use std::path::{Path, PathBuf}; use std::process::{Command, Output, Stdio}; use std::time::{Duration, Instant}; -mod common; - use common::apply_tracedecay_home_env; use tempfile::TempDir; use tracedecay::global_db::GlobalDb; @@ -48,7 +50,6 @@ fn add_tracedecay_path_shim(command: &mut Command, home: &Path) { } #[cfg(unix)] { - use std::os::unix::fs::PermissionsExt; let mut permissions = std::fs::metadata(&shim).unwrap().permissions(); permissions.set_mode(0o755); std::fs::set_permissions(&shim, permissions).unwrap(); From 24f344ff9e4e9c031945239702b150b8e18cf910 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 00:59:14 +0000 Subject: [PATCH 06/11] fix: align heal branch-meta corruption check with loader schema branch_meta now owns the one canonical parse used by both load_branch_meta and the post-update heal quarantine, so schema-corrupt files (valid JSON, wrong shape) are quarantined instead of warning on every open. Restructures the health pass into compute/render, fetches the registry once, makes stale_code_projects borrow with a named StaleRootScope predicate, adds a shared 0o600-at-create private-open helper in PrivateStoreIo, and documents the heal-by-default policy. Adds unit + integration tests for the schema-corrupt quarantine path. --- src/branch_meta.rs | 23 ++- src/commands.rs | 7 +- src/doctor.rs | 5 +- src/doctor/heal.rs | 256 +++++++++++++++++++------------ src/migrate/registry.rs | 48 ++++-- src/storage.rs | 32 +++- tests/update_health_pass_test.rs | 40 ++++- 7 files changed, 290 insertions(+), 121 deletions(-) diff --git a/src/branch_meta.rs b/src/branch_meta.rs index 82f47288..08fcb25c 100644 --- a/src/branch_meta.rs +++ b/src/branch_meta.rs @@ -8,7 +8,7 @@ use std::path::{Path, PathBuf}; use serde::{Deserialize, Serialize}; -const BRANCH_META_FILENAME: &str = "branch-meta.json"; +use crate::storage::BRANCH_META_FILENAME; /// Metadata for a single tracked branch. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -115,6 +115,17 @@ impl BranchMeta { } } +/// Parses `branch-meta.json` content into [`BranchMeta`]. +/// +/// This is the canonical definition of "corrupt branch metadata": anything +/// this rejects — invalid JSON *or* valid JSON with the wrong schema — makes +/// the runtime fall back to single-DB mode. Every consumer (loading at +/// runtime, quarantining in the post-update health pass) must go through this +/// one predicate so they agree on what corrupt means. +pub fn parse(content: &str) -> serde_json::Result { + serde_json::from_str(content) +} + /// Loads branch metadata from `branch-meta.json` in the project data dir. /// /// Returns `None` if the file doesn't exist (single-DB mode / pre-branch projects). @@ -122,7 +133,7 @@ impl BranchMeta { pub fn load_branch_meta(data_dir: &Path) -> Option { let path = data_dir.join(BRANCH_META_FILENAME); let content = std::fs::read_to_string(&path).ok()?; - match serde_json::from_str(&content) { + match parse(&content) { Ok(meta) => Some(meta), Err(e) => { eprintln!( @@ -215,6 +226,14 @@ mod tests { assert!(meta.remove_branch("main").is_none()); } + #[test] + fn parse_rejects_schema_mismatch_as_corrupt() { + assert!(parse(r#"{"default_branch":"main","branches":{}}"#).is_ok()); + assert!(parse("{not valid json").is_err()); + assert!(parse(r#"{"default_branch": 5}"#).is_err()); + assert!(parse("[]").is_err()); + } + #[test] fn roundtrip_json() { let mut meta = BranchMeta::new("main"); diff --git a/src/commands.rs b/src/commands.rs index fe93a097..6c1cba85 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -352,10 +352,11 @@ pub(crate) async fn handle_migrate_action(action: MigrateAction) -> tracedecay:: message: "could not open global DB for registry cleanup".to_string(), })?; let projects = global_db.list_code_projects(usize::MAX).await; - let prefix_path = prefix.as_deref().map(PathBuf::from); + let prefixes: Vec = prefix.iter().map(PathBuf::from).collect(); let stale = tracedecay::migrate::registry::stale_code_projects( - projects, - prefix_path.as_deref(), + &projects, + &prefixes, + tracedecay::migrate::registry::StaleRootScope::CanonicalRootMissing, ); let deleted = if apply { let project_ids: Vec = stale diff --git a/src/doctor.rs b/src/doctor.rs index bac63622..c612cba8 100644 --- a/src/doctor.rs +++ b/src/doctor.rs @@ -7,6 +7,7 @@ use std::path::{Component, Path, PathBuf}; use crate::agents::{self, DoctorCounters, HealthcheckContext}; use crate::display::{format_bytes, format_token_count}; +use crate::migrate::registry::code_project_root_exists; use crate::tracedecay::TraceDecay; pub mod heal; @@ -311,10 +312,6 @@ async fn check_stale_code_projects(dc: &mut DoctorCounters, gdb: &crate::global_ )); } -fn code_project_root_exists(project: &crate::global_db::CodeProjectRecord) -> bool { - Path::new(&project.canonical_root).exists() || Path::new(&project.display_root).exists() -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum DoctorStorageStatus { RepoLocal, diff --git a/src/doctor/heal.rs b/src/doctor/heal.rs index 0232dae2..13490187 100644 --- a/src/doctor/heal.rs +++ b/src/doctor/heal.rs @@ -1,27 +1,35 @@ //! Post-update health pass: safe, automatic repairs plus a concise summary //! of the doctor findings that still need a human decision. //! -//! Runs at the end of `tracedecay update` / `tracedecay post-update` (skip -//! with `--no-heal`). Every step is failure-tolerant: a failing check prints -//! a warning but never fails the update itself. Only remedies that are safe -//! to automate are applied: +//! Runs at the end of `tracedecay update` / `tracedecay post-update`. Running +//! by default (opt-out via `--no-heal`) is intentional product policy: the +//! hidden `post-update` subcommand fires from the self-update re-exec path, +//! so every successful `tracedecay update` heals the store unless the user +//! explicitly skips it. Every step is failure-tolerant: a failing check +//! prints a warning but never fails the update itself. Only remedies that +//! are safe to automate are applied: //! -//! - corrupt `branch-meta.json` files are quarantined (renamed to -//! `branch-meta.json.corrupt-`), which preserves evidence while -//! restoring the silent single-DB fallback, +//! - corrupt `branch-meta.json` files (anything [`crate::branch_meta::parse`] +//! rejects) are quarantined — renamed to +//! `branch-meta.json.corrupt-`, never deleted — preserving the +//! evidence while restoring the silent single-DB fallback, //! - registry rows whose project root no longer exists AND lives under the //! system temp directory are purged (the automated equivalent of -//! `tracedecay migrate registry-gc --prefix --apply`). +//! `tracedecay migrate registry-gc --prefix --apply`), and only when +//! BOTH the canonical and display roots are gone. +//! +//! Those auto-applied remedies are safe precisely because quarantine renames +//! instead of deleting and the GC removes only temp-rooted registry metadata +//! whose every known root has vanished — no user data is ever destroyed. //! //! Everything else (orphan store manifests, stale rows outside the temp //! directory, registry/manifest identity drift) is only reported. -use std::collections::HashSet; use std::path::{Path, PathBuf}; -use crate::global_db::GlobalDb; -use crate::migrate::registry::stale_code_projects; -use crate::storage::BRANCH_META_FILENAME; +use crate::global_db::{CodeProjectRecord, GlobalDb}; +use crate::migrate::registry::{code_project_root_exists, stale_code_projects, StaleRootScope}; +use crate::storage::{BRANCH_META_FILENAME, BRANCH_META_QUARANTINE_PREFIX}; /// A corrupt `branch-meta.json` that was renamed out of the way. #[derive(Debug, Clone, PartialEq, Eq)] @@ -34,7 +42,8 @@ pub struct BranchMetaQuarantine { #[derive(Debug, Default)] pub struct HealthPassReport { pub quarantined_branch_meta: Vec, - pub purged_temp_registry_rows: usize, + /// `None` when the global DB could not be opened, so the GC never ran. + pub purged_temp_registry_rows: Option, pub remaining_findings: Vec, pub warnings: Vec, } @@ -44,18 +53,55 @@ pub struct HealthPassReport { /// Never fails: every error is collected as a warning so a broken store can /// never abort `tracedecay update`. pub async fn run_post_update_health_pass() -> HealthPassReport { - let mut report = HealthPassReport::default(); eprintln!("\n\x1b[1mPost-update health pass\x1b[0m (skip with --no-heal)"); let Some(profile_root) = crate::config::user_data_dir() else { + let report = HealthPassReport { + warnings: vec!["could not determine the profile data directory".to_string()], + ..HealthPassReport::default() + }; + render_warnings(&report.warnings); + return report; + }; + + let report = compute_health_pass_report(&profile_root).await; + render_health_pass_report(&report); + report +} + +/// Applies the safe remedies and gathers everything the pass has to say into +/// a [`HealthPassReport`], without printing anything. +async fn compute_health_pass_report(profile_root: &Path) -> HealthPassReport { + let mut report = HealthPassReport::default(); + + let (quarantined, warnings) = quarantine_corrupt_branch_meta(profile_root); + report.quarantined_branch_meta = quarantined; + report.warnings.extend(warnings); + + // Opening the global DB applies its idempotent schema migrations — the + // same lazy upgrade every normal open path performs. + let Some(global_db) = GlobalDb::open().await else { report .warnings - .push("could not determine the profile data directory".to_string()); - print_warnings(&report.warnings); + .push("could not open the global DB for the health pass".to_string()); return report; }; - quarantine_corrupt_branch_meta(&profile_root, &mut report); + // One registry snapshot for the whole pass: the GC and the remaining + // findings below both work from this list. + let projects = global_db.list_code_projects(usize::MAX).await; + let (purged, purged_ids) = gc_stale_temp_registry_rows(&global_db, &projects).await; + report.purged_temp_registry_rows = Some(purged); + + let (findings, warnings) = + collect_remaining_findings(&global_db, profile_root, &projects, &purged_ids).await; + report.remaining_findings = findings; + report.warnings.extend(warnings); + report +} + +/// Prints the doctor-style summary for a computed report. +fn render_health_pass_report(report: &HealthPassReport) { if report.quarantined_branch_meta.is_empty() { eprintln!(" \x1b[32m✔\x1b[0m No corrupt branch metadata files"); } else { @@ -68,26 +114,12 @@ pub async fn run_post_update_health_pass() -> HealthPassReport { } } - // Opening the global DB applies its idempotent schema migrations — the - // same lazy upgrade every normal open path performs. - match GlobalDb::open().await { - Some(global_db) => { - gc_stale_temp_registry_rows(&global_db, &mut report).await; - if report.purged_temp_registry_rows > 0 { - eprintln!( - " \x1b[32m✔\x1b[0m Purged {} stale temp-root registry row(s)", - report.purged_temp_registry_rows - ); - } else { - eprintln!(" \x1b[32m✔\x1b[0m No stale temp-root registry rows"); - } - collect_remaining_findings(&global_db, &profile_root, &mut report).await; - } - None => { - report - .warnings - .push("could not open the global DB for the health pass".to_string()); + match report.purged_temp_registry_rows { + Some(0) => eprintln!(" \x1b[32m✔\x1b[0m No stale temp-root registry rows"), + Some(purged) => { + eprintln!(" \x1b[32m✔\x1b[0m Purged {purged} stale temp-root registry row(s)"); } + None => {} } if report.remaining_findings.is_empty() { @@ -98,23 +130,28 @@ pub async fn run_post_update_health_pass() -> HealthPassReport { eprintln!(" • {finding}"); } } - print_warnings(&report.warnings); - report + render_warnings(&report.warnings); } -fn print_warnings(warnings: &[String]) { +fn render_warnings(warnings: &[String]) { for warning in warnings { eprintln!(" \x1b[33mwarning:\x1b[0m health pass: {warning}"); } } -/// Renames every `branch-meta.json` under `/projects/*` that is -/// not valid JSON to `branch-meta.json.corrupt-`, preserving the -/// corrupt content as evidence while restoring the single-DB fallback. -fn quarantine_corrupt_branch_meta(profile_root: &Path, report: &mut HealthPassReport) { +/// Renames every `branch-meta.json` under `/projects/*` that +/// [`crate::branch_meta::parse`] rejects — the runtime's own definition of +/// corrupt, covering both invalid JSON and schema mismatches — to +/// `branch-meta.json.corrupt-`, preserving the corrupt content as +/// evidence while restoring the single-DB fallback. +/// +/// Returns the performed quarantines and any warnings. +fn quarantine_corrupt_branch_meta(profile_root: &Path) -> (Vec, Vec) { + let mut quarantines = Vec::new(); + let mut warnings = Vec::new(); let projects_root = profile_root.join("projects"); let Ok(entries) = std::fs::read_dir(&projects_root) else { - return; + return (quarantines, warnings); }; let mut meta_paths: Vec = entries .flatten() @@ -128,52 +165,52 @@ fn quarantine_corrupt_branch_meta(profile_root: &Path, report: &mut HealthPassRe let content = match std::fs::read_to_string(&path) { Ok(content) => content, Err(err) => { - report - .warnings - .push(format!("could not read '{}': {err}", path.display())); + warnings.push(format!("could not read '{}': {err}", path.display())); continue; } }; - if serde_json::from_str::(&content).is_ok() { + if crate::branch_meta::parse(&content).is_ok() { continue; } - let quarantined = path.with_file_name(format!("{BRANCH_META_FILENAME}.corrupt-{now}")); + let quarantined = path.with_file_name(format!("{BRANCH_META_QUARANTINE_PREFIX}{now}")); match std::fs::rename(&path, &quarantined) { - Ok(()) => report.quarantined_branch_meta.push(BranchMetaQuarantine { + Ok(()) => quarantines.push(BranchMetaQuarantine { original: path, quarantined, }), - Err(err) => report.warnings.push(format!( + Err(err) => warnings.push(format!( "could not quarantine corrupt '{}': {err}", path.display() )), } } + (quarantines, warnings) } -/// Purges registry rows whose canonical project root is gone AND lives under -/// the system temp directory. This is the only registry GC scope that is safe -/// to run without review. -async fn gc_stale_temp_registry_rows(global_db: &GlobalDb, report: &mut HealthPassReport) { - let projects = global_db.list_code_projects(usize::MAX).await; - let mut seen: HashSet = HashSet::new(); - let mut stale_ids: Vec = Vec::new(); - for prefix in temp_dir_prefixes() { - for project in stale_code_projects(projects.clone(), Some(&prefix)) { - // Extra safety over the manual `migrate registry-gc`: also - // require the display root to be gone before auto-deleting. - if super::code_project_root_exists(&project) { - continue; - } - if seen.insert(project.project_id.clone()) { - stale_ids.push(project.project_id); - } - } - } +/// Purges registry rows in the auto-GC scope: canonical root under the +/// system temp directory AND every known root gone +/// ([`StaleRootScope::AllRootsMissing`]) — the only registry GC scope that is +/// safe to run without review. +/// +/// Returns the purged row count plus the candidate ids, so the remaining +/// findings can exclude them from the shared pre-purge registry snapshot. +async fn gc_stale_temp_registry_rows( + global_db: &GlobalDb, + projects: &[CodeProjectRecord], +) -> (usize, Vec) { + let stale_ids: Vec = stale_code_projects( + projects, + &temp_dir_prefixes(), + StaleRootScope::AllRootsMissing, + ) + .into_iter() + .map(|project| project.project_id.clone()) + .collect(); if stale_ids.is_empty() { - return; + return (0, stale_ids); } - report.purged_temp_registry_rows = global_db.delete_code_projects(&stale_ids).await; + let purged = global_db.delete_code_projects(&stale_ids).await; + (purged, stale_ids) } /// The system temp directory in both its literal and canonicalized spellings, @@ -190,40 +227,45 @@ fn temp_dir_prefixes() -> Vec { } /// Summarizes the doctor findings that are NOT safe to auto-apply so the user -/// sees them at the end of `tracedecay update` output. +/// sees them at the end of `tracedecay update` output. `projects` is the +/// pre-purge registry snapshot; rows in `purged_ids` are skipped. +/// +/// Returns the findings and any warnings. async fn collect_remaining_findings( global_db: &GlobalDb, profile_root: &Path, - report: &mut HealthPassReport, -) { + projects: &[CodeProjectRecord], + purged_ids: &[String], +) -> (Vec, Vec) { + let mut findings = Vec::new(); let project_paths = global_db.list_project_paths().await; - let (orphan_count, issues) = super::orphan_store_manifest_report(profile_root, &project_paths); - report.warnings.extend(issues); + let (orphan_count, warnings) = + super::orphan_store_manifest_report(profile_root, &project_paths); if orphan_count > 0 { - report.remaining_findings.push(format!( + findings.push(format!( "{orphan_count} orphan profile store manifest(s) can reconstruct registry rows" )); } - let stale_rows = global_db - .list_code_projects(usize::MAX) - .await - .into_iter() - .filter(|project| !super::code_project_root_exists(project)) + let stale_rows = projects + .iter() + .filter(|project| !purged_ids.contains(&project.project_id)) + .filter(|project| !code_project_root_exists(project)) .count(); if stale_rows > 0 { - report.remaining_findings.push(format!( + findings.push(format!( "{stale_rows} stale code project registry row(s) outside the temp directory" )); } let drift = super::registry_drift::registry_drift_findings(global_db, profile_root).await; if !drift.is_empty() { - report.remaining_findings.push(format!( + findings.push(format!( "{} registry/store manifest identity drift finding(s)", drift.len() )); } + (findings, warnings) } #[cfg(test)] @@ -250,12 +292,11 @@ mod tests { r#"{"default_branch":"main","branches":{}}"#, ); - let mut report = HealthPassReport::default(); - quarantine_corrupt_branch_meta(dir.path(), &mut report); + let (quarantines, warnings) = quarantine_corrupt_branch_meta(dir.path()); - assert_eq!(report.quarantined_branch_meta.len(), 1); - assert!(report.warnings.is_empty()); - let quarantine = &report.quarantined_branch_meta[0]; + assert_eq!(quarantines.len(), 1); + assert!(warnings.is_empty()); + let quarantine = &quarantines[0]; assert_eq!(quarantine.original, corrupt); assert!(!corrupt.exists(), "corrupt file should be renamed away"); assert_eq!( @@ -269,18 +310,43 @@ mod tests { .file_name() .unwrap() .to_string_lossy() - .starts_with("branch-meta.json.corrupt-"), + .starts_with(BRANCH_META_QUARANTINE_PREFIX), "quarantine name should be branch-meta.json.corrupt-: {quarantine:?}" ); assert!(valid.exists(), "valid branch-meta must be left untouched"); } + #[test] + fn quarantine_treats_schema_mismatch_as_corrupt() { + let dir = tempfile::TempDir::new().unwrap(); + let projects_root = dir.path().join("projects"); + // Valid JSON, but not a valid BranchMeta — the runtime warns + // "corrupt" on every open, so the health pass must agree. + let schema_corrupt = + write_branch_meta(&projects_root, "proj_schema", r#"{"default_branch": 5}"#); + + let (quarantines, warnings) = quarantine_corrupt_branch_meta(dir.path()); + + assert!(warnings.is_empty()); + assert_eq!( + quarantines.len(), + 1, + "schema-corrupt branch-meta must be quarantined: {quarantines:?}" + ); + assert_eq!(quarantines[0].original, schema_corrupt); + assert!(!schema_corrupt.exists()); + assert_eq!( + std::fs::read_to_string(&quarantines[0].quarantined).unwrap(), + r#"{"default_branch": 5}"#, + "quarantined file must preserve the corrupt content as evidence" + ); + } + #[test] fn quarantine_is_a_no_op_without_a_projects_dir() { let dir = tempfile::TempDir::new().unwrap(); - let mut report = HealthPassReport::default(); - quarantine_corrupt_branch_meta(dir.path(), &mut report); - assert!(report.quarantined_branch_meta.is_empty()); - assert!(report.warnings.is_empty()); + let (quarantines, warnings) = quarantine_corrupt_branch_meta(dir.path()); + assert!(quarantines.is_empty()); + assert!(warnings.is_empty()); } } diff --git a/src/migrate/registry.rs b/src/migrate/registry.rs index 2fc907eb..28b7e12c 100644 --- a/src/migrate/registry.rs +++ b/src/migrate/registry.rs @@ -130,20 +130,46 @@ pub async fn apply_registry_reconstruction_report( } } -/// Filters registry rows whose canonical project root no longer exists on -/// disk, optionally scoped to roots under `prefix`. Shared by -/// `tracedecay migrate registry-gc` and the post-update health pass so both -/// agree on what counts as a GC candidate. -pub fn stale_code_projects( - projects: Vec, - prefix: Option<&Path>, -) -> Vec { +/// How dead a registry row's project root must be before the row counts as +/// stale. This is the single definition of both GC scopes, so a reader never +/// has to reassemble the effective condition from scattered half-checks. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StaleRootScope { + /// Manual `tracedecay migrate registry-gc` scope: the canonical root is + /// gone (the user reviews candidates before applying). + CanonicalRootMissing, + /// Post-update auto-GC scope: both the canonical and display roots are + /// gone — stricter, because nobody reviews the deletion. + AllRootsMissing, +} + +/// Returns true if the project's canonical or display root still exists. +pub fn code_project_root_exists(project: &CodeProjectRecord) -> bool { + Path::new(&project.canonical_root).exists() || Path::new(&project.display_root).exists() +} + +/// Filters registry rows that are stale under `scope`, restricted to +/// canonical roots under one of `prefixes` (an empty slice means no +/// restriction). Shared by `tracedecay migrate registry-gc` and the +/// post-update health pass so both agree on what counts as a GC candidate. +pub fn stale_code_projects<'a>( + projects: &'a [CodeProjectRecord], + prefixes: &[PathBuf], + scope: StaleRootScope, +) -> Vec<&'a CodeProjectRecord> { projects - .into_iter() + .iter() .filter(|project| { - prefix.is_none_or(|prefix| Path::new(&project.canonical_root).starts_with(prefix)) + let canonical_root = Path::new(&project.canonical_root); + prefixes.is_empty() + || prefixes + .iter() + .any(|prefix| canonical_root.starts_with(prefix)) + }) + .filter(|project| match scope { + StaleRootScope::CanonicalRootMissing => !Path::new(&project.canonical_root).exists(), + StaleRootScope::AllRootsMissing => !code_project_root_exists(project), }) - .filter(|project| !Path::new(&project.canonical_root).exists()) .collect() } diff --git a/src/storage.rs b/src/storage.rs index b08f208b..fdb347ec 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -12,6 +12,9 @@ pub const ENROLLMENT_FILENAME: &str = "enrollment.json"; pub const STORE_MANIFEST_FILENAME: &str = "store_manifest.json"; pub const SESSIONS_DB_FILENAME: &str = "sessions.db"; pub const BRANCH_META_FILENAME: &str = "branch-meta.json"; +/// Filename prefix for corrupt `branch-meta.json` files renamed out of the +/// way by the post-update health pass (`branch-meta.json.corrupt-`). +pub const BRANCH_META_QUARANTINE_PREFIX: &str = "branch-meta.json.corrupt-"; pub const STORE_MANIFEST_SCHEMA_VERSION: u32 = 1; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -459,7 +462,8 @@ impl PrivateStoreIo { Self::create_dir_all(parent)?; } reject_symlink_components(path, "private store file")?; - fs::write(path, contents)?; + Self::open_private(path, fs::OpenOptions::new().write(true).truncate(true))? + .write_all(contents)?; set_private_file_permissions(path) } @@ -471,14 +475,22 @@ impl PrivateStoreIo { Self::create_dir_all(parent)?; } reject_symlink_components(path, "private store file")?; - fs::OpenOptions::new() - .create(true) - .append(true) - .open(path)? + Self::open_private(path, fs::OpenOptions::new().append(true))? .write_all(format!("{line}\n").as_bytes())?; set_private_file_permissions(path) } + /// Opens `path` for writing, creating it if missing with owner-only + /// permissions applied at create time (Unix), so a fresh file never + /// exists with umask-default permissions before the trailing + /// `set_private_file_permissions` call. Pre-existing files keep their + /// mode here and are tightened by that trailing call. + fn open_private(path: &Path, options: &mut fs::OpenOptions) -> io::Result { + options.create(true); + apply_private_create_mode(options); + options.open(path) + } + pub fn write_file_atomically(path: &Path, temp_path: &Path, contents: &[u8]) -> io::Result<()> { if path_parent(path) != path_parent(temp_path) { return Err(invalid_input( @@ -718,6 +730,16 @@ fn set_private_file_permissions(path: &Path) -> std::io::Result<()> { fs::set_permissions(path, fs::Permissions::from_mode(0o600)) } +#[cfg(unix)] +fn apply_private_create_mode(options: &mut fs::OpenOptions) { + use std::os::unix::fs::OpenOptionsExt; + + options.mode(0o600); +} + +#[cfg(not(unix))] +fn apply_private_create_mode(_options: &mut fs::OpenOptions) {} + #[cfg(not(unix))] fn set_private_file_permissions(_path: &Path) -> std::io::Result<()> { Ok(()) diff --git a/tests/update_health_pass_test.rs b/tests/update_health_pass_test.rs index d54308f1..7d3f4723 100644 --- a/tests/update_health_pass_test.rs +++ b/tests/update_health_pass_test.rs @@ -15,6 +15,7 @@ use std::time::{Duration, Instant}; use common::apply_tracedecay_home_env; use tempfile::TempDir; use tracedecay::global_db::GlobalDb; +use tracedecay::storage::BRANCH_META_QUARANTINE_PREFIX; fn canonical_temp_path(path: &Path) -> PathBuf { #[cfg(windows)] @@ -128,7 +129,7 @@ fn quarantined_branch_meta_files(shard: &Path) -> Vec { .filter(|path| { path.file_name().is_some_and(|name| { name.to_string_lossy() - .starts_with("branch-meta.json.corrupt-") + .starts_with(BRANCH_META_QUARANTINE_PREFIX) }) }) .collect() @@ -178,6 +179,43 @@ fn post_update_quarantines_corrupt_branch_meta() { ); } +#[test] +fn post_update_quarantines_schema_corrupt_branch_meta() { + let home = TempDir::new().unwrap(); + let home_root = canonical_temp_path(home.path()); + let profile_root = home_root.join(".tracedecay"); + // Valid JSON, but not a valid BranchMeta — the runtime treats any schema + // mismatch as corrupt, so the health pass must quarantine it too. + let schema_corrupt = + write_branch_meta(&profile_root, "proj_schema", r#"{"default_branch": 5}"#); + + let mut command = post_update_command(&home_root); + command.arg("post-update"); + let output = run_with_timeout(command, cli_timeout()); + + assert_success(&output, "post-update"); + assert!( + !schema_corrupt.exists(), + "schema-corrupt branch-meta.json should be quarantined away" + ); + let quarantined = quarantined_branch_meta_files(schema_corrupt.parent().unwrap()); + assert_eq!( + quarantined.len(), + 1, + "exactly one quarantine file expected, got {quarantined:?}" + ); + assert_eq!( + std::fs::read_to_string(&quarantined[0]).unwrap(), + r#"{"default_branch": 5}"#, + "quarantine must preserve the corrupt content as evidence" + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("Quarantined 1 corrupt branch metadata file(s)"), + "stderr should report the quarantine\nstderr:\n{stderr}" + ); +} + #[test] fn post_update_no_heal_skips_health_pass() { let home = TempDir::new().unwrap(); From e97e4f3b934c92368bd788e8e688685e24d42b47 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 00:59:24 +0000 Subject: [PATCH 07/11] refactor: dedup scheduler skips from a single ledger read The scheduler gate now loads the run ledger once and threads the records through the run context, so gate-level and post-gate skip dedup share that one read and append_skipped_record is a pure append-unless-repeat with no second I/O pass. Also inlines tokio::time::timeout for the daemon shutdown deadline (a panic in shutdown_all no longer reads as success) and derives the session-reflector trust-label representatives from named memory::trust constants with a drift-guard test. --- src/automation/lifecycle.rs | 212 +++++++++++++++++++++++----- src/automation/memory_curator.rs | 2 +- src/automation/runner.rs | 4 +- src/automation/session_reflector.rs | 17 ++- src/daemon.rs | 66 ++------- src/memory/trust.rs | 24 +++- 6 files changed, 227 insertions(+), 98 deletions(-) diff --git a/src/automation/lifecycle.rs b/src/automation/lifecycle.rs index a262f903..fb989a77 100644 --- a/src/automation/lifecycle.rs +++ b/src/automation/lifecycle.rs @@ -40,6 +40,11 @@ pub(crate) struct AgentTaskRunContext<'a> { config: &'a AutomationConfig, task: AgentTaskKind, started_at: String, + /// Ledger records loaded once by [`Self::gate`] on the scheduler path. + /// Both gate-level and post-gate skips compute their repeat-skip dedup + /// from these cached records, so the append path never re-reads the + /// ledger. + ledger_records: Option>, } impl<'a> AgentTaskRunContext<'a> { @@ -58,6 +63,7 @@ impl<'a> AgentTaskRunContext<'a> { config, task, started_at: current_timestamp().to_string(), + ledger_records: None, } } @@ -65,8 +71,11 @@ impl<'a> AgentTaskRunContext<'a> { &self.started_at } - pub(crate) async fn gate(&self) -> Result { - task_run_gate(self.config, &self.dashboard_root, self.task, self.trigger).await + pub(crate) async fn gate(&mut self) -> Result { + let (gate, records) = + task_run_gate(self.config, &self.dashboard_root, self.task, self.trigger).await?; + self.ledger_records = records; + Ok(gate) } pub(crate) async fn skipped_parts( @@ -85,10 +94,23 @@ impl<'a> AgentTaskRunContext<'a> { reason, self.started_at(), report_task_key, + self.scheduler_skip_is_repeat(reason), ) .await } + /// Computes the repeat-skip dedup decision from the records cached by + /// [`Self::gate`], with no ledger I/O. A scheduler-trigger context whose + /// gate has not run yet has no cached records and conservatively persists + /// the skip. + fn scheduler_skip_is_repeat(&self, reason: &str) -> bool { + self.trigger == AutomationTrigger::Scheduler + && self + .ledger_records + .as_deref() + .is_some_and(|records| is_repeat_scheduler_skip(records, self.task, reason)) + } + pub(crate) fn finalizer(&self, input_hash: Option) -> AgentRunFinalizer<'_> { AgentRunFinalizer::new( &self.dashboard_root, @@ -121,17 +143,21 @@ pub(crate) fn task_skip_reason( None } +/// Evaluates the scheduler gate, returning the ledger records it loaded so +/// callers can reuse them for skip dedup instead of re-reading the ledger. +/// The ledger is read at most once per gate evaluation. pub(crate) async fn scheduler_gate( config: &AutomationConfig, dashboard_root: &Path, task: AgentTaskKind, trigger: AutomationTrigger, -) -> Result { +) -> Result<(SchedulerGate, Option>)> { if trigger != AutomationTrigger::Scheduler { - return Ok(SchedulerGate::Proceed(None)); + return Ok((SchedulerGate::Proceed(None), None)); } let now_secs = current_timestamp(); + let records = load_run_records(dashboard_root, 200).await?; let Some(lock) = AutomationTaskLock::try_acquire( dashboard_root, task, @@ -140,16 +166,15 @@ pub(crate) async fn scheduler_gate( ) .await? else { - return Ok(SchedulerGate::Skip("scheduler_lock_active")); + return Ok((SchedulerGate::Skip("scheduler_lock_active"), Some(records))); }; - let records = load_run_records(dashboard_root, 200).await?; let decision = schedule_decision(config, task, &records, now_secs); if let Some(reason) = scheduler_skip_reason(&decision, task) { - return Ok(SchedulerGate::Skip(reason)); + return Ok((SchedulerGate::Skip(reason), Some(records))); } - Ok(SchedulerGate::Proceed(Some(lock))) + Ok((SchedulerGate::Proceed(Some(lock)), Some(records))) } pub(crate) async fn task_run_gate( @@ -157,16 +182,21 @@ pub(crate) async fn task_run_gate( dashboard_root: &Path, task: AgentTaskKind, trigger: AutomationTrigger, -) -> Result { - match scheduler_gate(config, dashboard_root, task, trigger).await? { - SchedulerGate::Skip(reason) => Ok(SchedulerGate::Skip(reason)), +) -> Result<(SchedulerGate, Option>)> { + let (gate, records) = scheduler_gate(config, dashboard_root, task, trigger).await?; + let gate = match gate { + SchedulerGate::Skip(reason) => SchedulerGate::Skip(reason), SchedulerGate::Proceed(lock) => match task_skip_reason(config, task) { - Some(reason) => Ok(SchedulerGate::Skip(reason)), - None => Ok(SchedulerGate::Proceed(lock)), + Some(reason) => SchedulerGate::Skip(reason), + None => SchedulerGate::Proceed(lock), }, - } + }; + Ok((gate, records)) } +/// Appends a skipped run record unless the caller already determined it is a +/// repeat scheduler skip. Performs no ledger reads: `is_repeat` must be +/// computed from the records the gate evaluation loaded. #[allow(clippy::too_many_arguments)] pub(crate) async fn append_skipped_record( dashboard_root: &Path, @@ -177,6 +207,7 @@ pub(crate) async fn append_skipped_record( evidence_hash: Option, reason: &str, started_at: &str, + is_repeat: bool, ) -> Result { let record = ledger_record( run_id, @@ -195,9 +226,11 @@ pub(crate) async fn append_skipped_record( // skip condition (interval not elapsed, task disabled, ...) would append // thousands of identical records and drown real runs out of the ledger. // Persist only the first record of each consecutive identical skip. - if trigger == AutomationTrigger::Scheduler - && is_repeat_scheduler_skip(dashboard_root, task, reason).await? - { + // + // The gate's ledger read and this append are not atomic: two concurrent + // writers can both observe no prior skip and each append the "first" + // record. The duplicate is benign, so no cross-process locking is done. + if trigger == AutomationTrigger::Scheduler && is_repeat { return Ok(record); } append_run_record(dashboard_root, &record).await?; @@ -206,20 +239,22 @@ pub(crate) async fn append_skipped_record( /// True when the most recent ledger record for `task` is already a scheduler /// skip with the same reason. -async fn is_repeat_scheduler_skip( - dashboard_root: &Path, +/// +/// The skip reason is read out of `record.error`, inheriting the pre-existing +/// modeling wart that skipped runs store their reason in the error field. +fn is_repeat_scheduler_skip( + records: &[AutomationRunLedgerRecord], task: AgentTaskKind, reason: &str, -) -> Result { - let records = load_run_records(dashboard_root, 200).await?; - Ok(records +) -> bool { + records .iter() .find(|record| record.task == task) .is_some_and(|record| { record.trigger == AutomationTrigger::Scheduler && record.status == AutomationRunStatus::Skipped && record.error.as_deref() == Some(reason) - })) + }) } #[allow(clippy::too_many_arguments)] @@ -233,6 +268,7 @@ pub(crate) async fn skipped_run_parts( reason: &str, started_at: &str, report_task_key: Option<&'static str>, + is_repeat: bool, ) -> Result<(Value, AutomationRunLedgerRecord)> { let mut report = json!({ "status": "skipped", @@ -253,6 +289,7 @@ pub(crate) async fn skipped_run_parts( evidence_hash, reason, started_at, + is_repeat, ) .await?; Ok((report, record)) @@ -634,6 +671,7 @@ fn noop_output_for_task(task: AgentTaskKind) -> Value { #[cfg(test)] #[allow(clippy::unwrap_used, clippy::expect_used)] mod tests { + use super::super::config::{AutomationTaskConfig, AutomationTaskSet}; use super::*; #[test] @@ -644,6 +682,9 @@ mod tests { assert_ne!(first, second); } + /// Runs the production skip path: evaluate the gate (which caches ledger + /// records for scheduler triggers) and then record the skip, exactly as a + /// gate-level skip does in the task runners. async fn append_skip( dashboard_root: &Path, run_id: &str, @@ -652,18 +693,20 @@ mod tests { reason: &str, ) -> AutomationRunLedgerRecord { let config = AutomationConfig::default(); - append_skipped_record( - dashboard_root, - run_id, + let mut run = AgentTaskRunContext::new( + dashboard_root.to_path_buf(), + Some(run_id.to_string()), + "test", trigger, &config, task, - None, - reason, - "1000", - ) - .await - .expect("append skipped record") + ); + run.gate().await.expect("gate"); + let (_report, record) = run + .skipped_parts(None, reason, None) + .await + .expect("append skipped record"); + record } #[tokio::test] @@ -758,4 +801,109 @@ mod tests { let records = load_run_records(root, 50).await.expect("load records"); assert_eq!(records.len(), 2, "manual skips must always be recorded"); } + + fn scheduler_enabled_config() -> AutomationConfig { + AutomationConfig { + enabled: true, + backend: AutomationBackend::CodexAppServer, + host_mode: AutomationHostMode::Standalone, + tasks: AutomationTaskSet { + memory_curator: AutomationTaskConfig { + enabled: true, + schedule: Some("hourly".to_string()), + ..AutomationTaskConfig::default() + }, + ..AutomationTaskSet::default() + }, + ..AutomationConfig::default() + } + } + + /// Runs the production post-gate skip path: the gate proceeds (caching + /// ledger records), and the task body later decides to skip. + async fn post_gate_scheduler_skip(dashboard_root: &Path, run_id: &str, reason: &str) { + let config = scheduler_enabled_config(); + let mut run = AgentTaskRunContext::new( + dashboard_root.to_path_buf(), + Some(run_id.to_string()), + "test", + AutomationTrigger::Scheduler, + &config, + AgentTaskKind::MemoryCurator, + ); + let SchedulerGate::Proceed(lock) = run.gate().await.expect("gate") else { + panic!("gate must proceed so the skip is decided post-gate"); + }; + run.skipped_parts(None, reason, None) + .await + .expect("append post-gate skip"); + drop(lock); + } + + #[tokio::test] + async fn consecutive_identical_post_gate_scheduler_skips_persist_once() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + + post_gate_scheduler_skip(root, "run-1", "nothing_to_review").await; + post_gate_scheduler_skip(root, "run-2", "nothing_to_review").await; + + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!( + records.len(), + 1, + "repeat post-gate scheduler skip must not append a second record" + ); + assert_eq!(records[0].run_id, "run-1"); + } + + #[tokio::test] + async fn append_path_relies_solely_on_caller_computed_repeat_flag() { + let temp = tempfile::TempDir::new().expect("temp dir"); + let root = temp.path(); + let config = AutomationConfig::default(); + let task = AgentTaskKind::MemoryCurator; + + // Both identical scheduler skips persist when the caller reports + // is_repeat=false, even though the second is a repeat on disk: the + // append path must not perform its own ledger read to second-guess + // the flag computed from the gate's cached records. + for run_id in ["run-1", "run-2"] { + append_skipped_record( + root, + run_id, + AutomationTrigger::Scheduler, + &config, + task, + None, + "scheduler_interval_not_elapsed", + "1000", + false, + ) + .await + .expect("append skipped record"); + } + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!( + records.len(), + 2, + "append path must trust the caller-computed repeat flag" + ); + + append_skipped_record( + root, + "run-3", + AutomationTrigger::Scheduler, + &config, + task, + None, + "scheduler_interval_not_elapsed", + "1000", + true, + ) + .await + .expect("append skipped record"); + let records = load_run_records(root, 50).await.expect("load records"); + assert_eq!(records.len(), 2, "is_repeat=true must suppress the append"); + } } diff --git a/src/automation/memory_curator.rs b/src/automation/memory_curator.rs index bd7952a0..339459a8 100644 --- a/src/automation/memory_curator.rs +++ b/src/automation/memory_curator.rs @@ -51,7 +51,7 @@ pub async fn run_memory_curator_with_backend( backend: &dyn AgentTaskBackend, options: MemoryCuratorAutomationOptions, ) -> Result { - let run = AgentTaskRunContext::new( + let mut run = AgentTaskRunContext::new( cg.store_layout().dashboard_root.clone(), options.run_id.clone(), "memory_curator", diff --git a/src/automation/runner.rs b/src/automation/runner.rs index 1906ae5e..477a3f52 100644 --- a/src/automation/runner.rs +++ b/src/automation/runner.rs @@ -173,7 +173,7 @@ pub async fn run_session_reflector_with_backend( backend: &dyn AgentTaskBackend, options: SessionReflectorAutomationOptions, ) -> Result { - let run = AgentTaskRunContext::new( + let mut run = AgentTaskRunContext::new( cg.store_layout().dashboard_root.clone(), options.run_id.clone(), "session_reflector", @@ -429,7 +429,7 @@ pub async fn run_skill_writer_with_backend( backend: &dyn AgentTaskBackend, options: SkillWriterAutomationOptions, ) -> Result { - let run = AgentTaskRunContext::new( + let mut run = AgentTaskRunContext::new( cg.store_layout().dashboard_root.clone(), options.run_id.clone(), "skill_writer", diff --git a/src/automation/session_reflector.rs b/src/automation/session_reflector.rs index faf6b2a9..e6fc0374 100644 --- a/src/automation/session_reflector.rs +++ b/src/automation/session_reflector.rs @@ -4,6 +4,7 @@ use serde_json::{json, Value}; use crate::errors::Result; use crate::memory::retrieval::FactRetriever; +use crate::memory::trust::{DEFAULT_TRUST, HIGH_TRUST_REPRESENTATIVE, LOW_TRUST_REPRESENTATIVE}; use crate::memory::types::{AddFactRequest, MemoryCategory}; use crate::tracedecay::TraceDecay; @@ -233,16 +234,22 @@ async fn validate_fact_proposal( /// Accepts numeric trust in `[0, 1]` plus the `low`/`medium`/`high` bucket /// labels models frequently emit despite the numeric prompt instruction. -/// Buckets map to representative scores inside the matching -/// [`crate::memory::trust::trust_bucket`] range. +/// Buckets map to the representative scores defined next to +/// [`crate::memory::trust::trust_bucket`], so they cannot drift out of their +/// documented ranges. +/// +/// Deliberate decision: the prompt forbids string labels, but they are +/// accepted defensively rather than rejecting an otherwise valid fact. Note +/// that the "high" representative can clear auto-apply thresholds when +/// `auto_apply_memory_ops` is enabled — this is intentional. fn proposal_trust_value(value: &Value) -> Option { if let Some(trust) = value.as_f64() { return (0.0..=1.0).contains(&trust).then_some(trust); } match value.as_str()?.trim().to_ascii_lowercase().as_str() { - "low" => Some(0.15), - "medium" => Some(0.5), - "high" => Some(0.85), + "low" => Some(LOW_TRUST_REPRESENTATIVE), + "medium" => Some(DEFAULT_TRUST), + "high" => Some(HIGH_TRUST_REPRESENTATIVE), _ => None, } } diff --git a/src/daemon.rs b/src/daemon.rs index 4fd82634..739d15c2 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -2,8 +2,6 @@ use std::collections::HashMap; use std::fmt::Write; #[cfg(unix)] -use std::future::Future; -#[cfg(unix)] use std::os::unix::fs::PermissionsExt; use std::path::{Path, PathBuf}; #[cfg(unix)] @@ -794,11 +792,15 @@ async fn run_foreground_unix(socket_path: PathBuf) -> Result<()> { "daemon_shutdown", &[("socket", socket_path.display().to_string())], ); - let completed = await_shutdown_within_deadline( - async move { engine.shutdown_all().await }, - DAEMON_SHUTDOWN_DEADLINE, - ) - .await; + // Graceful shutdown persists tokens-saved counters and checkpoints WALs + // for every live project server sequentially; with many servers or large + // WALs that can exceed systemd's stop timeout, which then sends `SIGKILL` + // to the daemon. On timeout the shutdown future is dropped and we proceed + // to exit: the remaining persistence is best-effort and the database WAL + // keeps state crash-safe. + let completed = timeout(DAEMON_SHUTDOWN_DEADLINE, engine.shutdown_all()) + .await + .is_ok(); if !completed { log_daemon_event( "daemon_shutdown", @@ -815,29 +817,6 @@ async fn run_foreground_unix(socket_path: PathBuf) -> Result<()> { Ok(()) } -/// Runs the shutdown future on its own task and waits at most `deadline` for -/// it to finish, returning `true` when it completed in time. -/// -/// Graceful shutdown persists tokens-saved counters and checkpoints WALs for -/// every live project server sequentially; with many servers or large WALs -/// that can exceed systemd's stop timeout, which then sends `SIGKILL` to the -/// daemon. On timeout the task is aborted and we proceed to exit: the -/// remaining persistence is best-effort and the database WAL keeps state -/// crash-safe. -#[cfg(unix)] -async fn await_shutdown_within_deadline(shutdown: F, deadline: Duration) -> bool -where - F: Future + Send + 'static, -{ - let mut handle = tokio::spawn(shutdown); - if timeout(deadline, &mut handle).await.is_ok() { - true - } else { - handle.abort(); - false - } -} - #[cfg(unix)] fn set_owner_only_permissions(path: &Path, mode: u32) -> Result<()> { let permissions = std::fs::Permissions::from_mode(mode); @@ -2346,31 +2325,4 @@ mod tests { .expect("server B task") .expect("server B result"); } - - #[cfg(unix)] - #[tokio::test] - async fn shutdown_deadline_reports_completion_for_fast_shutdown() { - let completed = super::await_shutdown_within_deadline( - std::future::ready(()), - std::time::Duration::from_secs(5), - ) - .await; - assert!(completed, "instant shutdown should finish within deadline"); - } - - #[cfg(unix)] - #[tokio::test] - async fn shutdown_deadline_aborts_stalled_shutdown() { - let started = std::time::Instant::now(); - let completed = super::await_shutdown_within_deadline( - std::future::pending(), - std::time::Duration::from_millis(50), - ) - .await; - assert!(!completed, "stalled shutdown must report a timeout"); - assert!( - started.elapsed() < std::time::Duration::from_secs(5), - "timeout must return promptly instead of waiting on the stalled task" - ); - } } diff --git a/src/memory/trust.rs b/src/memory/trust.rs index 0a9edca7..f5e35730 100644 --- a/src/memory/trust.rs +++ b/src/memory/trust.rs @@ -8,6 +8,14 @@ pub const TRUST_MIN: f64 = 0.0; pub const TRUST_MAX: f64 = 1.0; pub const DEFAULT_TRUST: f64 = 0.5; pub const DEFAULT_MIN_TRUST: f64 = 0.3; +/// Lower bound of the "high" bucket in [`trust_bucket`]; scores in +/// `[DEFAULT_MIN_TRUST, HIGH_TRUST_THRESHOLD)` are "medium". +pub const HIGH_TRUST_THRESHOLD: f64 = 0.75; +/// Representative score for a "low" trust label, inside the low bucket. +pub const LOW_TRUST_REPRESENTATIVE: f64 = 0.15; +/// Representative score for a "high" trust label, inside the high bucket. +/// `DEFAULT_TRUST` is the representative for "medium". +pub const HIGH_TRUST_REPRESENTATIVE: f64 = 0.85; pub fn clamp_trust(score: f64) -> f64 { score.clamp(TRUST_MIN, TRUST_MAX) @@ -26,7 +34,7 @@ pub fn trust_bucket(score: f64) -> &'static str { let clamped = clamp_trust(score); if clamped < DEFAULT_MIN_TRUST { "low" - } else if clamped < 0.75 { + } else if clamped < HIGH_TRUST_THRESHOLD { "medium" } else { "high" @@ -42,3 +50,17 @@ pub fn trust_distribution(scores: &[f64]) -> (usize, usize, usize) { } }) } + +#[cfg(test)] +mod tests { + use super::*; + + /// Guards the label representative scores against bucket-boundary drift: + /// each representative must map back onto its own bucket. + #[test] + fn label_representatives_map_onto_their_buckets() { + assert_eq!(trust_bucket(LOW_TRUST_REPRESENTATIVE), "low"); + assert_eq!(trust_bucket(DEFAULT_TRUST), "medium"); + assert_eq!(trust_bucket(HIGH_TRUST_REPRESENTATIVE), "high"); + } +} From 7bae088fb0e9b8618c8c23d70f9d3f6bf8a36c3f Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 00:59:24 +0000 Subject: [PATCH 08/11] refactor: extract update flow from main.rs into update_cmd module Moves the update/post-update wiring (plugin refresh, daemon refresh, subprocess re-exec, health pass) into src/update_cmd.rs following the *_cmd convention, bringing main.rs to 871 lines. Also promotes the branch-DB tests' IsolatedEnv into tests/common as the canonical env-isolation helper. --- src/automation_cli.rs | 3 +- src/main.rs | 149 +------------------------------- src/startup_tests.rs | 6 +- src/update_cmd.rs | 150 +++++++++++++++++++++++++++++++++ tests/branch_db_safety_test.rs | 45 +--------- tests/common/mod.rs | 61 ++++++++++++++ 6 files changed, 222 insertions(+), 192 deletions(-) create mode 100644 src/update_cmd.rs diff --git a/src/automation_cli.rs b/src/automation_cli.rs index 2c1f3265..e09aa3f2 100644 --- a/src/automation_cli.rs +++ b/src/automation_cli.rs @@ -1,5 +1,6 @@ use crate::cli::*; -use crate::{parse_lcm_scope_arg, resolve_cli_project_root, tracedecay_bin_on_path}; +use crate::update_cmd::tracedecay_bin_on_path; +use crate::{parse_lcm_scope_arg, resolve_cli_project_root}; pub(crate) async fn handle_automation_command( action: AutomationAction, diff --git a/src/main.rs b/src/main.rs index be98b186..dc5ec35c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,7 @@ mod project_cmd; mod sessions_cmd; mod status_cmd; mod tool_command; +mod update_cmd; pub use tracedecay::serve; @@ -352,148 +353,6 @@ fn maybe_run_silent_reinstall(user_config: &mut tracedecay::user_config::UserCon } } -fn refresh_generated_plugins() -> tracedecay::errors::Result<()> { - let home = tracedecay_home_dir()?; - let tracedecay_bin = tracedecay_bin_on_path()?; - eprintln!("Refreshing tracedecay-generated plugin artifacts (agent configs are not touched)"); - - // Detection-driven, not `installed_agents`-driven: each integration - // decides whether generated artifacts exist on this machine, so stale - // tracking state can neither skip a real install nor install anywhere new. - let mut refreshed_any = false; - let mut config_only_installed: Vec<&'static str> = Vec::new(); - let mut failures: Vec = Vec::new(); - for ag in tracedecay::agents::all_integrations() { - let ctx = tracedecay::agents::InstallContext { - home: home.clone(), - tracedecay_bin: tracedecay_bin.clone(), - tool_permissions: tracedecay::agents::expected_tool_perms(), - profile: None, - project_root: None, - dashboard: true, - }; - match ag.update_plugin(&ctx) { - Ok(tracedecay::agents::UpdatePluginOutcome::Refreshed(paths)) => { - refreshed_any = true; - for path in paths { - eprintln!( - " \x1b[32m✔\x1b[0m {}: refreshed {}", - ag.id(), - path.display() - ); - } - } - Ok(tracedecay::agents::UpdatePluginOutcome::NotInstalled) => {} - Ok(tracedecay::agents::UpdatePluginOutcome::ConfigOnly) => { - if ag.has_tracedecay(&home) { - config_only_installed.push(ag.id()); - } - } - Err(e) => failures.push(format!("{}: {e}", ag.id())), - } - } - if !config_only_installed.is_empty() { - eprintln!( - " Config-managed integrations left untouched: {} (run `tracedecay reinstall` to refresh their config entries)", - config_only_installed.join(", ") - ); - } - if !refreshed_any { - eprintln!("No generated plugin installs detected — nothing to update."); - } - if !failures.is_empty() { - return Err(tracedecay::errors::TraceDecayError::Config { - message: format!("update-plugin failed for {}", failures.join("; ")), - }); - } - - Ok(()) -} - -fn refresh_daemon_service() -> tracedecay::errors::Result<()> { - let tracedecay_bin = tracedecay_bin_on_path()?; - let spec = tracedecay::daemon::service_spec(tracedecay_bin, None)?; - let socket_path = tracedecay::daemon::installed_service_socket_path()? - .unwrap_or_else(|| spec.socket_path.clone()); - match tracedecay::daemon::refresh_installed_service(&spec)? { - Some(service_path) => { - eprintln!( - "\x1b[32m✔\x1b[0m Daemon service refreshed at {}", - service_path.display() - ); - eprintln!("Daemon socket: {}", socket_path.display()); - } - None => { - eprintln!("TraceDecay daemon service is not installed; skipping daemon restart."); - } - } - Ok(()) -} - -fn tracedecay_home_dir() -> tracedecay::errors::Result { - tracedecay::agents::home_dir().ok_or_else(|| tracedecay::errors::TraceDecayError::Config { - message: "could not determine home directory".to_string(), - }) -} - -pub(crate) fn tracedecay_bin_on_path() -> tracedecay::errors::Result { - tracedecay::agents::which_tracedecay().ok_or_else(|| { - tracedecay::errors::TraceDecayError::Config { - message: "tracedecay not found on PATH".to_string(), - } - }) -} - -fn run_update_steps(mut upgrade: U, mut post_update: P) -> tracedecay::errors::Result<()> -where - U: FnMut() -> tracedecay::errors::Result<()>, - P: FnMut() -> tracedecay::errors::Result<()>, -{ - upgrade()?; - post_update()?; - Ok(()) -} - -fn run_update_command(no_heal: bool) -> tracedecay::errors::Result<()> { - run_update_steps( - || tracedecay::upgrade::run_upgrade().map(|_| ()), - || run_post_update_subcommand(no_heal), - ) -} - -fn run_post_update_subcommand(no_heal: bool) -> tracedecay::errors::Result<()> { - let tracedecay_bin = tracedecay_bin_on_path()?; - let mut command = std::process::Command::new(&tracedecay_bin); - command.arg("post-update"); - if no_heal { - command.arg("--no-heal"); - } - let status = command - .status() - .map_err(|e| tracedecay::errors::TraceDecayError::Config { - message: format!("failed to run post-update with '{tracedecay_bin}': {e}"), - })?; - if status.success() { - return Ok(()); - } - Err(tracedecay::errors::TraceDecayError::Config { - message: format!("post-update failed with status: {status}"), - }) -} - -async fn run_post_update_tasks(no_heal: bool) -> tracedecay::errors::Result<()> { - refresh_generated_plugins()?; - if let Err(error) = refresh_daemon_service() { - eprintln!(" \x1b[33mwarning:\x1b[0m daemon service refresh failed: {error}"); - } - if no_heal { - eprintln!("Skipping post-update health pass (--no-heal)."); - } else { - tracedecay::doctor::heal::run_post_update_health_pass().await; - } - Ok(()) -} - async fn resolve_registered_project_root( project_id: Option, project_path: Option, @@ -631,7 +490,7 @@ async fn dispatch_command(command: Commands) -> tracedecay::errors::Result<()> { agent_cmd::handle_reinstall_command().await?; } Commands::UpdatePlugin => { - refresh_generated_plugins()?; + update_cmd::refresh_generated_plugins()?; } Commands::Uninstall { agent, @@ -784,10 +643,10 @@ async fn dispatch_command(command: Commands) -> tracedecay::errors::Result<()> { tracedecay::upgrade::run_upgrade()?; } Commands::Update { no_heal } => { - run_update_command(no_heal)?; + update_cmd::run_update_command(no_heal)?; } Commands::PostUpdate { no_heal } => { - run_post_update_tasks(no_heal).await?; + update_cmd::run_post_update_tasks(no_heal).await?; } Commands::Channel { channel } => match channel { Some(target) => { diff --git a/src/startup_tests.rs b/src/startup_tests.rs index e4b21df4..556a626c 100644 --- a/src/startup_tests.rs +++ b/src/startup_tests.rs @@ -3,8 +3,10 @@ use super::{ hermes_profile_targets, hermes_selected_profile_targets, validate_hermes_profile_flags, validate_hermes_project_root_flag, }, - is_local_install_command, run_update_steps, should_skip_agent_install_maintenance, - should_skip_startup_maintenance, Commands, + is_local_install_command, should_skip_agent_install_maintenance, + should_skip_startup_maintenance, + update_cmd::run_update_steps, + Commands, }; use std::cell::RefCell; use tempfile::TempDir; diff --git a/src/update_cmd.rs b/src/update_cmd.rs new file mode 100644 index 00000000..0ccd74aa --- /dev/null +++ b/src/update_cmd.rs @@ -0,0 +1,150 @@ +//! The `update` / `post-update` / `update-plugin` flow: binary upgrade via +//! subprocess re-exec, generated-plugin refresh, daemon service refresh, and +//! the post-update health pass. + +use std::path::PathBuf; + +pub(crate) fn refresh_generated_plugins() -> tracedecay::errors::Result<()> { + let home = tracedecay_home_dir()?; + let tracedecay_bin = tracedecay_bin_on_path()?; + eprintln!("Refreshing tracedecay-generated plugin artifacts (agent configs are not touched)"); + + // Detection-driven, not `installed_agents`-driven: each integration + // decides whether generated artifacts exist on this machine, so stale + // tracking state can neither skip a real install nor install anywhere new. + let mut refreshed_any = false; + let mut config_only_installed: Vec<&'static str> = Vec::new(); + let mut failures: Vec = Vec::new(); + for ag in tracedecay::agents::all_integrations() { + let ctx = tracedecay::agents::InstallContext { + home: home.clone(), + tracedecay_bin: tracedecay_bin.clone(), + tool_permissions: tracedecay::agents::expected_tool_perms(), + profile: None, + project_root: None, + dashboard: true, + }; + match ag.update_plugin(&ctx) { + Ok(tracedecay::agents::UpdatePluginOutcome::Refreshed(paths)) => { + refreshed_any = true; + for path in paths { + eprintln!( + " \x1b[32m✔\x1b[0m {}: refreshed {}", + ag.id(), + path.display() + ); + } + } + Ok(tracedecay::agents::UpdatePluginOutcome::NotInstalled) => {} + Ok(tracedecay::agents::UpdatePluginOutcome::ConfigOnly) => { + if ag.has_tracedecay(&home) { + config_only_installed.push(ag.id()); + } + } + Err(e) => failures.push(format!("{}: {e}", ag.id())), + } + } + if !config_only_installed.is_empty() { + eprintln!( + " Config-managed integrations left untouched: {} (run `tracedecay reinstall` to refresh their config entries)", + config_only_installed.join(", ") + ); + } + if !refreshed_any { + eprintln!("No generated plugin installs detected — nothing to update."); + } + if !failures.is_empty() { + return Err(tracedecay::errors::TraceDecayError::Config { + message: format!("update-plugin failed for {}", failures.join("; ")), + }); + } + + Ok(()) +} + +fn refresh_daemon_service() -> tracedecay::errors::Result<()> { + let tracedecay_bin = tracedecay_bin_on_path()?; + let spec = tracedecay::daemon::service_spec(tracedecay_bin, None)?; + let socket_path = tracedecay::daemon::installed_service_socket_path()? + .unwrap_or_else(|| spec.socket_path.clone()); + match tracedecay::daemon::refresh_installed_service(&spec)? { + Some(service_path) => { + eprintln!( + "\x1b[32m✔\x1b[0m Daemon service refreshed at {}", + service_path.display() + ); + eprintln!("Daemon socket: {}", socket_path.display()); + } + None => { + eprintln!("TraceDecay daemon service is not installed; skipping daemon restart."); + } + } + Ok(()) +} + +fn tracedecay_home_dir() -> tracedecay::errors::Result { + tracedecay::agents::home_dir().ok_or_else(|| tracedecay::errors::TraceDecayError::Config { + message: "could not determine home directory".to_string(), + }) +} + +pub(crate) fn tracedecay_bin_on_path() -> tracedecay::errors::Result { + tracedecay::agents::which_tracedecay().ok_or_else(|| { + tracedecay::errors::TraceDecayError::Config { + message: "tracedecay not found on PATH".to_string(), + } + }) +} + +pub(crate) fn run_update_steps( + mut upgrade: U, + mut post_update: P, +) -> tracedecay::errors::Result<()> +where + U: FnMut() -> tracedecay::errors::Result<()>, + P: FnMut() -> tracedecay::errors::Result<()>, +{ + upgrade()?; + post_update()?; + Ok(()) +} + +pub(crate) fn run_update_command(no_heal: bool) -> tracedecay::errors::Result<()> { + run_update_steps( + || tracedecay::upgrade::run_upgrade().map(|_| ()), + || run_post_update_subcommand(no_heal), + ) +} + +fn run_post_update_subcommand(no_heal: bool) -> tracedecay::errors::Result<()> { + let tracedecay_bin = tracedecay_bin_on_path()?; + let mut command = std::process::Command::new(&tracedecay_bin); + command.arg("post-update"); + if no_heal { + command.arg("--no-heal"); + } + let status = command + .status() + .map_err(|e| tracedecay::errors::TraceDecayError::Config { + message: format!("failed to run post-update with '{tracedecay_bin}': {e}"), + })?; + if status.success() { + return Ok(()); + } + Err(tracedecay::errors::TraceDecayError::Config { + message: format!("post-update failed with status: {status}"), + }) +} + +pub(crate) async fn run_post_update_tasks(no_heal: bool) -> tracedecay::errors::Result<()> { + refresh_generated_plugins()?; + if let Err(error) = refresh_daemon_service() { + eprintln!(" \x1b[33mwarning:\x1b[0m daemon service refresh failed: {error}"); + } + if no_heal { + eprintln!("Skipping post-update health pass (--no-heal)."); + } else { + tracedecay::doctor::heal::run_post_update_health_pass().await; + } + Ok(()) +} diff --git a/tests/branch_db_safety_test.rs b/tests/branch_db_safety_test.rs index 4ebc7f5b..abaab130 100644 --- a/tests/branch_db_safety_test.rs +++ b/tests/branch_db_safety_test.rs @@ -5,55 +5,12 @@ use std::path::Path; use std::path::PathBuf; use std::process::Command; -use common::TraceDecayStorageEnvGuard; -use tempfile::TempDir; +use common::IsolatedEnv; use tracedecay::branch::BranchAddOutcome; use tracedecay::branch_meta::load_branch_meta; use tracedecay::storage::resolve_layout_for_current_profile; use tracedecay::tracedecay::TraceDecay; -/// Serializes the tests in this binary: storage isolation swaps process-wide -/// env vars (`HOME`, `TRACEDECAY_DATA_DIR`, ...), so tests must not overlap. -static ENV_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(()); - -/// Keeps every test's project registration, store manifests, and -/// branch-meta writes inside a throwaway home instead of the developer's -/// real `~/.tracedecay` profile store. -struct IsolatedEnv { - _env_lock: tokio::sync::MutexGuard<'static, ()>, - storage: TraceDecayStorageEnvGuard, - _dir: TempDir, -} - -impl IsolatedEnv { - fn build(env_lock: tokio::sync::MutexGuard<'static, ()>) -> (Self, PathBuf) { - let dir = TempDir::new().unwrap(); - let storage = TraceDecayStorageEnvGuard::for_tempdir(&dir); - let project = dir.path().join("project"); - fs::create_dir_all(&project).unwrap(); - ( - Self { - _env_lock: env_lock, - storage, - _dir: dir, - }, - project, - ) - } - - async fn acquire() -> (Self, PathBuf) { - Self::build(ENV_LOCK.lock().await) - } - - fn acquire_blocking() -> (Self, PathBuf) { - Self::build(ENV_LOCK.blocking_lock()) - } - - fn home(&self) -> &Path { - self.storage.home() - } -} - fn git(project: &Path, args: &[&str]) { let output = Command::new("git") .args(["-c", "core.hooksPath=.git/no-hooks"]) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index d3ece00b..7b07ea60 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -59,8 +59,69 @@ impl Drop for EnvVarGuard { pub const GLOBAL_DB_ENV: &str = "TRACEDECAY_GLOBAL_DB"; /// Serializes tests within one binary that mutate process-wide env vars. +/// +/// Prefer [`IsolatedEnv`], which bundles this serialization with a throwaway +/// home and [`TraceDecayStorageEnvGuard`]; reach for this raw lock only when +/// a test needs finer-grained control over which env vars it swaps. pub static GLOBAL_DB_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); +/// Serializes [`IsolatedEnv`] users within one test binary: storage isolation +/// swaps process-wide env vars (`HOME`, `TRACEDECAY_DATA_DIR`, ...), so tests +/// must not overlap. +static ISOLATED_ENV_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(()); + +/// The canonical way to isolate env-mutating tests: serializes tests within +/// one binary and keeps every test's project registration, store manifests, +/// and branch-meta writes inside a throwaway home instead of the developer's +/// real `~/.tracedecay` profile store. +/// +/// Construct via [`IsolatedEnv::acquire`] (async tests) or +/// [`IsolatedEnv::acquire_blocking`] (sync tests); both return the guard plus +/// a ready-made `project` directory inside the temp home. +pub struct IsolatedEnv { + _env_lock: tokio::sync::MutexGuard<'static, ()>, + storage: TraceDecayStorageEnvGuard, + _dir: TempDir, +} + +impl IsolatedEnv { + fn build(env_lock: tokio::sync::MutexGuard<'static, ()>) -> (Self, PathBuf) { + let dir = tempdir_or_panic(); + let storage = TraceDecayStorageEnvGuard::for_tempdir(&dir); + let project = dir.path().join("project"); + fs::create_dir_all(&project).unwrap_or_else(|err| { + panic!( + "failed to create isolated project directory '{}': {err}", + project.display() + ) + }); + ( + Self { + _env_lock: env_lock, + storage, + _dir: dir, + }, + project, + ) + } + + pub async fn acquire() -> (Self, PathBuf) { + Self::build(ISOLATED_ENV_LOCK.lock().await) + } + + /// Sync counterpart of [`IsolatedEnv::acquire`] for plain `#[test]` fns. + /// + /// Warning: this uses `blocking_lock`, which panics if called from within + /// an async context — use [`IsolatedEnv::acquire`] there instead. + pub fn acquire_blocking() -> (Self, PathBuf) { + Self::build(ISOLATED_ENV_LOCK.blocking_lock()) + } + + pub fn home(&self) -> &Path { + self.storage.home() + } +} + /// Sets [`GLOBAL_DB_ENV`] to a test DB path for the guard's lifetime. pub struct GlobalDbEnvGuard { _env_guard: EnvVarGuard, From 822d921af9c8d315de3e716e9f6031708cbfec0f Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 01:25:45 +0000 Subject: [PATCH 09/11] fix: hold IsolatedEnv lock until storage guard restores env --- tests/common/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 7b07ea60..42ef34b9 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -79,9 +79,12 @@ static ISOLATED_ENV_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new /// [`IsolatedEnv::acquire_blocking`] (sync tests); both return the guard plus /// a ready-made `project` directory inside the temp home. pub struct IsolatedEnv { - _env_lock: tokio::sync::MutexGuard<'static, ()>, + // Field order matters: fields drop in declaration order, so the lock must + // be declared last. Dropping it first would let the next waiting test + // install its own isolated env, only for `storage`'s restore to clobber it. storage: TraceDecayStorageEnvGuard, _dir: TempDir, + _env_lock: tokio::sync::MutexGuard<'static, ()>, } impl IsolatedEnv { @@ -97,9 +100,9 @@ impl IsolatedEnv { }); ( Self { - _env_lock: env_lock, storage, _dir: dir, + _env_lock: env_lock, }, project, ) From 967379342963d37a45992c6a458b145097f62283 Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 01:28:17 +0000 Subject: [PATCH 10/11] ci: baseline over-length subject of published commit deb256e --- .github/conventional-commit-baseline.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/conventional-commit-baseline.txt b/.github/conventional-commit-baseline.txt index f288cbe2..d2882890 100644 --- a/.github/conventional-commit-baseline.txt +++ b/.github/conventional-commit-baseline.txt @@ -4,3 +4,4 @@ 035bb71cd9059b571e18bbd69059e6dbd80f2b80 # PR #100 split/profile-branch-foundation c4d4ce4c427988701cde38915f4f3a5b5fa7d52d # PR #101 split/mcp-tool-surface 5b4eeb1b9d279bfe64f289a80292bfd83fd852a1 # PR #102 split/daemon-transport +deb256e4276da2c7be8729a4db767e56ebffd4ca # PR #178 subject 86 chars; published before lint caught it, unrewritable without force-push From 2bcd343c87490ebe81d70ebe2806fc62dd87969f Mon Sep 17 00:00:00 2001 From: ScriptedAlchemy Date: Thu, 2 Jul 2026 01:38:40 +0000 Subject: [PATCH 11/11] chore: retrigger CI after dropped workflow events