From c0cb2d53be2c4b0bc6276f754aac739b54af5f53 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Tue, 26 May 2026 11:55:50 -0400 Subject: [PATCH 1/2] Port Lite Stage 1+3 analysis detectors to Dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR (a) of the Dashboard port — engine seams + the two new fact paths that do not depend on the scheduler/notification layer. Stage 2 (scheduled analysis + finding notifications) drops on top of this as PR (b). Lite-side workstreams ported here: Stage 1 — Parameter sensitivity and plan regression: - PARAMETER_SENSITIVITY: detects a single cached plan whose per-execution worker time varies wildly (classic parameter sniffing). Sourced from collect.query_stats; magnitude-driven scoring so a lone catastrophic offender still scores high. Drill-down surfaces the top five plans. - PLAN_REGRESSION: detects a query whose currently-active plan has per-execution cost >= 2x the best plan that query previously used. Sourced from collect.query_store_data. Uses a 14-day comparison window (independent of the standard analysis window) so the days-old "best plan" baseline is present. Stage 3 — Blocking chains and compile-wait scoring: - BlockingChainReconstructor (new, pure static) walks blocked-process pairs into chains using composite (spid, last_tran_started) session identity so a reused SPID with a different transaction start cannot fabricate a chain. The 1900-01-01 "no transaction" sentinel is normalized to NULL. - CollectBlockingChainFactsAsync re-parses blocked_process_report_xml at analysis time (no SQL-side typed columns yet) and emits one aggregate BLOCKING_CHAIN fact carrying the worst chain's apex, depth, and victim count — structure BLOCKING_EVENTS (a rate) is blind to. - Scoring is magnitude-driven: Max(depth, victims) so one severe dimension scores high without being diluted. Amplifiers cover a sleeping apex, deadlocks present, and THREADPOOL. Forward and reverse edges connect BLOCKING_CHAIN to LCK / THREADPOOL / DEADLOCKS / BLOCKING_EVENTS. - RESOURCE_SEMAPHORE_QUERY_COMPILE wait fact: already collected but unscored. Added a ramped (0.01, 0.10) threshold, compile-specific amplifiers (CPU and scheduler signals, not the runtime-grant amplifiers that would mislead), and edges to SOS_SCHEDULER_YIELD and CPU_SQL_PERCENT. Cross-cutting Dashboard fixes: - Ported Lite's GetDeterministicHashCode (FNV-1a) into a new ServerIdHelper. Swapped all six existing ServerName.GetHashCode() call sites in McpAnalysisTools to use it — string.GetHashCode() is randomized per process on .NET 10, so persisted config.analysis_findings.server_id and config.analysis_muted rows did not match the next launch's value for the same server name. New writes are stable; existing rows are a one-time reset (they were already broken across restart with the random hash). - Added RootFactMetadata to both AnalysisStory and AnalysisFinding (ephemeral, not persisted). Populated in InferenceEngine.BuildStory and carried into SaveFindingsAsync — used by Stage 2's finding message formatter (PR (b)). - Ported FormatBaselineContext and DayNames onto ToolRecommendations. Currently no caller; staged for the notification path in PR (b). T-SQL adaptations from Lite's DuckDB queries verified against MS Learn: - DECOMPRESS + LEFT(CAST(... AS NVARCHAR(MAX)), N) for query_text / query_sql_text (page-compressed varbinary(max)). - WITH ... ROW_NUMBER() OVER ... WHERE rn = 1 in place of QUALIFY. - OFFSET 0 ROWS FETCH NEXT N ROWS ONLY in place of LIMIT. - (SELECT MAX(v) FROM (VALUES (a), (b)) AS x(v)) in place of GREATEST (SQL Server has no GREATEST before 2022). - MAX(CAST(is_forced_plan AS tinyint)) in place of bool_or (bit type is invalid for MAX in T-SQL). - query_plan_hash moved into GROUP BY rather than aggregated (MS Learn's MAX docs do not list binary types; using a documented construct). - query_sql_text fetched via OUTER APPLY rather than carried through aggregations (MAX is invalid on varbinary(max)). - TOP (5000) ORDER BY collection_time DESC over collect.blocking_BlockedProcessReport — backward CIX scan, sort-free (clustered index leads on collection_time). - activity = 'blocked' (varchar literal, no N prefix) preserves the sargable predicate (an N-prefixed literal would force an nvarchar promotion of the column and break any future index seek). Drift mitigation: BlockingChainReconstructor.cs is a verbatim copy of the Lite file (29 diff lines, all in the header). A user-level blocking-reconstructor-sync-checker agent at C:\Users\edarl\.claude\agents\ flags any drift between the two copies, mirroring the existing planalyzer-sync-checker. Verification: - dotnet build Dashboard/Dashboard.csproj -c Debug: clean (0 warnings, 0 errors). - dotnet test Lite.Tests/Lite.Tests.csproj: 260/260 pass (Lite untouched). - Reconstructor body parity: in sync with the Lite worktree. - T-SQL verification via docs-first-verifier against MS Learn. - C# correctness pass via code-reviewer: no issues. Designed in a plan file and revised across three adversarial review rounds. Round 1 caught the FormatBaselineContext/RootFactMetadata gaps; round 2 moved the .NET 10 GetHashCode randomization in-scope and restructured FindingMessageFormatter for Stage 2's AlertContext shape; round 3 caught the webhook-URL-absent silent-drop and the plain-text "Recent Events" header (both fixes land in PR (b)). Stage 2 — scheduled analysis + AnalysisNotificationService + Settings UI + RecordAlert-when-channels-absent + EmailTemplateBuilder header rename — is the committed follow-on (PR (b)). Co-Authored-By: Claude Opus 4.7 (1M context) --- Dashboard/Analysis/AnalysisModels.cs | 12 + Dashboard/Analysis/BlockedProcessXmlParser.cs | 103 +++++ .../Analysis/BlockingChainReconstructor.cs | 364 ++++++++++++++++ Dashboard/Analysis/FactScorer.cs | 152 +++++++ Dashboard/Analysis/InferenceEngine.cs | 3 +- Dashboard/Analysis/RelationshipGraph.cs | 94 ++++ .../Analysis/SqlServerDrillDownCollector.cs | 336 +++++++++++++++ Dashboard/Analysis/SqlServerFactCollector.cs | 408 ++++++++++++++++++ Dashboard/Analysis/SqlServerFindingStore.cs | 4 +- Dashboard/Mcp/McpAnalysisTools.cs | 56 ++- Dashboard/Services/ServerIdHelper.cs | 41 ++ 11 files changed, 1565 insertions(+), 8 deletions(-) create mode 100644 Dashboard/Analysis/BlockedProcessXmlParser.cs create mode 100644 Dashboard/Analysis/BlockingChainReconstructor.cs create mode 100644 Dashboard/Services/ServerIdHelper.cs diff --git a/Dashboard/Analysis/AnalysisModels.cs b/Dashboard/Analysis/AnalysisModels.cs index 76718852..40f0a5f9 100644 --- a/Dashboard/Analysis/AnalysisModels.cs +++ b/Dashboard/Analysis/AnalysisModels.cs @@ -72,6 +72,12 @@ public class AnalysisStory public double? LeafFactValue { get; set; } public int FactCount { get; set; } public bool IsAbsolution { get; set; } + + /// + /// Metadata from the root fact (raw metric values used to assemble the story). + /// Ephemeral — copied onto the finding for the notification layer, not persisted. + /// + public Dictionary? RootFactMetadata { get; set; } } /// @@ -104,6 +110,12 @@ public class AnalysisFinding /// Contains supporting detail keyed by category (e.g., "top_deadlocks", "queries_at_spike"). /// public Dictionary? DrillDown { get; set; } + + /// + /// Metadata from the root fact carried in from . + /// Ephemeral — used by the notification layer for diagnosis context; not persisted. + /// + public Dictionary? RootFactMetadata { get; set; } } /// diff --git a/Dashboard/Analysis/BlockedProcessXmlParser.cs b/Dashboard/Analysis/BlockedProcessXmlParser.cs new file mode 100644 index 00000000..233b4cc2 --- /dev/null +++ b/Dashboard/Analysis/BlockedProcessXmlParser.cs @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2026 Erik Darling, Darling Data LLC + * + * This file is part of the SQL Server Performance Monitor. + * + * Licensed under the MIT License. See LICENSE file in the project root for full license information. + */ + +using System; +using System.Xml.Linq; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// Parses one blocked_process_report_xml document into a single +/// for the reconstructor. Returns null when the +/// document is malformed or missing the canonical blocked-process node. +/// +/// +/// The collector treats a null return as "skip this row, keep going" — high-volume, +/// no per-row log noise. +/// +/// +internal static class BlockedProcessXmlParser +{ + /// + /// Parses a single blocked-process-report XML fragment. The reconstructor's + /// normalizes the + /// 1900-01-01 "no transaction" sentinel — no special handling needed here. + /// + /// The raw blocked_process_report_xml value. + /// The row's event_time — when the XE fired. + /// The row's database_name (blocked-side fallback). + /// The row's wait_time_ms (blocked-side wait). + /// The row's lock_mode (blocked-side lock mode). + /// The row's last_transaction_started (blocked-side tran start). + /// The row's spid — for activity='blocked' rows this is the blocked SPID. + public static BlockingPairRow? Parse( + string xml, + DateTime eventTime, + string databaseName, + long waitTimeMs, + string lockMode, + DateTime? lastTransactionStarted, + int blockedSpidFromRow) + { + if (string.IsNullOrWhiteSpace(xml)) + return null; + + try + { + var doc = XElement.Parse(xml); + var blockedProcess = doc.Element("blocked-process")?.Element("process"); + var blockingProcess = doc.Element("blocking-process")?.Element("process"); + + if (blockedProcess == null) + return null; + + // SPID and last_transaction_started for the BLOCKED side come from the row; + // fall back to the XML attributes if the row was missing them. + var blockedSpid = blockedSpidFromRow > 0 + ? blockedSpidFromRow + : (int.TryParse(blockedProcess.Attribute("spid")?.Value, out var bs) ? bs : 0); + var blockingSpid = int.TryParse(blockingProcess?.Attribute("spid")?.Value, out var ks) + ? ks + : 0; + + var blockedTran = lastTransactionStarted + ?? (DateTime.TryParse(blockedProcess.Attribute("lasttranstarted")?.Value, out var blts) ? blts : (DateTime?)null); + // Blocking-side tran start lives only in the XML — the row carries blocked-side info. + var blockingTran = DateTime.TryParse(blockingProcess?.Attribute("lasttranstarted")?.Value, out var bklts) + ? bklts + : (DateTime?)null; + + return new BlockingPairRow + { + EventTime = eventTime, + DatabaseName = !string.IsNullOrWhiteSpace(databaseName) + ? databaseName + : blockedProcess.Attribute("currentdbname")?.Value ?? string.Empty, + BlockedSpid = blockedSpid, + BlockedTranStarted = blockedTran, + BlockingSpid = blockingSpid, + BlockingTranStarted = blockingTran, + WaitTimeMs = waitTimeMs > 0 + ? waitTimeMs + : (long.TryParse(blockedProcess.Attribute("waittime")?.Value, out var wt) ? wt : 0), + LockMode = !string.IsNullOrEmpty(lockMode) + ? lockMode + : blockedProcess.Attribute("lockMode")?.Value ?? string.Empty, + // The row's status is the BLOCKED side's status — the reconstructor needs the + // BLOCKING side's status to detect sleeping-apex chains. Read it from the XML. + BlockingStatus = blockingProcess?.Attribute("status")?.Value ?? string.Empty, + BlockedSqlText = blockedProcess.Element("inputbuf")?.Value?.Trim() ?? string.Empty, + BlockingSqlText = blockingProcess?.Element("inputbuf")?.Value?.Trim() ?? string.Empty + }; + } + catch + { + return null; + } + } +} diff --git a/Dashboard/Analysis/BlockingChainReconstructor.cs b/Dashboard/Analysis/BlockingChainReconstructor.cs new file mode 100644 index 00000000..ab1c393f --- /dev/null +++ b/Dashboard/Analysis/BlockingChainReconstructor.cs @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2026 Erik Darling, Darling Data LLC + * + * This file is part of the SQL Server Performance Monitor. + * + * Licensed under the MIT License. See LICENSE file in the project root for full license information. + */ + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PerformanceMonitorDashboard.Analysis; + +/// +/// One blocked/blocker pair from blocked_process_reports — the raw input to reconstruction. +/// +/// +/// This file is kept in sync with Lite/Analysis/BlockingChainReconstructor.cs. +/// Any logic change here must land in Lite as well — see the +/// blocking-reconstructor-sync-checker agent. +/// +internal sealed class BlockingPairRow +{ + public DateTime EventTime { get; init; } + public string DatabaseName { get; init; } = string.Empty; + public int BlockedSpid { get; init; } + public DateTime? BlockedTranStarted { get; init; } + public int BlockingSpid { get; init; } + public DateTime? BlockingTranStarted { get; init; } + public long WaitTimeMs { get; init; } + public string LockMode { get; init; } = string.Empty; + public string BlockingStatus { get; init; } = string.Empty; + public string BlockedSqlText { get; init; } = string.Empty; + public string BlockingSqlText { get; init; } = string.Empty; +} + +/// +/// Stable session identity. A SPID is reused across the analysis window, so the bare +/// integer is not a session identity — the transaction start time disambiguates two +/// sessions that reused one SPID. +/// +internal readonly record struct SessionKey(int Spid, DateTime? TranStarted); + +/// One level (one blocked/blocker edge) of a reconstructed chain, for drill-down. +internal sealed class ChainLevel +{ + public int Level { get; init; } + public int BlockingSpid { get; init; } + public int BlockedSpid { get; init; } + public string LockMode { get; init; } = string.Empty; + public long WaitTimeMs { get; init; } + public string BlockingSqlText { get; init; } = string.Empty; + public string BlockedSqlText { get; init; } = string.Empty; +} + +/// A single reconstructed blocking chain, rooted at an apex head blocker. +internal sealed class ReconstructedChain +{ + public int ApexSpid { get; init; } + public bool ApexSleeping { get; init; } + public int Depth { get; init; } + public int VictimCount { get; init; } + public long MaxWaitMs { get; init; } + public double Magnitude { get; init; } + public IReadOnlyList Levels { get; init; } = Array.Empty(); +} + +/// Result of a reconstruction pass — chains ranked worst-first, plus cap flags. +internal sealed class BlockingReconstruction +{ + public IReadOnlyList Chains { get; init; } = Array.Empty(); + public bool DepthCapped { get; init; } + public bool TraversalTruncated { get; init; } + public bool CycleDetected { get; init; } +} + +/// +/// Reconstructs blocking chains (apex head blocker, depth, victim count) from the per-pair +/// blocked_process_reports rows. Pure — no DB dependency — so the collector and the +/// drill-down collector share one implementation and it is directly unit-testable. +/// +internal static class BlockingChainReconstructor +{ + /// + /// SQL Server's blocked-process-report emits lasttranstarted="1900-01-01T00:00:00" + /// (a real, parseable value — not NULL) for a session with no open transaction. + /// A transaction start at or before this floor is treated as "no transaction". + /// + private static readonly DateTime SentinelFloor = new(1900, 1, 2); + + private sealed record EdgeInfo(long WaitMs, string LockMode, string BlockingSql, string BlockedSql); + + /// Builds a stable session key, normalizing the 1900-01-01 sentinel to NULL. + public static SessionKey MakeKey(int spid, DateTime? tranStarted) + { + var normalized = tranStarted.HasValue && tranStarted.Value > SentinelFloor ? tranStarted : null; + return new SessionKey(spid, normalized); + } + + public static BlockingReconstruction Reconstruct( + IEnumerable rows, int maxDepth, int maxPairs, int stepBudget) + { + var pairs = rows.Take(maxPairs).ToList(); + if (pairs.Count == 0) + return new BlockingReconstruction(); + + // Directed graph: blocker -> blocked. Edges deduped by max wait time (a pair + // re-fires every few seconds with a growing wait), keeping the worst row's detail. + var adjacency = new Dictionary>(); + var allNodes = new HashSet(); + var blockedNodes = new HashSet(); + var sleepingBlockers = new HashSet(); + + foreach (var row in pairs) + { + var blocker = MakeKey(row.BlockingSpid, row.BlockingTranStarted); + var blocked = MakeKey(row.BlockedSpid, row.BlockedTranStarted); + + allNodes.Add(blocker); + allNodes.Add(blocked); + blockedNodes.Add(blocked); + + if (string.Equals(row.BlockingStatus, "sleeping", StringComparison.OrdinalIgnoreCase)) + sleepingBlockers.Add(blocker); + + if (blocker.Equals(blocked)) + continue; // a session cannot block itself — guard against degenerate data + + if (!adjacency.TryGetValue(blocker, out var dests)) + adjacency[blocker] = dests = new Dictionary(); + + if (!dests.TryGetValue(blocked, out var existing) || row.WaitTimeMs > existing.WaitMs) + { + dests[blocked] = new EdgeInfo( + row.WaitTimeMs, row.LockMode ?? string.Empty, + row.BlockingSqlText ?? string.Empty, row.BlockedSqlText ?? string.Empty); + } + } + + var cycleDetected = HasCycle(allNodes, adjacency); + + // Roots: apexes (blockers that are never blocked). Subgraphs that are pure cycles + // have no apex — give each a fallback root so the chain is not silently dropped. + var roots = allNodes.Where(n => adjacency.ContainsKey(n) && !blockedNodes.Contains(n)).ToList(); + AddFallbackRoots(roots, allNodes, blockedNodes, adjacency); + + var steps = stepBudget; + var depthCapped = false; + var truncated = false; + var depthMemo = new Dictionary(); + + var chains = new List(roots.Count); + foreach (var root in roots) + { + var depth = LongestDepth(root, adjacency, maxDepth, !cycleDetected, depthMemo, + new HashSet(), ref steps, ref depthCapped, ref truncated); + var (victimCount, maxWait, levels) = WalkChain(root, adjacency, ref steps, ref truncated); + + var magnitude = Math.Max( + FactScorer.ApplyThresholdFormula(depth, 3, 8), + FactScorer.ApplyThresholdFormula(victimCount, 5, 25)); + + chains.Add(new ReconstructedChain + { + ApexSpid = root.Spid, + ApexSleeping = sleepingBlockers.Contains(root), + Depth = depth, + VictimCount = victimCount, + MaxWaitMs = maxWait, + Magnitude = magnitude, + Levels = levels + }); + } + + return new BlockingReconstruction + { + Chains = chains.OrderByDescending(c => c.Magnitude) + .ThenByDescending(c => c.Depth) + .ToList(), + DepthCapped = depthCapped, + TraversalTruncated = truncated, + CycleDetected = cycleDetected + }; + } + + /// Kahn's algorithm — true if the graph is not a DAG. + private static bool HasCycle( + HashSet allNodes, + Dictionary> adjacency) + { + var inDegree = allNodes.ToDictionary(n => n, _ => 0); + foreach (var dests in adjacency.Values) + foreach (var dest in dests.Keys) + inDegree[dest]++; + + var queue = new Queue(inDegree.Where(kv => kv.Value == 0).Select(kv => kv.Key)); + var removed = 0; + while (queue.Count > 0) + { + var node = queue.Dequeue(); + removed++; + if (adjacency.TryGetValue(node, out var dests)) + foreach (var dest in dests.Keys) + if (--inDegree[dest] == 0) + queue.Enqueue(dest); + } + + return removed != allNodes.Count; + } + + /// + /// For any subgraph with no apex (a pure cycle), adds the highest-wait node as a + /// fallback root so the chain is reconstructed rather than silently dropped. + /// + private static void AddFallbackRoots( + List roots, + HashSet allNodes, + HashSet blockedNodes, + Dictionary> adjacency) + { + var reached = new HashSet(); + foreach (var root in roots) + MarkReachable(root, adjacency, reached); + + var orphans = allNodes.Where(n => adjacency.ContainsKey(n) && !reached.Contains(n)).ToList(); + while (orphans.Count > 0) + { + // Pick the orphan with the largest outgoing wait time as the fallback root. + var fallback = orphans + .OrderByDescending(n => adjacency[n].Values.Max(e => e.WaitMs)) + .First(); + roots.Add(fallback); + MarkReachable(fallback, adjacency, reached); + orphans = orphans.Where(n => !reached.Contains(n)).ToList(); + } + } + + private static void MarkReachable( + SessionKey start, + Dictionary> adjacency, + HashSet reached) + { + var stack = new Stack(); + if (reached.Add(start)) + stack.Push(start); + while (stack.Count > 0) + { + var node = stack.Pop(); + if (adjacency.TryGetValue(node, out var dests)) + foreach (var dest in dests.Keys) + if (reached.Add(dest)) + stack.Push(dest); + } + } + + /// + /// Longest downward path (in edges) from a node. Memoized when the graph is a DAG + /// (memo is path-independent there); on a cyclic graph memo is disabled and the + /// per-path visited set plus the global step budget bound the traversal. + /// + private static int LongestDepth( + SessionKey node, + Dictionary> adjacency, + int maxDepth, + bool useMemo, + Dictionary memo, + HashSet path, + ref int steps, + ref bool depthCapped, + ref bool truncated) + { + if (useMemo && memo.TryGetValue(node, out var cached)) + return cached; + + if (steps-- <= 0) + { + truncated = true; + return 0; + } + + if (path.Count >= maxDepth) + { + depthCapped = true; + return 0; + } + + var best = 0; + if (adjacency.TryGetValue(node, out var dests)) + { + path.Add(node); + foreach (var child in dests.Keys) + { + if (path.Contains(child)) + continue; // cycle guard + + var childDepth = LongestDepth(child, adjacency, maxDepth, useMemo, memo, path, + ref steps, ref depthCapped, ref truncated); + if (1 + childDepth > best) + best = 1 + childDepth; + } + path.Remove(node); + } + + if (useMemo) + memo[node] = best; + return best; + } + + /// + /// Walks the subtree under a root: distinct transitive victim count, the worst edge + /// wait time, and a BFS-ordered level list for drill-down. + /// + private static (int VictimCount, long MaxWaitMs, List Levels) WalkChain( + SessionKey root, + Dictionary> adjacency, + ref int steps, + ref bool truncated) + { + var victims = new HashSet(); + var levels = new List(); + long maxWait = 0; + + var queue = new Queue<(SessionKey Node, int Level)>(); + var enqueued = new HashSet { root }; + queue.Enqueue((root, 0)); + + while (queue.Count > 0) + { + if (steps-- <= 0) + { + truncated = true; + break; + } + + var (node, level) = queue.Dequeue(); + if (!adjacency.TryGetValue(node, out var dests)) + continue; + + foreach (var (child, edge) in dests) + { + victims.Add(child); + if (edge.WaitMs > maxWait) + maxWait = edge.WaitMs; + + levels.Add(new ChainLevel + { + Level = level + 1, + BlockingSpid = node.Spid, + BlockedSpid = child.Spid, + LockMode = edge.LockMode, + WaitTimeMs = edge.WaitMs, + BlockingSqlText = edge.BlockingSql, + BlockedSqlText = edge.BlockedSql + }); + + if (enqueued.Add(child)) + queue.Enqueue((child, level + 1)); + } + } + + return (victims.Count, maxWait, levels); + } +} diff --git a/Dashboard/Analysis/FactScorer.cs b/Dashboard/Analysis/FactScorer.cs index 5605a88b..92f9785f 100644 --- a/Dashboard/Analysis/FactScorer.cs +++ b/Dashboard/Analysis/FactScorer.cs @@ -117,10 +117,27 @@ private static double ScoreBlockingFact(Fact fact) "BLOCKING_EVENTS" => ApplyThresholdFormula(value, 10, 50), // Deadlocks: concerning >5/hr (no critical — any sustained deadlocking is bad) "DEADLOCKS" => ApplyThresholdFormula(value, 5, null), + // Blocking chain: scored by structural magnitude. Value = worst-chain depth >= 1 + // for any emitted chain, so the value<=0 guard above never trips this arm. + "BLOCKING_CHAIN" => ScoreBlockingChain(fact), _ => 0.0 }; } + /// + /// Scores a BLOCKING_CHAIN fact by structural magnitude — the worse of chain depth + /// and transitive victim count. Max, not average, so one severe dimension scores high + /// without being diluted by the other. + /// + private static double ScoreBlockingChain(Fact fact) + { + var depth = fact.Metadata.GetValueOrDefault("worst_chain_depth"); + var victims = fact.Metadata.GetValueOrDefault("worst_chain_victim_count"); + return Math.Max( + ApplyThresholdFormula(depth, 3, 8), + ApplyThresholdFormula(victims, 5, 25)); + } + /// /// Scores CPU utilization. Value is average SQL CPU %. /// @@ -189,6 +206,11 @@ private static double ScoreQueryFact(Fact fact) "QUERY_SPILLS" => ApplyThresholdFormula(fact.Value, 100, 1000), // High DOP queries: concerning at 5, critical at 20 in the period "QUERY_HIGH_DOP" => ApplyThresholdFormula(fact.Value, 5, 20), + // Parameter sensitivity: worst max/min worker-time ratio. Magnitude-driven — + // concerning at 10x, critical at 100x — so a lone catastrophic plan still scores high. + "PARAMETER_SENSITIVITY" => ApplyThresholdFormula(fact.Value, 10, 100), + // Plan regression: worst per-exec cost factor vs the best plan. Concerning 2x, critical 10x. + "PLAN_REGRESSION" => ApplyThresholdFormula(fact.Value, 2, 10), _ => 0.0 }; } @@ -380,6 +402,8 @@ private static List GetAmplifiers(Fact fact) "PAGEIOLATCH_SH" or "PAGEIOLATCH_EX" => PageiolatchAmplifiers(), "LATCH_EX" or "LATCH_SH" => LatchAmplifiers(), "BLOCKING_EVENTS" => BlockingEventsAmplifiers(), + "BLOCKING_CHAIN" => BlockingChainAmplifiers(), + "RESOURCE_SEMAPHORE_QUERY_COMPILE" => ResourceSemaphoreQueryCompileAmplifiers(), "DEADLOCKS" => DeadlockAmplifiers(), "LCK" => LckAmplifiers(), "CPU_SQL_PERCENT" => CpuSqlPercentAmplifiers(), @@ -388,6 +412,8 @@ private static List GetAmplifiers(Fact fact) "IO_WRITE_LATENCY_MS" => IoWriteLatencyAmplifiers(), "MEMORY_GRANT_PENDING" => MemoryGrantAmplifiers(), "QUERY_SPILLS" => QuerySpillAmplifiers(), + "PARAMETER_SENSITIVITY" => ParameterSensitivityAmplifiers(), + "PLAN_REGRESSION" => PlanRegressionAmplifiers(), "PERFMON_PLE" => PleAmplifiers(), "DB_CONFIG" => DbConfigAmplifiers(), "DISK_SPACE" => DiskSpaceAmplifiers(), @@ -395,6 +421,76 @@ private static List GetAmplifiers(Fact fact) }; } + /// + /// PARAMETER_SENSITIVITY: a single plan with wildly varying per-execution cost. + /// Corroborated by grant/spill divergence and memory-grant pressure. + /// + private static List ParameterSensitivityAmplifiers() => + [ + new() + { + Description = "Three or more sensitive plans — systemic parameter-sniffing problem", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("PARAMETER_SENSITIVITY", out var f) + && f.Metadata.GetValueOrDefault("offender_count") >= 3 + }, + new() + { + Description = "Memory grant varies with the plan — classic sniffing fingerprint", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("PARAMETER_SENSITIVITY", out var f) + && f.Metadata.GetValueOrDefault("grant_divergence") > 0 + }, + new() + { + Description = "Worst plan spills on some parameter values but not others", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("PARAMETER_SENSITIVITY", out var f) + && f.Metadata.GetValueOrDefault("spill_divergence") > 0 + }, + new() + { + Description = "Memory grant pressure present — sensitive plans competing for grants", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("MEMORY_GRANT_PENDING", out var f) && f.BaseSeverity > 0 + } + ]; + + /// + /// PLAN_REGRESSION: a query running a worse plan than one it performed well with. + /// Corroborated by a failing forced plan and by CPU pressure. + /// + private static List PlanRegressionAmplifiers() => + [ + new() + { + Description = "Three or more regressed queries — systemic plan-choice instability", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("PLAN_REGRESSION", out var f) + && f.Metadata.GetValueOrDefault("offender_count") >= 3 + }, + new() + { + Description = "Worst regression is on a forced plan that is failing to apply", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("PLAN_REGRESSION", out var f) + && f.Metadata.GetValueOrDefault("latest_is_forced") > 0 + && f.Metadata.GetValueOrDefault("force_failure_count") > 0 + }, + new() + { + Description = "CPU spike present — regressed plan likely driving it", + Boost = 0.25, + Predicate = facts => facts.TryGetValue("CPU_SPIKE", out var f) && f.BaseSeverity > 0 + }, + new() + { + Description = "SQL Server CPU elevated — regressed plan contributing", + Boost = 0.2, + Predicate = facts => facts.TryGetValue("CPU_SQL_PERCENT", out var f) && f.BaseSeverity > 0 + } + ]; + /// /// SOS_SCHEDULER_YIELD: CPU starvation confirmed by parallelism waits. /// More amplifiers added when config and CPU utilization facts are available. @@ -558,6 +654,59 @@ private static List BlockingEventsAmplifiers() => } ]; + /// + /// BLOCKING_CHAIN: a reconstructed blocking pile-up, amplified by an abandoned apex + /// transaction and by the cascade symptoms a deep/wide chain produces. + /// + private static List BlockingChainAmplifiers() => + [ + new() + { + Description = "Apex head blocker is sleeping — abandoned transaction at the top of the chain", + Boost = 0.4, + Predicate = facts => facts.TryGetValue("BLOCKING_CHAIN", out var f) + && f.Metadata.GetValueOrDefault("worst_apex_sleeping") > 0 + }, + new() + { + Description = "Deadlocks also present — chain blocking escalating to deadlocks", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0 + }, + new() + { + Description = "THREADPOOL waits present — chain victims pinning worker threads", + Boost = 0.3, + Predicate = facts => facts.ContainsKey("THREADPOOL") && facts["THREADPOOL"].BaseSeverity > 0 + } + ]; + + /// + /// RESOURCE_SEMAPHORE_QUERY_COMPILE: compile-gateway memory pressure. Corroborated by + /// CPU signals (compilation is CPU-heavy), not by runtime-grant signals. + /// + private static List ResourceSemaphoreQueryCompileAmplifiers() => + [ + new() + { + Description = "SOS_SCHEDULER_YIELD elevated — compilation competing for CPU", + Boost = 0.3, + Predicate = facts => HasSignificantWait(facts, "SOS_SCHEDULER_YIELD", 0.25) + }, + new() + { + Description = "SQL Server CPU > 80% — compilation a measurable share of CPU load", + Boost = 0.3, + Predicate = facts => facts.TryGetValue("CPU_SQL_PERCENT", out var cpu) && cpu.Value >= 80 + }, + new() + { + Description = "RESOURCE_SEMAPHORE also present — broad memory starvation, not isolated compile pressure", + Boost = 0.2, + Predicate = facts => facts.ContainsKey("RESOURCE_SEMAPHORE") && facts["RESOURCE_SEMAPHORE"].BaseSeverity > 0 + } + ]; + /// /// DEADLOCKS: deadlocks confirmed by blocking patterns. /// @@ -825,6 +974,9 @@ private static (double concerning, double? critical)? GetWaitThresholds(string w "PAGEIOLATCH_SH" => (0.25, null), "PAGEIOLATCH_EX" => (0.25, null), "RESOURCE_SEMAPHORE" => (0.01, null), + // Query-compile memory pressure — ramped: healthy servers see some compile-gateway + // waits, so 1% of period is concerning but 10% is critical. + "RESOURCE_SEMAPHORE_QUERY_COMPILE" => (0.01, 0.10), // Parallelism (CXCONSUMER is grouped into CXPACKET by collector) "CXPACKET" => (0.25, null), diff --git a/Dashboard/Analysis/InferenceEngine.cs b/Dashboard/Analysis/InferenceEngine.cs index 976bef43..992ae502 100644 --- a/Dashboard/Analysis/InferenceEngine.cs +++ b/Dashboard/Analysis/InferenceEngine.cs @@ -150,7 +150,8 @@ private static AnalysisStory BuildStory(List path, Dictionary HasFact(facts, "CXPACKET") && facts["CXPACKET"].Severity >= 0.3); + + // CPU_SPIKE → PLAN_REGRESSION (spike explained by a regressed plan) + AddEdge("CPU_SPIKE", "PLAN_REGRESSION", "cpu_spike", + "Plan regression present — a query is running a worse plan than before", + facts => HasFact(facts, "PLAN_REGRESSION") && facts["PLAN_REGRESSION"].BaseSeverity > 0); + + // CPU_SQL_PERCENT → PLAN_REGRESSION (sustained CPU explained by a regressed plan) + AddEdge("CPU_SQL_PERCENT", "PLAN_REGRESSION", "cpu_pressure", + "Plan regression present — a query is running a worse plan than before", + facts => HasFact(facts, "PLAN_REGRESSION") && facts["PLAN_REGRESSION"].BaseSeverity > 0); + + // CPU_SPIKE → PARAMETER_SENSITIVITY (spike explained by a parameter-sensitive plan) + AddEdge("CPU_SPIKE", "PARAMETER_SENSITIVITY", "cpu_spike", + "Parameter-sensitive plan present — one plan running expensively for some inputs", + facts => HasFact(facts, "PARAMETER_SENSITIVITY") && facts["PARAMETER_SENSITIVITY"].BaseSeverity > 0); } /* ── Memory Pressure ── */ @@ -182,6 +197,26 @@ private void BuildMemoryPressureEdges() AddEdge("PAGEIOLATCH_EX", "IO_READ_LATENCY_MS", "memory_pressure", "Read latency elevated — disk confirms buffer pool pressure", facts => HasFact(facts, "IO_READ_LATENCY_MS") && facts["IO_READ_LATENCY_MS"].BaseSeverity > 0); + + // MEMORY_GRANT_PENDING → PARAMETER_SENSITIVITY (grant pressure traced to a sensitive plan) + AddEdge("MEMORY_GRANT_PENDING", "PARAMETER_SENSITIVITY", "memory_grants", + "Parameter-sensitive plan present — its grant varies wildly with its inputs", + facts => HasFact(facts, "PARAMETER_SENSITIVITY") && facts["PARAMETER_SENSITIVITY"].BaseSeverity > 0); + + // RESOURCE_SEMAPHORE_QUERY_COMPILE → SOS_SCHEDULER_YIELD (compilation competing for CPU) + AddEdge("RESOURCE_SEMAPHORE_QUERY_COMPILE", "SOS_SCHEDULER_YIELD", "memory_grants", + "Scheduler yields elevated — query compilation competing for CPU", + facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Value >= 0.25); + + // RESOURCE_SEMAPHORE_QUERY_COMPILE → CPU_SQL_PERCENT (compilation a share of CPU load) + AddEdge("RESOURCE_SEMAPHORE_QUERY_COMPILE", "CPU_SQL_PERCENT", "memory_grants", + "SQL Server CPU elevated — compilation is a measurable share of CPU load", + facts => HasFact(facts, "CPU_SQL_PERCENT") && facts["CPU_SQL_PERCENT"].Value >= 80); + + // SOS_SCHEDULER_YIELD → RESOURCE_SEMAPHORE_QUERY_COMPILE (CPU pressure traced to compiles) + AddEdge("SOS_SCHEDULER_YIELD", "RESOURCE_SEMAPHORE_QUERY_COMPILE", "memory_grants", + "Query-compile memory pressure — compilation contributing to CPU pressure", + facts => HasFact(facts, "RESOURCE_SEMAPHORE_QUERY_COMPILE") && facts["RESOURCE_SEMAPHORE_QUERY_COMPILE"].BaseSeverity > 0); } /* ── Blocking & Deadlocking ── */ @@ -241,6 +276,41 @@ private void BuildBlockingEdges() AddEdge("THREADPOOL", "BLOCKING_EVENTS", "thread_exhaustion", "Blocking events present — blocked queries holding worker threads", facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + + // BLOCKING_CHAIN → LCK (chain blocking visible in lock waits) + AddEdge("BLOCKING_CHAIN", "LCK", "blocking", + "Lock contention waits elevated — chain blocking visible in wait stats", + facts => HasFact(facts, "LCK") && facts["LCK"].Severity >= 0.5); + + // BLOCKING_CHAIN → THREADPOOL (chain victims pinning worker threads) + AddEdge("BLOCKING_CHAIN", "THREADPOOL", "blocking", + "THREADPOOL waits present — chain victims consuming worker threads", + facts => HasFact(facts, "THREADPOOL") && facts["THREADPOOL"].Severity > 0); + + // BLOCKING_CHAIN → DEADLOCKS (chain blocking escalating) + AddEdge("BLOCKING_CHAIN", "DEADLOCKS", "blocking", + "Deadlocks also present — chain blocking escalating to deadlocks", + facts => HasFact(facts, "DEADLOCKS") && facts["DEADLOCKS"].BaseSeverity > 0); + + // BLOCKING_CHAIN → BLOCKING_EVENTS (chain confirmed by event volume) + AddEdge("BLOCKING_CHAIN", "BLOCKING_EVENTS", "blocking", + "Blocking event rate elevated — chain confirmed by event volume", + facts => HasFact(facts, "BLOCKING_EVENTS") && facts["BLOCKING_EVENTS"].BaseSeverity > 0); + + // BLOCKING_EVENTS → BLOCKING_CHAIN (event rate has structural depth) + AddEdge("BLOCKING_EVENTS", "BLOCKING_CHAIN", "blocking", + "Reconstructed blocking chain — the pile-up has structural depth", + facts => HasFact(facts, "BLOCKING_CHAIN") && facts["BLOCKING_CHAIN"].BaseSeverity > 0); + + // LCK → BLOCKING_CHAIN (lock waits form a transitive pile-up) + AddEdge("LCK", "BLOCKING_CHAIN", "lock_contention", + "Reconstructed blocking chain — lock waits form a transitive pile-up", + facts => HasFact(facts, "BLOCKING_CHAIN") && facts["BLOCKING_CHAIN"].BaseSeverity > 0); + + // THREADPOOL → BLOCKING_CHAIN (chain victims pinning worker threads) + AddEdge("THREADPOOL", "BLOCKING_CHAIN", "thread_exhaustion", + "Reconstructed blocking chain — chain victims pinning worker threads", + facts => HasFact(facts, "BLOCKING_CHAIN") && facts["BLOCKING_CHAIN"].BaseSeverity > 0); } /* ── I/O Pressure ── */ @@ -316,6 +386,30 @@ private void BuildQueryEdges() AddEdge("QUERY_HIGH_DOP", "SOS_SCHEDULER_YIELD", "query_performance", "Scheduler yields — high-DOP queries saturating CPU", facts => HasFact(facts, "SOS_SCHEDULER_YIELD") && facts["SOS_SCHEDULER_YIELD"].Severity >= 0.5); + + // PARAMETER_SENSITIVITY → MEMORY_GRANT_PENDING (sensitive plan's grant varies — grant pressure) + AddEdge("PARAMETER_SENSITIVITY", "MEMORY_GRANT_PENDING", "query_performance", + "Memory grant waiters — a parameter-sensitive plan's grant varies with its inputs", + facts => HasFact(facts, "MEMORY_GRANT_PENDING") && facts["MEMORY_GRANT_PENDING"].BaseSeverity > 0 + && facts.TryGetValue("PARAMETER_SENSITIVITY", out var ps) + && ps.Metadata.GetValueOrDefault("grant_divergence") > 0); + + // PARAMETER_SENSITIVITY → QUERY_SPILLS (sensitive plan spills on some parameter values) + AddEdge("PARAMETER_SENSITIVITY", "QUERY_SPILLS", "query_performance", + "Query spills — a parameter-sensitive plan spills on some parameter values", + facts => HasFact(facts, "QUERY_SPILLS") && facts["QUERY_SPILLS"].BaseSeverity > 0 + && facts.TryGetValue("PARAMETER_SENSITIVITY", out var ps) + && ps.Metadata.GetValueOrDefault("spill_divergence") > 0); + + // PLAN_REGRESSION → CPU_SQL_PERCENT (regressed plan driving CPU load) + AddEdge("PLAN_REGRESSION", "CPU_SQL_PERCENT", "query_performance", + "SQL Server CPU elevated — the regressed plan is burning CPU", + facts => HasFact(facts, "CPU_SQL_PERCENT") && facts["CPU_SQL_PERCENT"].BaseSeverity > 0); + + // PLAN_REGRESSION → CPU_SPIKE (regressed plan caused a CPU spike) + AddEdge("PLAN_REGRESSION", "CPU_SPIKE", "query_performance", + "CPU spike — the regressed plan is burning CPU", + facts => HasFact(facts, "CPU_SPIKE") && facts["CPU_SPIKE"].BaseSeverity > 0); } private static bool HasFact(IReadOnlyDictionary facts, string key) diff --git a/Dashboard/Analysis/SqlServerDrillDownCollector.cs b/Dashboard/Analysis/SqlServerDrillDownCollector.cs index cf435978..88c8a197 100644 --- a/Dashboard/Analysis/SqlServerDrillDownCollector.cs +++ b/Dashboard/Analysis/SqlServerDrillDownCollector.cs @@ -53,6 +53,9 @@ public async Task EnrichFindingsAsync(List findings, AnalysisCo if (pathKeys.Contains("BLOCKING_EVENTS")) await CollectTopBlockingChains(finding, context); + if (pathKeys.Contains("BLOCKING_CHAIN")) + await CollectReconstructedBlockingChains(finding, context); + if (pathKeys.Contains("CPU_SPIKE")) await CollectQueriesAtSpike(finding, context); @@ -85,6 +88,12 @@ public async Task EnrichFindingsAsync(List findings, AnalysisCo if (pathKeys.Any(k => k.StartsWith("BAD_ACTOR_", StringComparison.OrdinalIgnoreCase))) await CollectBadActorDetail(finding, context); + if (pathKeys.Contains("PARAMETER_SENSITIVITY")) + await CollectParameterSensitiveQueries(finding, context); + + if (pathKeys.Contains("PLAN_REGRESSION")) + await CollectRegressedQueries(finding, context); + // Plan analysis: for findings with top queries, analyze their cached plans await CollectPlanAnalysis(finding, context); @@ -775,4 +784,331 @@ FROM collect.query_stats }; } } + + /// + /// Top parameter-sensitive plans behind a PARAMETER_SENSITIVITY finding. + /// Re-runs the detector for the top 5 offenders. + /// + private async Task CollectParameterSensitiveQueries(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +WITH latest AS +( + SELECT + database_name, + query_hash, + query_plan_hash, + execution_count, + creation_time, + min_worker_time, + max_worker_time, + min_grant_kb, + max_grant_kb, + min_spills, + max_spills, + query_text, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_hash, query_plan_hash + ORDER BY collection_time DESC + ) AS rn + FROM collect.query_stats + WHERE collection_time >= @startTime + AND collection_time <= @endTime + AND execution_count_delta > 0 +) +SELECT + database_name, + CONVERT(varchar(18), query_hash, 1) AS query_hash, + CONVERT(varchar(18), query_plan_hash, 1) AS query_plan_hash, + execution_count, + min_worker_time, + max_worker_time, + CAST(max_worker_time AS float) / NULLIF(min_worker_time, 0) AS worker_ratio, + CAST(max_grant_kb AS float) / NULLIF(min_grant_kb, 0) AS grant_ratio, + CASE WHEN max_spills > 0 AND min_spills = 0 THEN 1 ELSE 0 END AS spill_divergence, + LEFT(CAST(DECOMPRESS(query_text) AS NVARCHAR(MAX)), 500) AS query_text +FROM latest +WHERE rn = 1 +AND min_worker_time >= 10000 +AND max_worker_time >= 250000 +AND execution_count >= 20 +AND creation_time <= @startTime +AND CAST(max_worker_time AS float) / NULLIF(min_worker_time, 0) >= 10 +ORDER BY worker_ratio DESC +OFFSET 0 ROWS FETCH NEXT 5 ROWS ONLY"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_hash = reader.IsDBNull(1) ? "" : reader.GetString(1), + query_plan_hash = reader.IsDBNull(2) ? "" : reader.GetString(2), + execution_count = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)), + min_worker_time_us = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)), + max_worker_time_us = reader.IsDBNull(5) ? 0L : Convert.ToInt64(reader.GetValue(5)), + worker_ratio = reader.IsDBNull(6) ? 0.0 : Convert.ToDouble(reader.GetValue(6)), + grant_ratio = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)), + spills_on_some_inputs = !reader.IsDBNull(8) && Convert.ToInt32(reader.GetValue(8)) == 1, + query_text = reader.IsDBNull(9) ? "" : reader.GetString(9) + }); + } + + if (items.Count > 0) + finding.DrillDown!["parameter_sensitive_queries"] = items; + } + + /// + /// Top regressed queries behind a PLAN_REGRESSION finding. + /// Uses the same 14-day server_last_execution_time comparison window as the detector + /// (NOT the standard analysis window) so the days-old "best plan" baseline is present. + /// + private async Task CollectRegressedQueries(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +WITH deduped AS +( + SELECT + database_name, + query_id, + plan_id, + query_plan_hash, + count_executions, + avg_cpu_time, + avg_duration, + server_last_execution_time, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_id, plan_id, server_first_execution_time + ORDER BY collection_time DESC + ) AS rn + FROM collect.query_store_data + WHERE execution_type_desc = N'Regular' + AND server_last_execution_time >= @windowStart +), +plan_agg AS +( + -- query_plan_hash is invariant within a plan_id, so include it in the GROUP BY + -- (MS Learn's MAX page does not list binary/varbinary in the accepted types). + SELECT + database_name, + query_id, + plan_id, + query_plan_hash, + SUM(count_executions) AS execs, + CASE WHEN SUM(count_executions) > 0 + THEN SUM(avg_cpu_time * count_executions) / NULLIF(SUM(count_executions), 0) + ELSE 0 END AS cpu_per_exec, + CASE WHEN SUM(count_executions) > 0 + THEN SUM(avg_duration * count_executions) / NULLIF(SUM(count_executions), 0) + ELSE 0 END AS dur_per_exec, + MAX(server_last_execution_time) AS last_exec + FROM deduped + WHERE rn = 1 + GROUP BY database_name, query_id, plan_id, query_plan_hash +), +plan_dedup AS +( + SELECT + database_name, + query_id, + query_plan_hash, + SUM(execs) AS execs, + CASE WHEN SUM(execs) > 0 + THEN SUM(cpu_per_exec * execs) / NULLIF(SUM(execs), 0) + ELSE 0 END AS cpu_per_exec, + CASE WHEN SUM(execs) > 0 + THEN SUM(dur_per_exec * execs) / NULLIF(SUM(execs), 0) + ELSE 0 END AS dur_per_exec, + MAX(last_exec) AS last_exec + FROM plan_agg + GROUP BY database_name, query_id, query_plan_hash + HAVING SUM(execs) >= 25 +), +ranked AS +( + SELECT + *, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY last_exec DESC) AS recency, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY cpu_per_exec ASC) AS cheapness + FROM plan_dedup +), +compared AS +( + SELECT + l.database_name, + l.query_id, + l.query_plan_hash AS latest_plan_hash, + l.cpu_per_exec AS latest_cpu, + l.dur_per_exec AS latest_dur, + b.query_plan_hash AS best_plan_hash, + b.cpu_per_exec AS best_cpu, + b.dur_per_exec AS best_dur, + (SELECT MAX(v) + FROM (VALUES + (CAST(l.cpu_per_exec AS float) / NULLIF(b.cpu_per_exec, 0)), + (CAST(l.dur_per_exec AS float) / NULLIF(b.dur_per_exec, 0)) + ) AS x(v)) AS regression_factor + FROM ranked AS l + JOIN ranked AS b + ON b.database_name = l.database_name + AND b.query_id = l.query_id + AND b.cheapness = 1 + WHERE l.recency = 1 + AND l.query_plan_hash <> b.query_plan_hash +) +SELECT + c.database_name, + c.query_id, + CONVERT(varchar(18), c.latest_plan_hash, 1) AS latest_plan_hash, + c.latest_cpu, + c.latest_dur, + CONVERT(varchar(18), c.best_plan_hash, 1) AS best_plan_hash, + c.best_cpu, + c.best_dur, + c.regression_factor, + -- query_sql_text is varbinary(max); fetch it via APPLY (MAX() on varbinary(max) is invalid). + LEFT(CAST(DECOMPRESS(qt.query_sql_text) AS NVARCHAR(MAX)), 500) AS query_text +FROM compared AS c +OUTER APPLY +( + SELECT TOP (1) qs.query_sql_text + FROM collect.query_store_data AS qs + WHERE qs.database_name = c.database_name + AND qs.query_id = c.query_id + AND qs.query_plan_hash = c.latest_plan_hash + AND qs.server_last_execution_time >= @windowStart + ORDER BY qs.server_last_execution_time DESC +) AS qt +WHERE c.regression_factor >= 2 +ORDER BY c.regression_factor DESC +OFFSET 0 ROWS FETCH NEXT 5 ROWS ONLY"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart.AddDays(-14))); + + var items = new List(); + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + items.Add(new + { + database = reader.IsDBNull(0) ? "" : reader.GetString(0), + query_id = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)), + latest_plan_hash = reader.IsDBNull(2) ? "" : reader.GetString(2), + latest_cpu_per_exec_us = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)), + latest_duration_per_exec_us = reader.IsDBNull(4) ? 0.0 : Convert.ToDouble(reader.GetValue(4)), + best_plan_hash = reader.IsDBNull(5) ? "" : reader.GetString(5), + best_cpu_per_exec_us = reader.IsDBNull(6) ? 0.0 : Convert.ToDouble(reader.GetValue(6)), + best_duration_per_exec_us = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)), + regression_factor = reader.IsDBNull(8) ? 0.0 : Convert.ToDouble(reader.GetValue(8)), + query_text = reader.IsDBNull(9) ? "" : reader.GetString(9) + }); + } + + if (items.Count > 0) + finding.DrillDown!["regressed_queries"] = items; + } + + /// + /// Reconstructs blocking chains (same logic as the collector) and surfaces the top 3 + /// by magnitude — apex, depth, victim count, and the level-by-level structure that + /// the flat top_blocking_chains list cannot show. + /// + private async Task CollectReconstructedBlockingChains(AnalysisFinding finding, AnalysisContext context) + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP (5000) + event_time, + database_name, + blocked_process_report_xml, + wait_time_ms, + lock_mode, + last_transaction_started, + spid +FROM collect.blocking_BlockedProcessReport +WHERE collection_time >= @collectionWindow +AND event_time >= @startTime +AND event_time <= @endTime +AND activity = 'blocked' +ORDER BY collection_time DESC"; + + cmd.Parameters.Add(new SqlParameter("@collectionWindow", context.TimeRangeStart.AddHours(-1))); + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var rows = new List(); + using (var reader = await cmd.ExecuteReaderAsync()) + { + while (await reader.ReadAsync()) + { + var eventTime = reader.IsDBNull(0) ? default : reader.GetDateTime(0); + var dbName = reader.IsDBNull(1) ? string.Empty : reader.GetString(1); + var xml = reader.IsDBNull(2) ? string.Empty : reader.GetSqlXml(2).Value; + var waitMs = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var lockMode = reader.IsDBNull(4) ? string.Empty : reader.GetString(4); + var tranStarted = reader.IsDBNull(5) ? (DateTime?)null : reader.GetDateTime(5); + var spid = reader.IsDBNull(6) ? 0 : Convert.ToInt32(reader.GetValue(6)); + + var pair = BlockedProcessXmlParser.Parse(xml, eventTime, dbName, waitMs, lockMode, tranStarted, spid); + if (pair != null) + rows.Add(pair); + } + } + + if (rows.Count == 0) return; + + var reconstruction = BlockingChainReconstructor.Reconstruct( + rows, maxDepth: 50, maxPairs: 5000, stepBudget: 100_000); + + var items = new List(); + foreach (var chain in reconstruction.Chains.Take(3)) + { + items.Add(new + { + apex_spid = chain.ApexSpid, + apex_sleeping = chain.ApexSleeping, + depth = chain.Depth, + // Distinct sessions blocked under this apex over the window — cumulative, not peak-concurrent. + victim_count = chain.VictimCount, + max_wait_ms = chain.MaxWaitMs, + levels = chain.Levels.Select(l => new + { + level = l.Level, + blocking_spid = l.BlockingSpid, + blocked_spid = l.BlockedSpid, + lock_mode = l.LockMode, + wait_time_ms = l.WaitTimeMs, + blocking_sql = l.BlockingSqlText, + blocked_sql = l.BlockedSqlText + }).ToList() + }); + } + + if (items.Count > 0) + finding.DrillDown!["reconstructed_blocking_chains"] = items; + } } diff --git a/Dashboard/Analysis/SqlServerFactCollector.cs b/Dashboard/Analysis/SqlServerFactCollector.cs index 446e4277..5d439f57 100644 --- a/Dashboard/Analysis/SqlServerFactCollector.cs +++ b/Dashboard/Analysis/SqlServerFactCollector.cs @@ -39,6 +39,9 @@ public async Task> CollectFactsAsync(AnalysisContext context) await CollectTempDbFactsAsync(context, facts); await CollectMemoryGrantFactsAsync(context, facts); await CollectQueryStatsFactsAsync(context, facts); + await CollectParameterSensitivityFactsAsync(context, facts); + await CollectPlanRegressionFactsAsync(context, facts); + await CollectBlockingChainFactsAsync(context, facts); await CollectBadActorFactsAsync(context, facts); await CollectPerfmonFactsAsync(context, facts); await CollectMemoryClerkFactsAsync(context, facts); @@ -798,6 +801,411 @@ FROM collect.query_stats } } + /// + /// Detects parameter-sensitive cached plans: a single query_plan_hash whose + /// per-execution worker time varies wildly — one plan serving very different + /// parameter values. Emits one aggregate PARAMETER_SENSITIVITY fact. + /// Note min_*/max_* are cumulative over the plan's cached lifetime, so the + /// finding means "this plan, active now, has a history of widely varying cost". + /// + private async Task CollectParameterSensitivityFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +WITH latest AS +( + SELECT + query_hash, + query_plan_hash, + database_name, + execution_count, + creation_time, + min_worker_time, + max_worker_time, + min_grant_kb, + max_grant_kb, + min_spills, + max_spills, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_hash, query_plan_hash + ORDER BY collection_time DESC + ) AS rn + FROM collect.query_stats + WHERE collection_time >= @startTime + AND collection_time <= @endTime + AND execution_count_delta > 0 +) +SELECT + min_worker_time, + max_worker_time, + CAST(max_worker_time AS float) / NULLIF(min_worker_time, 0) AS worker_ratio, + CAST(max_grant_kb AS float) / NULLIF(min_grant_kb, 0) AS grant_ratio, + CASE WHEN max_spills > 0 AND min_spills = 0 THEN 1 ELSE 0 END AS spill_divergence +FROM latest +WHERE rn = 1 +AND min_worker_time >= 10000 +AND max_worker_time >= 250000 +AND execution_count >= 20 +AND creation_time <= @startTime +AND CAST(max_worker_time AS float) / NULLIF(min_worker_time, 0) >= 10 +ORDER BY worker_ratio DESC +OFFSET 0 ROWS FETCH NEXT 20 ROWS ONLY"; + + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var offenderCount = 0; + var worstRatio = 0.0; + var worstMinWorker = 0L; + var worstMaxWorker = 0L; + var worstGrantRatio = 0.0; + var worstSpillDivergence = 0; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + // Rows arrive ordered by worker_ratio DESC — the first row is the worst offender. + if (offenderCount == 0) + { + worstMinWorker = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + worstMaxWorker = reader.IsDBNull(1) ? 0L : Convert.ToInt64(reader.GetValue(1)); + worstRatio = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + worstGrantRatio = reader.IsDBNull(3) ? 0.0 : Convert.ToDouble(reader.GetValue(3)); + worstSpillDivergence = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)); + } + offenderCount++; + } + + if (offenderCount == 0) return; + + facts.Add(new Fact + { + Source = "queries", + Key = "PARAMETER_SENSITIVITY", + Value = worstRatio, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["offender_count"] = offenderCount, + ["worst_ratio"] = worstRatio, + ["worst_min_worker_us"] = worstMinWorker, + ["worst_max_worker_us"] = worstMaxWorker, + ["worst_grant_ratio"] = worstGrantRatio, + ["grant_divergence"] = worstGrantRatio >= 5 ? 1 : 0, + ["spill_divergence"] = worstSpillDivergence + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectParameterSensitivityFactsAsync failed", ex); + } + } + + /// + /// Detects plan regressions: a query whose currently-active plan has per-execution + /// cost >= 2x the best plan that query is known to perform well with. Emits one + /// aggregate PLAN_REGRESSION fact. Sourced from Query Store (collect.query_store_data); + /// no fact when Query Store is not enabled on the monitored databases. + /// Unlike other collectors this windows on server_last_execution_time (14-day + /// comparison window), NOT collection_time. + /// + private async Task CollectPlanRegressionFactsAsync(AnalysisContext context, List facts) + { + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +WITH deduped AS +( + -- Collapse incremental re-collections of the same open runtime-stats interval: + -- keep only the latest collection_time row per logical interval. + SELECT + database_name, + query_id, + plan_id, + query_plan_hash, + count_executions, + avg_cpu_time, + avg_duration, + server_last_execution_time, + is_forced_plan, + force_failure_count, + ROW_NUMBER() OVER + ( + PARTITION BY database_name, query_id, plan_id, server_first_execution_time + ORDER BY collection_time DESC + ) AS rn + FROM collect.query_store_data + WHERE execution_type_desc = N'Regular' + AND server_last_execution_time >= @windowStart +), +plan_agg AS +( + -- Execution-weighted per-exec cost per plan_id. query_plan_hash is invariant + -- within a plan_id, so include it in the GROUP BY rather than aggregating it + -- (MS Learn's MAX page does not list binary/varbinary in the accepted types). + SELECT + database_name, + query_id, + plan_id, + query_plan_hash, + SUM(count_executions) AS execs, + CASE WHEN SUM(count_executions) > 0 + THEN SUM(avg_cpu_time * count_executions) / NULLIF(SUM(count_executions), 0) + ELSE 0 END AS cpu_per_exec, + CASE WHEN SUM(count_executions) > 0 + THEN SUM(avg_duration * count_executions) / NULLIF(SUM(count_executions), 0) + ELSE 0 END AS dur_per_exec, + MAX(server_last_execution_time) AS last_exec, + MAX(CAST(is_forced_plan AS tinyint)) AS is_forced_plan, + MAX(force_failure_count) AS force_failure_count + FROM deduped + WHERE rn = 1 + GROUP BY database_name, query_id, plan_id, query_plan_hash +), +plan_dedup AS +( + -- Collapse plan_ids that share a query_plan_hash (a recompile can produce an + -- identical plan under a new plan_id); keep only plans with enough executions. + SELECT + database_name, + query_id, + query_plan_hash, + SUM(execs) AS execs, + CASE WHEN SUM(execs) > 0 + THEN SUM(cpu_per_exec * execs) / NULLIF(SUM(execs), 0) + ELSE 0 END AS cpu_per_exec, + CASE WHEN SUM(execs) > 0 + THEN SUM(dur_per_exec * execs) / NULLIF(SUM(execs), 0) + ELSE 0 END AS dur_per_exec, + MAX(last_exec) AS last_exec, + MAX(is_forced_plan) AS is_forced_plan, + MAX(force_failure_count) AS force_failure_count + FROM plan_agg + GROUP BY database_name, query_id, query_plan_hash + HAVING SUM(execs) >= 25 +), +ranked AS +( + SELECT + *, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY last_exec DESC) AS recency, + ROW_NUMBER() OVER (PARTITION BY database_name, query_id ORDER BY cpu_per_exec ASC) AS cheapness + FROM plan_dedup +), +compared AS +( + -- Latest active plan vs the best-performing plan for the same query. + SELECT + l.query_id, + l.cpu_per_exec AS latest_cpu, + l.dur_per_exec AS latest_dur, + l.is_forced_plan AS latest_is_forced, + l.force_failure_count AS force_failure_count, + b.cpu_per_exec AS best_cpu, + b.dur_per_exec AS best_dur, + (SELECT MAX(v) + FROM (VALUES + (CAST(l.cpu_per_exec AS float) / NULLIF(b.cpu_per_exec, 0)), + (CAST(l.dur_per_exec AS float) / NULLIF(b.dur_per_exec, 0)) + ) AS x(v)) AS regression_factor + FROM ranked AS l + JOIN ranked AS b + ON b.database_name = l.database_name + AND b.query_id = l.query_id + AND b.cheapness = 1 + WHERE l.recency = 1 + AND l.query_plan_hash <> b.query_plan_hash +) +SELECT + query_id, + latest_cpu, + latest_dur, + latest_is_forced, + force_failure_count, + best_cpu, + best_dur, + regression_factor +FROM compared +WHERE regression_factor >= 2 +ORDER BY regression_factor DESC +OFFSET 0 ROWS FETCH NEXT 20 ROWS ONLY"; + + cmd.Parameters.Add(new SqlParameter("@windowStart", context.TimeRangeStart.AddDays(-14))); + + var offenderCount = 0; + var worstFactor = 0.0; + var worstQueryId = 0L; + var worstLatestCpu = 0.0; + var worstBestCpu = 0.0; + var worstDimension = 1; + var worstLatestForced = 0; + var worstForceFailures = 0L; + + using var reader = await cmd.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + // Rows arrive ordered by regression_factor DESC — the first row is the worst offender. + if (offenderCount == 0) + { + worstQueryId = reader.IsDBNull(0) ? 0L : Convert.ToInt64(reader.GetValue(0)); + var latestCpu = reader.IsDBNull(1) ? 0.0 : Convert.ToDouble(reader.GetValue(1)); + var latestDur = reader.IsDBNull(2) ? 0.0 : Convert.ToDouble(reader.GetValue(2)); + worstLatestForced = (!reader.IsDBNull(3) && Convert.ToInt32(reader.GetValue(3)) > 0) ? 1 : 0; + worstForceFailures = reader.IsDBNull(4) ? 0L : Convert.ToInt64(reader.GetValue(4)); + var bestCpu = reader.IsDBNull(5) ? 0.0 : Convert.ToDouble(reader.GetValue(5)); + var bestDur = reader.IsDBNull(6) ? 0.0 : Convert.ToDouble(reader.GetValue(6)); + worstFactor = reader.IsDBNull(7) ? 0.0 : Convert.ToDouble(reader.GetValue(7)); + + worstLatestCpu = latestCpu; + worstBestCpu = bestCpu; + var cpuRatio = bestCpu > 0 ? latestCpu / bestCpu : 0.0; + var durRatio = bestDur > 0 ? latestDur / bestDur : 0.0; + worstDimension = cpuRatio >= durRatio ? 1 : 2; // 1 = cpu, 2 = duration + } + offenderCount++; + } + + if (offenderCount == 0) return; + + facts.Add(new Fact + { + Source = "queries", + Key = "PLAN_REGRESSION", + Value = worstFactor, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["offender_count"] = offenderCount, + ["worst_regression_factor"] = worstFactor, + ["worst_query_id"] = worstQueryId, + ["latest_cpu_per_exec_us"] = worstLatestCpu, + ["best_cpu_per_exec_us"] = worstBestCpu, + ["regressed_dimension"] = worstDimension, + ["latest_is_forced"] = worstLatestForced, + ["force_failure_count"] = worstForceFailures + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectPlanRegressionFactsAsync failed", ex); + } + } + + /// + /// Reconstructs blocking chains from collect.blocking_BlockedProcessReport (one row + /// per side of each blocking event, dedup'd to activity = 'blocked') and emits + /// one aggregate BLOCKING_CHAIN fact describing the worst chain — apex head blocker, + /// depth, transitive victim count. Structure the BLOCKING_EVENTS rate is blind to. + /// XML parsing happens at analysis time; a malformed row is skipped silently. + /// + private async Task CollectBlockingChainFactsAsync(AnalysisContext context, List facts) + { + const int maxPairs = 5000; + const int maxDepth = 50; + const int stepBudget = 100_000; + + try + { + using var connection = new SqlConnection(_connectionString); + await connection.OpenAsync(); + + using var cmd = connection.CreateCommand(); + // ORDER BY collection_time DESC is a backward CIX scan (sort-free); event_time + // is a residual predicate. activity='blocked' picks the canonical per-event side. + cmd.CommandText = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +SELECT TOP (5000) + event_time, + database_name, + blocked_process_report_xml, + wait_time_ms, + lock_mode, + last_transaction_started, + spid +FROM collect.blocking_BlockedProcessReport +WHERE collection_time >= @collectionWindow +AND event_time >= @startTime +AND event_time <= @endTime +AND activity = 'blocked' +ORDER BY collection_time DESC"; + + // Generous bound — analysis window plus an hour — to catch rows whose + // event_time is inside the window but whose collection_time may lag slightly. + cmd.Parameters.Add(new SqlParameter("@collectionWindow", context.TimeRangeStart.AddHours(-1))); + cmd.Parameters.Add(new SqlParameter("@startTime", context.TimeRangeStart)); + cmd.Parameters.Add(new SqlParameter("@endTime", context.TimeRangeEnd)); + + var rows = new List(); + using (var reader = await cmd.ExecuteReaderAsync()) + { + while (await reader.ReadAsync()) + { + var eventTime = reader.IsDBNull(0) ? default : reader.GetDateTime(0); + var dbName = reader.IsDBNull(1) ? string.Empty : reader.GetString(1); + var xml = reader.IsDBNull(2) ? string.Empty : reader.GetSqlXml(2).Value; + var waitMs = reader.IsDBNull(3) ? 0L : Convert.ToInt64(reader.GetValue(3)); + var lockMode = reader.IsDBNull(4) ? string.Empty : reader.GetString(4); + var tranStarted = reader.IsDBNull(5) ? (DateTime?)null : reader.GetDateTime(5); + var spid = reader.IsDBNull(6) ? 0 : Convert.ToInt32(reader.GetValue(6)); + + var pair = BlockedProcessXmlParser.Parse(xml, eventTime, dbName, waitMs, lockMode, tranStarted, spid); + if (pair != null) + rows.Add(pair); + } + } + + if (rows.Count == 0) return; + + var reconstruction = BlockingChainReconstructor.Reconstruct(rows, maxDepth, maxPairs, stepBudget); + if (reconstruction.Chains.Count == 0) return; + + var worst = reconstruction.Chains[0]; + + facts.Add(new Fact + { + Source = "blocking", + Key = "BLOCKING_CHAIN", + Value = worst.Depth, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["worst_chain_depth"] = worst.Depth, + ["worst_chain_victim_count"] = worst.VictimCount, + ["worst_apex_spid"] = worst.ApexSpid, + ["worst_apex_sleeping"] = worst.ApexSleeping ? 1 : 0, + ["worst_chain_max_wait_ms"] = worst.MaxWaitMs, + ["total_reconstructed_chains"] = reconstruction.Chains.Count, + ["deepest_chain_overall"] = reconstruction.Chains.Max(c => c.Depth), + ["max_victim_count_overall"] = reconstruction.Chains.Max(c => c.VictimCount), + ["depth_capped"] = reconstruction.DepthCapped ? 1 : 0, + ["traversal_truncated"] = reconstruction.TraversalTruncated ? 1 : 0, + ["cycle_detected"] = reconstruction.CycleDetected ? 1 : 0 + } + }); + } + catch (Exception ex) + { + Logger.Error("SqlServerFactCollector.CollectBlockingChainFactsAsync failed", ex); + } + } + /// /// Identifies individual queries that are consistently terrible ("bad actors"). /// These queries don't necessarily cause server-level symptoms but waste resources diff --git a/Dashboard/Analysis/SqlServerFindingStore.cs b/Dashboard/Analysis/SqlServerFindingStore.cs index 0fd0d73e..3d9ceb8d 100644 --- a/Dashboard/Analysis/SqlServerFindingStore.cs +++ b/Dashboard/Analysis/SqlServerFindingStore.cs @@ -123,7 +123,9 @@ public async Task> SaveFindingsAsync( RootFactValue = story.RootFactValue, LeafFactKey = story.LeafFactKey, LeafFactValue = story.LeafFactValue, - FactCount = story.FactCount + FactCount = story.FactCount, + // Carried in-memory only; no analysis_findings column for it. + RootFactMetadata = story.RootFactMetadata }; await InsertFindingAsync(finding); diff --git a/Dashboard/Mcp/McpAnalysisTools.cs b/Dashboard/Mcp/McpAnalysisTools.cs index c20db796..0a33516e 100644 --- a/Dashboard/Mcp/McpAnalysisTools.cs +++ b/Dashboard/Mcp/McpAnalysisTools.cs @@ -39,7 +39,7 @@ public static async Task AnalyzeServer( try { var analysisService = CreateAnalysisService(resolved.Value.Service); - var serverId = resolved.Value.ServerName.GetHashCode(); + var serverId = ServerIdHelper.GetDeterministicHashCode(resolved.Value.ServerName); var findings = await analysisService.AnalyzeAsync(serverId, resolved.Value.ServerName, hours_back); if (analysisService.InsufficientDataMessage != null) @@ -116,7 +116,7 @@ public static async Task GetAnalysisFacts( try { var analysisService = CreateAnalysisService(resolved.Value.Service); - var serverId = resolved.Value.ServerName.GetHashCode(); + var serverId = ServerIdHelper.GetDeterministicHashCode(resolved.Value.ServerName); var facts = await analysisService.CollectAndScoreFactsAsync(serverId, resolved.Value.ServerName, hours_back); if (facts.Count == 0) @@ -196,7 +196,7 @@ public static async Task CompareAnalysis( try { var analysisService = CreateAnalysisService(resolved.Value.Service); - var serverId = resolved.Value.ServerName.GetHashCode(); + var serverId = ServerIdHelper.GetDeterministicHashCode(resolved.Value.ServerName); var now = DateTime.UtcNow; var comparisonStart = now.AddHours(-hours_back); @@ -261,7 +261,7 @@ public static async Task AuditConfig( try { var analysisService = CreateAnalysisService(resolved.Value.Service); - var serverId = resolved.Value.ServerName.GetHashCode(); + var serverId = ServerIdHelper.GetDeterministicHashCode(resolved.Value.ServerName); var facts = await analysisService.CollectAndScoreFactsAsync(serverId, resolved.Value.ServerName, 1); var factsByKey = facts.ToDictionary(f => f.Key, f => f); @@ -331,7 +331,7 @@ public static async Task GetAnalysisFindings( try { var analysisService = CreateAnalysisService(resolved.Value.Service); - var serverId = resolved.Value.ServerName.GetHashCode(); + var serverId = ServerIdHelper.GetDeterministicHashCode(resolved.Value.ServerName); var findings = await analysisService.GetRecentFindingsAsync(serverId, hours_back); if (findings.Count == 0) @@ -372,7 +372,7 @@ public static async Task MuteAnalysisFinding( try { var analysisService = CreateAnalysisService(resolved.Value.Service); - var serverId = resolved.Value.ServerName.GetHashCode(); + var serverId = ServerIdHelper.GetDeterministicHashCode(resolved.Value.ServerName); var finding = new AnalysisFinding { ServerId = serverId, StoryPathHash = story_path_hash, StoryPath = story_path_hash }; await analysisService.MuteFindingAsync(finding, reason); @@ -459,6 +459,50 @@ public static System.Collections.Generic.List GetForStoryPath(string sto return result; } + + private static readonly string[] DayNames = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]; + + /// + /// Formats baseline context from anomaly fact metadata into a structured dictionary + /// (deviation, ratio, baseline mean/stddev, time bucket, tier, samples). Used by MCP + /// output and by the analysis notification path's finding formatter. + /// + internal static System.Collections.Generic.Dictionary? FormatBaselineContext( + System.Collections.Generic.Dictionary metadata) + { + var result = new System.Collections.Generic.Dictionary(); + + if (metadata.TryGetValue("deviation_sigma", out var sigma)) + result["deviation"] = $"{sigma:F1}σ"; + + if (metadata.TryGetValue("ratio", out var ratio)) + result["ratio"] = $"{ratio:F1}x"; + + if (metadata.TryGetValue("baseline_mean", out var mean)) + result["baseline_mean"] = Math.Round(mean, 2); + + if (metadata.TryGetValue("baseline_mean_ms", out var meanMs)) + result["baseline_mean"] = Math.Round(meanMs, 2); + + if (metadata.TryGetValue("baseline_stddev", out var stddev)) + result["baseline_stddev"] = Math.Round(stddev, 2); + + if (metadata.TryGetValue("baseline_hour", out var hour) && + metadata.TryGetValue("baseline_dow", out var dow)) + { + var dowIdx = (int)dow; + var dayName = dowIdx >= 0 && dowIdx < DayNames.Length ? DayNames[dowIdx] : "?"; + result["bucket"] = hour >= 0 ? $"{dayName} {(int)hour:00}:00" : "flat"; + } + + if (metadata.TryGetValue("baseline_tier", out var tier)) + result["tier"] = tier switch { 0 => "full", 1 => "hour_only", _ => "flat" }; + + if (metadata.TryGetValue("baseline_samples", out var samples)) + result["baseline_samples"] = (int)samples; + + return result.Count > 0 ? result : null; + } } internal sealed record ToolRecommendation( diff --git a/Dashboard/Services/ServerIdHelper.cs b/Dashboard/Services/ServerIdHelper.cs new file mode 100644 index 00000000..5c86bec7 --- /dev/null +++ b/Dashboard/Services/ServerIdHelper.cs @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2026 Erik Darling, Darling Data LLC + * + * This file is part of the SQL Server Performance Monitor. + * + * Licensed under the MIT License. See LICENSE file in the project root for full license information. + */ + +namespace PerformanceMonitorDashboard.Services; + +/// +/// Deterministic int server-id derivation used by the analysis pipeline. +/// +/// +/// string.GetHashCode() is randomized per process on .NET Core / .NET 10, +/// so persisted rows in config.analysis_findings and config.analysis_muted +/// would not match the next launch's value for the same server name. This helper +/// produces a stable FNV-1a hash so writes survive restart and are consistent across +/// the MCP entry points and any scheduled-analysis path. +/// +/// +internal static class ServerIdHelper +{ + /// + /// Process-independent FNV-1a hash of a string. Identical implementation to + /// Lite/Services/RemoteCollectorService.GetDeterministicHashCode so + /// Dashboard and Lite produce the same id for the same server name. + /// + public static int GetDeterministicHashCode(string value) + { + unchecked + { + var hash = (int)2166136261; + foreach (var c in value) + { + hash = (hash ^ c) * 16777619; + } + return hash; + } + } +} From 1f08b5bb879dbfb69b9dfde9782cc427f1d75fe4 Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Tue, 26 May 2026 12:12:42 -0400 Subject: [PATCH 2/2] Wire scheduled analysis + finding notifications (Dashboard Stage 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR (b) of the Dashboard port — drops on top of c0cb2d5. Closes the "act" stage: high-severity findings produced by the engine in PR (a) now ride the same email/Slack/Teams channels (and Alerts history tab) as the existing threshold alerts, on a configurable schedule. Scheduler (Dashboard/Services/AnalysisScheduler.cs, new): - Owns its own DispatcherTimer — separate cadence and gating from _alertCheckTimer. Gated by AnalysisNotificationsEnabled; zero cost when off. - Per-server fresh AnalysisService (IsAnalyzing is a single-instance flag, so a shared instance whose task is abandoned on timeout would block every other server). - ConcurrentDictionary in-flight tracking, cleared only on real task completion via ContinueWith — a permanently-hung server is parked, not relaunched. Accepted limitation, matches Lite Stage 2. - Task.WhenAny per-server timeout with a linked CTS so the orphaned Task.Delay timer is cancelled the moment analyze wins the race. - _cycleRunning re-entrancy guard against the timer firing again while a slow cycle is still in progress. - Stop() cancels the CTS so shutdown drops out cleanly instead of waiting out the full per-server timeout window. Notifier + formatter (Dashboard/Services/AnalysisNotificationService.cs, new — sealed, with FindingMessageFormatter as internal static in the same file): - {serverId}:{StoryPathHash} cooldown so a recurring finding does not re-notify every analysis cycle. In-memory only; lost on app restart (accepted limitation, matches Lite). - Diagnosis routed into structured AlertContext.Details rows (Story, Severity, Notify threshold, Confidence, Facts, Database, Window). Dashboard's TrySendAlertEmailAsync has no detailText parameter, so the structured route replaces Lite's detailText payload — renders inside the same 600px email template the threshold alerts use. - BuildContext walks finding.DrillDown values as JsonElement via a System.Text.Json round-trip — robust to any drill-down shape. - ServerId lookup: resolve ServerConnection.Id by ServerName for the alert log so keys line up with the existing threshold-alert engine (code-reviewer caught this). Falls back to the stable int id if the server was removed between cycle start and notify. - RecordAlert fallback when no channel is configured: round-3 review caught a webhook-flag-on-with-URL-absent silent-drop. The fallback predicate now requires both the prefs flag AND the URL via WebhookAlertService.GetTeamsWebhookUrl / GetSlackWebhookUrl so the Alerts history tab still logs findings when no channel can send. Documents the asymmetric-cooldown limitation (SMTP cooldown can suppress a RecordAlert that the analysis cooldown would have allowed) as accepted. - FindingMessageFormatter stays static — notifyThreshold is threaded through as a method parameter rather than read inline (Dashboard has no App.* statics like Lite). UI + settings (Dashboard/Models/UserPreferences.cs + Dashboard/SettingsWindow.xaml + .xaml.cs): - Five new properties: AnalysisNotificationsEnabled (false), AnalysisIntervalMinutes (30), AnalysisNotifySeverity (1.5), AnalysisNotifyCooldownMinutes (360), AnalysisTimeoutSeconds (120). - Bounds enforced at consumption (AnalysisScheduler.Configure clamps interval 5-360 and timeout 30-600; NotifyAsync clamps severity 0-2 and cooldown 30-10080). UI also validates the three exposed values with the same range constants. Cooldown and timeout are intentionally preferences.json-only. - "Automated Analysis Notifications" group added after the MCP section on the General tab — enable checkbox + interval (minutes) + severity threshold (0.0-2.0). - Ships disabled. MainWindow wiring (Dashboard/MainWindow.xaml.cs): - Two new fields: _analysisNotificationService and _analysisScheduler. Names chosen to avoid colliding with _notificationService (tray service constructed in MainWindow_Loaded). - Constructed in the ctor right after _alertCheckTimer — all deps (_serverManager, _credentialService, _preferencesService, _emailAlertService) exist by that point. - _analysisScheduler.Configure() called in MainWindow_Loaded after ConfigureAlertCheckTimer, and again on settings save — paralleling the existing _alertCheckTimer pattern so a toggle in Settings takes immediate effect without restart. - _analysisScheduler.Stop() in MainWindow_Closing. EmailTemplateBuilder.cs header rename: - Hard-coded "RECENT EVENTS" was misleading for both threshold alerts (the section already carried non-event content like sessions and TempDB metrics) and findings. Renamed to "DETAILS" / "--- Details ---" in both the HTML and plain-text bodies (round-3 review caught the plain-text occurrence the round-2 patch missed). Webhook rendering is unaffected — it already flattened AlertContext.Details without using this header. Verification: - dotnet build Dashboard/Dashboard.csproj -c Debug: clean (0 warnings, 0 errors). - code-reviewer pass: surfaced ServerId-format-mismatch and a few minor items; the real bug and two minor cleanups (linked CTS, Stop() on close, asymmetric-cooldown comment) applied. - avalonia-gotcha-reviewer: confirmed pure WPF, no Avalonia traps. Three Lite stages now have full Dashboard equivalents through both PR (a) and PR (b). The original four problem scenarios — long blocking chains, parameter sensitivity / plan regressions, resource contention including the compile-wait fact — are detected by both apps and routed into the notification surface on both apps. Co-Authored-By: Claude Opus 4.7 (1M context) --- Dashboard/MainWindow.xaml.cs | 22 ++ Dashboard/Models/UserPreferences.cs | 10 + .../Services/AnalysisNotificationService.cs | 353 ++++++++++++++++++ Dashboard/Services/AnalysisScheduler.cs | 199 ++++++++++ Dashboard/Services/EmailTemplateBuilder.cs | 4 +- Dashboard/SettingsWindow.xaml | 35 ++ Dashboard/SettingsWindow.xaml.cs | 31 ++ 7 files changed, 652 insertions(+), 2 deletions(-) create mode 100644 Dashboard/Services/AnalysisNotificationService.cs create mode 100644 Dashboard/Services/AnalysisScheduler.cs diff --git a/Dashboard/MainWindow.xaml.cs b/Dashboard/MainWindow.xaml.cs index 2591e5f6..d218edeb 100644 --- a/Dashboard/MainWindow.xaml.cs +++ b/Dashboard/MainWindow.xaml.cs @@ -65,6 +65,13 @@ public partial class MainWindow : Window private readonly DispatcherTimer _alertCheckTimer; private readonly EmailAlertService _emailAlertService; private readonly CredentialService _credentialService; + + // Scheduled analysis-finding notifications — separate cadence and gating from + // the threshold-alert engine above. Owns its own DispatcherTimer internally; + // re-Configured after every settings save. Field name avoids colliding with + // _notificationService (the tray-notification service constructed in Loaded). + private readonly AnalysisNotificationService _analysisNotificationService; + private readonly AnalysisScheduler _analysisScheduler; private readonly ConcurrentDictionary _lastBlockingAlert = new(); private readonly ConcurrentDictionary _lastDeadlockAlert = new(); private readonly ConcurrentDictionary _lastHighCpuAlert = new(); @@ -111,6 +118,14 @@ public MainWindow() _alertCheckTimer = new DispatcherTimer(); _alertCheckTimer.Tick += AlertCheckTimer_Tick; + /* Scheduled analysis-finding notifications. Constructed alongside the + alert engine (all dependencies exist by this point); started by + _analysisScheduler.Configure() in MainWindow_Loaded. */ + _analysisNotificationService = new AnalysisNotificationService( + _emailAlertService, _preferencesService, _serverManager); + _analysisScheduler = new AnalysisScheduler( + _serverManager, _credentialService, _preferencesService, _analysisNotificationService); + _displayRefreshTimer = new DispatcherTimer { Interval = TimeSpan.FromSeconds(30) @@ -169,6 +184,7 @@ private async void MainWindow_Loaded(object sender, RoutedEventArgs e) LoadSidebarState(); ConfigureConnectionStatusTimer(); ConfigureAlertCheckTimer(); + _analysisScheduler.Configure(); UpdateAlertBadge(); StartMcpServerIfEnabled(); @@ -318,6 +334,11 @@ private void MainWindow_Closing(object? sender, System.ComponentModel.CancelEven try { Task.Run(StopMcpServerAsync).Wait(TimeSpan.FromSeconds(10)); } catch { /* shutdown best-effort */ } + // Stop the scheduled-analysis timer + cancel its in-flight cycle so the + // per-server Task.Delay timers can drop out cleanly instead of waiting + // out their full timeout during shutdown. + _analysisScheduler?.Stop(); + // Save alert history to disk _emailAlertService?.SaveAlertLog(); @@ -1157,6 +1178,7 @@ private void Settings_Click(object sender, RoutedEventArgs e) { ConfigureConnectionStatusTimer(); ConfigureAlertCheckTimer(); + _analysisScheduler.Configure(); _landingPage?.RefreshAutoRefreshSettings(); foreach (TabItem tab in ServerTabControl.Items) diff --git a/Dashboard/Models/UserPreferences.cs b/Dashboard/Models/UserPreferences.cs index 6b1de362..4b128617 100644 --- a/Dashboard/Models/UserPreferences.cs +++ b/Dashboard/Models/UserPreferences.cs @@ -131,6 +131,16 @@ public int EmailCooldownMinutes public bool McpEnabled { get; set; } = false; public int McpPort { get; set; } = 5150; + // Automated analysis notifications (Stage 2) + // Bounds are enforced where these are consumed (the scheduler and the + // notification service), not here — keeps the prefs surface simple and + // lets clamps be visible at the consumption sites. + public bool AnalysisNotificationsEnabled { get; set; } = false; + public int AnalysisIntervalMinutes { get; set; } = 30; + public double AnalysisNotifySeverity { get; set; } = 1.5; + public int AnalysisNotifyCooldownMinutes { get; set; } = 360; + public int AnalysisTimeoutSeconds { get; set; } = 120; + // CSV export settings public string CsvSeparator { get; set; } = GetDefaultCsvSeparator(); diff --git a/Dashboard/Services/AnalysisNotificationService.cs b/Dashboard/Services/AnalysisNotificationService.cs new file mode 100644 index 00000000..424e0262 --- /dev/null +++ b/Dashboard/Services/AnalysisNotificationService.cs @@ -0,0 +1,353 @@ +/* + * Performance Monitor Dashboard + * Copyright (c) 2026 Darling Data, LLC + * Licensed under the MIT License - see LICENSE file for details + */ + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; +using PerformanceMonitorDashboard.Analysis; +using PerformanceMonitorDashboard.Helpers; +using PerformanceMonitorDashboard.Interfaces; +using PerformanceMonitorDashboard.Mcp; + +namespace PerformanceMonitorDashboard.Services +{ + /// + /// Routes high-severity analysis findings into the notification channels. + /// Filters by severity, dedups per finding (so a recurring finding does not + /// re-notify every analysis cycle), composes a structured message, and hands off + /// to — which fans out to email + Slack + Teams + /// and logs to alert_history.json. + /// + /// + /// Called by with the findings returned by each + /// completed AnalysisService.AnalyzeAsync. Never throws. + /// + /// + public sealed class AnalysisNotificationService + { + private readonly EmailAlertService _emailAlertService; + private readonly IUserPreferencesService _preferencesService; + private readonly IServerManager _serverManager; + + /// + /// Per-finding re-notification cooldown, keyed "{serverId}:{StoryPathHash}". + /// In-memory only — lost on app restart (each active finding then re-notifies once). + /// Never evicts; bounded by servers × distinct story patterns. + /// + private readonly ConcurrentDictionary _cooldowns = new(); + + public AnalysisNotificationService( + EmailAlertService emailAlertService, + IUserPreferencesService preferencesService, + IServerManager serverManager) + { + _emailAlertService = emailAlertService; + _preferencesService = preferencesService; + _serverManager = serverManager; + } + + /// + /// Notifies on every finding at or above the configured severity that is not + /// inside its re-notification cooldown. Never throws. + /// + public async Task NotifyAsync(IReadOnlyList findings) + { + if (findings is null || findings.Count == 0) + return; + + var prefs = _preferencesService.GetPreferences(); + // Bounds enforced at consumption — keeps the prefs surface simple. + var threshold = Math.Clamp(prefs.AnalysisNotifySeverity, 0.0, 2.0); + var cooldownMinutes = Math.Clamp(prefs.AnalysisNotifyCooldownMinutes, 30, 10080); + var cooldown = TimeSpan.FromMinutes(cooldownMinutes); + var now = DateTime.UtcNow; + + // Round-2/3 review: webhook-flag-on-with-URL-absent silently drops the alert + // (WebhookAlertService short-circuits at the URL check). Require both the flag + // AND the URL to count a channel as "attempted", so the fallback RecordAlert + // fires when no channel can actually send. + // + // Known asymmetry: emailWouldLog is a *configuration* check, not a *delivery* + // check. If SMTP is configured but inside its EmailCooldownMinutes window for + // (serverId, metricName), TrySendAlertEmailAsync silently skips RecordAlert. + // With default cooldowns (15-minute SMTP, 360-minute analysis) this is hard + // to hit — the analysis per-finding cooldown is the much longer of the two. + // A user who lowers AnalysisNotifyCooldownMinutes below EmailCooldownMinutes + // can lose alert-history rows during the SMTP cooldown window; accepted. + var emailWouldLog = + prefs.SmtpEnabled + && !string.IsNullOrWhiteSpace(prefs.SmtpServer) + && !string.IsNullOrWhiteSpace(prefs.SmtpFromAddress) + && !string.IsNullOrWhiteSpace(prefs.SmtpRecipients); + var webhooksAttempted = + (prefs.TeamsWebhookEnabled && !string.IsNullOrWhiteSpace(WebhookAlertService.GetTeamsWebhookUrl())) + || (prefs.SlackWebhookEnabled && !string.IsNullOrWhiteSpace(WebhookAlertService.GetSlackWebhookUrl())); + + foreach (var finding in findings) + { + if (finding.Severity < threshold) + continue; + + var key = $"{finding.ServerId}:{finding.StoryPathHash}"; + if (_cooldowns.TryGetValue(key, out var last) && now - last < cooldown) + continue; + + try + { + var metricName = FindingMessageFormatter.MetricName(finding); + var currentValue = FindingMessageFormatter.CurrentValue(finding); + var thresholdDisplay = threshold.ToString("F1"); + var context = FindingMessageFormatter.BuildContext(finding, threshold); + + /* Use the matching ServerConnection.Id (GUID string) when we can find + it — keeps alert_history.json keys consistent with the threshold-alert + engine. Fall back to the finding's stable int id (as a string) if the + lookup misses (server removed between cycle start and notify). */ + var matchedServer = _serverManager.GetAllServers() + .FirstOrDefault(s => string.Equals(s.ServerName, finding.ServerName, StringComparison.OrdinalIgnoreCase)); + var serverId = matchedServer?.Id ?? finding.ServerId.ToString(); + + /* TrySendAlertEmailAsync fans out to email + Slack + Teams and records + an alert log row for each channel that actually fired. Returns no + success/failure signal — cooldown is stamped regardless. */ + await _emailAlertService.TrySendAlertEmailAsync( + metricName, + finding.ServerName, + currentValue, + thresholdDisplay, + serverId, + context); + + /* Fallback: when neither channel is configured to log, EmailAlertService + never calls RecordAlert and the finding silently disappears from the + Alerts tab. Log it ourselves so the history is complete regardless of + SMTP/webhook configuration. */ + if (!emailWouldLog && !webhooksAttempted) + { + _emailAlertService.RecordAlert( + serverId, + finding.ServerName, + metricName, + currentValue, + thresholdDisplay, + alertSent: false, + notificationType: "tray", + muted: false, + detailText: FindingMessageFormatter.PlainTextDiagnosis(finding, threshold)); + } + + _cooldowns[key] = now; + } + catch (Exception ex) + { + /* TrySendAlertEmailAsync is documented never to throw; this guards a + formatter defect so one bad finding cannot abort the rest. */ + Logger.Error( + $"AnalysisNotificationService: failed to notify on finding {finding.StoryPathHash}: {ex.GetType().Name}: {ex.Message}"); + } + } + } + } + + /// + /// Composes the arguments for an analysis-finding notification. The engine never + /// populates , so the readable message is + /// built here from the finding's structured fields and drill-down detail. + /// + /// + /// Static — Dashboard reads settings via , + /// not App.* statics like Lite, so notifyThreshold is threaded + /// through as a method parameter rather than read inline. + /// + /// + internal static class FindingMessageFormatter + { + private const int FieldValueLimit = 300; + + /// + /// Alert metric name. The "Analysis: " prefix groups these in the Alerts tab; the + /// short hash suffix makes each distinct finding unique, so EmailAlertService's own + /// {serverId}:{metricName} cooldown cannot collapse two findings sharing a Category. + /// + public static string MetricName(AnalysisFinding finding) + { + var hash = finding.StoryPathHash ?? string.Empty; + var shortHash = hash.Length >= 8 ? hash[..8] : hash; + var category = string.IsNullOrEmpty(finding.Category) ? "finding" : finding.Category; + return $"Analysis: {category} [{shortHash}]"; + } + + /// + /// Headline value — the root fact and its value, plus baseline context for anomaly findings. + /// + public static string CurrentValue(AnalysisFinding finding) + { + var root = string.IsNullOrEmpty(finding.RootFactKey) ? finding.Category : finding.RootFactKey; + var sb = new StringBuilder(root); + + if (finding.RootFactValue.HasValue) + sb.Append($" ({finding.RootFactValue.Value:F1})"); + + if (finding.RootFactMetadata is { Count: > 0 }) + { + var baseline = ToolRecommendations.FormatBaselineContext(finding.RootFactMetadata); + if (baseline is { Count: > 0 }) + { + var parts = baseline.Select(kv => $"{Humanize(kv.Key)} {kv.Value}"); + sb.Append(" — ").Append(string.Join(", ", parts)); + } + } + + return sb.ToString(); + } + + /// + /// Plain-text diagnosis block — used only as the RecordAlert.detailText payload + /// when no notification channel is configured (so the Alerts history tab still shows + /// a searchable summary). Multi-line to match the existing alert-detail TextBox shape. + /// + public static string PlainTextDiagnosis(AnalysisFinding finding, double notifyThreshold) + { + var sb = new StringBuilder(); + sb.AppendLine($" Story: {finding.StoryPath}"); + sb.AppendLine($" Severity: {finding.Severity:F2} (notify threshold {notifyThreshold:F1})"); + sb.AppendLine($" Confidence: {finding.Confidence:F2}"); + sb.AppendLine($" Facts in chain: {finding.FactCount}"); + + if (!string.IsNullOrEmpty(finding.DatabaseName)) + sb.AppendLine($" Database: {finding.DatabaseName}"); + + if (finding.TimeRangeStart.HasValue && finding.TimeRangeEnd.HasValue) + sb.AppendLine($" Window: {finding.TimeRangeStart.Value:u} - {finding.TimeRangeEnd.Value:u}"); + + return sb.ToString().TrimEnd(); + } + + /// + /// Builds the structured for the alert template. + /// First detail item is the Diagnosis summary; subsequent items are the + /// finding's drill-down detail flattened into label/value pairs. + /// + public static AlertContext BuildContext(AnalysisFinding finding, double notifyThreshold) + { + var context = new AlertContext(); + + /* Diagnosis summary — fits inside the 600px email template (label column 120px, value column ~480px). */ + var diagnosis = new AlertDetailItem { Heading = "Diagnosis" }; + diagnosis.Fields.Add(("Story", finding.StoryPath ?? string.Empty)); + diagnosis.Fields.Add(("Severity", finding.Severity.ToString("F2"))); + diagnosis.Fields.Add(("Notify threshold", notifyThreshold.ToString("F1"))); + diagnosis.Fields.Add(("Confidence", finding.Confidence.ToString("F2"))); + diagnosis.Fields.Add(("Facts", finding.FactCount.ToString())); + if (!string.IsNullOrEmpty(finding.DatabaseName)) + diagnosis.Fields.Add(("Database", finding.DatabaseName)); + if (finding.TimeRangeStart.HasValue && finding.TimeRangeEnd.HasValue) + diagnosis.Fields.Add(("Window", $"{finding.TimeRangeStart.Value:u} → {finding.TimeRangeEnd.Value:u}")); + context.Details.Add(diagnosis); + + /* Drill-down values are anonymous types behind object (a bare object, or a + List of them). Round-trip through System.Text.Json and walk as + JsonElement — robust to any shape DrillDownCollector emits. */ + if (finding.DrillDown is { Count: > 0 }) + { + foreach (var (key, value) in finding.DrillDown) + { + if (value is null) + continue; + + var item = new AlertDetailItem { Heading = Humanize(key) }; + try + { + FlattenInto(item.Fields, JsonSerializer.SerializeToElement(value)); + } + catch + { + /* Unexpected value shape — skip this drill-down entry, keep the rest. */ + continue; + } + + if (item.Fields.Count > 0) + context.Details.Add(item); + } + } + + return context; + } + + /// + /// Flattens one drill-down value into label/value field pairs. Arrays are capped at + /// the first 3 elements; nested objects/arrays are rendered as compact JSON. + /// + private static void FlattenInto(List<(string Label, string Value)> fields, JsonElement element) + { + switch (element.ValueKind) + { + case JsonValueKind.Array: + var index = 0; + foreach (var child in element.EnumerateArray()) + { + if (index >= 3) + break; + index++; + + if (child.ValueKind == JsonValueKind.Object) + { + foreach (var prop in child.EnumerateObject()) + fields.Add(($"#{index} {Humanize(prop.Name)}", ScalarText(prop.Value))); + } + else + { + fields.Add(($"#{index}", ScalarText(child))); + } + } + break; + + case JsonValueKind.Object: + foreach (var prop in element.EnumerateObject()) + fields.Add((Humanize(prop.Name), ScalarText(prop.Value))); + break; + + default: + fields.Add(("value", ScalarText(element))); + break; + } + } + + /// Renders a single JSON value as truncated display text. + private static string ScalarText(JsonElement element) + { + return element.ValueKind switch + { + JsonValueKind.String => Truncate(element.GetString() ?? string.Empty), + JsonValueKind.Number => element.GetRawText(), + JsonValueKind.True => "true", + JsonValueKind.False => "false", + JsonValueKind.Null => string.Empty, + // Nested object/array — show compact raw JSON rather than recursing further. + _ => Truncate(element.GetRawText()) + }; + } + + /// Turns a snake_case key into spaced Title Case ("top_blocking_chains" → "Top Blocking Chains"). + private static string Humanize(string key) + { + if (string.IsNullOrEmpty(key)) + return key; + + var words = key.Replace('_', ' ').Split(' ', StringSplitOptions.RemoveEmptyEntries); + return string.Join(' ', words.Select(w => char.ToUpperInvariant(w[0]) + w[1..])); + } + + private static string Truncate(string text) + { + return text.Length <= FieldValueLimit ? text : text[..FieldValueLimit] + "…"; + } + } +} diff --git a/Dashboard/Services/AnalysisScheduler.cs b/Dashboard/Services/AnalysisScheduler.cs new file mode 100644 index 00000000..3e3c474b --- /dev/null +++ b/Dashboard/Services/AnalysisScheduler.cs @@ -0,0 +1,199 @@ +/* + * Performance Monitor Dashboard + * Copyright (c) 2026 Darling Data, LLC + * Licensed under the MIT License - see LICENSE file for details + */ + +using System; +using System.Collections.Concurrent; +using System.Threading; +using System.Threading.Tasks; +using System.Windows.Threading; +using PerformanceMonitorDashboard.Analysis; +using PerformanceMonitorDashboard.Helpers; +using PerformanceMonitorDashboard.Interfaces; + +namespace PerformanceMonitorDashboard.Services +{ + /// + /// Runs the triage engine () for each connected server on + /// a configurable interval and routes high-severity findings to + /// . Lives next to the threshold-alert timer + /// in MainWindow's lifetime; entirely separate cadence and gating. + /// + /// + /// Gated by — zero cost + /// when off. Per-server in-flight tracking prevents a hung connection from piling + /// up overlapping analysis tasks for the same server. + /// + /// + public sealed class AnalysisScheduler + { + private readonly IServerManager _serverManager; + private readonly ICredentialService _credentialService; + private readonly IUserPreferencesService _preferencesService; + private readonly AnalysisNotificationService _notificationService; + private readonly DispatcherTimer _timer; + + /// + /// Server-ids whose previous analysis is still running. A hung connection that + /// outlived its timeout would otherwise pile up tasks. The marker is cleared on + /// real task completion (in the ContinueWith), never on the timeout fast-path — + /// so a permanently-hung server is parked, not relaunched. Accepted limitation: + /// matches Lite Stage 2. + /// + private readonly ConcurrentDictionary _inFlight = new(); + + private readonly CancellationTokenSource _cts = new(); + private bool _cycleRunning; + + public AnalysisScheduler( + IServerManager serverManager, + ICredentialService credentialService, + IUserPreferencesService preferencesService, + AnalysisNotificationService notificationService) + { + _serverManager = serverManager; + _credentialService = credentialService; + _preferencesService = preferencesService; + _notificationService = notificationService; + + _timer = new DispatcherTimer(); + _timer.Tick += OnTick; + } + + /// + /// Applies the latest interval from preferences and starts or stops the timer. + /// Safe to call repeatedly — call after any settings save. + /// + public void Configure() + { + var prefs = _preferencesService.GetPreferences(); + + if (!prefs.AnalysisNotificationsEnabled) + { + _timer.Stop(); + return; + } + + var intervalMinutes = Math.Clamp(prefs.AnalysisIntervalMinutes, 5, 360); + _timer.Interval = TimeSpan.FromMinutes(intervalMinutes); + + if (!_timer.IsEnabled) + _timer.Start(); + } + + /// + /// Stops the timer and cancels any in-flight cycle. Called on app shutdown. + /// + public void Stop() + { + _timer.Stop(); + try + { + _cts.Cancel(); + } + catch (ObjectDisposedException) + { + /* Already disposed — no-op. */ + } + } + + private async void OnTick(object? sender, EventArgs e) + { + /* Re-entrancy guard: if the previous cycle still has not finished (e.g. a + slow server), don't pile up another cycle on top of it. */ + if (_cycleRunning) + return; + + _cycleRunning = true; + try + { + await RunCycleAsync(_cts.Token); + } + catch (Exception ex) + { + Logger.Error($"AnalysisScheduler: cycle failed: {ex.GetType().Name}: {ex.Message}"); + } + finally + { + _cycleRunning = false; + } + } + + private async Task RunCycleAsync(CancellationToken cancellationToken) + { + var prefs = _preferencesService.GetPreferences(); + if (!prefs.AnalysisNotificationsEnabled) + return; + + var timeoutSeconds = Math.Clamp(prefs.AnalysisTimeoutSeconds, 30, 600); + var timeout = TimeSpan.FromSeconds(timeoutSeconds); + + foreach (var server in _serverManager.GetAllServers()) + { + if (cancellationToken.IsCancellationRequested) + break; + + /* Stable, process-independent server id — matches the MCP entry points + so persisted analysis_findings/analysis_muted rows align. */ + var serverId = ServerIdHelper.GetDeterministicHashCode(server.ServerName); + var displayName = server.DisplayNameWithIntent; + + /* Skip a server whose previous analysis is still running. */ + if (!_inFlight.TryAdd(serverId, 0)) + continue; + + try + { + var connectionString = server.GetConnectionString(_credentialService); + + /* Fresh AnalysisService per server: IsAnalyzing is a single instance + flag, so a shared instance whose task is abandoned on timeout would + block analysis for every other server. */ + var planFetcher = new SqlServerPlanFetcher(connectionString); + var analysisService = new AnalysisService(connectionString, planFetcher); + var analyzeTask = analysisService.AnalyzeAsync(serverId, displayName, hoursBack: 4); + + /* Clear the in-flight marker only when the task truly finishes — not + when the timeout below moves us on — so a hung server is not relaunched. */ + _ = analyzeTask.ContinueWith( + completed => _inFlight.TryRemove(serverId, out _), + TaskScheduler.Default); + + /* Linked CTS so the Task.Delay timer is cancelled the moment analyze + wins the race — otherwise the orphaned timer lingers for the full + timeout window after we have moved on. */ + using var delayCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + var finished = await Task.WhenAny(analyzeTask, Task.Delay(timeout, delayCts.Token)); + delayCts.Cancel(); + + if (cancellationToken.IsCancellationRequested) + break; + + if (finished != analyzeTask) + { + Logger.Warning( + $"AnalysisScheduler: scheduled analysis for {displayName} exceeded {timeoutSeconds}s — skipped this cycle"); + continue; + } + + var findings = await analyzeTask; + await _notificationService.NotifyAsync(findings); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + break; + } + catch (Exception ex) + { + Logger.Error( + $"AnalysisScheduler: scheduled analysis failed for {displayName}: {ex.GetType().Name}: {ex.Message}"); + /* If analyzeTask was never created (e.g. ctor threw), the continuation + never ran — clear the marker defensively. */ + _inFlight.TryRemove(serverId, out _); + } + } + } + } +} diff --git a/Dashboard/Services/EmailTemplateBuilder.cs b/Dashboard/Services/EmailTemplateBuilder.cs index c76ceff1..6b501416 100644 --- a/Dashboard/Services/EmailTemplateBuilder.cs +++ b/Dashboard/Services/EmailTemplateBuilder.cs @@ -200,7 +200,7 @@ private static void AppendDetailSection(StringBuilder sb, AlertContext context) /* Separator + heading */ sb.Append("
 
"); sb.Append(""); - sb.Append($"RECENT EVENTS"); + sb.Append($"DETAILS"); sb.Append(""); foreach (var item in context.Details) @@ -266,7 +266,7 @@ private static string BuildPlainTextBody( if (context?.Details?.Count > 0) { - sb.Append($"\r\n--- Recent Events ---\r\n"); + sb.Append($"\r\n--- Details ---\r\n"); foreach (var item in context.Details) { sb.Append($"\r\n {item.Heading}\r\n"); diff --git a/Dashboard/SettingsWindow.xaml b/Dashboard/SettingsWindow.xaml index b797e09f..264565de 100644 --- a/Dashboard/SettingsWindow.xaml +++ b/Dashboard/SettingsWindow.xaml @@ -366,6 +366,41 @@ FontStyle="Italic" Foreground="Gray" VerticalAlignment="Center" Margin="10,0,0,0" FontSize="11"/> + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Dashboard/SettingsWindow.xaml.cs b/Dashboard/SettingsWindow.xaml.cs index bbdbebe1..0b813dcf 100644 --- a/Dashboard/SettingsWindow.xaml.cs +++ b/Dashboard/SettingsWindow.xaml.cs @@ -151,6 +151,12 @@ private void LoadSettings() McpPortTextBox.IsEnabled = prefs.McpEnabled; UpdateMcpStatus(prefs); + // Automated analysis notifications (Stage 2). Cooldown and timeout are + // intentionally not exposed in the UI — preferences.json only. + AnalysisNotificationsEnabledCheckBox.IsChecked = prefs.AnalysisNotificationsEnabled; + AnalysisIntervalMinutesTextBox.Text = prefs.AnalysisIntervalMinutes.ToString(CultureInfo.InvariantCulture); + AnalysisNotifySeverityTextBox.Text = prefs.AnalysisNotifySeverity.ToString("F1", CultureInfo.InvariantCulture); + // System tray settings MinimizeToTrayCheckBox.IsChecked = prefs.MinimizeToTray; NotificationsEnabledCheckBox.IsChecked = prefs.NotificationsEnabled; @@ -751,6 +757,31 @@ private async void OkButton_Click(object sender, RoutedEventArgs e) else validationErrors.Add($"MCP port must be between 1024 and {IPEndPoint.MaxPort}.\nPorts 0–1023 are well-known privileged ports reserved by the operating system."); + // Automated analysis notifications (Stage 2). Bounds are also enforced at + // consumption (AnalysisScheduler.Configure, AnalysisNotificationService.NotifyAsync), + // but validating here catches typos early. + prefs.AnalysisNotificationsEnabled = AnalysisNotificationsEnabledCheckBox.IsChecked == true; + + if (int.TryParse(AnalysisIntervalMinutesTextBox.Text?.Trim(), NumberStyles.Integer, CultureInfo.InvariantCulture, out int analysisInterval) + && analysisInterval >= 5 && analysisInterval <= 360) + { + prefs.AnalysisIntervalMinutes = analysisInterval; + } + else + { + validationErrors.Add("Analysis interval must be between 5 and 360 minutes."); + } + + if (double.TryParse(AnalysisNotifySeverityTextBox.Text?.Trim(), NumberStyles.Float, CultureInfo.InvariantCulture, out double analysisSeverity) + && analysisSeverity >= 0.0 && analysisSeverity <= 2.0) + { + prefs.AnalysisNotifySeverity = analysisSeverity; + } + else + { + validationErrors.Add("Analysis notify severity must be between 0.0 and 2.0."); + } + if (validationErrors.Count > 0) { MessageBox.Show(