From bbd61e2e665ec913b621b4df19d3dfb062d6aad9 Mon Sep 17 00:00:00 2001 From: SungJin1212 Date: Mon, 8 Jun 2026 09:06:07 +0900 Subject: [PATCH] improve slow query logging with additional context and statistics Signed-off-by: SungJin1212 --- CHANGELOG.md | 1 + pkg/frontend/transport/handler.go | 44 +++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea57cca48f..f7821e21fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ * [ENHANCEMENT] Distributor: Added `cortex_distributor_received_histogram_buckets` metric to track number of buckets in received native histogram samples before validation, per user. #7569 * [ENHANCEMENT] Distributor: Add `WrappedHistogram` with configurable size limit (`-validation.max-native-histogram-size-bytes`) to cap native histogram protobuf size before unmarshalling. #7570 * [ENHANCEMENT] Ingester: Add lazy regex evaluation on head postings cache miss. Defers expensive regex matchers on high-cardinality labels to per-series filtering when a selective equality matcher already narrows the result set. Configured via `-blocks-storage.expanded_postings_cache.head.lazy-matcher-max-cardinality` (disabled by default). #7553 +* [ENHANCEMENT] Query Frontend: Improve the slow query log with `component`, `source`, `user_agent`, `engine_type`, `block_store_type`, and query stats fields to aid slow query diagnosis. #7601 * [BUGFIX] Querier: Fix queryWithRetry and labelsWithRetry returning (nil, nil) on cancelled context by propagating ctx.Err(). #7370 * [BUGFIX] Metrics Helper: Fix non-deterministic bucket order in merged histograms by sorting buckets after map iteration, matching Prometheus client library behavior. #7380 * [BUGFIX] Distributor: Return HTTP 401 Unauthorized when tenant ID resolution fails in the Prometheus Remote Write 2.0 path. #7389 diff --git a/pkg/frontend/transport/handler.go b/pkg/frontend/transport/handler.go index 08f08582bb..7de7e5e580 100644 --- a/pkg/frontend/transport/handler.go +++ b/pkg/frontend/transport/handler.go @@ -322,7 +322,7 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { queryString = f.parseRequestQueryString(r, buf) } if shouldReportSlowQuery { - f.reportSlowQuery(r, queryString, queryResponseTime) + f.reportSlowQuery(r, queryString, queryResponseTime, source, stats) if f.cfg.QueryStatsEnabled { f.getOrCreateSlowQueryMetric().WithLabelValues(source, userID).Inc() } @@ -423,18 +423,58 @@ func (f *Handler) logQueryRequest(r *http.Request, queryString url.Values, sourc } // reportSlowQuery reports slow queries. -func (f *Handler) reportSlowQuery(r *http.Request, queryString url.Values, queryResponseTime time.Duration) { +func (f *Handler) reportSlowQuery(r *http.Request, queryString url.Values, queryResponseTime time.Duration, source string, stats *querier_stats.QueryStats) { logMessage := []any{ "msg", "slow query detected", "method", r.Method, "host", r.Host, "path", r.URL.Path, + "source", source, "time_taken", queryResponseTime.String(), } + grafanaFields := formatGrafanaStatsFields(r) if len(grafanaFields) > 0 { logMessage = append(logMessage, grafanaFields...) } + + if userAgent := r.Header.Get("User-Agent"); len(userAgent) > 0 { + logMessage = append(logMessage, "user_agent", userAgent) + } + if engineType := r.Header.Get(engine.TypeHeader); len(engineType) > 0 { + logMessage = append(logMessage, "engine_type", engineType) + } + if blockStoreType := r.Header.Get(querier.BlockStoreTypeHeader); len(blockStoreType) > 0 { + logMessage = append(logMessage, "block_store_type", blockStoreType) + } + if wallTime := stats.LoadWallTime(); wallTime > 0 { + logMessage = append(logMessage, "query_wall_time_seconds", wallTime.Seconds()) + } + if storageWallTime := stats.LoadQueryStorageWallTime(); storageWallTime > 0 { + logMessage = append(logMessage, "query_storage_wall_time_seconds", storageWallTime.Seconds()) + } + if n := stats.LoadFetchedSeries(); n > 0 { + logMessage = append(logMessage, "fetched_series_count", n) + } + if n := stats.LoadFetchedChunks(); n > 0 { + logMessage = append(logMessage, "fetched_chunks_count", n) + } + if n := stats.LoadFetchedSamples(); n > 0 { + logMessage = append(logMessage, "fetched_samples_count", n) + } + if n := stats.LoadScannedSamples(); n > 0 { + logMessage = append(logMessage, "samples_scanned", n) + } + if n := stats.LoadFetchedChunkBytes(); n > 0 { + logMessage = append(logMessage, "fetched_chunks_bytes", n) + } + if n := stats.LoadFetchedDataBytes(); n > 0 { + logMessage = append(logMessage, "fetched_data_bytes", n) + } + if n := stats.LoadSplitQueries(); n > 0 { + logMessage = append(logMessage, "split_queries", n) + } + logMessage = append(logMessage, formatQueryString(queryString)...) level.Info(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)