diff --git a/CHANGELOG.md b/CHANGELOG.md index 66d2be50ef2..bfd7504aed2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [FEATURE] Ruler: Add per-tenant `ruler_alert_generator_url_template` runtime config option to customize alert generator URLs using Go templates. Supports Grafana Explore, Perses, and other UIs. #7302 * [FEATURE] Distributor: Add experimental `-distributor.enable-start-timestamp` flag for Prometheus Remote Write 2.0. When enabled, `StartTimestamp (ST)` is ingested. #7371 * [FEATURE] Memberlist: Add `-memberlist.cluster-label` and `-memberlist.cluster-label-verification-disabled` to prevent accidental cross-cluster gossip joins and support rolling label rollout. #7385 * [FEATURE] Querier: Add timeout classification to classify query timeouts as 4XX (user error) or 5XX (system error) based on phase timing. When enabled, queries that spend most of their time in PromQL evaluation return `422 Unprocessable Entity` instead of `503 Service Unavailable`. #7374 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 2dc7d8bfe12..8c2adf98027 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4388,6 +4388,16 @@ query_rejection: # external labels for alerting rules [ruler_external_labels: | default = []] +# Per-tenant external URL for the ruler. If set, it overrides the global +# -ruler.external.url for this tenant's alert notifications. +[ruler_external_url: | default = ""] + +# Go text/template for alert generator URLs. Available variables: .ExternalURL +# (resolved external URL) and .Expression (PromQL expression). Built-in +# functions like urlquery are available. If empty, uses default Prometheus +# /graph format. +[ruler_alert_generator_url_template: | default = ""] + # Enable to allow rules to be evaluated with data from a single zone, if other # zones are not available. [rules_partial_data: | default = false] diff --git a/docs/getting-started/.env b/docs/getting-started/.env index 52b62bd990b..81b6cc44d5b 100644 --- a/docs/getting-started/.env +++ b/docs/getting-started/.env @@ -2,4 +2,4 @@ CORTEX_VERSION=v1.20.1 GRAFANA_VERSION=10.4.2 PROMETHEUS_VERSION=v3.2.1 SEAWEEDFS_VERSION=3.67 -PERSES_VERSION=v0.49-distroless-debug +PERSES_VERSION=v0.53.1-distroless-debug diff --git a/docs/getting-started/cortex-config.yaml b/docs/getting-started/cortex-config.yaml index 1b24084ad3f..9351b788f18 100644 --- a/docs/getting-started/cortex-config.yaml +++ b/docs/getting-started/cortex-config.yaml @@ -82,6 +82,14 @@ frontend_worker: # https://cortexmetrics.io/docs/configuration/configuration-file/#ruler_config ruler: enable_api: true + external_url: http://localhost:9009 + alertmanager_url: http://localhost:9009/alertmanager + +# Per-tenant runtime configuration (hot-reloaded without restart). +# This file configures per-tenant overrides such as custom alert generator +# URL templates for Grafana, Perses, or any metrics explorer. +runtime_config: + file: /config/runtime-config.yaml # https://cortexmetrics.io/docs/configuration/configuration-file/#ruler_storage_config ruler_storage: diff --git a/docs/getting-started/docker-compose.yaml b/docs/getting-started/docker-compose.yaml index 1c48394b16c..47ac1d7e2e3 100644 --- a/docs/getting-started/docker-compose.yaml +++ b/docs/getting-started/docker-compose.yaml @@ -17,6 +17,7 @@ services: - -config.file=/config/cortex-config.yaml volumes: - ./cortex-config.yaml:/config/cortex-config.yaml:ro + - ./runtime-config.yaml:/config/runtime-config.yaml:ro ports: - "9009:9009" healthcheck: @@ -47,6 +48,8 @@ services: volumes: - ./perses/config.yaml:/etc/perses/config/config.yaml:ro - ./perses/datasource.yaml:/etc/perses/resources/datasource.yaml:ro + - ./perses/datasource-tenant-a.yaml:/etc/perses/resources/datasource-tenant-a.yaml:ro + - ./perses/datasource-tenant-b.yaml:/etc/perses/resources/datasource-tenant-b.yaml:ro - ./perses/project.yaml:/etc/perses/resources/project.yaml:ro - ./perses/dashboards/cortex-writes.yaml:/etc/perses/resources/cortex-writes.yaml:ro prometheus: diff --git a/docs/getting-started/grafana-datasource-docker.yaml b/docs/getting-started/grafana-datasource-docker.yaml index a40cce5e65f..41385b9d2f0 100644 --- a/docs/getting-started/grafana-datasource-docker.yaml +++ b/docs/getting-started/grafana-datasource-docker.yaml @@ -5,6 +5,7 @@ apiVersion: 1 datasources: - name: Cortex type: prometheus + uid: cortex access: proxy orgId: 1 url: http://cortex:9009/api/prom @@ -22,6 +23,7 @@ datasources: isDefault: true - name: Tenant A type: prometheus + uid: tenant-a access: proxy orgId: 1 url: http://cortex:9009/api/prom @@ -71,3 +73,25 @@ datasources: secureJsonData: httpHeaderValue1: cortex version: 1 + - orgId: 1 + name: Tenant A Alertmanager + type: alertmanager + access: proxy + url: http://cortex:9009/ + jsonData: + httpHeaderName1: X-Scope-OrgID + implementation: cortex + secureJsonData: + httpHeaderValue1: tenant-a + version: 1 + - orgId: 1 + name: Tenant B Alertmanager + type: alertmanager + access: proxy + url: http://cortex:9009/ + jsonData: + httpHeaderName1: X-Scope-OrgID + implementation: cortex + secureJsonData: + httpHeaderValue1: tenant-b + version: 1 diff --git a/docs/getting-started/perses/config.yaml b/docs/getting-started/perses/config.yaml index b87f81bc0f6..ba04acce34e 100644 --- a/docs/getting-started/perses/config.yaml +++ b/docs/getting-started/perses/config.yaml @@ -8,7 +8,7 @@ security: database: file: extension: yaml - folder: /perses + folder: /tmp/perses-data schemas: datasources_path: /etc/perses/cue/schemas/datasources @@ -16,6 +16,11 @@ schemas: panels_path: /etc/perses/cue/schemas/panels queries_path: /etc/perses/cue/schemas/queries variables_path: /etc/perses/cue/schemas/variables + +frontend: + explorer: + enable: true + provisioning: folders: - /etc/perses/resources \ No newline at end of file diff --git a/docs/getting-started/perses/dashboards/cortex-writes.yaml b/docs/getting-started/perses/dashboards/cortex-writes.yaml index 8705ad5f556..a7de3b2795b 100644 --- a/docs/getting-started/perses/dashboards/cortex-writes.yaml +++ b/docs/getting-started/perses/dashboards/cortex-writes.yaml @@ -4,7 +4,7 @@ metadata: createdAt: 2025-03-24T19:15:47.468680767Z updatedAt: 2025-03-24T19:43:53.000136362Z version: 12 - project: default + project: cortex spec: display: name: Cortex / Writes diff --git a/docs/getting-started/perses/datasource-tenant-a.yaml b/docs/getting-started/perses/datasource-tenant-a.yaml new file mode 100644 index 00000000000..78d67370828 --- /dev/null +++ b/docs/getting-started/perses/datasource-tenant-a.yaml @@ -0,0 +1,14 @@ +kind: GlobalDatasource +metadata: + name: TenantA +spec: + default: false + plugin: + kind: PrometheusDatasource + spec: + proxy: + kind: HTTPProxy + spec: + url: http://cortex:9009/api/prom + headers: + X-Scope-OrgID: tenant-a diff --git a/docs/getting-started/perses/datasource-tenant-b.yaml b/docs/getting-started/perses/datasource-tenant-b.yaml new file mode 100644 index 00000000000..40f80a67492 --- /dev/null +++ b/docs/getting-started/perses/datasource-tenant-b.yaml @@ -0,0 +1,14 @@ +kind: GlobalDatasource +metadata: + name: TenantB +spec: + default: false + plugin: + kind: PrometheusDatasource + spec: + proxy: + kind: HTTPProxy + spec: + url: http://cortex:9009/api/prom + headers: + X-Scope-OrgID: tenant-b diff --git a/docs/getting-started/perses/project.yaml b/docs/getting-started/perses/project.yaml index a39681c7841..3b1a1ad9835 100644 --- a/docs/getting-started/perses/project.yaml +++ b/docs/getting-started/perses/project.yaml @@ -1,6 +1,6 @@ kind: Project metadata: - name: default + name: cortex spec: display: - name: "default" \ No newline at end of file + name: "Cortex" \ No newline at end of file diff --git a/docs/getting-started/runtime-config.yaml b/docs/getting-started/runtime-config.yaml new file mode 100644 index 00000000000..5fa09833fec --- /dev/null +++ b/docs/getting-started/runtime-config.yaml @@ -0,0 +1,25 @@ +# Runtime configuration with per-tenant overrides. +# This file is hot-reloaded by Cortex without requiring a restart. +# +# The examples below demonstrate per-tenant alert generator URL templates. +# Each tenant can have a different URL format for alert "Source" links. + +overrides: + # Tenant using Grafana Explore for alert generator URLs. + # Clicking "Source" on an alert in Alertmanager opens Grafana Explore + # with the PromQL expression pre-filled. + tenant-a: + ruler_external_url: "http://localhost:3000" + ruler_alert_generator_url_template: >- + {{ .ExternalURL }}/explore?schemaVersion=1&panes=%7B%22default%22:%7B%22datasource%22:%22tenant-a%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22{{ urlquery .Expression }}%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D%7D&orgId=1 + + # Tenant using Perses for alert generator URLs. + # Clicking "Source" on an alert opens Perses explore view with + # the PromQL expression pre-filled and the TenantB datasource selected. + tenant-b: + ruler_external_url: http://localhost:8080 + ruler_alert_generator_url_template: >- + {{ .ExternalURL }}/explore?explorer=Prometheus-PrometheusExplorer&data=%7B%22tab%22%3A%22graph%22%2C%22queries%22%3A%5B%7B%22kind%22%3A%22TimeSeriesQuery%22%2C%22spec%22%3A%7B%22plugin%22%3A%7B%22kind%22%3A%22PrometheusTimeSeriesQuery%22%2C%22spec%22%3A%7B%22datasource%22%3A%7B%22kind%22%3A%22PrometheusDatasource%22%2C%22name%22%3A%22tenantb%22%7D%2C%22query%22%3A%22{{ urlquery .Expression }}%22%7D%7D%7D%7D%5D%7D + + # Tenants without overrides use the global ruler.external.url + # and the default Prometheus /graph format. diff --git a/docs/getting-started/single-binary.md b/docs/getting-started/single-binary.md index 6321a1c238e..4b7c93ceb14 100644 --- a/docs/getting-started/single-binary.md +++ b/docs/getting-started/single-binary.md @@ -214,6 +214,133 @@ docker run --network cortex-docs-getting-started_default \ Configure Alertmanager notification policies in Grafana: [Alerting → Notification policies](http://localhost:3000/alerting/notifications?search=&alertmanager=Cortex%20Alertmanager) +## Step 7: Per-Tenant Alert Generator URLs (Optional) + +Cortex supports customizing the "Source" link on alerts per-tenant using Go `text/template` strings. This lets each tenant's alerts link back to their preferred metrics explorer — Grafana Explore, Perses, or any other tool. + +The getting-started example includes a `runtime-config.yaml` with two tenant configurations: +- **tenant-a**: Alert source links point to **Grafana Explore** +- **tenant-b**: Alert source links point to **Perses** + +### How It Works + +The `ruler_alert_generator_url_template` field accepts a Go template with two variables: +- `{{ .ExternalURL }}` — the resolved external URL for this tenant (set via `ruler_external_url`) +- `{{ .Expression }}` — the PromQL expression that triggered the alert + +Built-in Go template functions like `urlquery` are available for URL encoding. + +Example for Grafana Explore: +```yaml +ruler_external_url: "http://localhost:3000" +ruler_alert_generator_url_template: >- + {{ .ExternalURL }}/explore?expr={{ urlquery .Expression }} +``` + +### Try It Out + +1. **Load alertmanager configs** for tenant-a and tenant-b: + +```sh +# Upload alertmanager config for tenant-a +curl -X POST http://localhost:9009/api/v1/alerts \ + -H "X-Scope-OrgID: tenant-a" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +alertmanager_config: | + receivers: + - name: default-receiver + route: + receiver: default-receiver + group_wait: 5s + group_interval: 10s +EOF + +# Upload alertmanager config for tenant-b +curl -X POST http://localhost:9009/api/v1/alerts \ + -H "X-Scope-OrgID: tenant-b" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +alertmanager_config: | + receivers: + - name: default-receiver + route: + receiver: default-receiver + group_wait: 5s + group_interval: 10s +EOF +``` + +2. **Load demo alert rules** that fire immediately: + +```sh +# Alert rules for tenant-a +curl -X POST http://localhost:9009/api/v1/rules/demo \ + -H "X-Scope-OrgID: tenant-a" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +name: demo_alerts +interval: 10s +rules: + - alert: HighMemoryUsage + expr: vector(85) > 80 + for: 0m + labels: + severity: warning + annotations: + summary: "Memory usage is above 80%" + - alert: HighErrorRate + expr: vector(5.2) > 5 + for: 0m + labels: + severity: critical + annotations: + summary: "Error rate exceeds 5%" +EOF + +# Alert rules for tenant-b +curl -X POST http://localhost:9009/api/v1/rules/demo \ + -H "X-Scope-OrgID: tenant-b" \ + -H "Content-Type: application/yaml" \ + --data-binary @- <<'EOF' +name: demo_alerts +interval: 10s +rules: + - alert: DiskSpaceLow + expr: vector(92) > 90 + for: 0m + labels: + severity: critical + annotations: + summary: "Disk space usage above 90%" + - alert: HighLatency + expr: vector(3.5) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: "P99 latency exceeds 2s" +EOF +``` + +3. **Wait ~30 seconds** for the ruler to evaluate rules and send alerts to the alertmanager. + +4. **View alerts in Grafana** at [Alerting → Alert groups](http://localhost:3000/alerting/groups?groupBy=alertname): + - Select the **Tenant A Alertmanager** datasource — click "See source" to open Grafana Explore + - Select the **Tenant B Alertmanager** datasource — click "See source" to open Perses + +5. **Verify generator URLs** via the API: + +```sh +# Tenant A: Grafana Explore URLs +curl -s "http://localhost:9009/alertmanager/api/v2/alerts" \ + -H "X-Scope-OrgID: tenant-a" | jq '.[].generatorURL' + +# Tenant B: Perses URLs +curl -s "http://localhost:9009/alertmanager/api/v2/alerts" \ + -H "X-Scope-OrgID: tenant-b" | jq '.[].generatorURL' +``` + ## Explore and Experiment Now that everything is running, try these experiments to learn how Cortex works: @@ -306,6 +433,7 @@ This setup uses several configuration files. Here's what each does: |----------------------------------|---------------------------------------------------------------| | `docker-compose.yaml` | Defines all services (Cortex, Prometheus, Grafana, SeaweedFS) | | `cortex-config.yaml` | Cortex configuration (storage, limits, components) | +| `runtime-config.yaml` | Per-tenant runtime overrides (alert generator URL templates) | | `prometheus-config.yaml` | Prometheus configuration with remote_write to Cortex | | `grafana-datasource-docker.yaml` | Grafana datasource pointing to Cortex | | `rules.yaml` | Example recording rules | diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 3a13151b4c6..ff9a5995b2b 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "net/url" "time" @@ -19,6 +20,7 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/util/strutil" "github.com/weaveworks/common/httpgrpc" "github.com/weaveworks/common/user" @@ -173,6 +175,8 @@ type RulesLimits interface { RulerQueryOffset(userID string) time.Duration DisabledRuleGroups(userID string) validation.DisabledRuleGroups RulerExternalLabels(userID string) labels.Labels + RulerExternalURL(userID string) string + RulerAlertGeneratorURLTemplate(userID string) string } type QueryExecutor func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) @@ -374,15 +378,56 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi // for graceful shutdown of rules that are still in execution even in case the cortex context is canceled. prometheusContext := user.InjectOrgID(context.WithoutCancel(ctx), userID) + // Resolve the per-tenant external URL for ManagerOptions.ExternalURL. + // This *url.URL is set once at manager creation and cannot be refreshed + // without recreating the manager. It powers the {{ externalURL }} and + // {{ pathPrefix }} template functions (not {{ $externalURL }}). + externalURL := cfg.ExternalURL.URL + if tenantURL := overrides.RulerExternalURL(userID); tenantURL != "" { + if parsed, err := url.Parse(tenantURL); err == nil { + externalURL = parsed + } else { + level.Warn(logger).Log("msg", "failed to parse per-tenant ruler external URL, using global", "user", userID, "url", tenantURL, "err", err) + } + } + + // resolveExternalURL returns the per-tenant external URL string, + // re-reading from runtime config on each call so that changes + // take effect without restarting the ruler. + globalExternalURLStr := cfg.ExternalURL.String() + resolveExternalURL := func() string { + if tenantURL := overrides.RulerExternalURL(userID); tenantURL != "" { + return tenantURL + } + return globalExternalURLStr + } + + // Cache for the parsed generator URL template. The closure below is called + // on every alert send; caching avoids re-parsing the template each time. + // The cache is invalidated if the template string changes via runtime config. + tmplCache := &generatorURLTemplateCache{} + return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, evalMetrics.TotalWritesVec.WithLabelValues(userID), evalMetrics.FailedWritesVec.WithLabelValues(userID)), - Queryable: q, - QueryFunc: queryFunc, - Context: prometheusContext, - ExternalURL: cfg.ExternalURL.URL, - NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), + Queryable: q, + QueryFunc: queryFunc, + Context: prometheusContext, + ExternalURL: externalURL, + NotifyFunc: SendAlerts(notifier, func(expr string) string { + externalURLStr := resolveExternalURL() + tmplStr := overrides.RulerAlertGeneratorURLTemplate(userID) + if tmplStr == "" { + return externalURLStr + strutil.TableLinkForExpression(expr) + } + result, err := executeGeneratorURLTemplate(tmplCache, tmplStr, externalURLStr, expr) + if err != nil { + level.Warn(logger).Log("msg", "failed to execute generator URL template, falling back to prometheus format", "err", err) + return externalURLStr + strutil.TableLinkForExpression(expr) + } + return result + }), Logger: util_log.GoKitLogToSlog(log.With(logger, "user", userID)), Registerer: reg, OutageTolerance: cfg.OutageTolerance, diff --git a/pkg/ruler/external_url.go b/pkg/ruler/external_url.go new file mode 100644 index 00000000000..0928413a889 --- /dev/null +++ b/pkg/ruler/external_url.go @@ -0,0 +1,56 @@ +package ruler + +import ( + "sync" +) + +// userExternalURL tracks per-user resolved external URLs and detects changes. +type userExternalURL struct { + global string + limits RulesLimits + + mtx sync.Mutex + users map[string]string +} + +func newUserExternalURL(global string, limits RulesLimits) *userExternalURL { + return &userExternalURL{ + global: global, + limits: limits, + + mtx: sync.Mutex{}, + users: map[string]string{}, + } +} + +func (e *userExternalURL) update(userID string) (string, bool) { + tenantURL := e.limits.RulerExternalURL(userID) + resolved := e.global + if tenantURL != "" { + resolved = tenantURL + } + + e.mtx.Lock() + defer e.mtx.Unlock() + + if prev, ok := e.users[userID]; ok && prev == resolved { + return resolved, false + } + + e.users[userID] = resolved + return resolved, true +} + +func (e *userExternalURL) remove(user string) { + e.mtx.Lock() + defer e.mtx.Unlock() + delete(e.users, user) +} + +func (e *userExternalURL) cleanup() { + e.mtx.Lock() + defer e.mtx.Unlock() + for user := range e.users { + delete(e.users, user) + } +} diff --git a/pkg/ruler/external_url_test.go b/pkg/ruler/external_url_test.go new file mode 100644 index 00000000000..50b88563e8e --- /dev/null +++ b/pkg/ruler/external_url_test.go @@ -0,0 +1,67 @@ +package ruler + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestUserExternalURL(t *testing.T) { + limits := ruleLimits{} + e := newUserExternalURL("http://global:9090", &limits) + + const userID = "test-user" + + t.Run("global URL used when no per-tenant override", func(t *testing.T) { + e.remove(userID) + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + }) + + t.Run("no change on second update", func(t *testing.T) { + url, changed := e.update(userID) + require.False(t, changed) + require.Equal(t, "http://global:9090", url) + }) + + t.Run("per-tenant URL overrides global", func(t *testing.T) { + limits.mtx.Lock() + limits.externalURL = "http://tenant:3000" + limits.mtx.Unlock() + + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://tenant:3000", url) + }) + + t.Run("no change when per-tenant URL is the same", func(t *testing.T) { + url, changed := e.update(userID) + require.False(t, changed) + require.Equal(t, "http://tenant:3000", url) + }) + + t.Run("revert to global when per-tenant override removed", func(t *testing.T) { + limits.mtx.Lock() + limits.externalURL = "" + limits.mtx.Unlock() + + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + }) + + t.Run("remove and cleanup lifecycle", func(t *testing.T) { + e.remove(userID) + // After remove, next update should report changed + url, changed := e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + + e.cleanup() + // After cleanup, next update should report changed + url, changed = e.update(userID) + require.True(t, changed) + require.Equal(t, "http://global:9090", url) + }) +} diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index d44a0d95829..86611201899 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -53,6 +53,9 @@ type DefaultMultiTenantManager struct { // Per-user externalLabels. userExternalLabels *userExternalLabels + // Per-user externalURL. + userExternalURL *userExternalURL + // rules backup rulesBackupManager *rulesBackupManager @@ -101,6 +104,7 @@ func NewDefaultMultiTenantManager(cfg Config, limits RulesLimits, managerFactory ruleEvalMetrics: evalMetrics, notifiers: map[string]*rulerNotifier{}, userExternalLabels: newUserExternalLabels(cfg.ExternalLabels, limits), + userExternalURL: newUserExternalURL(cfg.ExternalURL.String(), limits), notifiersDiscoveryMetrics: notifiersDiscoveryMetrics, mapper: newMapper(cfg.RulePath, logger), userManagers: map[string]RulesManager{}, @@ -166,6 +170,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou r.removeNotifier(userID) r.mapper.cleanupUser(userID) r.userExternalLabels.remove(userID) + r.userExternalURL.remove(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) r.configUpdatesTotal.DeleteLabelValues(userID) @@ -210,6 +215,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user return } externalLabels, externalLabelsUpdated := r.userExternalLabels.update(user) + externalURL, externalURLUpdated := r.userExternalURL.update(user) existing := true manager := r.getRulesManager(user, ctx) @@ -222,13 +228,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user return } - if !existing || rulesUpdated || externalLabelsUpdated { + if !existing || rulesUpdated || externalLabelsUpdated || externalURLUpdated { level.Debug(r.logger).Log("msg", "updating rules", "user", user) r.configUpdatesTotal.WithLabelValues(user).Inc() - if (rulesUpdated || externalLabelsUpdated) && existing { + if (rulesUpdated || externalLabelsUpdated || externalURLUpdated) && existing { r.updateRuleCache(user, manager.RuleGroups()) } - err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, r.cfg.ExternalURL.String(), r.ruleGroupIterationFunc) + err = manager.Update(r.cfg.EvaluationInterval, files, externalLabels, externalURL, r.ruleGroupIterationFunc) r.deleteRuleCache(user) if err != nil { r.lastReloadSuccessful.WithLabelValues(user).Set(0) @@ -443,6 +449,7 @@ func (r *DefaultMultiTenantManager) Stop() { // cleanup user rules directories r.mapper.cleanup() r.userExternalLabels.cleanup() + r.userExternalURL.cleanup() } func (m *DefaultMultiTenantManager) ValidateRuleGroup(g rulefmt.RuleGroup) []error { diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 57ee59e370a..ee8dd00ede3 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -1,6 +1,7 @@ package ruler import ( + "bytes" "context" "flag" "fmt" @@ -12,6 +13,7 @@ import ( "sort" "strings" "sync" + "text/template" "time" "github.com/go-kit/log" @@ -26,7 +28,6 @@ import ( "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/promql/parser" promRules "github.com/prometheus/prometheus/rules" - "github.com/prometheus/prometheus/util/strutil" "github.com/weaveworks/common/user" "golang.org/x/sync/errgroup" @@ -507,7 +508,7 @@ type sender interface { // It filters any non-firing alerts from the input. // // Copied from Prometheus's main.go. -func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { +func SendAlerts(n sender, generatorURLFn func(expr string) string) promRules.NotifyFunc { return func(ctx context.Context, expr string, alerts ...*promRules.Alert) { var res []*notifier.Alert @@ -516,7 +517,7 @@ func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { StartsAt: alert.FiredAt, Labels: alert.Labels, Annotations: alert.Annotations, - GeneratorURL: externalURL + strutil.TableLinkForExpression(expr), + GeneratorURL: generatorURLFn(expr), } if !alert.ResolvedAt.IsZero() { a.EndsAt = alert.ResolvedAt @@ -532,6 +533,76 @@ func SendAlerts(n sender, externalURL string) promRules.NotifyFunc { } } +// generatorURLTemplateData holds the variables available in generator URL templates. +type generatorURLTemplateData struct { + ExternalURL string + Expression string +} + +// generatorURLTemplateCache caches a parsed text/template keyed on the template string. +// If the template string changes (e.g., via runtime config), the cache is invalidated. +type generatorURLTemplateCache struct { + tmplStr string + tmpl *template.Template +} + +// getOrParse returns a parsed template, reusing the cached one if the template string +// hasn't changed. This avoids re-parsing on every alert send. +func (c *generatorURLTemplateCache) getOrParse(tmplStr string) (*template.Template, error) { + if c.tmpl != nil && c.tmplStr == tmplStr { + return c.tmpl, nil + } + tmpl, err := template.New("generator_url").Parse(tmplStr) + if err != nil { + return nil, err + } + c.tmplStr = tmplStr + c.tmpl = tmpl + return tmpl, nil +} + +// executeGeneratorURLTemplate executes a Go text/template to produce a generator URL. +// We intentionally use text/template instead of html/template because the output is a URL, +// not HTML. HTML-escaping would corrupt URL characters (e.g., & → &). The output is +// validated to ensure it uses http/https scheme to prevent javascript: or data: injection. +func executeGeneratorURLTemplate(cache *generatorURLTemplateCache, tmplStr, externalURL, expr string) (string, error) { + tmpl, err := cache.getOrParse(tmplStr) + if err != nil { + return "", err + } + var buf bytes.Buffer + if err := tmpl.Execute(&buf, generatorURLTemplateData{ + ExternalURL: externalURL, + Expression: expr, + }); err != nil { + return "", err + } + result := buf.String() + if err := validateGeneratorURL(result); err != nil { + return "", err + } + return result, nil +} + +// validateGeneratorURL checks that the URL is well-formed, uses http or https scheme, +// and does not contain HTML in the fragment. +func validateGeneratorURL(rawURL string) error { + u, err := url.Parse(rawURL) + if err != nil { + return fmt.Errorf("invalid generator URL: %w", err) + } + if u.Scheme != "http" && u.Scheme != "https" { + return fmt.Errorf("generator URL has unsupported scheme %q, must be http or https", u.Scheme) + } + if u.Host == "" { + return fmt.Errorf("generator URL is missing host") + } + if strings.ContainsAny(u.Fragment, "<>") { + return fmt.Errorf("generator URL fragment contains invalid characters") + } + return nil +} + func ruleGroupDisabled(ruleGroup *rulespb.RuleGroupDesc, disabledRuleGroupsForUser validation.DisabledRuleGroups) bool { for _, disabledRuleGroupForUser := range disabledRuleGroupsForUser { if ruleGroup.Namespace == disabledRuleGroupForUser.Namespace && diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index e5738945cb4..51635a65523 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -35,6 +35,7 @@ import ( promRules "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/util/annotations" + "github.com/prometheus/prometheus/util/strutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -89,14 +90,16 @@ func defaultRulerConfig(t testing.TB) Config { } type ruleLimits struct { - mtx sync.RWMutex - tenantShard float64 - maxRulesPerRuleGroup int - maxRuleGroups int - disabledRuleGroups validation.DisabledRuleGroups - maxQueryLength time.Duration - queryOffset time.Duration - externalLabels labels.Labels + mtx sync.RWMutex + tenantShard float64 + maxRulesPerRuleGroup int + maxRuleGroups int + disabledRuleGroups validation.DisabledRuleGroups + maxQueryLength time.Duration + queryOffset time.Duration + externalLabels labels.Labels + externalURL string + alertGeneratorURLTemplate string } func (r *ruleLimits) setRulerExternalLabels(lset labels.Labels) { @@ -147,6 +150,18 @@ func (r *ruleLimits) RulerExternalLabels(_ string) labels.Labels { return r.externalLabels } +func (r *ruleLimits) RulerExternalURL(_ string) string { + r.mtx.RLock() + defer r.mtx.RUnlock() + return r.externalURL +} + +func (r *ruleLimits) RulerAlertGeneratorURLTemplate(_ string) string { + r.mtx.RLock() + defer r.mtx.RUnlock() + return r.alertGeneratorURLTemplate +} + func newEmptyQueryable() storage.Queryable { return storage.QueryableFunc(func(mint, maxt int64) (storage.Querier, error) { return emptyQuerier{}, nil @@ -2684,10 +2699,13 @@ func (s senderFunc) Send(alerts ...*notifier.Alert) { func TestSendAlerts(t *testing.T) { testCases := []struct { - in []*promRules.Alert - exp []*notifier.Alert + name string + in []*promRules.Alert + exp []*notifier.Alert + generatorURLFn func(expr string) string }{ { + name: "prometheus format with valid until", in: []*promRules.Alert{ { Labels: labels.FromStrings("l1", "v1"), @@ -2706,8 +2724,12 @@ func TestSendAlerts(t *testing.T) { GeneratorURL: "http://localhost:9090/graph?g0.expr=up&g0.tab=1", }, }, + generatorURLFn: func(expr string) string { + return "http://localhost:9090" + strutil.TableLinkForExpression(expr) + }, }, { + name: "prometheus format with resolved at", in: []*promRules.Alert{ { Labels: labels.FromStrings("l1", "v1"), @@ -2726,25 +2748,170 @@ func TestSendAlerts(t *testing.T) { GeneratorURL: "http://localhost:9090/graph?g0.expr=up&g0.tab=1", }, }, + generatorURLFn: func(expr string) string { + return "http://localhost:9090" + strutil.TableLinkForExpression(expr) + }, }, { - in: []*promRules.Alert{}, + name: "empty alerts", + in: []*promRules.Alert{}, + generatorURLFn: func(expr string) string { + return "http://localhost:9090" + strutil.TableLinkForExpression(expr) + }, + }, + { + name: "custom template format", + in: []*promRules.Alert{ + { + Labels: labels.FromStrings("l1", "v1"), + Annotations: labels.FromStrings("a2", "v2"), + ActiveAt: time.Unix(1, 0), + FiredAt: time.Unix(2, 0), + ValidUntil: time.Unix(3, 0), + }, + }, + exp: []*notifier.Alert{ + { + Labels: labels.FromStrings("l1", "v1"), + Annotations: labels.FromStrings("a2", "v2"), + StartsAt: time.Unix(2, 0), + EndsAt: time.Unix(3, 0), + GeneratorURL: "http://grafana.example.com/explore?expr=up", + }, + }, + generatorURLFn: func(expr string) string { + cache := &generatorURLTemplateCache{} + result, _ := executeGeneratorURLTemplate(cache, + "{{ .ExternalURL }}/explore?expr={{ urlquery .Expression }}", + "http://grafana.example.com", expr) + return result + }, }, } - for i, tc := range testCases { - t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { - senderFunc := senderFunc(func(alerts ...*notifier.Alert) { + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + sf := senderFunc(func(alerts ...*notifier.Alert) { if len(tc.in) == 0 { t.Fatalf("sender called with 0 alert") } - require.Equal(t, tc.exp, alerts) + if tc.exp != nil { + require.Equal(t, tc.exp, alerts) + } }) - SendAlerts(senderFunc, "http://localhost:9090")(context.TODO(), "up", tc.in...) + SendAlerts(sf, tc.generatorURLFn)(context.TODO(), "up", tc.in...) }) } } +func TestExecuteGeneratorURLTemplate(t *testing.T) { + testCases := []struct { + name string + tmplStr string + externalURL string + expr string + expected string + expectErr bool + }{ + { + name: "basic template with expression", + tmplStr: "{{ .ExternalURL }}/graph?expr={{ .Expression }}", + externalURL: "http://prometheus:9090", + expr: "up", + expected: "http://prometheus:9090/graph?expr=up", + }, + { + name: "template with urlquery", + tmplStr: "{{ .ExternalURL }}/explore?expr={{ urlquery .Expression }}", + externalURL: "http://grafana.example.com", + expr: "rate(http_requests_total[5m])", + expected: "http://grafana.example.com/explore?expr=rate%28http_requests_total%5B5m%5D%29", + }, + { + name: "invalid template returns error", + tmplStr: "{{ .Invalid", + expectErr: true, + }, + { + name: "template with multiple variables", + tmplStr: "{{ .ExternalURL }}/explore?left=%7B%22queries%22:%5B%7B%22expr%22:%22{{ urlquery .Expression }}%22%7D%5D%7D", + externalURL: "http://grafana:3000", + expr: "up", + expected: "http://grafana:3000/explore?left=%7B%22queries%22:%5B%7B%22expr%22:%22up%22%7D%5D%7D", + }, + { + name: "javascript URI scheme is rejected", + tmplStr: "javascript://alert('xss')", + externalURL: "http://localhost:3000", + expr: "up", + expectErr: true, + }, + { + name: "data URI scheme is rejected", + tmplStr: "data:text/html,", + externalURL: "http://localhost:3000", + expr: "up", + expectErr: true, + }, + { + name: "fragment with script tag is rejected", + tmplStr: "{{ .ExternalURL }}/explore#", + externalURL: "http://localhost:3000", + expr: "up", + expectErr: true, + }, + { + name: "missing host is rejected", + tmplStr: "http:///path", + externalURL: "http://localhost:3000", + expr: "up", + expectErr: true, + }, + { + name: "valid URL with fragment is allowed", + tmplStr: "{{ .ExternalURL }}/explore#tab=graph", + externalURL: "http://localhost:3000", + expr: "up", + expected: "http://localhost:3000/explore#tab=graph", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cache := &generatorURLTemplateCache{} + result, err := executeGeneratorURLTemplate(cache, tc.tmplStr, tc.externalURL, tc.expr) + if tc.expectErr { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tc.expected, result) + } + }) + } +} + +func TestGeneratorURLTemplateCaching(t *testing.T) { + cache := &generatorURLTemplateCache{} + + // First call parses and caches the template. + result1, err := executeGeneratorURLTemplate(cache, "{{ .ExternalURL }}/graph?expr={{ urlquery .Expression }}", "http://localhost:9090", "up") + require.NoError(t, err) + require.Equal(t, "http://localhost:9090/graph?expr=up", result1) + cachedTmpl := cache.tmpl + + // Same template string reuses the cached parsed template. + result2, err := executeGeneratorURLTemplate(cache, "{{ .ExternalURL }}/graph?expr={{ urlquery .Expression }}", "http://localhost:9090", "rate(http_requests_total[5m])") + require.NoError(t, err) + require.Equal(t, "http://localhost:9090/graph?expr=rate%28http_requests_total%5B5m%5D%29", result2) + require.Same(t, cachedTmpl, cache.tmpl, "expected cached template to be reused") + + // Different template string invalidates the cache. + result3, err := executeGeneratorURLTemplate(cache, "{{ .ExternalURL }}/explore?expr={{ urlquery .Expression }}", "http://grafana:3000", "up") + require.NoError(t, err) + require.Equal(t, "http://grafana:3000/explore?expr=up", result3) + require.NotSame(t, cachedTmpl, cache.tmpl, "expected cache to be invalidated for new template string") +} + // Tests for whether the Ruler is able to recover ALERTS_FOR_STATE state func TestRecoverAlertsPostOutage(t *testing.T) { // Test Setup diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 36284110a05..a9b92c866c5 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -10,6 +10,7 @@ import ( "math" "regexp" "strings" + "text/template" "time" "github.com/cespare/xxhash/v2" @@ -217,13 +218,15 @@ type Limits struct { QueryRejection QueryRejection `yaml:"query_rejection" json:"query_rejection" doc:"nocli|description=Configuration for query rejection."` // Ruler defaults and limits. - RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` - RulerTenantShardSize float64 `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` - RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` - RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` - RulerQueryOffset model.Duration `yaml:"ruler_query_offset" json:"ruler_query_offset"` - RulerExternalLabels labels.Labels `yaml:"ruler_external_labels" json:"ruler_external_labels" doc:"nocli|description=external labels for alerting rules"` - RulesPartialData bool `yaml:"rules_partial_data" json:"rules_partial_data" doc:"nocli|description=Enable to allow rules to be evaluated with data from a single zone, if other zones are not available.|default=false"` + RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` + RulerTenantShardSize float64 `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` + RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` + RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` + RulerQueryOffset model.Duration `yaml:"ruler_query_offset" json:"ruler_query_offset"` + RulerExternalLabels labels.Labels `yaml:"ruler_external_labels" json:"ruler_external_labels" doc:"nocli|description=external labels for alerting rules"` + RulerExternalURL string `yaml:"ruler_external_url" json:"ruler_external_url" doc:"nocli|description=Per-tenant external URL for the ruler. If set, it overrides the global -ruler.external.url for this tenant's alert notifications."` + RulerAlertGeneratorURLTemplate string `yaml:"ruler_alert_generator_url_template" json:"ruler_alert_generator_url_template" doc:"nocli|description=Go text/template for alert generator URLs. Available variables: .ExternalURL (resolved external URL) and .Expression (PromQL expression). Built-in functions like urlquery are available. If empty, uses default Prometheus /graph format."` + RulesPartialData bool `yaml:"rules_partial_data" json:"rules_partial_data" doc:"nocli|description=Enable to allow rules to be evaluated with data from a single zone, if other zones are not available.|default=false"` // Store-gateway. StoreGatewayTenantShardSize float64 `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` @@ -435,6 +438,12 @@ func (l *Limits) Validate(nameValidationScheme model.ValidationScheme, shardByAl } } + if l.RulerAlertGeneratorURLTemplate != "" { + if _, err := template.New("").Parse(l.RulerAlertGeneratorURLTemplate); err != nil { + return fmt.Errorf("invalid ruler_alert_generator_url_template: %w", err) + } + } + return nil } func (l *Limits) ValidateQueryLimits(userID string, closeIdleTSDBTimeout time.Duration) error { @@ -1205,6 +1214,14 @@ func (o *Overrides) RulerExternalLabels(userID string) labels.Labels { return o.GetOverridesForUser(userID).RulerExternalLabels } +func (o *Overrides) RulerExternalURL(userID string) string { + return o.GetOverridesForUser(userID).RulerExternalURL +} + +func (o *Overrides) RulerAlertGeneratorURLTemplate(userID string) string { + return o.GetOverridesForUser(userID).RulerAlertGeneratorURLTemplate +} + // MaxRegexPatternLength returns the maximum length of an unoptimized regex pattern. // This is only used in Ingester. func (o *Overrides) MaxRegexPatternLength(userID string) int { diff --git a/schemas/cortex-config-schema.json b/schemas/cortex-config-schema.json index feb746bdb22..84724f8d76e 100644 --- a/schemas/cortex-config-schema.json +++ b/schemas/cortex-config-schema.json @@ -5521,6 +5521,10 @@ "x-cli-flag": "frontend.results-cache-ttl", "x-format": "duration" }, + "ruler_alert_generator_url_template": { + "description": "Go text/template for alert generator URLs. Available variables: .ExternalURL (resolved external URL) and .Expression (PromQL expression). Built-in functions like urlquery are available. If empty, uses default Prometheus /graph format.", + "type": "string" + }, "ruler_evaluation_delay_duration": { "default": "0s", "description": "Deprecated(use ruler.query-offset instead) and will be removed in v1.19.0: Duration to delay the evaluation of rules to ensure the underlying metrics have been pushed to Cortex.", @@ -5534,6 +5538,10 @@ "description": "external labels for alerting rules", "type": "object" }, + "ruler_external_url": { + "description": "Per-tenant external URL for the ruler. If set, it overrides the global -ruler.external.url for this tenant's alert notifications.", + "type": "string" + }, "ruler_max_rule_groups_per_tenant": { "default": 0, "description": "Maximum number of rule groups per-tenant. 0 to disable.",