diff --git a/deno.json b/deno.json index c73a63da4..b67d3ee41 100644 --- a/deno.json +++ b/deno.json @@ -33,6 +33,7 @@ "./examples/astro", "./examples/fresh", "./examples/hono-sample", + "./examples/monitoring", "./examples/rfc-9421-test", "./test/smoke/harness" ], diff --git a/deno.lock b/deno.lock index 18f6f8f18..b1fe95475 100644 --- a/deno.lock +++ b/deno.lock @@ -9380,6 +9380,11 @@ "jsr:@hono/hono@^4.7.1" ] }, + "examples/monitoring": { + "dependencies": [ + "npm:@opentelemetry/api@^1.9.1" + ] + }, "examples/rfc-9421-test": { "dependencies": [ "jsr:@hono/hono@^4.7.1" diff --git a/docs/manual/monitoring.md b/docs/manual/monitoring.md index a6180e381..e34c0af71 100644 --- a/docs/manual/monitoring.md +++ b/docs/manual/monitoring.md @@ -29,9 +29,13 @@ are the integration points most backends share, not because Fedify prefers them. Everything here applies to any backend that ingests OTLP or scrapes Prometheus; where a vendor's setup begins, this guide stops and points you at their documentation. +The [runnable monitoring example] packages the Collector, Prometheus, Grafana, +alert rules, dashboard provisioning, and a small synthetic metric source into a +Docker Compose stack you can start locally. [Prometheus]: https://prometheus.io/ [OpenTelemetry Collector]: https://opentelemetry.io/docs/collector/ +[runnable monitoring example]: https://github.com/fedify-dev/fedify/tree/main/examples/monitoring Before you begin diff --git a/docs/manual/opentelemetry.md b/docs/manual/opentelemetry.md index 3ba5bd6c1..23246eb06 100644 --- a/docs/manual/opentelemetry.md +++ b/docs/manual/opentelemetry.md @@ -923,9 +923,13 @@ For turning these metrics into a production dashboard and alert rules, see the [*Production monitoring* guide](./monitoring.md). It maps the metrics above to the federation-health questions operators ask, with PromQL examples, the OpenTelemetry-to-Prometheus naming translation, and cardinality guidance for -dashboard and alert authors. +dashboard and alert authors. The +[runnable monitoring example] +contains a Docker Compose stack with Prometheus, Grafana, OpenTelemetry +Collector, provisioned alert rules, and a starter dashboard. [URI Template]: https://datatracker.ietf.org/doc/html/rfc6570 +[runnable monitoring example]: https://github.com/fedify-dev/fedify/tree/main/examples/monitoring Semantic [attributes] for ActivityPub diff --git a/examples/README.md b/examples/README.md index 44abf77d6..1927e26ec 100644 --- a/examples/README.md +++ b/examples/README.md @@ -17,6 +17,7 @@ added in the future.[^1] - [Hono integration sample](./hono-sample/) - [Fastify integration example](./fastify/) - [Fedify–Express integration example](./express/) + - [Fedify monitoring example](./monitoring/) - [Fedify–Next.js 16 integration example using `@fedify/next`](./next-integration/) - [Fedify–Next.js 14 integration example](./next14-app-router/) - [Fedify–Next.js 15 integration example](./next15-app-router/) diff --git a/examples/monitoring/README.md b/examples/monitoring/README.md new file mode 100644 index 000000000..139c66dd8 --- /dev/null +++ b/examples/monitoring/README.md @@ -0,0 +1,229 @@ +Fedify monitoring example +========================= + +This example starts a local monitoring stack for Fedify OpenTelemetry metrics: + + - OpenTelemetry Collector receives OTLP metrics on ports 4317 and 4318. + - Prometheus scrapes the Collector on port 9464 and loads example alert + rules. + - Grafana starts on port 3000 with a provisioned Prometheus data source and + the *Fedify overview* dashboard. + - A small Deno process emits synthetic Fedify-shaped metrics so the dashboard + is populated immediately. + +The sample process is not a Fedify application. It exists only to make the +stack observable before you connect your own app. + + +Prerequisites +------------- + +Install Docker Compose or a compatible implementation such as Podman Compose. +From the repository root, run: + +~~~~ sh +docker compose -f examples/monitoring/compose.yaml up +~~~~ + +Then open: + + - [Grafana] + - [Prometheus] + - [Collector Prometheus endpoint] + +Grafana anonymous viewer access is enabled for this local example only. Do not +copy that authentication setting into production. + +Stop the stack with: + +~~~~ sh +docker compose -f examples/monitoring/compose.yaml down +~~~~ + +[Grafana]: http://localhost:3000/d/fedify-overview/fedify-overview +[Prometheus]: http://localhost:9090/ +[Collector Prometheus endpoint]: http://localhost:9464/metrics + + +Validate the example +-------------------- + +The repository includes a validation script for the monitoring files: + +~~~~ sh +mise run test:monitoring +~~~~ + +That command checks the Deno scripts, Docker Compose file, Prometheus config, +Prometheus alert rules and rule tests, and OpenTelemetry Collector config. + +To also start the stack and verify that Prometheus, Grafana, the Collector +target, the provisioned dashboard, and the sample Fedify metrics are reachable: + +~~~~ sh +deno run -A examples/monitoring/validate.ts --smoke +~~~~ + +The smoke test uses a separate Compose project name and tears the stack down +afterward. + + +Connect a Fedify app +-------------------- + +Leave the monitoring stack running and point your application at the Collector. +For a Deno 2.4 or later application, the built-in OpenTelemetry exporter is the +shortest path: + +~~~~ sh +OTEL_DENO=1 \ +OTEL_SERVICE_NAME=my-fedify-app \ +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 \ +OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf \ +deno run -A your_fedify_app.ts +~~~~ + +If your app runs in another Compose service on the same network, use the +service name instead of `localhost`: + +~~~~ sh +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +~~~~ + +For Node.js, Bun, or a custom SDK setup, configure an OpenTelemetry +`MeterProvider` and an OTLP metrics exporter before starting the Fedify server. +Fedify uses the global meter provider by default for most metrics, or the +explicit `meterProvider` option passed to `createFederation()`. Document +loader metrics (`activitypub.document.fetch`, `activitypub.document.cache`) +require an explicit `meterProvider`; see the [OpenTelemetry manual] for that +detail. + +Once your app is exporting, you can stop the synthetic sample service: + +~~~~ sh +docker compose -f examples/monitoring/compose.yaml stop sample-metrics +~~~~ + +[OpenTelemetry manual]: ../../docs/manual/opentelemetry.md + + +Metric compatibility +-------------------- + +This example is the runnable companion to the [production monitoring guide]. +That guide explains why the dashboard groups by bounded labels such as +`fedify.endpoint`, `fedify.queue.role`, `activitypub.processing.result`, +`activitypub.lookup.result`, and `activitypub.remote.host`, while avoiding raw +actor IDs, object IDs, inbox URLs, and full route parameter values. + +The dashboard and alert rules use the metric names documented in the +[OpenTelemetry manual]. They cover the Fedify metrics introduced and expanded +through the OpenTelemetry work tracked by issues such as [#316], [#619], +[#735], [#736], [#737], [#738], [#739], [#740], [#741], and [#742]. + +If you change the OpenTelemetry Collector Prometheus translation settings, or +if you export these metrics to a backend that keeps dots in metric and label +names, update the PromQL expressions accordingly. + +[production monitoring guide]: ../../docs/manual/monitoring.md +[#316]: https://github.com/fedify-dev/fedify/issues/316 +[#619]: https://github.com/fedify-dev/fedify/issues/619 +[#735]: https://github.com/fedify-dev/fedify/issues/735 +[#736]: https://github.com/fedify-dev/fedify/issues/736 +[#737]: https://github.com/fedify-dev/fedify/issues/737 +[#738]: https://github.com/fedify-dev/fedify/issues/738 +[#739]: https://github.com/fedify-dev/fedify/issues/739 +[#740]: https://github.com/fedify-dev/fedify/issues/740 +[#741]: https://github.com/fedify-dev/fedify/issues/741 +[#742]: https://github.com/fedify-dev/fedify/issues/742 + + +What the dashboard shows +------------------------ + +The dashboard focuses on bounded, aggregate labels: + +HTTP request performance +: Request rate and p95 latency by `fedify.endpoint`. + +Queue health +: Queue depth, in-flight tasks, enqueue rate, completion rate, and task + latency by `fedify.queue.role`. + +Inbox processing +: p95 listener processing latency and inbound activity outcomes. With a + queued inbox this measures worker side effects, not the remote server's + HTTP wait time. + +Outbound delivery +: Outbox activity outcomes, delivery attempts split by success, failure + ratio, and permanent failures by HTTP status. + +Signature and lookup health +: Signature verification latency, key-fetch latency, document and key lookup + latency, lookup outcomes, verification failures, and public key lookup + failures. + +Peer discovery +: WebFinger and actor discovery outcomes and p95 latency. + +Resource context +: Process memory and CPU metrics emitted by the sample process. In a real + deployment, replace or extend these panels with runtime, database, cache, + queue backend, host, or platform metrics from your own instrumentation. + +The dashboard deliberately avoids raw actor IDs, object IDs, inbox URLs, full +remote URLs, and route parameter values as labels. Keep that property when +you adapt the JSON for your own deployment. + + +Alert rules +----------- + +*prometheus-rules.yaml* contains starter rules for common Fedify production +symptoms: + + - Collector target down. + - Missing Fedify metrics from an expected target. + - Queue falling behind. + - Queue depth above an example threshold. + - Outbound delivery failure ratio above 20%. + - Permanent delivery failures. + - Remote `404`/`410` spikes. + - Sustained inbox processing latency. + - Signature verification failures. + - Discovery and public key lookup failures. + +The thresholds are examples, not Fedify defaults. Watch normal traffic before +you page on these values in production. Alerts that describe remote churn, +such as `404` and `410` spikes, are marked as investigation alerts rather than +paging alerts. + + +Troubleshooting +--------------- + +The dashboard is empty +: Check [Prometheus targets]. The `otel-collector` target should be up. + Then check whether `fedify_http_server_request_count_total` exists in + Prometheus. If it does not, confirm the app has OpenTelemetry enabled and + is exporting to the [Collector OTLP endpoint]. + +`fedify.queue.depth` is missing +: Queue depth is emitted only when the configured message queue backend + implements `MessageQueue.getDepth()`. Use the enqueue-versus-completion + panels when depth is unavailable. + +Ports are already in use +: Edit *compose.yaml* and change the host-side ports. The container + ports should stay the same unless you also update the related config files. + +Prometheus metric names look different +: This example uses the Collector's default Prometheus translation, where + dots become underscores, counters gain `_total`, and millisecond + histograms gain `_milliseconds_bucket`, `_milliseconds_sum`, and + `_milliseconds_count` series. If you change Collector translation options, + update dashboard and alert PromQL to match. + +[Prometheus targets]: http://localhost:9090/targets +[Collector OTLP endpoint]: http://localhost:4318 diff --git a/examples/monitoring/compose.yaml b/examples/monitoring/compose.yaml new file mode 100644 index 000000000..66c596dd3 --- /dev/null +++ b/examples/monitoring/compose.yaml @@ -0,0 +1,69 @@ +name: fedify-monitoring + +services: + otel-collector: + image: docker.io/otel/opentelemetry-collector-contrib:0.154.0 + command: ["--config=/etc/otelcol-contrib/config.yaml"] + volumes: + - ./otel-collector.yaml:/etc/otelcol-contrib/config.yaml:ro,z + ports: + - "127.0.0.1:4317:4317" + - "127.0.0.1:4318:4318" + - "127.0.0.1:9464:9464" + + prometheus: + image: docker.io/prom/prometheus:v3.5.4 + command: + - --config.file=/etc/prometheus/prometheus.yaml + - --web.enable-lifecycle + volumes: + - ./prometheus.yaml:/etc/prometheus/prometheus.yaml:ro,z + - ./prometheus-rules.yaml:/etc/prometheus/prometheus-rules.yaml:ro,z + ports: + - "127.0.0.1:9090:9090" + depends_on: + - otel-collector + + grafana: + image: docker.io/grafana/grafana:13.0.2 + environment: + GF_AUTH_ANONYMOUS_ENABLED: "true" + GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer + GF_AUTH_BASIC_ENABLED: "false" + GF_AUTH_DISABLE_LOGIN_FORM: "true" + GF_ANALYTICS_CHECK_FOR_PLUGIN_UPDATES: "false" + GF_ANALYTICS_CHECK_FOR_UPDATES: "false" + GF_PLUGINS_PREINSTALL: "" + GF_PLUGINS_PREINSTALL_SYNC: "" + GF_USERS_DEFAULT_THEME: light + volumes: + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro,z + - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro,z + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro,z + ports: + - "127.0.0.1:3000:3000" + depends_on: + - prometheus + + sample-metrics: + image: docker.io/denoland/deno:2.7.13 + command: + - deno + - run + - --config + - /workspace/deno.json + - --node-modules-dir=none + - --allow-env + - --allow-net + - /workspace/sample-metrics.ts + environment: + OTEL_DENO: "true" + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318 + OTEL_EXPORTER_OTLP_PROTOCOL: http/protobuf + OTEL_METRIC_EXPORT_INTERVAL: "1000" + OTEL_SERVICE_NAME: fedify-monitoring-sample + volumes: + - ./deno.json:/workspace/deno.json:ro,z + - ./sample-metrics.ts:/workspace/sample-metrics.ts:ro,z + depends_on: + - otel-collector diff --git a/examples/monitoring/deno.json b/examples/monitoring/deno.json new file mode 100644 index 000000000..8d35135a8 --- /dev/null +++ b/examples/monitoring/deno.json @@ -0,0 +1,5 @@ +{ + "imports": { + "@opentelemetry/api": "npm:@opentelemetry/api@^1.9.1" + } +} diff --git a/examples/monitoring/grafana/dashboards/fedify-overview.json b/examples/monitoring/grafana/dashboards/fedify-overview.json new file mode 100644 index 000000000..2a75ad588 --- /dev/null +++ b/examples/monitoring/grafana/dashboards/fedify-overview.json @@ -0,0 +1,1321 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations and Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "Fedify production monitoring guide", + "tooltip": "Open the Fedify manual chapter for dashboard and alert guidance.", + "type": "link", + "url": "https://fedify.dev/manual/monitoring" + } + ], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "count(fedify_http_server_request_count_total) or vector(0)", + "legendFormat": "series", + "refId": "A" + } + ], + "title": "Fedify metric series", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "down" + }, + "1": { + "color": "green", + "text": "up" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "up{job=\"otel-collector\"}", + "legendFormat": "collector", + "refId": "A" + } + ], + "title": "Collector scrape", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (fedify_endpoint) (rate(fedify_http_server_request_count_total[5m]))", + "legendFormat": "{{fedify_endpoint}}", + "refId": "A" + } + ], + "title": "HTTP request rate by Fedify endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, fedify_endpoint) (rate(fedify_http_server_request_duration_milliseconds_bucket[5m])))", + "legendFormat": "{{fedify_endpoint}} p95", + "refId": "A" + } + ], + "title": "HTTP request p95 latency by endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "max by (fedify_queue_role) (fedify_queue_depth{fedify_queue_depth_state=\"queued\"})", + "legendFormat": "{{fedify_queue_role}} queued", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (fedify_queue_role) (fedify_queue_task_in_flight)", + "legendFormat": "{{fedify_queue_role}} in flight", + "refId": "B" + } + ], + "title": "Queue backlog and in-flight tasks", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (fedify_queue_role) (rate(fedify_queue_task_enqueued_total[5m]))", + "legendFormat": "{{fedify_queue_role}} enqueued", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (fedify_queue_role) (rate(fedify_queue_task_completed_total[5m]))", + "legendFormat": "{{fedify_queue_role}} completed", + "refId": "B" + } + ], + "title": "Queue enqueue and completion rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le) (rate(activitypub_inbox_processing_duration_milliseconds_bucket[5m])))", + "legendFormat": "inbox p95", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, fedify_queue_role) (rate(fedify_queue_task_duration_milliseconds_bucket[5m])))", + "legendFormat": "{{fedify_queue_role}} task p95", + "refId": "B" + } + ], + "title": "Inbox and queue task latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_delivery_success) (rate(activitypub_delivery_sent_total[5m]))", + "legendFormat": "success={{activitypub_delivery_success}}", + "refId": "A" + } + ], + "title": "Outbound delivery attempts by success", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.05 + }, + { + "color": "red", + "value": 0.2 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "((sum(rate(activitypub_delivery_sent_total{activitypub_delivery_success=\"false\"}[5m])) or vector(0)) / sum(rate(activitypub_delivery_sent_total[5m]))) and sum(rate(activitypub_delivery_sent_total[5m])) > 0", + "legendFormat": "failure ratio", + "refId": "A" + } + ], + "title": "Outbound failure ratio", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (http_response_status_code) (rate(activitypub_delivery_permanent_failure_total[5m]))", + "legendFormat": "{{http_response_status_code}}", + "refId": "A" + } + ], + "title": "Permanent delivery failures by status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, activitypub_signature_kind) (rate(activitypub_signature_verification_duration_milliseconds_bucket[5m])))", + "legendFormat": "{{activitypub_signature_kind}} verification", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, activitypub_signature_kind) (rate(activitypub_signature_key_fetch_duration_milliseconds_bucket[5m])))", + "legendFormat": "{{activitypub_signature_kind}} key fetch", + "refId": "B" + } + ], + "title": "Signature verification and key fetch p95 latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_verification_failure_reason) (rate(activitypub_signature_verification_failure_total[5m]))", + "legendFormat": "{{activitypub_verification_failure_reason}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_lookup_result) (rate(activitypub_key_lookup_total{activitypub_lookup_result=~\"not_found|invalid|network_error|error\"}[5m]))", + "legendFormat": "key {{activitypub_lookup_result}}", + "refId": "B" + } + ], + "title": "Signature and public key failures", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (webfinger_lookup_result) (rate(webfinger_lookup_total[5m]))", + "legendFormat": "webfinger {{webfinger_lookup_result}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_actor_discovery_result) (rate(activitypub_actor_discovery_total[5m]))", + "legendFormat": "actor {{activitypub_actor_discovery_result}}", + "refId": "B" + } + ], + "title": "Peer discovery and WebFinger outcomes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, webfinger_lookup_result) (rate(webfinger_lookup_duration_milliseconds_bucket[5m])))", + "legendFormat": "webfinger {{webfinger_lookup_result}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, activitypub_actor_discovery_result) (rate(activitypub_actor_discovery_duration_milliseconds_bucket[5m])))", + "legendFormat": "actor {{activitypub_actor_discovery_result}}", + "refId": "B" + } + ], + "title": "Peer discovery and WebFinger p95 latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_processing_result) (rate(activitypub_inbox_activity_total[5m]))", + "legendFormat": "{{activitypub_processing_result}}", + "refId": "A" + } + ], + "title": "Inbox activity outcomes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_processing_result) (rate(activitypub_outbox_activity_total[5m]))", + "legendFormat": "{{activitypub_processing_result}}", + "refId": "A" + } + ], + "title": "Outbox activity outcomes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, activitypub_lookup_kind) (rate(activitypub_key_lookup_duration_milliseconds_bucket[5m])))", + "legendFormat": "key {{activitypub_lookup_kind}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum by (le, activitypub_lookup_kind) (rate(activitypub_document_fetch_duration_milliseconds_bucket[5m])))", + "legendFormat": "document {{activitypub_lookup_kind}}", + "refId": "B" + } + ], + "title": "Document and key lookup p95 latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 52 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_lookup_result) (rate(activitypub_document_fetch_total[5m]))", + "legendFormat": "document {{activitypub_lookup_result}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (activitypub_lookup_result) (rate(activitypub_key_lookup_total[5m]))", + "legendFormat": "key {{activitypub_lookup_result}}", + "refId": "B" + } + ], + "title": "Document and key lookup outcomes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (process_memory_type) (process_memory_usage_bytes)", + "legendFormat": "{{process_memory_type}}", + "refId": "A" + } + ], + "title": "Process memory context", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 60 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "13.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (process_cpu_state) (rate(process_cpu_time_seconds_total[5m]))", + "legendFormat": "{{process_cpu_state}}", + "refId": "A" + } + ], + "title": "Process CPU context", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "fedify", + "activitypub", + "opentelemetry", + "prometheus" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Fedify overview", + "uid": "fedify-overview", + "version": 1, + "weekStart": "" +} diff --git a/examples/monitoring/grafana/provisioning/dashboards/dashboards.yaml b/examples/monitoring/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 000000000..4a5f38677 --- /dev/null +++ b/examples/monitoring/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: Fedify dashboards + orgId: 1 + folder: Fedify + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 10 + options: + path: /var/lib/grafana/dashboards diff --git a/examples/monitoring/grafana/provisioning/datasources/datasources.yaml b/examples/monitoring/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 000000000..f34f58d38 --- /dev/null +++ b/examples/monitoring/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,15 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + httpMethod: POST + prometheusType: Prometheus + prometheusVersion: 3.5.4 + timeInterval: 5s diff --git a/examples/monitoring/otel-collector.yaml b/examples/monitoring/otel-collector.yaml new file mode 100644 index 000000000..7c13d5328 --- /dev/null +++ b/examples/monitoring/otel-collector.yaml @@ -0,0 +1,23 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + +exporters: + prometheus: + endpoint: 0.0.0.0:9464 + add_metric_suffixes: true + +service: + pipelines: + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus] diff --git a/examples/monitoring/prometheus-rules.test.yaml b/examples/monitoring/prometheus-rules.test.yaml new file mode 100644 index 000000000..170344fe8 --- /dev/null +++ b/examples/monitoring/prometheus-rules.test.yaml @@ -0,0 +1,52 @@ +rule_files: + - /workspace/prometheus-rules.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: 'activitypub_delivery_sent_total{activitypub_delivery_success="false"}' + values: "0+1x30" + - series: 'activitypub_delivery_sent_total{activitypub_delivery_success="true"}' + values: "0+1x30" + alert_rule_test: + - eval_time: 20m + alertname: FedifyOutboundDeliveryFailing + exp_alerts: + - exp_labels: + severity: page + exp_annotations: + summary: "Over 20% of outbound delivery attempts are failing" + description: "A sustained delivery failure ratio across all peers usually points at local egress, DNS, proxy, or signing problems." + guidance: "Paging alert: inspect delivery spans, network egress, and recent key or signature changes." + + - interval: 1m + input_series: + - series: 'webfinger_lookup_total{webfinger_lookup_result="error"}' + values: "0+3x40" + alert_rule_test: + - eval_time: 30m + alertname: FedifyDiscoveryFailures + exp_alerts: + - exp_labels: + severity: ticket + exp_annotations: + summary: "Sustained WebFinger or actor discovery failures" + description: "Actor discovery is failing often enough to affect federation with new or refreshed peers." + guidance: "Investigation alert: check remote DNS, redirects, malformed WebFinger responses, and outbound fetch policy." + + - interval: 1m + input_series: + - series: 'activitypub_actor_discovery_total{activitypub_actor_discovery_result="error"}' + values: "0+3x40" + alert_rule_test: + - eval_time: 30m + alertname: FedifyDiscoveryFailures + exp_alerts: + - exp_labels: + severity: ticket + exp_annotations: + summary: "Sustained WebFinger or actor discovery failures" + description: "Actor discovery is failing often enough to affect federation with new or refreshed peers." + guidance: "Investigation alert: check remote DNS, redirects, malformed WebFinger responses, and outbound fetch policy." diff --git a/examples/monitoring/prometheus-rules.yaml b/examples/monitoring/prometheus-rules.yaml new file mode 100644 index 000000000..40462686d --- /dev/null +++ b/examples/monitoring/prometheus-rules.yaml @@ -0,0 +1,157 @@ +groups: + - name: fedify.rules + rules: + - alert: FedifyCollectorTargetDown + expr: up{job="otel-collector"} == 0 + for: 2m + labels: + severity: page + annotations: + summary: "Fedify metrics scrape target is down" + description: "Prometheus cannot scrape the OpenTelemetry Collector endpoint that exposes Fedify metrics." + guidance: "Paging alert: check the Collector process, network path, and Prometheus scrape configuration." + + - alert: FedifyMetricsMissing + expr: absent(fedify_http_server_request_count_total) + for: 10m + labels: + severity: page + annotations: + summary: "Fedify metrics are missing" + description: "No Fedify HTTP server metrics have been seen for 10 minutes." + guidance: "Paging alert when this target should be serving traffic: check OpenTelemetry SDK startup and OTLP export settings." + + - alert: FedifyQueueFallingBehind + expr: | + sum by (fedify_queue_role) (rate(fedify_queue_task_enqueued_total[10m])) + - ( + sum by (fedify_queue_role) (rate(fedify_queue_task_completed_total[10m])) + or sum by (fedify_queue_role) (rate(fedify_queue_task_enqueued_total[10m])) * 0 + ) + > 0 + for: 30m + labels: + severity: page + annotations: + summary: "Fedify {{ $labels.fedify_queue_role }} queue is not draining" + description: "Task enqueue rate has stayed above completion rate for 30 minutes." + guidance: "Paging alert: add worker capacity or investigate the queue backend and slow listener or delivery dependencies." + + - alert: FedifyQueueDepthHigh + expr: | + max by (fedify_queue_role) ( + fedify_queue_depth{fedify_queue_depth_state="queued"} + ) > 1000 + for: 15m + labels: + severity: ticket + annotations: + summary: "Fedify {{ $labels.fedify_queue_role }} queue depth is high" + description: "The queue backend reports more than 1000 queued messages for 15 minutes." + guidance: "Investigation alert: replace this example threshold with a value learned from your normal traffic." + + - alert: FedifyOutboundDeliveryFailing + expr: | + ( + (sum(rate(activitypub_delivery_sent_total{ + activitypub_delivery_success="false" + }[5m])) or vector(0)) + / sum(rate(activitypub_delivery_sent_total[5m])) + ) + > 0.2 + and sum(rate(activitypub_delivery_sent_total[5m])) > 0 + for: 10m + labels: + severity: page + annotations: + summary: "Over 20% of outbound delivery attempts are failing" + description: "A sustained delivery failure ratio across all peers usually points at local egress, DNS, proxy, or signing problems." + guidance: "Paging alert: inspect delivery spans, network egress, and recent key or signature changes." + + - alert: FedifyPermanentDeliveryFailures + expr: | + sum(rate(activitypub_delivery_permanent_failure_total[5m])) > 1 + for: 15m + labels: + severity: ticket + annotations: + summary: "Permanent ActivityPub delivery failures increased" + description: "Remote servers are rejecting deliveries with permanent-failure status codes." + guidance: "Investigation alert: break down by http_response_status_code and use topk by activitypub_remote_host only while investigating." + + - alert: FedifyRemoteGoneSpike + expr: | + sum(increase(activitypub_delivery_permanent_failure_total{ + http_response_status_code=~"404|410" + }[1h])) > 50 + labels: + severity: ticket + annotations: + summary: "Elevated 404/410 responses from remote inboxes" + description: "A remote instance may have removed accounts, changed paths, or gone away." + guidance: "Investigation alert, not a page: consider pruning orphaned follower records after confirming the affected peers." + + - alert: FedifyInboxLatencyHigh + expr: | + histogram_quantile( + 0.95, + sum by (le) ( + rate(activitypub_inbox_processing_duration_milliseconds_bucket[5m]) + ) + ) > 2000 + for: 15m + labels: + severity: page + annotations: + summary: "Inbox processing p95 above 2s for 15 minutes" + description: "Inbox listener side effects are slow. Behind a queue this affects eventual processing, not the remote HTTP response." + guidance: "Paging alert: compare with queue backlog and signature key fetch duration to find the slow dependency." + + - alert: FedifySignatureVerificationFailures + expr: | + sum by (activitypub_verification_failure_reason) ( + increase(activitypub_signature_verification_failure_total[5m]) + ) > 10 + for: 15m + labels: + severity: page + annotations: + summary: "Sustained signature verification failures" + description: "Fedify is rejecting inbound signed activities for reason {{ $labels.activitypub_verification_failure_reason }}." + guidance: "Paging alert when broad across peers: check clock drift, actor key rotation, and remote key fetch failures." + + - alert: FedifyDiscoveryFailures + expr: | + ( + sum(increase(webfinger_lookup_total{ + webfinger_lookup_result=~"invalid|network_error|error" + }[10m])) or vector(0) + ) + + ( + sum(increase(activitypub_actor_discovery_total{ + activitypub_actor_discovery_result!="resolved" + }[10m])) or vector(0) + ) + > 20 + for: 15m + labels: + severity: ticket + annotations: + summary: "Sustained WebFinger or actor discovery failures" + description: "Actor discovery is failing often enough to affect federation with new or refreshed peers." + guidance: "Investigation alert: check remote DNS, redirects, malformed WebFinger responses, and outbound fetch policy." + + - alert: FedifyKeyLookupFailures + expr: | + sum by (activitypub_lookup_result) ( + increase(activitypub_key_lookup_total{ + activitypub_lookup_result=~"not_found|invalid|network_error|error" + }[10m]) + ) > 10 + for: 15m + labels: + severity: page + annotations: + summary: "Sustained public key lookup failures" + description: "Public key lookup failures with result {{ $labels.activitypub_lookup_result }} are high enough to break signature verification." + guidance: "Paging alert if broad across peers: check outbound network access, document loader configuration, and key cache behavior." diff --git a/examples/monitoring/prometheus.yaml b/examples/monitoring/prometheus.yaml new file mode 100644 index 000000000..e01285479 --- /dev/null +++ b/examples/monitoring/prometheus.yaml @@ -0,0 +1,15 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +rule_files: + - /etc/prometheus/prometheus-rules.yaml + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ["localhost:9090"] + + - job_name: otel-collector + static_configs: + - targets: ["otel-collector:9464"] diff --git a/examples/monitoring/sample-metrics.ts b/examples/monitoring/sample-metrics.ts new file mode 100644 index 000000000..ef83be388 --- /dev/null +++ b/examples/monitoring/sample-metrics.ts @@ -0,0 +1,465 @@ +import { metrics } from "@opentelemetry/api"; + +const meter = metrics.getMeter("fedify.monitoring.example", "1.0.0"); + +const createType = "https://www.w3.org/ns/activitystreams#Create"; +const followType = "https://www.w3.org/ns/activitystreams#Follow"; +const noteType = "https://www.w3.org/ns/activitystreams#Note"; + +const httpRequestCount = meter.createCounter( + "fedify.http.server.request.count", + { unit: "{request}" }, +); +const httpRequestDuration = meter.createHistogram( + "fedify.http.server.request.duration", + { unit: "ms" }, +); +const queueTaskEnqueued = meter.createCounter( + "fedify.queue.task.enqueued", + { unit: "{task}" }, +); +const queueTaskStarted = meter.createCounter( + "fedify.queue.task.started", + { unit: "{task}" }, +); +const queueTaskCompleted = meter.createCounter( + "fedify.queue.task.completed", + { unit: "{task}" }, +); +const queueTaskFailed = meter.createCounter( + "fedify.queue.task.failed", + { unit: "{task}" }, +); +const queueTaskDuration = meter.createHistogram( + "fedify.queue.task.duration", + { unit: "ms" }, +); +const queueTaskInFlight = meter.createUpDownCounter( + "fedify.queue.task.in_flight", + { unit: "{task}" }, +); +const queueDepth = meter.createObservableGauge( + "fedify.queue.depth", + { unit: "{message}" }, +); +const deliverySent = meter.createCounter( + "activitypub.delivery.sent", + { unit: "{attempt}" }, +); +const deliveryPermanentFailure = meter.createCounter( + "activitypub.delivery.permanent_failure", + { unit: "{failure}" }, +); +const deliveryDuration = meter.createHistogram( + "activitypub.delivery.duration", + { unit: "ms" }, +); +const inboxActivity = meter.createCounter( + "activitypub.inbox.activity", + { unit: "{activity}" }, +); +const inboxProcessingDuration = meter.createHistogram( + "activitypub.inbox.processing_duration", + { unit: "ms" }, +); +const outboxActivity = meter.createCounter( + "activitypub.outbox.activity", + { unit: "{activity}" }, +); +const fanoutRecipients = meter.createHistogram( + "activitypub.fanout.recipients", + { unit: "{recipient}" }, +); +const circuitBreakerStateChange = meter.createCounter( + "activitypub.circuit_breaker.state_change", + { unit: "{change}" }, +); +const signatureVerificationFailure = meter.createCounter( + "activitypub.signature.verification_failure", + { unit: "{failure}" }, +); +const signatureVerificationDuration = meter.createHistogram( + "activitypub.signature.verification.duration", + { unit: "ms" }, +); +const signatureKeyFetchDuration = meter.createHistogram( + "activitypub.signature.key_fetch.duration", + { unit: "ms" }, +); +const keyLookup = meter.createCounter( + "activitypub.key.lookup", + { unit: "{lookup}" }, +); +const keyLookupDuration = meter.createHistogram( + "activitypub.key.lookup.duration", + { unit: "ms" }, +); +const documentFetch = meter.createCounter( + "activitypub.document.fetch", + { unit: "{fetch}" }, +); +const documentFetchDuration = meter.createHistogram( + "activitypub.document.fetch.duration", + { unit: "ms" }, +); +const objectLookup = meter.createCounter( + "activitypub.object.lookup", + { unit: "{lookup}" }, +); +const actorDiscovery = meter.createCounter( + "activitypub.actor.discovery", + { unit: "{discovery}" }, +); +const actorDiscoveryDuration = meter.createHistogram( + "activitypub.actor.discovery.duration", + { unit: "ms" }, +); +const webFingerLookup = meter.createCounter( + "webfinger.lookup", + { unit: "{lookup}" }, +); +const webFingerLookupDuration = meter.createHistogram( + "webfinger.lookup.duration", + { unit: "ms" }, +); +const webFingerHandle = meter.createCounter( + "webfinger.handle", + { unit: "{request}" }, +); +const webFingerHandleDuration = meter.createHistogram( + "webfinger.handle.duration", + { unit: "ms" }, +); +const processMemoryUsage = meter.createObservableGauge( + "process.memory.usage", + { unit: "By" }, +); +const processCpuTime = meter.createCounter( + "process.cpu.time", + { unit: "s" }, +); + +type QueueRole = "inbox" | "outbox" | "fanout"; +type QueueState = "queued" | "ready" | "delayed"; + +const queueRoles: QueueRole[] = ["inbox", "outbox", "fanout"]; +const queueStates: QueueState[] = ["queued", "ready", "delayed"]; +const queueDepthValues = new Map(); +const inFlightValues = new Map( + queueRoles.map((role) => [role, 0]), +); + +queueDepth.addCallback((observableResult) => { + for (const role of queueRoles) { + for (const state of queueStates) { + observableResult.observe( + queueDepthValues.get(`${role}:${state}`) ?? 0, + { + "fedify.queue.role": role, + "fedify.queue.backend": "InProcessMessageQueue", + "fedify.queue.depth.state": state, + "fedify.federation.instance_id": "sample", + }, + ); + } + } +}); + +processMemoryUsage.addCallback((observableResult) => { + const usage = Deno.memoryUsage(); + observableResult.observe(usage.rss, { "process.memory.type": "rss" }); + observableResult.observe( + usage.heapUsed, + { "process.memory.type": "heap_used" }, + ); + observableResult.observe( + usage.heapTotal, + { "process.memory.type": "heap_total" }, + ); +}); + +let tick = 0; + +function wave(base: number, amplitude: number, divisor: number): number { + return Math.max(0, Math.round(base + Math.sin(tick / divisor) * amplitude)); +} + +function recordQueueDepths(): void { + for (const role of queueRoles) { + const roleOffset = role === "inbox" ? 0 : role === "outbox" ? 8 : 3; + queueDepthValues.set(`${role}:queued`, wave(24 + roleOffset, 10, 5)); + queueDepthValues.set(`${role}:ready`, wave(6 + roleOffset / 2, 3, 4)); + queueDepthValues.set(`${role}:delayed`, wave(3 + roleOffset / 4, 2, 7)); + + const nextInFlight = wave(role === "outbox" ? 5 : 2, 2, 3); + const previousInFlight = inFlightValues.get(role) ?? 0; + queueTaskInFlight.add(nextInFlight - previousInFlight, { + "fedify.queue.role": role, + "fedify.queue.backend": "InProcessMessageQueue", + }); + inFlightValues.set(role, nextInFlight); + } +} + +function recordHttpMetrics(): void { + const requests = [ + { method: "GET", endpoint: "actor", route: "/users/{identifier}", ms: 18 }, + { + method: "POST", + endpoint: "inbox", + route: "/users/{identifier}/inbox", + ms: 85, + }, + { + method: "GET", + endpoint: "webfinger", + route: "/.well-known/webfinger", + ms: 12, + }, + { + method: "GET", + endpoint: "outbox", + route: "/users/{identifier}/outbox", + ms: 34, + }, + ]; + for (const request of requests) { + const statusCode = tick % 31 === 0 && request.endpoint === "inbox" + ? 500 + : 200; + const attrs = { + "http.request.method": request.method, + "fedify.endpoint": request.endpoint, + "fedify.route.template": request.route, + "http.response.status_code": statusCode, + }; + httpRequestCount.add(1, attrs); + httpRequestDuration.record(request.ms + wave(0, 8, 3), attrs); + } +} + +function recordQueueMetrics(): void { + for (const role of queueRoles) { + const attrs = { + "fedify.queue.role": role, + "fedify.queue.backend": "InProcessMessageQueue", + "activitypub.activity.type": role === "inbox" ? followType : createType, + }; + queueTaskEnqueued.add(role === "outbox" ? 3 : 1, { + ...attrs, + "fedify.queue.task.attempt": tick % 17 === 0 ? 1 : 0, + }); + queueTaskStarted.add(role === "outbox" ? 2 : 1, attrs); + queueTaskCompleted.add(role === "outbox" ? 2 : 1, { + ...attrs, + "fedify.queue.task.result": "completed", + }); + queueTaskDuration.record(role === "outbox" ? 140 + wave(0, 35, 4) : 45, { + ...attrs, + "fedify.queue.task.result": "completed", + }); + } + + if (tick % 23 === 0) { + queueTaskFailed.add(1, { + "fedify.queue.role": "outbox", + "fedify.queue.backend": "InProcessMessageQueue", + "activitypub.activity.type": createType, + "fedify.queue.task.result": "failed", + }); + } +} + +function recordActivityMetrics(): void { + inboxActivity.add(2, { + "activitypub.processing.result": "processed", + "activitypub.activity.type": followType, + }); + inboxProcessingDuration.record(55 + wave(0, 25, 4), { + "activitypub.activity.type": followType, + }); + if (tick % 19 === 0) { + inboxActivity.add(1, { + "activitypub.processing.result": "rejected", + "activitypub.activity.type": noteType, + }); + } + + outboxActivity.add(3, { + "activitypub.processing.result": "queued", + "activitypub.activity.type": createType, + }); + if (tick % 13 === 0) { + outboxActivity.add(1, { + "activitypub.processing.result": "retried", + "activitypub.activity.type": createType, + }); + } + fanoutRecipients.record(4 + (tick % 8), { + "activitypub.activity.type": createType, + }); +} + +function recordDeliveryMetrics(): void { + const hosts = ["mastodon.example", "pixelfed.example", "misskey.example"]; + for (const host of hosts) { + const success = !(tick % 11 === 0 && host === "misskey.example"); + const attrs = { + "activitypub.remote.host": host, + "activitypub.delivery.success": success, + "activitypub.activity.type": createType, + }; + deliverySent.add(host === "mastodon.example" ? 4 : 1, attrs); + deliveryDuration.record(success ? 130 + wave(0, 40, 6) : 950, attrs); + } + + if (tick % 29 === 0) { + deliveryPermanentFailure.add(1, { + "activitypub.remote.host": "gone.example", + "http.response.status_code": 410, + }); + } + if (tick % 31 === 0) { + circuitBreakerStateChange.add(1, { + "activitypub.remote.host": "slow.example", + "activitypub.circuit_breaker.state": "open", + }); + } +} + +function recordSignatureAndLookupMetrics(): void { + signatureVerificationDuration.record(4 + wave(0, 3, 3), { + "activitypub.signature.kind": "http", + "activitypub.signature.result": "verified", + "http_signatures.algorithm": "rsa-sha256", + }); + signatureKeyFetchDuration.record(18 + wave(0, 7, 5), { + "activitypub.signature.kind": "http", + "activitypub.signature.key_fetch.result": tick % 5 === 0 + ? "fetched" + : "hit", + }); + if (tick % 37 === 0) { + signatureVerificationFailure.add(1, { + "activitypub.verification.failure_reason": "keyFetchError", + "activitypub.remote.host": "keys.example", + }); + signatureVerificationDuration.record(260, { + "activitypub.signature.kind": "http", + "activitypub.signature.result": "rejected", + "http_signatures.failure_reason": "keyFetchError", + }); + } + + keyLookup.add(1, { + "activitypub.lookup.kind": "public_key", + "activitypub.lookup.result": "fetched", + "activitypub.remote.host": "keys.example", + "activitypub.cache.enabled": true, + "http.response.status_code": 200, + }); + keyLookupDuration.record(24 + wave(0, 9, 4), { + "activitypub.lookup.kind": "public_key", + "activitypub.lookup.result": "fetched", + "activitypub.remote.host": "keys.example", + "activitypub.cache.enabled": true, + "http.response.status_code": 200, + }); + + documentFetch.add(1, { + "activitypub.lookup.kind": "object", + "activitypub.lookup.result": "fetched", + "activitypub.remote.host": "objects.example", + "activitypub.cache.enabled": true, + "http.response.status_code": 200, + }); + documentFetchDuration.record(42 + wave(0, 12, 6), { + "activitypub.lookup.kind": "object", + "activitypub.lookup.result": "fetched", + "activitypub.remote.host": "objects.example", + "activitypub.cache.enabled": true, + "http.response.status_code": 200, + }); + objectLookup.add(1, { + "activitypub.lookup.kind": tick % 4 === 0 ? "actor" : "object", + "activitypub.remote.host": "objects.example", + }); +} + +function recordDiscoveryMetrics(): void { + actorDiscovery.add(1, { + "activitypub.actor.discovery.result": "resolved", + "activitypub.remote.host": "mastodon.example", + }); + actorDiscoveryDuration.record(60 + wave(0, 20, 5), { + "activitypub.actor.discovery.result": "resolved", + "activitypub.remote.host": "mastodon.example", + }); + webFingerLookup.add(1, { + "webfinger.lookup.result": "found", + "webfinger.resource.scheme": "acct", + "activitypub.remote.host": "mastodon.example", + "http.response.status_code": 200, + }); + webFingerLookupDuration.record(38 + wave(0, 14, 5), { + "webfinger.lookup.result": "found", + "webfinger.resource.scheme": "acct", + "activitypub.remote.host": "mastodon.example", + "http.response.status_code": 200, + }); + webFingerHandle.add(1, { + "webfinger.handle.result": "resolved", + "webfinger.resource.scheme": "acct", + "http.response.status_code": 200, + }); + webFingerHandleDuration.record(10 + wave(0, 4, 4), { + "webfinger.handle.result": "resolved", + "webfinger.resource.scheme": "acct", + "http.response.status_code": 200, + }); + + if (tick % 41 === 0) { + actorDiscovery.add(1, { + "activitypub.actor.discovery.result": "not_found", + "activitypub.remote.host": "missing.example", + }); + webFingerLookup.add(1, { + "webfinger.lookup.result": "not_found", + "webfinger.resource.scheme": "acct", + "activitypub.remote.host": "missing.example", + "http.response.status_code": 404, + }); + } +} + +function recordResourceMetrics(): void { + processCpuTime.add(0.02 + (tick % 5) / 100, { + "process.cpu.state": "user", + }); + processCpuTime.add(0.01, { "process.cpu.state": "system" }); +} + +function recordAll(): void { + tick++; + recordQueueDepths(); + recordHttpMetrics(); + recordQueueMetrics(); + recordActivityMetrics(); + recordDeliveryMetrics(); + recordSignatureAndLookupMetrics(); + recordDiscoveryMetrics(); + recordResourceMetrics(); +} + +recordAll(); +const interval = setInterval(recordAll, 1000); + +if (Deno.build.os !== "windows") { + Deno.addSignalListener("SIGTERM", () => { + clearInterval(interval); + Deno.exit(0); + }); +} + +console.log("Fedify monitoring sample metrics are being exported over OTLP."); +await new Promise(() => {}); diff --git a/examples/monitoring/validate.ts b/examples/monitoring/validate.ts new file mode 100644 index 000000000..77b365afd --- /dev/null +++ b/examples/monitoring/validate.ts @@ -0,0 +1,314 @@ +import { dirname, fromFileUrl, join } from "@std/path"; + +const exampleDir = dirname(fromFileUrl(import.meta.url)); +const composeFile = join(exampleDir, "compose.yaml"); +const prometheusConfig = join(exampleDir, "prometheus.yaml"); +const prometheusRules = join(exampleDir, "prometheus-rules.yaml"); +const prometheusRuleTests = join(exampleDir, "prometheus-rules.test.yaml"); +const collectorConfig = join(exampleDir, "otel-collector.yaml"); +const dashboardFile = join( + exampleDir, + "grafana", + "dashboards", + "fedify-overview.json", +); +const sampleMetrics = join(exampleDir, "sample-metrics.ts"); +const validateScript = join(exampleDir, "validate.ts"); + +const projectName = "fedify-monitoring-validate"; +const smoke = Deno.args.includes("--smoke"); +const requestTimeoutMs = 5_000; + +interface RunOptions { + cwd?: string; + noThrow?: boolean; + quiet?: boolean; +} + +async function run( + label: string, + command: string, + args: string[], + options: RunOptions = {}, +): Promise { + console.log(`\n> ${label}`); + console.log(` ${command} ${args.join(" ")}`); + const output = await new Deno.Command(command, { + args, + cwd: options.cwd, + stdout: "piped", + stderr: "piped", + }).output(); + const stdout = new TextDecoder().decode(output.stdout); + const stderr = new TextDecoder().decode(output.stderr); + if (!output.success && !options.noThrow) { + if (stdout.trim()) console.error(stdout.trim()); + if (stderr.trim()) console.error(stderr.trim()); + throw new Error(`${label} failed with exit code ${output.code}`); + } + if (!options.quiet) { + if (stdout.trim()) console.log(stdout.trim()); + if (stderr.trim()) console.error(stderr.trim()); + } + return stdout + stderr; +} + +async function checkTools(): Promise { + await run("Check Docker", "docker", ["--version"]); + await run("Check Docker Compose", "docker", ["compose", "version"]); +} + +async function staticChecks(): Promise { + await run("Deno check sample metric generator", Deno.execPath(), [ + "check", + "--config", + join(exampleDir, "deno.json"), + sampleMetrics, + ]); + await run("Deno check validation script", Deno.execPath(), [ + "check", + validateScript, + ]); + await run("Docker Compose config", "docker", [ + "compose", + "-f", + composeFile, + "config", + ]); + await run("Prometheus config", "docker", [ + "run", + "--rm", + "-v", + `${prometheusConfig}:/etc/prometheus/prometheus.yaml:ro,z`, + "-v", + `${prometheusRules}:/etc/prometheus/prometheus-rules.yaml:ro,z`, + "--entrypoint", + "promtool", + "docker.io/prom/prometheus:v3.5.4", + "check", + "config", + "/etc/prometheus/prometheus.yaml", + ]); + await run("Prometheus rules", "docker", [ + "run", + "--rm", + "-v", + `${prometheusRules}:/workspace/prometheus-rules.yaml:ro,z`, + "--entrypoint", + "promtool", + "docker.io/prom/prometheus:v3.5.4", + "check", + "rules", + "/workspace/prometheus-rules.yaml", + ]); + await run("Prometheus rule tests", "docker", [ + "run", + "--rm", + "-v", + `${prometheusRules}:/workspace/prometheus-rules.yaml:ro,z`, + "-v", + `${prometheusRuleTests}:/workspace/prometheus-rules.test.yaml:ro,z`, + "--entrypoint", + "promtool", + "docker.io/prom/prometheus:v3.5.4", + "test", + "rules", + "/workspace/prometheus-rules.test.yaml", + ]); + await run("OpenTelemetry Collector config", "docker", [ + "run", + "--rm", + "-v", + `${collectorConfig}:/etc/otelcol-contrib/config.yaml:ro,z`, + "docker.io/otel/opentelemetry-collector-contrib:0.154.0", + "validate", + "--config=/etc/otelcol-contrib/config.yaml", + ]); +} + +async function waitFor( + label: string, + check: () => Promise, + timeoutMs = 120_000, +): Promise { + const deadline = Date.now() + timeoutMs; + let lastError: unknown; + while (Date.now() < deadline) { + try { + if (await check()) { + console.log(`${label}: ready`); + return; + } + } catch (error) { + lastError = error; + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + throw new Error( + `${label} did not become ready within ${timeoutMs} ms` + + (lastError == null ? "" : `: ${lastError}`), + ); +} + +async function fetchJson(url: string): Promise { + const response = await fetch(url, { + signal: AbortSignal.timeout(requestTimeoutMs), + }); + if (!response.ok) { + await response.body?.cancel(); + throw new Error(`${url} returned HTTP ${response.status}`); + } + return await response.json(); +} + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value != null; +} + +async function prometheusQuery(expression: string): Promise { + const result = await evaluatePrometheusExpression(expression); + return Array.isArray(result) && result.length > 0; +} + +async function evaluatePrometheusExpression( + expression: string, +): Promise { + const url = new URL("http://localhost:9090/api/v1/query"); + url.searchParams.set("query", expression); + const json = await fetchJson(url.href); + if (!isRecord(json) || json.status !== "success" || !isRecord(json.data)) { + throw new Error(`Prometheus rejected query: ${expression}`); + } + return json.data.result; +} + +function collectDashboardExpressions(value: unknown): string[] { + if (Array.isArray(value)) return value.flatMap(collectDashboardExpressions); + if (!isRecord(value)) return []; + const expressions: string[] = []; + if (typeof value.expr === "string" && value.expr.trim() !== "") { + expressions.push(value.expr); + } + for (const child of Object.values(value)) { + expressions.push(...collectDashboardExpressions(child)); + } + return expressions; +} + +async function checkDashboardQueries(): Promise { + const dashboard = JSON.parse(await Deno.readTextFile(dashboardFile)); + const expressions = [...new Set(collectDashboardExpressions(dashboard))]; + if (expressions.length < 1) { + throw new Error("No Prometheus expressions found in the Grafana dashboard"); + } + await Promise.all(expressions.map(evaluatePrometheusExpression)); + console.log(`Grafana dashboard queries: ${expressions.length} valid`); +} + +type SmokeSignal = "SIGINT" | "SIGTERM"; + +function addSmokeSignalListener( + signal: SmokeSignal, + listener: () => void, +): boolean { + if (Deno.build.os === "windows" && signal === "SIGTERM") return false; + Deno.addSignalListener(signal, listener); + return true; +} + +async function smokeChecks(): Promise { + let cleanedUp = false; + const stopSmokeStack = async (options: RunOptions = {}) => { + if (cleanedUp) return; + cleanedUp = true; + await run("Stop smoke stack", "docker", [ + "compose", + "-p", + projectName, + "-f", + composeFile, + "down", + "--remove-orphans", + "--volumes", + ], { noThrow: true, ...options }); + }; + const cleanupAfterSignal = (signal: SmokeSignal, code: number) => { + void (async () => { + console.error(`\nReceived ${signal}; cleaning up smoke stack.`); + await stopSmokeStack(); + Deno.exit(code); + })(); + }; + const onSigint = () => cleanupAfterSignal("SIGINT", 130); + const onSigterm = () => cleanupAfterSignal("SIGTERM", 143); + + await stopSmokeStack({ quiet: true }); + cleanedUp = false; + + const sigintRegistered = addSmokeSignalListener("SIGINT", onSigint); + const sigtermRegistered = addSmokeSignalListener("SIGTERM", onSigterm); + + try { + await run("Start smoke stack", "docker", [ + "compose", + "-p", + projectName, + "-f", + composeFile, + "up", + "-d", + ]); + + await waitFor("Prometheus", async () => { + const response = await fetch("http://localhost:9090/-/ready", { + signal: AbortSignal.timeout(requestTimeoutMs), + }); + await response.body?.cancel(); + return response.ok; + }); + await waitFor("Grafana", async () => { + const json = await fetchJson("http://localhost:3000/api/health"); + return isRecord(json) && json.database === "ok"; + }); + await waitFor("OpenTelemetry Collector scrape target", async () => { + const json = await fetchJson("http://localhost:9090/api/v1/targets"); + if (!isRecord(json) || !isRecord(json.data)) return false; + const targets = json.data.activeTargets; + if (!Array.isArray(targets)) return false; + return targets.some((target) => + isRecord(target) && + target.health === "up" && + isRecord(target.labels) && + target.labels.job === "otel-collector" + ); + }); + await waitFor( + "Fedify sample metrics", + () => prometheusQuery("fedify_http_server_request_count_total"), + ); + await checkDashboardQueries(); + await waitFor("Fedify dashboard provisioning", async () => { + const json = await fetchJson( + "http://localhost:3000/api/search?query=Fedify", + ); + return Array.isArray(json) && + json.some((entry) => + isRecord(entry) && entry.uid === "fedify-overview" + ); + }); + } finally { + if (sigintRegistered) Deno.removeSignalListener("SIGINT", onSigint); + if (sigtermRegistered) Deno.removeSignalListener("SIGTERM", onSigterm); + await stopSmokeStack(); + } +} + +try { + await checkTools(); + await staticChecks(); + if (smoke) await smokeChecks(); + console.log("\nMonitoring example validation passed."); +} catch (error) { + console.error(error instanceof Error ? error.message : String(error)); + Deno.exit(1); +} diff --git a/examples/test-examples/mod.ts b/examples/test-examples/mod.ts index 421b6909c..5c5318a8a 100644 --- a/examples/test-examples/mod.ts +++ b/examples/test-examples/mod.ts @@ -316,6 +316,11 @@ const SKIPPED_EXAMPLES: SkippedExample[] = [ reason: "No actor dispatcher configured; federation lookup cannot be verified", }, + { + name: "monitoring", + reason: + "Docker Compose monitoring stack; validate with mise run test:monitoring", + }, { name: "rfc-9421-test", reason: diff --git a/mise.toml b/mise.toml index cebfcc4dc..8adf4d3f5 100644 --- a/mise.toml +++ b/mise.toml @@ -183,6 +183,10 @@ done description = "Run tests for all example projects" run = "deno run -A examples/test-examples/mod.ts" +[tasks."test:monitoring"] +description = "Validate the monitoring example configuration" +run = "deno run -A examples/monitoring/validate.ts" + [tasks."test:init"] description = "Run tests for the init package" run = "deno task -f @fedify/init test-init"