diff --git a/code/chapter09/docker-compose.yml b/code/chapter09/docker-compose.yml new file mode 100644 index 00000000..508ad8d3 --- /dev/null +++ b/code/chapter09/docker-compose.yml @@ -0,0 +1,88 @@ +version: '3.8' + +services: + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "13000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-storage:/var/lib/grafana + depends_on: + - prometheus + - loki + - tempo + dns_search: [] + networks: + - observability + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "19090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-storage:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + dns_search: [] + networks: + - observability + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "13100:3100" + volumes: + - loki-storage:/loki + dns_search: [] + networks: + - observability + + tempo: + image: grafana/tempo:latest + container_name: tempo + ports: + - "13200:3200" + - "14317:4317" + - "14318:4318" + volumes: + - tempo-storage:/var/tempo + - ./tempo-config.yaml:/etc/tempo/tempo-config.yaml + command: [ "-config.file=/etc/tempo/tempo-config.yaml" ] + dns_search: [] + networks: + - observability + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + ports: + - "24317:4317" + - "24318:4318" + - "19411:9411" + volumes: + - ./otel-collector-config.yml:/etc/otel-collector-config.yml + command: [ "--config=/etc/otel-collector-config.yml" ] + depends_on: + - loki + - prometheus + - tempo + dns_search: [] + networks: + - observability + +volumes: + grafana-storage: + prometheus-storage: + loki-storage: + tempo-storage: + +networks: + observability: + driver: bridge diff --git a/code/chapter09/images/grafana-loki-payment-service-logs.png b/code/chapter09/images/grafana-loki-payment-service-logs.png new file mode 100644 index 00000000..c8156b38 Binary files /dev/null and b/code/chapter09/images/grafana-loki-payment-service-logs.png differ diff --git a/code/chapter09/images/grafana-prometheus-payment-attempts-metric.png b/code/chapter09/images/grafana-prometheus-payment-attempts-metric.png new file mode 100644 index 00000000..e880caaf Binary files /dev/null and b/code/chapter09/images/grafana-prometheus-payment-attempts-metric.png differ diff --git a/code/chapter09/images/grafana-tempo-all-payment-traces.png b/code/chapter09/images/grafana-tempo-all-payment-traces.png new file mode 100644 index 00000000..d5f16ff9 Binary files /dev/null and b/code/chapter09/images/grafana-tempo-all-payment-traces.png differ diff --git a/code/chapter09/images/grafana-tempo-authorize-trace-detail.png b/code/chapter09/images/grafana-tempo-authorize-trace-detail.png new file mode 100644 index 00000000..56c31414 Binary files /dev/null and b/code/chapter09/images/grafana-tempo-authorize-trace-detail.png differ diff --git a/code/chapter09/images/grafana-tempo-gateway-health-traces.png b/code/chapter09/images/grafana-tempo-gateway-health-traces.png new file mode 100644 index 00000000..f26b195e Binary files /dev/null and b/code/chapter09/images/grafana-tempo-gateway-health-traces.png differ diff --git a/code/chapter09/otel-collector-config.yml b/code/chapter09/otel-collector-config.yml new file mode 100644 index 00000000..57f06d51 --- /dev/null +++ b/code/chapter09/otel-collector-config.yml @@ -0,0 +1,46 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + debug: + verbosity: detailed + + otlp_grpc/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + otlp_http/loki: + endpoint: http://loki:3100/otlp + tls: + insecure: true + + prometheus: + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp_grpc/tempo, debug] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus, debug] + + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/loki, debug] diff --git a/code/chapter09/payment/README.adoc b/code/chapter09/payment/README.adoc index e3b65582..1b029852 100644 --- a/code/chapter09/payment/README.adoc +++ b/code/chapter09/payment/README.adoc @@ -1,4 +1,4 @@ -= Payment Service += Payment Service — MicroProfile Telemetry 2.1 :toc: macro :toclevels: 3 :icons: font @@ -7,937 +7,741 @@ toc::[] -This microservice is part of the Jakarta EE 10 and MicroProfile 6.1-based e-commerce application. It handles payment processing and transaction management. +This microservice demonstrates MicroProfile Telemetry 2.1 integrated with the LGTM observability stack (Loki, Grafana, Tempo, Prometheus) via an OpenTelemetry Collector. It is part of the Chapter 09 MicroProfile tutorial. -== Features +The service processes payments with full fault tolerance (retry, fallback, circuit breaker, bulkhead, timeout) and exports **traces**, **metrics**, and **logs** over OTLP to the LGTM stack. -* Payment transaction processing -* Dynamic configuration management via MicroProfile Config -* RESTful API endpoints with JSON support -* Custom ConfigSource implementation -* OpenAPI documentation -* **MicroProfile Fault Tolerance with Retry Policies** -* **Circuit Breaker protection for external services** -* **Fallback mechanisms for service resilience** -* **Bulkhead pattern for concurrency control** -* **Timeout protection for long-running operations** +== Architecture -== MicroProfile Fault Tolerance Implementation - -The Payment Service implements comprehensive fault tolerance patterns using MicroProfile Fault Tolerance annotations: - -=== Retry Policies - -The service implements different retry strategies based on operation criticality: - -==== Payment Authorization Retry (@Retry) -* **Max Retries**: 3 attempts -* **Delay**: 1000ms with 500ms jitter -* **Max Duration**: 10 seconds -* **Retry On**: RuntimeException, WebApplicationException -* **Use Case**: Standard payment authorization with exponential backoff - -[source,java] ----- -@Retry( - maxRetries = 3, - delay = 2000, - maxDuration = 10000 - jitter = 500, - retryOn = {RuntimeException.class, WebApplicationException.class} -) ----- - -=== Circuit Breaker Protection - -Payment capture operations use circuit breaker pattern: - -[source,java] ---- -@CircuitBreaker( - failureRatio = 0.5, - requestVolumeThreshold = 4, - delay = 5000 -) + Payment Service (Open Liberty) + │ MicroProfile Telemetry 2.1 + │ OTLP gRPC → port 24317 (Codespaces) + ▼ port 4317 (local) + OpenTelemetry Collector + ├──► Tempo (traces) — port 14317 → Tempo:4317 + ├──► Loki (logs) — HTTP → Loki:3100/otlp + └──► Prometheus (metrics) — scrape endpoint :8889 + │ + ▼ + Grafana (unified dashboard) — port 13000 ---- -* **Failure Ratio**: 50% failure rate triggers circuit opening -* **Request Volume**: Minimum 4 requests for evaluation -* **Recovery Delay**: 5 seconds before attempting recovery +== Prerequisites -=== Timeout Protection +* JDK 21 or higher +* Maven 3.9.0 or higher +* Docker and Docker Compose -Operations with potential long delays are protected with timeouts: - -[source,java] ----- -@Timeout(value = 3000) ----- - -=== Bulkhead Pattern - -The bulkhead pattern limits concurrent requests to prevent system overload: +== Project Structure -[source,java] ---- -@Bulkhead(value = 5) +code/chapter09/ +├── docker-compose.yml # LGTM + OTel Collector stack +├── otel-collector-config.yml # OTel Collector pipeline config +├── prometheus.yml # Prometheus scrape config +├── tempo-config.yaml # Grafana Tempo v3 config +└── payment/ + ├── pom.xml + └── src/main/ + ├── java/io/microprofile/tutorial/store/payment/ + │ ├── entity/PaymentDetails.java + │ ├── exception/ + │ ├── resource/PaymentResource.java # REST endpoints + │ └── service/PaymentService.java # Business logic + telemetry + ├── liberty/config/server.xml # Open Liberty features + └── resources/META-INF/ + └── microprofile-config.properties # OTLP / sampler config ---- -* **Concurrent Requests**: Limited to 5 concurrent requests -* **Excess Requests**: Rejected immediately instead of queuing -* **Use Case**: Protect service from traffic spikes and cascading failures - -=== Fallback Mechanisms - -All critical operations have fallback methods that provide graceful degradation: - -* **Payment Authorization Fallback**: Returns service unavailable with retry instructions +== LGTM Observability Stack -== Endpoints +All observability infrastructure lives in `code/chapter09/`. Start from that directory. -=== GET /payment/api/payment-config -* Returns all current payment configuration values -* Example: `GET http://localhost:9080/payment/api/payment-config` -* Response: `{"gateway.endpoint":"https://api.paymentgateway.com"}` +=== docker-compose.yml -=== POST /payment/api/payment-config -* Updates a payment configuration value -* Example: `POST http://localhost:9080/payment/api/payment-config` -* Request body: `{"key": "payment.gateway.endpoint", "value": "https://new-api.paymentgateway.com"}` -* Response: `{"key":"payment.gateway.endpoint","value":"https://new-api.paymentgateway.com","message":"Configuration updated successfully"}` +The stack exposes ports offset from the defaults to avoid conflicts in GitHub Codespaces: -=== POST /payment/api/authorize -* Processes a payment authorization with retry policy -* **Retry Configuration**: 3 attempts, 1s delay, 500ms jitter -* **Fallback**: Service unavailable response -* Example: `POST http://localhost:9080/payment/api/authorize` -* Request body: `{"cardNumber":"4111111111111111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":100.00}` -* Response: `{"status":"success", "message":"Payment authorized successfully", "transactionId":"TXN1234567890", "amount":100.00}` -* Fallback Response: `{"status":"failed", "message":"Payment gateway unavailable. Please try again later.", "fallback":true}` - -=== POST /payment/api/payment-config/process-example -* Example endpoint demonstrating payment processing with configuration -* Example: `POST http://localhost:9080/payment/api/payment-config/process-example` -* Request body: `{"cardNumber":"4111111111111111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":100.00}` -* Response: `{"amount":100.00,"message":"Payment processed successfully","status":"success","configUsed":{"gatewayEndpoint":"https://new-api.paymentgateway.com"}}` - -== Building and Running the Service - -=== Prerequisites +[options="header"] +|=== +|Service |Image |Host Port |Container Port |Purpose +|Grafana |grafana/grafana:latest |13000 |3000 |Unified dashboard +|Prometheus |prom/prometheus:latest |19090 |9090 |Metrics storage +|Loki |grafana/loki:latest |13100 |3100 |Log storage +|Tempo |grafana/tempo:latest |13200, 14317, 14318 |3200, 4317, 4318 |Trace storage +|OTel Collector |otel/opentelemetry-collector-contrib:latest |24317, 24318 |4317, 4318 |Telemetry gateway +|=== -* JDK 17 or higher -* Maven 3.6.0 or higher +IMPORTANT: `dns_search: []` is set on every service. Without it, Docker's embedded DNS in GitHub Codespaces appends an Azure search domain (e.g. `gjgsbu0qvyie5dgfl5s3ewuwyc.ax.internal.cloudapp.net`) to short hostnames like `tempo`, causing `no such host` errors when Grafana tries to connect to its data sources. + +[source,yaml] +---- +version: '3.8' + +services: + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "13000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-storage:/var/lib/grafana + depends_on: + - prometheus + - loki + - tempo + dns_search: [] + networks: + - observability + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "19090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-storage:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + dns_search: [] + networks: + - observability + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "13100:3100" + volumes: + - loki-storage:/loki + dns_search: [] + networks: + - observability + + tempo: + image: grafana/tempo:latest + container_name: tempo + ports: + - "13200:3200" + - "14317:4317" + - "14318:4318" + volumes: + - tempo-storage:/var/tempo + - ./tempo-config.yaml:/etc/tempo/tempo-config.yaml + command: [ "-config.file=/etc/tempo/tempo-config.yaml" ] + dns_search: [] + networks: + - observability + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + ports: + - "24317:4317" + - "24318:4318" + - "19411:9411" + volumes: + - ./otel-collector-config.yml:/etc/otel-collector-config.yml + command: [ "--config=/etc/otel-collector-config.yml" ] + depends_on: + - loki + - prometheus + - tempo + dns_search: [] + networks: + - observability + +volumes: + grafana-storage: + prometheus-storage: + loki-storage: + tempo-storage: + +networks: + observability: + driver: bridge +---- + +=== otel-collector-config.yml + +The OTel Collector receives all three signals on OTLP ports and fans them out to the appropriate backend. + +[source,yaml] +---- +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + debug: + verbosity: detailed + + otlp_grpc/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + otlp_http/loki: + endpoint: http://loki:3100/otlp + tls: + insecure: true + + prometheus: + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp_grpc/tempo, debug] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus, debug] + + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/loki, debug] +---- + +NOTE: The `debug` exporter prints received telemetry to the collector's stdout. Use `docker compose logs otel-collector` to confirm data is flowing. Remove it in production to reduce noise. + +=== tempo-config.yaml + +Grafana Tempo v3 removed the top-level `ingester` field. Use this minimal v3-compatible configuration: + +[source,yaml] +---- +server: + http_listen_port: 3200 + log_level: info -=== Local Development +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 -[source,bash] +storage: + trace: + backend: local + wal: + path: /var/tempo/wal + local: + path: /var/tempo/blocks ---- -# Build the application -mvn clean package -# Run the application with Liberty -mvn liberty:run ----- +=== prometheus.yml -The server will start on port 9080 (HTTP) and 9081 (HTTPS). +Prometheus scrapes itself and the OTel Collector's Prometheus exporter endpoint: -=== Docker - -[source,bash] +[source,yaml] ---- -# Build and run with Docker -./run-docker.sh ----- - -== Project Structure - -* `src/main/java/io/microprofile/tutorial/PaymentRestApplication.java` - Jakarta Restful web service application class -* `src/main/java/io/microprofile/tutorial/store/payment/config/` - Configuration classes -* `src/main/java/io/microprofile/tutorial/store/payment/resource/` - REST resource endpoints -* `src/main/java/io/microprofile/tutorial/store/payment/service/` - Business logic services -* `src/main/java/io/microprofile/tutorial/store/payment/entity/` - Data models -* `src/main/resources/META-INF/services/` - Service provider configuration -* `src/main/liberty/config/` - Liberty server configuration +global: + scrape_interval: 15s + evaluation_interval: 15s -== Custom ConfigSource +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] -The Payment Service implements a custom MicroProfile ConfigSource named `PaymentServiceConfigSource` that provides payment-specific configuration with high priority (ordinal: 600). - -=== Available Configuration Properties + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] +---- -[cols="1,2,2", options="header"] -|=== -|Property -|Description -|Default Value +== Payment Service Configuration -|payment.gateway.endpoint -|Payment gateway endpoint URL -|https://api.paymentgateway.com -|=== +=== server.xml — Open Liberty Features -=== Testing ConfigSource Endpoints +The service runs on the `microProfile-7.1` platform, which includes MicroProfile Telemetry 2.1: -You can test the ConfigSource endpoints using curl or any REST client: - -[source,bash] +[source,xml] ---- -# Get current configuration -curl -s http://localhost:9080/payment/api/payment-config | json_pp + + + microProfile-7.1 + jakartaEE-10.0 + restfulWS + jsonp + jsonb + cdi + mpConfig + mpOpenAPI + mpHealth + mpMetrics + mpTelemetry + mpFaultTolerance + + + + + + +---- + +=== microprofile-config.properties — Telemetry Settings -# Update configuration property -curl -s -X POST -H "Content-Type: application/json" \ - -d '{"key":"payment.gateway.endpoint", "value":"https://new-api.paymentgateway.com"}' \ - http://localhost:9080/payment/api/payment-config | json_pp +[source,properties] +---- +# MicroProfile Telemetry Configuration +otel.service.name=payment-service +otel.sdk.disabled=false -# Test payment processing with the configuration -curl -s -X POST -H "Content-Type: application/json" \ - -d '{"cardNumber":"4111111111111111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":100.00}' \ - http://localhost:9080/payment/api/payment-config/process-example | json_pp +# OTLP Exporter Configuration +# Use port 4317 locally; use port 24317 when running in GitHub Codespaces +otel.exporter.otlp.endpoint=http://localhost:24317 +otel.traces.exporter=otlp +otel.metrics.exporter=otlp +otel.logs.exporter=otlp -# Test basic payment authorization -curl -s -X POST -H "Content-Type: application/json" \ - http://localhost:9080/payment/api/authorize | json_pp +# Sampling — always sample, respecting parent decision +otel.traces.sampler=parentbased_always_on ---- -=== Implementation Details +IMPORTANT: When running locally (not in Codespaces), change the endpoint to `http://localhost:4317` to match the OTel Collector's standard OTLP gRPC port. In Codespaces the host port is `24317`. -The custom ConfigSource is implemented in the following classes: +== Code Implementation -* `PaymentServiceConfigSource.java` - Implements the MicroProfile ConfigSource interface -* `PaymentConfig.java` - Utility class for accessing configuration properties +=== PaymentService.java — CDI Telemetry Injection -Example usage in application code: +MicroProfile Telemetry 2.1 exposes `Tracer` and `Meter` as CDI beans. Inject them directly — do not use `GlobalOpenTelemetry.get*()`: [source,java] ---- -// Inject standard MicroProfile Config -@Inject -@ConfigProperty(name="payment.gateway.endpoint") -private String endpoint; - -// Or use the utility class -String gatewayUrl = PaymentConfig.getConfigProperty("payment.gateway.endpoint"); ----- - -The custom ConfigSource provides a higher priority (ordinal: 600) than system properties and environment variables, allowing for service-specific defaults while still enabling override via standard mechanisms. - -=== MicroProfile Config Sources - -MicroProfile Config uses a prioritized set of configuration sources. The payment service uses the following configuration sources in order of priority (highest to lowest): - -1. Custom ConfigSource (`PaymentServiceConfigSource`) - Ordinal: 600 -2. System properties - Ordinal: 400 -3. Environment variables - Ordinal: 300 -4. microprofile-config.properties file - Ordinal: 100 - -==== Updating Configuration Values - -You can update configuration properties through different methods: - -===== 1. Using the REST API (runtime) +@ApplicationScoped +public class PaymentService { -This uses the custom ConfigSource and persists only for the current server session: + @Inject + Tracer tracer; // io.opentelemetry.api.trace.Tracer -[source,bash] ----- -curl -X POST -H "Content-Type: application/json" \ - -d '{"key":"payment.gateway.endpoint", "value":"https://test-api.paymentgateway.com"}' \ - http://localhost:9080/payment/api/payment-config ----- + @Inject + Meter meter; // io.opentelemetry.api.metrics.Meter -===== 2. Using System Properties (startup) + private LongCounter paymentAttemptsCounter; -[source,bash] + @PostConstruct + public void init() { + paymentAttemptsCounter = meter + .counterBuilder("payment.attempts") + .setDescription("Number of payment attempts by result") + .setUnit("1") + .build(); + } +} ---- -# Linux/macOS -mvn liberty:run -Dpayment.gateway.endpoint=https://sys-api.paymentgateway.com -# Windows -mvn liberty:run "-Dpayment.gateway.endpoint=https://sys-api.paymentgateway.com" ----- +NOTE: The `Meter` is used in `@PostConstruct` to build instruments — not in `init()` directly on the `Tracer`, which is used per-request. Building instruments in `@PostConstruct` ensures they are registered once at startup. -===== 3. Using Environment Variables (startup) +=== Manual Span Creation -Environment variable names must follow the MicroProfile Config convention (uppercase with underscores): +The `processPayment` method creates a child span with business attributes: -[source,bash] +[source,java] ---- -# Linux/macOS -export PAYMENT_GATEWAY_ENDPOINT=https://env-api.paymentgateway.com -mvn liberty:run - -# Windows PowerShell -$env:PAYMENT_GATEWAY_ENDPOINT="https://env-api.paymentgateway.com" -mvn liberty:run - -# Windows CMD -set PAYMENT_GATEWAY_ENDPOINT=https://env-api.paymentgateway.com -mvn liberty:run +@Asynchronous +@Timeout(3000) +@Retry(maxRetries = 3, delay = 2000, jitter = 500, + retryOn = PaymentProcessingException.class, + abortOn = CriticalPaymentException.class) +@Fallback(fallbackMethod = "fallbackProcessPayment") +@Bulkhead(value = 5) +public CompletionStage processPayment(PaymentDetails paymentDetails) + throws PaymentProcessingException { + + Span span = tracer.spanBuilder("payment.process") + .setAttribute("payment.amount", paymentDetails.getAmount().toString()) + .setAttribute("payment.method", "credit_card") + .setAttribute("payment.service", "payment-service") + .startSpan(); + + try (Scope scope = span.makeCurrent()) { + span.setAttribute("payment.status", "IN_PROGRESS"); + span.addEvent("Starting payment processing"); + + // ... business logic ... + + paymentAttemptsCounter.add(1, + Attributes.of(AttributeKey.stringKey("result"), "success")); + span.setAttribute("payment.status", "SUCCESS"); + span.setStatus(StatusCode.OK); + span.addEvent("Payment processed successfully"); + return CompletableFuture.completedFuture("{\"status\":\"success\",...}"); + } finally { + span.end(); + } +} ---- -===== 4. Using microprofile-config.properties File - -Edit the file at `src/main/resources/META-INF/microprofile-config.properties`: - -[source,properties] ----- -# Update the endpoint -payment.gateway.endpoint=https://config-api.paymentgateway.com ----- +The `payment.attempts` counter is incremented with a `result` attribute on every outcome: `success`, `failed`, or `fallback`. This enables per-outcome metrics in Prometheus. -Then rebuild and restart the application: +=== Circuit Breaker — checkGatewayHealth -[source,bash] +[source,java] ---- -mvn clean package liberty:run +@CircuitBreaker(requestVolumeThreshold = 4, failureRatio = 0.75, + delay = 1000, successThreshold = 2) +public boolean checkGatewayHealth() { + // Simulates a network call to the payment gateway + simulateNetworkCall(200); + if (Math.random() > 0.9) { + throw new RuntimeException("Payment gateway not responding"); + } + return true; +} ---- -==== Testing Configuration Changes - -After changing a configuration property, you can verify it was updated by calling: +=== Async Notification — sendPaymentNotification -[source,bash] +[source,java] ---- -curl http://localhost:9080/payment/api/payment-config +@Asynchronous +@Bulkhead(5) +public CompletionStage sendPaymentNotification( + String paymentId, String recipient) { + simulateNetworkCall(300); + return CompletableFuture.completedFuture( + "Notification sent to " + recipient + " for payment " + paymentId); +} ---- -== Documentation - -=== OpenAPI - -The payment service automatically generates OpenAPI documentation using MicroProfile OpenAPI annotations. +== API Endpoints -* OpenAPI UI: `http://localhost:9080/payment/api/openapi-ui/` -* OpenAPI JSON: `http://localhost:9080/payment/api/openapi` +Base URL: `http://localhost:9080/payment` -=== MicroProfile Config Specification - -For more information about MicroProfile Config, refer to the official documentation: - -* https://download.eclipse.org/microprofile/microprofile-config-3.1/microprofile-config-spec-3.1.html - -=== Related Resources - -* MicroProfile: https://microprofile.io/ -* Jakarta EE: https://jakarta.ee/ -* Open Liberty: https://openliberty.io/ - -== Troubleshooting +[options="header"] +|=== +|Method |Path |Description |Fault Tolerance +|POST |`/authorize?amount={value}` |Process payment by amount |`@Retry`, `@Fallback`, `@Bulkhead`, `@Timeout`, `@Asynchronous` +|POST |`/payments` |Process payment with full card details |`@Retry`, `@Fallback`, `@Bulkhead`, `@Timeout`, `@Asynchronous` +|POST |`/verify` |Verify payment — runs validation, fraud check, funds check |`@Asynchronous` +|GET |`/health/gateway` |Check gateway health |`@CircuitBreaker` +|POST |`/notify/{paymentId}?recipient={email}` |Send async payment notification |`@Asynchronous`, `@Bulkhead` +|=== -=== Common Issues +OpenAPI UI: http://localhost:9080/openapi/ui/ -==== Port Conflicts +=== PaymentDetails request body -If you encounter a port conflict when starting the server, you can change the ports in the `pom.xml` file: +Used by `POST /payments` and `POST /verify`: -[source,xml] +[source,json] ---- -9080 -9081 +{ + "cardNumber": "4111111111111111", + "cardHolderName": "Test User", + "expiryDate": "12/25", + "securityCode": "123", + "amount": 99.99 +} ---- -==== ConfigSource Not Loading - -If the custom ConfigSource is not loading, check the following: - -1. Verify the service provider configuration file exists at: - `src/main/resources/META-INF/services/org.eclipse.microprofile.config.spi.ConfigSource` - -2. Ensure it contains the correct fully qualified class name: - `io.microprofile.tutorial.store.payment.config.PaymentServiceConfigSource` - -==== Deployment Errors - -For CWWKZ0004E deployment errors, check the server logs at: -`target/liberty/wlp/usr/servers/mpServer/logs/messages.log` +NOTE: Card numbers ending in `0000` trigger a fraud-check failure in the verification flow. Amounts above `1000` trigger an insufficient-funds failure. -== Testing Fault Tolerance Features +== Building and Running -=== Automated Test Scripts +=== Step 1 — Start the LGTM observability stack -The Payment Service includes several test scripts to demonstrate and validate fault tolerance features: - -==== test-payment-basic.sh - -Basic functionality test to verify core payment operations: - -* Configuration retrieval -* Simple payment processing -* Error handling +Open a terminal and run from the `code/chapter09/` directory: [source,bash] ---- -# Test basic payment operations -chmod +x test-payment-basic.sh -./test-payment-basic.sh +cd code/chapter09 +docker compose up -d +docker compose ps ---- -==== test-payment-retry.sh -Tests various retry scenarios with different triggers: +Expected: all five services (`grafana`, `prometheus`, `loki`, `tempo`, `otel-collector`) show status `running`. -* Normal payment processing (successful) -* Failed payment with retry (card ending in "0000") -* Verification with random failures -* Invalid input handling +Verify each backend is healthy: [source,bash] ---- -# Test retry scenarios -chmod +x test-payment-retry.sh -./test-payment-retry.sh +curl -s http://localhost:13200/ready # Tempo → "ready" +curl -s http://localhost:13100/ready # Loki → "ready" +curl -s http://localhost:19090/-/healthy # Prometheus → "Prometheus Server is Healthy." ---- -==== test-payment-concurrent-load.sh +=== Step 2 — Configure Grafana data sources -Tests the service under concurrent load: +Open Grafana at http://localhost:13000 (login: `admin` / `admin`). -* Multiple simultaneous requests -* Observing thread handling -* Response time analysis +Go to *Connections → Data sources → Add data source* and add: -[source,bash] ----- -# Test service under concurrent load -chmod +x test-payment-concurrent-load.sh -./test-payment-concurrent-load.sh ----- +[options="header"] +|=== +|Type |Name |URL +|Prometheus |Prometheus |`http://prometheus:9090` +|Loki |Loki |`http://loki:3100` +|Tempo |Tempo |`http://tempo:3200` +|=== -==== test-payment-async.sh +Click *Save & test* for each. All three should show a success message. -Analyzes asynchronous processing behavior: +NOTE: Use the Docker service names (`prometheus`, `loki`, `tempo`) as hostnames — not `localhost`. Grafana runs inside the same Docker network as the backends, so service-name DNS resolution works. -* Response time measurement -* Thread utilization -* Future completion patterns +=== Step 3 — Build and start the payment service + +Open a second terminal from the `code/chapter09/payment/` directory: [source,bash] ---- -# Analyze asynchronous processing -chmod +x test-payment-async.sh -./test-payment-async.sh +cd code/chapter09/payment +mvn clean package +mvn liberty:run ---- -==== test-payment-bulkhead.sh -Demonstrates the bulkhead pattern by sending concurrent requests: - -* Concurrent request handling -* Bulkhead limit verification (5 requests) -* Rejection of excess requests -* Service recovery after load reduction +Wait for the message: -[source,bash] ---- -# Test bulkhead functionality with concurrent requests -chmod +x test-payment-bulkhead.sh -./test-payment-bulkhead.sh +[AUDIT] CWWKF0011I: The server mpServer is ready to run a smarter planet. ---- -==== test-payment-async-analysis.sh +The service is now available at http://localhost:9080/payment. -Analyzes asynchronous processing behavior: +=== Step 4 — Generate telemetry traffic -* Response time measurement -* Thread utilization -* Future completion patterns +Run the following to exercise all telemetry paths: [source,bash] ---- -# Analyze asynchronous processing -chmod +x test-payment-async-analysis.sh -./test-payment-async-analysis.sh ----- - -=== Running the Tests +# 1. Simple authorize (retry + fallback path) +curl -s -X POST "http://localhost:9080/payment/authorize?amount=75.50" -To run any of these test scripts: +# 2. Full payment with card details (creates payment.process span) +curl -s -X POST "http://localhost:9080/payment/payments" \ + -H "Content-Type: application/json" \ + -d '{"cardNumber":"4111111111111111","cardHolderName":"Test User","expiryDate":"12/25","securityCode":"123","amount":99.99}' -[source,bash] ----- -# Make the script executable -chmod +x test-payment-bulkhead.sh +# 3. Verification flow (validation → fraud check → funds check sub-spans) +curl -s -X POST "http://localhost:9080/payment/verify" \ + -H "Content-Type: application/json" \ + -d '{"cardNumber":"4111111111111111","cardHolderName":"Test User","expiryDate":"12/25","securityCode":"123","amount":150.00}' -# Run the script -./test-payment-bulkhead.sh ----- +# 4. Trigger fraud check failure (card ending 0000) +curl -s -X POST "http://localhost:9080/payment/verify" \ + -H "Content-Type: application/json" \ + -d '{"cardNumber":"4111111110000","cardHolderName":"Test User","expiryDate":"12/25","securityCode":"123","amount":50.00}' -You can also run all test scripts in sequence: +# 5. Gateway health check (circuit breaker path) +curl -s "http://localhost:9080/payment/health/gateway" -[source,bash] ----- -# Run all test scripts -for script in test-payment-*.sh; do - echo "Running $script..." - chmod +x $script - ./$script - echo "----------------------------------------" - sleep 2 -done +# 6. Async notification +curl -s -X POST "http://localhost:9080/payment/notify/PAY-12345?recipient=ops@example.com" ---- -== Configuration Properties +Run each command several times to produce enough data for Grafana to display. -=== Fault Tolerance Configuration +== Verifying Telemetry in Grafana -The following properties can be configured via MicroProfile Config: +=== Traces in Tempo -[cols="1,2,2", options="header"] -|=== -|Property -|Description -|Default Value +1. Open Grafana → *Explore* → select *Tempo* +2. Switch to the *Search* tab +3. Set *Service Name* = `payment-service` +4. Click *Run query* -|payment.gateway.endpoint -|Payment gateway endpoint URL -|https://api.paymentgateway.com +You should see recent traces. Click any trace to expand the span tree. Look for: -|payment.retry.maxRetries -|Maximum retry attempts for payment operations -|3 +* Root HTTP span: `POST /payment/payments` or `POST /payment/verify` +* Child span: `payment.process` with attributes `payment.amount`, `payment.method`, `payment.status` +* For the verify flow: `validatePaymentDetails`, `performFraudCheck`, `verifyFundsAvailability`, `recordTransaction` steps appear as log events on the span -|payment.retry.delay -|Delay between retry attempts (milliseconds) -|1000 +The following screenshot shows all payment-service traces listed in Tempo after generating traffic: -|payment.circuitbreaker.failureRatio -|Circuit breaker failure ratio threshold -|0.5 +image::../images/grafana-tempo-all-payment-traces.png[Grafana Tempo — all payment-service traces,role="related thumb right"] -|payment.circuitbreaker.requestVolumeThreshold -|Minimum requests for circuit breaker evaluation -|4 +You can filter by span name (e.g. `GET /payment/api/health/gateway`) to narrow down traces for a specific endpoint: -|payment.timeout.duration -|Timeout duration for payment operations (milliseconds) -|3000 +image::../images/grafana-tempo-gateway-health-traces.png[Grafana Tempo — gateway health check traces filtered by span name,role="related thumb right"] -|payment.bulkhead.value -|Maximum concurrent requests for bulkhead -|5 -|=== +Click any row to open the trace detail panel. The example below shows the full span tree for a `POST /payment/api/authorize` request: -== Fault Tolerance Implementation Details +image::../images/grafana-tempo-authorize-trace-detail.png[Grafana Tempo — POST /payment/api/authorize trace detail with span tree,role="related thumb right"] -=== Server Configuration +=== Logs in Loki -The MicroProfile Fault Tolerance feature is enabled in the Liberty server configuration: +The following screenshot shows Loki logs for the payment-service, including the log volume histogram and individual log lines with `traceId` links back to Tempo: -[source,xml] +image::../images/grafana-loki-payment-service-logs.png[Grafana Loki — payment-service logs with volume histogram and trace correlation,role="related thumb right"] + +1. Open Grafana → *Explore* → select *Loki* +2. Enter one of these LogQL queries: ++ +[source] +---- +{exporter="OTLP"} ---- -mpFaultTolerance ++ +or filter by service: ++ +[source] ---- +{service_name="payment-service"} +---- ++ +3. Look for log lines that contain `traceId` — click the trace link to jump directly to the correlated trace in Tempo. -=== Code Implementation +=== Metrics in Prometheus -==== PaymentService Class +The following screenshot shows the `payment_attempts_total` counter plotted over time, broken down by `result` label (`failed` and `success`): -The PaymentService class is annotated with `@ApplicationScoped` to ensure proper fault tolerance behavior: +image::..../images/grafana-prometheus-payment-attempts-metric.png[Grafana Prometheus — payment_attempts_total metric time series by result label,role="related thumb right"] -[source,java] +1. Open Grafana → *Explore* → select *Prometheus* (or use http://localhost:19090 directly) +2. Query the custom payment counter: ++ +[source] ---- -@ApplicationScoped -public class PaymentService { - // ... -} +payment_attempts_total ---- - -==== Authorization Method - -[source,java] ++ +3. Break it down by result label: ++ +[source] ---- -@Retry( - maxRetries = 3, - delay = 1000, - jitter = 500, - maxDuration = 10000, - retryOn = {RuntimeException.class, WebApplicationException.class} -) -@Fallback(fallbackMethod = "fallbackPaymentAuthorization") -public PaymentResponse processPayment(PaymentRequest request) { - // Payment processing logic -} - -public PaymentResponse fallbackPaymentAuthorization(PaymentRequest request) { - // Fallback logic for payment authorization - return new PaymentResponse("failed", "Payment gateway unavailable. Please try again later.", true); -} +payment_attempts_total{result="success"} +payment_attempts_total{result="failed"} +payment_attempts_total{result="fallback"} +---- ++ +4. Also query HTTP server metrics emitted automatically by MicroProfile Telemetry: ++ +[source] +---- +http_server_request_duration_seconds_count +---- ++ +5. Fault tolerance metrics from MicroProfile Fault Tolerance: ++ +[source] +---- +ft_retry_calls_total +ft_circuitbreaker_state_total +ft_bulkhead_calls_total ---- -=== Key Implementation Benefits - -==== 1. Resilience -- Service continues operating despite external service failures -- Automatic recovery from transient failures -- Protection against cascading failures - -==== 2. User Experience -- Reduced timeout errors through retry mechanisms -- Graceful degradation with meaningful error messages -- Improved service availability - -==== 3. Operational Excellence -- Configurable fault tolerance parameters -- Comprehensive logging and monitoring -- Clear separation of concerns between business logic and resilience - -==== 4. Enterprise Readiness -- Production-ready fault tolerance patterns -- Compliance with microservices best practices -- Integration with MicroProfile ecosystem - -== MicroProfile Fault Tolerance Patterns - -=== Retry Pattern - -The retry pattern allows the service to automatically retry failed operations: - -* **@Retry**: Automatically retries failed operations -* **Parameters**: maxRetries, delay, jitter, maxDuration, retryOn, abortOn -* **Use Case**: Transient failures in external service calls - -=== Circuit Breaker Pattern - -The circuit breaker pattern prevents cascading failures: - -* **@CircuitBreaker**: Tracks failure rates and opens circuit when threshold is reached -* **Parameters**: failureRatio, requestVolumeThreshold, delay -* **States**: Closed (normal), Open (failing), Half-Open (testing recovery) -* **Use Case**: Protect against downstream service failures - -=== Timeout Pattern - -The timeout pattern prevents operations from hanging indefinitely: - -* **@Timeout**: Sets maximum duration for operations -* **Parameters**: value, unit -* **Use Case**: Prevent indefinite waiting for slow external services - -=== Bulkhead Pattern - -The bulkhead pattern limits concurrent requests: - -* **@Bulkhead**: Sets maximum concurrent executions -* **Parameters**: value, waitingTaskQueue (for async) -* **Use Case**: Prevent system overload during traffic spikes - -=== Fallback Pattern - -The fallback pattern provides alternatives when operations fail: - -* **@Fallback**: Specifies alternative method when operation fails -* **Parameters**: fallbackMethod, applyOn, skipOn -* **Use Case**: Graceful degradation for failed operations - -== Fault Tolerance Best Practices - -=== Configuring Retry Policies - -When configuring retry policies, consider these best practices: - -* **Operation Criticality**: Use more aggressive retry policies for critical operations -* **Retry Delay**: Implement exponential backoff for external service calls -* **Jitter**: Add random jitter to prevent thundering herd problems -* **Max Duration**: Set an overall timeout to prevent excessive retries -* **Abort Conditions**: Define specific exceptions that should abort retry attempts - -=== Circuit Breaker Configuration - -For effective circuit breaker implementation: - -* **Failure Ratio**: Set appropriate threshold based on expected error rates (typically 0.3-0.5) -* **Request Volume**: Set minimum request count to prevent premature circuit opening -* **Recovery Delay**: Allow sufficient time for downstream services to recover -* **Monitoring**: Track circuit state transitions for operational visibility - -=== Bulkhead Strategies - -Choose the appropriate bulkhead strategy: - -* **Synchronous Bulkhead**: Limits concurrent executions for thread-constrained systems -* **Asynchronous Bulkhead**: Provides a waiting queue for manageable load spikes -* **Isolation Levels**: Consider using separate bulkheads for different types of operations - -=== Fallback Implementation - -Implement effective fallback mechanisms: - -* **Graceful Degradation**: Return partial results when possible -* **Meaningful Responses**: Provide clear error messages to clients -* **Operation Queuing**: Queue failed operations for later processing -* **Fallback Chain**: Implement multiple fallback levels for critical operations - -=== Combining Fault Tolerance Annotations - -When combining multiple fault tolerance annotations: - -* **Execution Order**: Understand the execution order (Fallback → Retry → CircuitBreaker → Timeout → Bulkhead) -* **Compatibility**: Ensure annotations work together as expected -* **Resource Impact**: Consider the resource impact of combined annotations -* **Testing**: Test all combinations of annotation behaviors - -== Troubleshooting Fault Tolerance Issues - -=== Common Fault Tolerance Issues - -==== 1. Ineffective Retry Policies - -**Symptoms**: -* Operations fail without retrying -* Excessive retries causing performance issues - -**Solutions**: -* Verify exceptions match retryOn parameter -* Check that delay and jitter are appropriate -* Ensure maxDuration allows sufficient time for retries - -==== 2. Circuit Breaker Problems - -**Symptoms**: -* Circuit opens too frequently -* Circuit never opens despite failures -* Circuit remains open indefinitely - -**Solutions**: -* Adjust failureRatio based on expected error rates -* Increase requestVolumeThreshold if premature opening occurs -* Verify that delay allows sufficient recovery time -* Ensure exceptions are properly handled - -==== 3. Timeout Issues - -**Symptoms**: -* Operations timeout too quickly -* Timeouts not triggering as expected +== Troubleshooting -**Solutions**: -* Adjust timeout duration based on operation complexity -* Ensure timeout is shorter than upstream timeouts -* Verify that timeout unit is properly specified +=== Payment service cannot reach OTel Collector -==== 4. Bulkhead Restrictions +Symptom in Liberty logs: -**Symptoms**: -* Too many rejections during normal load -* Service overloaded despite bulkhead +---- +Failed to export logs. The request could not be executed. +Failed to connect to localhost/[0:0:0:0:0:0:0:1]:24317 +---- -**Solutions**: -* Adjust bulkhead value based on resource capacity -* Consider using asynchronous bulkheads with waiting queue -* Implement client-side load balancing for better distribution +Cause: The `otel.exporter.otlp.endpoint` in `microprofile-config.properties` points to the wrong port. -==== 5. Fallback Failures +Fix: In GitHub Codespaces use port `24317`; locally use `4317`: -**Symptoms**: -* Fallbacks not triggering despite failures -* Fallbacks throwing unexpected exceptions +[source,properties] +---- +# Codespaces +otel.exporter.otlp.endpoint=http://localhost:24317 -**Solutions**: -* Verify fallback method signature matches original method -* Ensure fallback method handles exceptions properly -* Check that fallback logic is fully tested +# Local Docker +otel.exporter.otlp.endpoint=http://localhost:4317 +---- -=== Diagnosing with Metrics +=== Grafana data source "Failed to connect to Tempo/Loki/Prometheus" -MicroProfile Metrics provides valuable insight into fault tolerance behavior: +Symptom: -[source,bash] ---- -# Total number of retry attempts -curl https://localhost:9080/metrics?name=ft_retry_retries_total - -# Bulkhead calls total -curl http://localhost:9080/metrics?name=ft_bulkhead_calls_total - -# Timeout execution duration -curl http://localhost:9080/payment/metrics/application?name=ft_timeout_executionDuration_nanoseconds +Get "http://tempo:3200/api/echo": dial tcp: lookup tempo on 127.0.0.11:53: no such host ---- -=== Server Log Analysis +Cause: GitHub Codespaces injects an Azure DNS search domain. Docker's embedded DNS (127.0.0.11) appends it to short names like `tempo`, producing an NXDOMAIN. -Liberty server logs provide detailed information about fault tolerance operations: +Fix: The `docker-compose.yml` already sets `dns_search: []` on every service to suppress the search domain. If you see this error, ensure you are using the latest `docker-compose.yml` and recreate all containers: [source,bash] ---- -tail -f target/liberty/wlp/usr/servers/mpServer/logs/messages.log | grep -E "Retry|CircuitBreaker|Timeout|Bulkhead|Fallback" +cd code/chapter09 +docker compose down +docker compose up -d ---- -Look for messages indicating: -* Retry attempts and success/failure -* Circuit breaker state transitions -* Timeout exceptions -* Bulkhead rejections -* Fallback method invocations - -== Resources and References - -=== MicroProfile Fault Tolerance Specification - -For detailed information about MicroProfile Fault Tolerance, refer to: - -* https://download.eclipse.org/microprofile/microprofile-fault-tolerance-4.0/microprofile-fault-tolerance-spec-4.0.html - -=== API Documentation - -* https://download.eclipse.org/microprofile/microprofile-fault-tolerance-4.0/apidocs/ - -=== Fault Tolerance Guides - -* https://openliberty.io/guides/microprofile-fallback.html -* https://openliberty.io/guides/retry-timeout.html -* https://openliberty.io/guides/circuit-breaker.html -* https://openliberty.io/guides/bulkhead.html - -=== Best Practices Resources - -* https://microprofile.io/ -* https://www.ibm.com/docs/en/was-liberty/base?topic=liberty-microprofile-fault-tolerance - -== MicroProfile Telemetry Implementation +=== Tempo container exits immediately -The Payment Service implements distributed tracing using MicroProfile Telemetry 1.1, which is based on OpenTelemetry standards. This enables end-to-end visibility of payment transactions across microservices and external dependencies. +Symptom in `docker compose logs tempo`: -=== Telemetry Configuration - -The service is configured to send telemetry data to Jaeger, enabling comprehensive transaction monitoring: - -==== Application Configuration (microprofile-config.properties) - -[source,properties] ---- -# MicroProfile Telemetry Configuration -otel.service.name=payment-service -otel.sdk.disabled=false -otel.metrics.exporter=none -otel.logs.exporter=none +field ingester not found in type app.Config ---- -=== Automatic Instrumentation - -MicroProfile Telemetry provides automatic instrumentation for: +Cause: Tempo v3 removed the `ingester` top-level key. Ensure `tempo-config.yaml` does not contain an `ingester:` section. The configuration in this repo is already v3-compatible. -* Jakarta Restful Web Services endpoints (inbound and outbound HTTP requests) -* CDI method invocations -* MicroProfile Rest Client calls +=== Loki container exits immediately -This enables tracing without modifying application code, capturing: +Symptom in `docker compose logs loki`: -* HTTP request information (method, URL, status code) -* Transaction timing and duration -* Service dependencies and call hierarchy - -=== Manual Instrumentation - -For enhanced visibility, the Payment Service also implements manual instrumentation: - -[source,java] ---- -private Tracer tracer; // Injected tracer for OpenTelemetry - -@PostConstruct -public void init() { - // Programmatic tracer access - the correct approach - this.tracer = GlobalOpenTelemetry.getTracer("payment-service", "1.0.0"); - logger.info("Tracer initialized successfully"); -} - -// Create explicit span with business context -Span span = tracer.spanBuilder("payment.process") - .setAttribute("payment.amount", paymentDetails.getAmount().toString()) - .setAttribute("payment.method", "credit_card") - .setAttribute("payment.service", "payment-service") - .startSpan(); - -try (io.opentelemetry.context.Scope scope = span.makeCurrent()) { - // Business logic here - span.addEvent("Starting payment processing"); - - // Add result information - span.setStatus(StatusCode.OK); -} catch (Exception e) { - // Record error details - span.recordException(e); - span.setStatus(StatusCode.ERROR, e.getMessage()); - throw e; -} finally { - span.end(); // Always end the span -} +/etc/loki/local-config.yml does not exist ---- -=== Key Telemetry Points - -The service captures telemetry at critical transaction points: - -1. **Payment Authorization**: Complete trace of payment authorization flow -2. **Payment Verification**: Detailed verification steps with fraud check results -3. **External Service Calls**: Timing of gateway communications -4. **Retry Operations**: Visibility into retry attempts and fallbacks -5. **Error Handling**: Detailed error context and fault tolerance behavior - -=== Business Context Enrichment +Cause: An override `command:` was pointing to a `.yml` file but the image default uses `.yaml`. The `docker-compose.yml` in this repo does not override the Loki command, so the image default applies and the container starts correctly. -Traces are enriched with business context to enable business-oriented analysis: - -* **Payment Amounts**: Track transaction values for business insights -* **Payment Methods**: Categorize by payment method for pattern analysis -* **Transaction IDs**: Correlate with order management systems -* **Processing Time**: Measure critical business SLAs -* **Error Categories**: Classify errors for targeted improvements - -=== Viewing Telemetry Data - -Telemetry data can be viewed in Jaeger UI: +=== No traces appear in Tempo +1. Confirm the OTel Collector is receiving data: ++ [source,bash] ---- -# Start Jaeger container (if not already running) -docker run --rm --name jaeger \ - -p 16686:16686 \ - -p 4317:4317 \ - -p 4318:4318 \ - -p 5778:5778 \ - -p 9411:9411 \ - jaegertracing/jaeger:2.7.0 - -# Access Jaeger UI -open http://localhost:16686 +docker compose logs --tail=30 otel-collector ---- ++ +You should see `ResourceSpans` logged by the `debug` exporter. -In the Jaeger UI: -1. Select "payment-service" from the Service dropdown -2. Choose an operation or search by transaction attributes -3. Explore the full transaction trace across services +2. Confirm `otel.sdk.disabled=false` in `microprofile-config.properties`. -=== Troubleshooting Telemetry - -If telemetry data is not appearing in Jaeger: - -1. **Verify Jaeger is running** with OTLP ports exposed (4317, 4318) -2. **Check Liberty server configuration** in server.xml -3. **Validate application configuration** in microprofile-config.properties -4. **Ensure trace application is enabled** with `` -5. **Check network connectivity** between the service and Jaeger -6. **Inspect Liberty server logs** for telemetry-related messages - -=== Testing Telemetry +3. Confirm the service restarted after the config change: ++ +[source,bash] +---- +# Stop (Ctrl+C in the mvn liberty:run terminal), then restart +mvn liberty:run +---- -To generate and verify telemetry data: +== Clean Up [source,bash] ---- -# Generate sample telemetry with payment request -curl -X POST -H "Content-Type: application/json" \ - -d '{"cardNumber":"4111-1111-1111-1111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":75.50}' \ - http://localhost:9080/payment/api/payments +# Stop the payment service (Ctrl+C in the mvn liberty:run terminal) +mvn liberty:stop -# Check for payment service in Jaeger UI services dropdown -curl -s http://localhost:16686/api/services +# Stop and remove all LGTM containers and volumes +cd code/chapter09 +docker compose down -v ---- -=== Benefits of Telemetry Implementation +== Related Resources -1. **End-to-End Transaction Visibility**: Follow payment flows across services -2. **Performance Monitoring**: Identify bottlenecks and optimization opportunities -3. **Error Detection**: Quickly locate and diagnose failures -4. **Dependency Analysis**: Understand service dependencies and impacts -5. **Business Insights**: Correlate technical metrics with business outcomes -6. **Operational Excellence**: Improve MTTR and system reliability \ No newline at end of file +* https://microprofile.io/specifications/microprofile-telemetry/[MicroProfile Telemetry 2.1 Specification] +* https://opentelemetry.io/docs/[OpenTelemetry Documentation] +* https://grafana.com/docs/tempo/[Grafana Tempo] +* https://grafana.com/docs/loki/[Grafana Loki] +* https://openliberty.io/docs/latest/microprofile-telemetry.html[Open Liberty MicroProfile Telemetry] diff --git a/code/chapter09/payment/pom.xml b/code/chapter09/payment/pom.xml index 65a1c5b5..60db6035 100644 --- a/code/chapter09/payment/pom.xml +++ b/code/chapter09/payment/pom.xml @@ -12,8 +12,8 @@ UTF-8 - 17 - 17 + 21 + 21 UTF-8 UTF-8 @@ -33,7 +33,7 @@ org.projectlombok lombok - 1.18.26 + 1.18.36 provided @@ -49,7 +49,7 @@ org.eclipse.microprofile microprofile - 6.1 + 7.1 pom provided @@ -58,8 +58,8 @@ io.opentelemetry opentelemetry-api - 1.32.0 - compile + 1.48.0 + provided @@ -73,6 +73,16 @@ ${project.artifactId} + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 21 + + + io.openliberty.tools @@ -90,4 +100,4 @@ - \ No newline at end of file + diff --git a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java index ae9258fb..b48c0c07 100644 --- a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java +++ b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java @@ -11,8 +11,10 @@ import io.microprofile.tutorial.store.payment.service.PaymentService; import jakarta.enterprise.context.RequestScoped; import jakarta.inject.Inject; +import jakarta.ws.rs.GET; import jakarta.ws.rs.POST; import jakarta.ws.rs.Path; +import jakarta.ws.rs.PathParam; import jakarta.ws.rs.Produces; import jakarta.ws.rs.QueryParam; import jakarta.ws.rs.Consumes; @@ -22,10 +24,13 @@ import java.math.BigDecimal; import java.util.concurrent.CompletionStage; import java.util.UUID; +import java.util.logging.Logger; @RequestScoped @Path("/") public class PaymentResource { + + private static final Logger logger = Logger.getLogger(PaymentResource.class.getName()); @Inject @ConfigProperty(name = "payment.gateway.endpoint") @@ -131,4 +136,72 @@ public Response verifyPaymentWithTelemetry(PaymentDetails paymentDetails) } } + @GET + @Path("/health/gateway") + @Produces(MediaType.APPLICATION_JSON) + @Operation(summary = "Check gateway health", description = "Check payment gateway health with circuit breaker protection") + @APIResponses(value = { + @APIResponse(responseCode = "200", description = "Gateway is healthy"), + @APIResponse(responseCode = "503", description = "Gateway is unavailable") + }) + public Response checkGatewayHealth() { + try { + boolean healthy = paymentService.checkGatewayHealth(); + + if (healthy) { + return Response.ok() + .entity("{\"status\":\"healthy\",\"message\":\"Payment gateway is operational\"}") + .build(); + } + + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"status\":\"unhealthy\",\"message\":\"Payment gateway is not responding\"}") + .build(); + } catch (Exception e) { + logger.warning("Gateway health check failed: " + e.getMessage()); + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"status\":\"circuit_open\",\"message\":\"Circuit breaker is open - gateway appears to be down\"}") + .build(); + } + } + + @POST + @Path("/notify/{paymentId}") + @Produces(MediaType.APPLICATION_JSON) + @Operation(summary = "Send payment notification", description = "Send asynchronous payment notification with bulkhead protection") + @APIResponses(value = { + @APIResponse(responseCode = "200", description = "Notification sent or queued"), + @APIResponse(responseCode = "503", description = "Bulkhead rejected request") + }) + public CompletionStage sendNotification( + @PathParam("paymentId") String paymentId, + @QueryParam("recipient") String recipient + ) { + + if (recipient == null || recipient.isEmpty()) { + recipient = "default@example.com"; + } + + return paymentService.sendPaymentNotification(paymentId, recipient) + .thenApply(result -> { + logger.info("Notification result: " + result); + return Response.ok() + .entity("{\"status\":\"success\",\"message\":\"" + result + "\"}") + .build(); + }) + .exceptionally(ex -> { + logger.warning("Notification failed: " + ex.getMessage()); + + if (ex.getMessage() != null && ex.getMessage().contains("BulkheadException")) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"status\":\"rejected\",\"message\":\"Too many concurrent notifications - please try again later\"}") + .build(); + } + + return Response.status(Response.Status.INTERNAL_SERVER_ERROR) + .entity("{\"status\":\"error\",\"message\":\"Notification processing failed\"}") + .build(); + }); + } + } diff --git a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java index da3ed2ba..92d971f3 100644 --- a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java +++ b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java @@ -1,21 +1,27 @@ package io.microprofile.tutorial.store.payment.service; -import io.microprofile.tutorial.store.payment.exception.PaymentProcessingException; -import io.opentelemetry.api.GlobalOpenTelemetry; -import io.opentelemetry.api.trace.Tracer; -import io.opentelemetry.api.trace.Span; import io.microprofile.tutorial.store.payment.entity.PaymentDetails; import io.microprofile.tutorial.store.payment.exception.CriticalPaymentException; +import io.microprofile.tutorial.store.payment.exception.PaymentProcessingException; + +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.LongCounter; +import io.opentelemetry.api.metrics.Meter; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.Tracer; import org.eclipse.microprofile.faulttolerance.Asynchronous; import org.eclipse.microprofile.faulttolerance.Bulkhead; +import org.eclipse.microprofile.faulttolerance.CircuitBreaker; import org.eclipse.microprofile.faulttolerance.Fallback; import org.eclipse.microprofile.faulttolerance.Retry; import org.eclipse.microprofile.faulttolerance.Timeout; -import jakarta.enterprise.context.ApplicationScoped; import jakarta.annotation.PostConstruct; - +import jakarta.enterprise.context.ApplicationScoped; +import jakarta.inject.Inject; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionStage; @@ -26,56 +32,56 @@ public class PaymentService { private static final Logger logger = Logger.getLogger(PaymentService.class.getName()); - private Tracer tracer; // Injected tracer for OpenTelemetry + @Inject + Tracer tracer; + + @Inject + Meter meter; + + private LongCounter paymentAttemptsCounter; @PostConstruct public void init() { - // Programmatic tracer access - the correct approach - this.tracer = GlobalOpenTelemetry.getTracer("payment-service", "1.0.0"); - logger.info("Tracer initialized successfully"); + paymentAttemptsCounter = meter + .counterBuilder("payment.attempts") + .setDescription("Number of payment attempts by result") + .setUnit("1") + .build(); + logger.info("PaymentService initialized with telemetry instrumentation"); } - /** - * Process the payment request with automatic tracing via MicroProfile Telemetry. - * The mpTelemetry feature automatically creates spans for this method. - * - * @param paymentDetails details of the payment - * @return response message indicating success or failure - * @throws PaymentProcessingException if a transient issue occurs - */ @Asynchronous @Timeout(3000) @Retry(maxRetries = 3, delay = 2000, jitter = 500, retryOn = PaymentProcessingException.class, abortOn = CriticalPaymentException.class) @Fallback(fallbackMethod = "fallbackProcessPayment") - @Bulkhead(value=5) + @Bulkhead(value = 5) public CompletionStage processPayment(PaymentDetails paymentDetails) throws PaymentProcessingException { - // Create explicit span for payment processing to help with debugging Span span = tracer.spanBuilder("payment.process") - .setAttribute("payment.amount", paymentDetails.getAmount().toString()) - .setAttribute("payment.method", "credit_card") - .setAttribute("payment.service", "payment-service") - .startSpan(); - + .setAttribute("payment.amount", paymentDetails.getAmount().toString()) + .setAttribute("payment.method", "credit_card") + .setAttribute("payment.service", "payment-service") + .startSpan(); + try (io.opentelemetry.context.Scope scope = span.makeCurrent()) { - // MicroProfile Telemetry automatically traces this method String maskedCardNumber = maskCardNumber(paymentDetails.getCardNumber()); - - logger.info(String.format("Processing payment - Amount: %s, Card: %s", + logger.info(String.format("Processing payment - Amount: %s, Card: %s", paymentDetails.getAmount(), maskedCardNumber)); - + + span.setAttribute("payment.status", "IN_PROGRESS"); span.addEvent("Starting payment processing"); simulateDelay(); - // Simulating a transient failure if (Math.random() > 0.7) { - span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Payment processing failed"); + paymentAttemptsCounter.add(1, Attributes.of(AttributeKey.stringKey("result"), "failed")); + span.setStatus(StatusCode.ERROR, "Payment processing failed"); span.addEvent("Payment processing failed due to transient error"); logger.warning("Payment processing failed due to transient error"); throw new PaymentProcessingException("Temporary payment processing failure"); } - // Simulating successful processing - span.setStatus(io.opentelemetry.api.trace.StatusCode.OK); + paymentAttemptsCounter.add(1, Attributes.of(AttributeKey.stringKey("result"), "success")); + span.setAttribute("payment.status", "SUCCESS"); + span.setStatus(StatusCode.OK); span.addEvent("Payment processed successfully"); logger.info("Payment processed successfully"); return CompletableFuture.completedFuture("{\"status\":\"success\", \"message\":\"Payment processed successfully.\"}"); @@ -84,51 +90,74 @@ public CompletionStage processPayment(PaymentDetails paymentDetails) thr } } - /** - * Fallback method when payment processing fails. - * Automatically traced by MicroProfile Telemetry. - * - * @param paymentDetails details of the payment - * @return response message for fallback - */ public CompletionStage fallbackProcessPayment(PaymentDetails paymentDetails) { - logger.warning(() -> String.format("Fallback invoked for payment - Amount: %s", + paymentAttemptsCounter.add(1, Attributes.of(AttributeKey.stringKey("result"), "fallback")); + logger.warning(() -> String.format("Fallback invoked for payment - Amount: %s", paymentDetails.getAmount())); - return CompletableFuture.completedFuture("{\"status\":\"failed\", \"message\":\"Payment service is currently unavailable.\"}"); } - /** - * Masks a credit card number for security in logs and traces. - * Only the last 4 digits are shown, all others are replaced with 'X'. - * - * @param cardNumber The full card number - * @return A masked card number (e.g., "XXXXXXXXXXXX1234") - */ + @CircuitBreaker(requestVolumeThreshold = 4, failureRatio = 0.75, delay = 1000, successThreshold = 2) + public boolean checkGatewayHealth() { + logger.info("Checking payment gateway health"); + simulateNetworkCall(200); + if (Math.random() > 0.9) { + logger.warning("Gateway health check failed"); + throw new RuntimeException("Payment gateway not responding"); + } + logger.info("Gateway is healthy"); + return true; + } + + @Asynchronous + @Bulkhead(5) + public CompletionStage sendPaymentNotification(String paymentId, String recipient) { + logger.info(() -> String.format("Sending payment notification - Payment ID: %s, Recipient: %s", + paymentId, recipient)); + simulateNetworkCall(300); + logger.info("Payment notification sent successfully"); + return CompletableFuture.completedFuture( + String.format("Notification sent to %s for payment %s", recipient, paymentId)); + } + + @Asynchronous + public CompletionStage verifyPaymentWithTelemetry(PaymentDetails paymentDetails, String transactionId) + throws PaymentProcessingException { + logger.info(() -> String.format("Starting payment verification - Transaction ID: %s", transactionId)); + + try { + validatePaymentDetails(paymentDetails); + performFraudCheck(paymentDetails, transactionId); + verifyFundsAvailability(paymentDetails); + recordTransaction(paymentDetails, transactionId); + + logger.info("Payment verification completed successfully"); + return CompletableFuture.completedFuture( + String.format("{\"status\":\"verified\", \"transaction_id\":\"%s\", \"message\":\"Payment verification complete.\"}", + transactionId)); + } catch (Exception e) { + logger.severe(() -> String.format("Payment verification failed: %s", e.getMessage())); + throw e; + } + } + private String maskCardNumber(String cardNumber) { if (cardNumber == null || cardNumber.length() < 4) { return "INVALID_CARD"; } - int visibleDigits = 4; int length = cardNumber.length(); - StringBuilder masked = new StringBuilder(); - for (int i = 0; i < length - visibleDigits; i++) { + for (int i = 0; i < length - 4; i++) { masked.append('X'); } - masked.append(cardNumber.substring(length - visibleDigits)); - + masked.append(cardNumber.substring(length - 4)); return masked.toString(); } - /** - * Simulate a delay in processing to demonstrate timeout. - * This method will be automatically traced by MicroProfile Telemetry. - */ private void simulateDelay() { try { logger.fine("Starting payment processing delay simulation"); - Thread.sleep(1500); // Simulated long-running task + Thread.sleep(1500); logger.fine("Payment processing delay simulation completed"); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -137,119 +166,48 @@ private void simulateDelay() { } } - /** - * Processes a comprehensive payment verification with multiple steps. - * Each method call will be automatically traced by MicroProfile Telemetry. - * - * @param paymentDetails The payment details to verify - * @param transactionId The unique transaction ID - * @return A detailed verification result - * @throws PaymentProcessingException if verification fails - */ - @Asynchronous - public CompletionStage verifyPaymentWithTelemetry(PaymentDetails paymentDetails, String transactionId) - throws PaymentProcessingException { - - logger.info(() -> String.format("Starting payment verification - Transaction ID: %s", transactionId)); - - try { - // Step 1: Validate payment details - validatePaymentDetails(paymentDetails); - - // Step 2: Check for fraud indicators - performFraudCheck(paymentDetails, transactionId); - - // Step 3: Verify funds with bank - verifyFundsAvailability(paymentDetails); - - // Step 4: Record transaction - recordTransaction(paymentDetails, transactionId); - - logger.info("Payment verification completed successfully"); - return CompletableFuture.completedFuture( - String.format("{\"status\":\"verified\", \"transaction_id\":\"%s\", \"message\":\"Payment verification complete.\"}", - transactionId)); - } catch (Exception e) { - logger.severe(() -> String.format("Payment verification failed: %s", e.getMessage())); - throw e; - } - } - - /** - * Validates payment details - automatically traced - */ private void validatePaymentDetails(PaymentDetails details) throws PaymentProcessingException { logger.info("Validating payment details"); - - boolean isValid = details.getCardNumber() != null && - details.getCardNumber().length() >= 15 && - details.getExpiryDate() != null && - details.getAmount() != null && - details.getAmount().doubleValue() > 0; - + boolean isValid = details.getCardNumber() != null && + details.getCardNumber().length() >= 15 && + details.getExpiryDate() != null && + details.getAmount() != null && + details.getAmount().doubleValue() > 0; if (!isValid) { logger.warning("Payment details validation failed"); throw new PaymentProcessingException("Payment details validation failed"); } - logger.info("Payment details validation successful"); } - - /** - * Performs fraud check - automatically traced - */ + private void performFraudCheck(PaymentDetails details, String transactionId) throws PaymentProcessingException { logger.info(() -> String.format("Performing fraud check for transaction: %s", transactionId)); - - // Simulate external service call simulateNetworkCall(300); - - // Simulate fraud detection (cards ending with "0000" are flagged) boolean isSafe = !details.getCardNumber().endsWith("0000"); - if (!isSafe) { logger.warning("Potential fraud detected"); throw new PaymentProcessingException("Fraud check failed"); } - logger.info("Fraud check passed"); } - - /** - * Verifies funds availability - automatically traced - */ + private void verifyFundsAvailability(PaymentDetails details) throws PaymentProcessingException { logger.info(() -> String.format("Verifying funds availability - Amount: %s", details.getAmount())); - - // Simulate banking service call simulateNetworkCall(500); - - // Simulate funds verification (amounts over 1000 fail) boolean hasFunds = details.getAmount().doubleValue() <= 1000; - if (!hasFunds) { logger.warning("Insufficient funds detected"); throw new PaymentProcessingException("Insufficient funds"); } - logger.info("Sufficient funds verified"); } - - /** - * Records transaction - automatically traced - */ + private void recordTransaction(PaymentDetails details, String transactionId) { logger.info(() -> String.format("Recording transaction: %s", transactionId)); - - // Simulate database operation simulateNetworkCall(200); - logger.info("Transaction recorded successfully"); } - - /** - * Simulates network calls or database operations - automatically traced - */ + private void simulateNetworkCall(int milliseconds) { try { logger.fine(() -> String.format("Simulating network call - Duration: %dms", milliseconds)); diff --git a/code/chapter09/payment/src/main/liberty/config/server.xml b/code/chapter09/payment/src/main/liberty/config/server.xml index 2d5c7290..c680bc91 100644 --- a/code/chapter09/payment/src/main/liberty/config/server.xml +++ b/code/chapter09/payment/src/main/liberty/config/server.xml @@ -1,7 +1,7 @@ + microProfile-7.1 jakartaEE-10.0 - microProfile-6.1 restfulWS jsonp jsonb @@ -11,7 +11,6 @@ mpHealth mpMetrics mpTelemetry - mpOpenTracing mpFaultTolerance diff --git a/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties b/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties index d41c5ed4..f7fcce07 100644 --- a/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties +++ b/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties @@ -13,5 +13,14 @@ io.microprofile.tutorial.store.payment.service.PaymentService/processPayment/Ret # MicroProfile Telemetry Configuration otel.service.name=payment-service otel.sdk.disabled=false -otel.metrics.exporter=none -otel.logs.exporter=none \ No newline at end of file + +# OTLP Exporter Configuration +# Override with the forwarded port when running in GitHub Codespaces: http://localhost:24317 +otel.exporter.otlp.endpoint=http://localhost:24317 +otel.traces.exporter=otlp +otel.metrics.exporter=otlp +otel.logs.exporter=otlp + +# Sampling +otel.traces.sampler=parentbased_always_on + diff --git a/code/chapter09/payment/src/main/webapp/index.html b/code/chapter09/payment/src/main/webapp/index.html index f7ba4adc..7d20f937 100644 --- a/code/chapter09/payment/src/main/webapp/index.html +++ b/code/chapter09/payment/src/main/webapp/index.html @@ -262,7 +262,7 @@

Links

MicroProfile Config, Fault Tolerance & Telemetry Demo | Payment Service

-

Powered by Open Liberty, MicroProfile 6.1 (Config 3.0, Fault Tolerance 4.0, Telemetry 1.1)

+

Powered by Open Liberty, MicroProfile 7.1 (Config 3.1, Fault Tolerance 4.1, Telemetry 2.1)

diff --git a/code/chapter09/prometheus.yml b/code/chapter09/prometheus.yml new file mode 100644 index 00000000..eae5740a --- /dev/null +++ b/code/chapter09/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] diff --git a/code/chapter09/tempo-config.yaml b/code/chapter09/tempo-config.yaml new file mode 100644 index 00000000..8ccb42b2 --- /dev/null +++ b/code/chapter09/tempo-config.yaml @@ -0,0 +1,20 @@ +server: + http_listen_port: 3200 + log_level: info + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +storage: + trace: + backend: local + wal: + path: /var/tempo/wal + local: + path: /var/tempo/blocks diff --git a/modules/ROOT/pages/chapter09/index.adoc b/modules/ROOT/pages/chapter09/index.adoc index 912de480..0bd2b131 100644 --- a/modules/ROOT/pages/chapter09/index.adoc +++ b/modules/ROOT/pages/chapter09/index.adoc @@ -1,26 +1,29 @@ = MicroProfile Telemetry -Microservices-based applications have better scalability, flexibility, and resilience, but they suffer from additional challenges regarding availability and performance monitoring. This makes observability critical to ensure these distributed systems operate reliably. +Microservices-based applications offer scalability, flexibility, and resilience, but they also introduce challenges in availability and performance monitoring. Observability is critical to ensure that these distributed systems operate reliably. -MicroProfile Telemetry specification provides a set of vendor-neutral APIs for instrumenting, collecting, and exporting telemetry data such as traces, metrics, and logs. It is built on the foundation of https://opentelemetry.io/[OpenTelemetry] from the https://www.cncf.io/[Cloud Native Computing Foundation (CNCF)] project, an open-source observability framework. +https://opentelemetry.io/[OpenTelemetry], from the https://www.cncf.io/[Cloud Native Computing Foundation (CNCF)] project, is an open-source observability framework that provides standardized APIs, SDKs, and tools to create, collect, and manage telemetry data, including traces, metrics, and logs. The MicroProfile Telemetry specification defines how OpenTelemetry components integrate with MicroProfile, which helps applications participate in distributed tracing environments with a consistent, vendor-neutral experience. -In this chapter, we will explore the fundamentals of MicroProfile Telemetry, covering topics such as tracing concepts, instrumenting Telemetry, setting up tracing providers, context propagation and correlation, analyzing traces, security considerations for tracing, and more. By the end of this chapter, you will learn how to effectively leverage distributed tracing for debugging, performance monitoring, and system optimization. +This chapter explores the fundamentals of MicroProfile Telemetry, including tracing concepts, telemetry instrumentation, tracing provider setup, context propagation and correlation, trace analysis, and security considerations. By the end of this chapter, developers can use distributed tracing effectively for debugging, performance monitoring, and system optimization. -== Topics to be covered +== Topics Covered * Introduction to MicroProfile Telemetry -* Tracing Concepts -** Spans +* Tracing Concepts +** Spans ** Traces ** Context Propagation ** Correlation * Instrumenting OpenTelemetry * Tools for Trace Analysis -* Exporting the Traces +* Exporting Telemetry Data * Types of Telemetry +* Metrics +* Logs * Agent Instrumentation * Analyzing Traces * Security Considerations for Tracing +* What's New in MicroProfile Telemetry 2.1 == Introduction to MicroProfile Telemetry @@ -30,16 +33,16 @@ Some of the key challenges in microservices-based applications include: * *Complexity due to Distributed Architecture*: Microservices are often deployed across multiple nodes, containers, or cloud environments, making it challenging to track requests as they move through the system. This lack of visibility increases debugging complexity, making it harder to identify bottlenecks and analyze system behavior. * *Polyglot Architecture*: Microservices are developed using multiple programming languages (e.g., Java, Python, and Go) and frameworks, resulting in inconsistent telemetry data and a lack of standardization in observability. This fragmentation makes correlating logs, traces, and metrics across services difficult. -* *Latency*: Communication between Microservices involves latency, and all of this adds up as requests traverse several services. This makes it difficult to identify the root causes of issues. -Ensuring High Availability: Failures in one microservice can affect the entire system, impacting multiple dependent microservices. This can lead to downtime or degraded performance, resulting in lost revenue and diminished user trust. +* *Latency*: Communication between microservices introduces latency, and this latency accumulates as requests traverse several services. This makes it difficult to identify root causes. +* *High Availability*: Failures in one microservice can affect the entire system, including dependent services. This can lead to downtime or degraded performance, resulting in lost revenue and diminished user trust. -To address these challenges, MicroProfile Telemetry specification provides a standardized set of APIs for capturing telemetry data, including trace information and context propagation, to improve observability in distributed systems. By enabling seamless tracing, developers can analyze system behavior, troubleshoot service interactions, and ensure application reliability. +To address these challenges, the MicroProfile Telemetry specification provides a standardized set of APIs for capturing telemetry data, including trace information and context propagation, to improve observability in distributed systems. By enabling seamless tracing, developers can analyze system behavior, troubleshoot service interactions, and improve application reliability. -MicroProfile Telemetry is vendor-neutral. It allows developers to switch between different OpenTelemetry implementations without modifying their application code. This flexibility ensures that MicroProfile applications can easily integrate with various observability platforms, making it easier to adopt, scale, and maintain Telemetry in modern cloud-native environments. +MicroProfile Telemetry is vendor-neutral. It allows developers to switch between OpenTelemetry implementations without modifying application code. This flexibility helps MicroProfile applications integrate with different observability platforms, making telemetry easier to adopt, scale, and maintain in modern cloud-native environments. == Tracing Concepts -Tracing is critical for observability. It allows developers to inspect the flow of requests as they traverse through distributed systems. Tracing provides visibility into the interactions and dependencies within a system by breaking down a request into multiple spans, and connecting them into traces with context propagated across services. +Tracing is critical for observability. It allows developers to inspect request flow across distributed systems. Tracing provides visibility into system interactions and dependencies by breaking a request into multiple spans and connecting those spans into traces with context propagated across services. === Spans @@ -59,16 +62,16 @@ A *trace* is a collection of related spans representing the end-to-end execution For example: ``` API Gateway (Root Span) + -│ +│ ├── Order Service (Child Span) + -│ │ +│ │ │ ├── Database Query (Another Child Span) + │ │ ├── Fetch Order Details + │ │ ├── Process Order Data + │ │ └── Return Data to Order Service + -│ │ +│ │ │ └── Return Response to API Gateway + -│ +│ └── API Gateway Sends Final Response to User ``` @@ -78,13 +81,13 @@ API Gateway (Root Span) + === Correlation -Context propagation is vital for connecting distributed spans and understanding their relationship ensuring trace metadata remains correlated as it travels with requests across service boundaries. +Context propagation is vital for connecting distributed spans and understanding their relationships. It ensures that trace metadata remains correlated as it travels with requests across service boundaries. *Correlation* is the process of associating related spans and traces across multiple services and threads to form a cohesive view of a transaction. Correlation enables developers to: * Identify the source of bottlenecks or errors in distributed systems. * Understand the dependencies and interactions between services. -When viewing logs, the +traceId+ and +spanId+ allow you to link specific log entries to the corresponding spans in your tracing system. +When viewing logs, the +traceId+ and +spanId+ allow developers to link specific log entries to the corresponding spans in their tracing system. * *Trace ID*: A unique identifier shared across all spans in a single trace. * *Span ID*: A unique identifier for a single span. It is linked to a parent span, forming a hierarchy. @@ -105,14 +108,14 @@ To enable tracing and exporting of telemetry data, include the MicroProfile Tele org.eclipse.microprofile.telemetry microprofile-telemetry-api - 1.1 + 2.1 provided ---- === *Step 2: Create a Tracer* -MicroProfile automatically traces requests, but you can manually instrument your code using OpenTelementry APIs. +MicroProfile automatically traces requests, but developers can manually instrument their code by using OpenTelemetry APIs. A *Tracer* is a core component of OpenTelemetry, responsible for *creating spans* and *managing trace data* within the application. To use it, inject a +Tracer+ instance into your MicroProfile service: @@ -132,7 +135,7 @@ public class PaymentService { public void processPayment(String orderId, double amount) { // Create a custom span for tracing the payment process Span span = tracer.spanBuilder("payment.process").startSpan(); - + try { span.setAttribute("order.id", orderId); span.setAttribute("payment.amount", amount); @@ -169,7 +172,7 @@ Use the Tracer to create a span that represents a specific operation or activity Span span = tracer.spanBuilder("my-span").startSpan(); ---- -The method `spanBuilder("my-span")` creates a new named span, which represents a specific operation within the application's execution flow. This helps in tracing and monitoring the operation as part of a distributed system. Calling `startSpan()` marks the beginning of the span lifecycle, ensuring that the span is actively recorded until it is explicitly ended. This allows telemetry data to be captured for performance analysis, debugging, and observability. +The method `spanBuilder("my-span")` creates a named span that represents a specific operation in the application's execution flow. This helps trace and monitor that operation as part of a distributed system. Calling `startSpan()` marks the beginning of the span lifecycle and records data until the span is explicitly ended. This telemetry data supports performance analysis, debugging, and observability. === *Step 4: Add Attributes to the Span* @@ -220,6 +223,9 @@ https://www.jaegertracing.io/[Jaeger] is an open-source distributed tracing syst One of Jaeger’s key capabilities is service dependency analysis, which helps identify how microservices interact, providing insights into latency, failures, and request propagation. It also supports adaptive sampling strategies, allowing developers to control the volume of traces collected to optimize performance without overwhelming storage and processing resources. Additionally, Jaeger offers built-in storage options, allowing trace data to be stored in Elasticsearch, Cassandra, or Kafka, making it scalable and flexible for various deployment environments. + +*Note*: While Jaeger excels at distributed tracing, for comprehensive observability that covers logs, metrics, and traces, consider using the LGTM stack (described in the "Verify the Traces" section) as an integrated solution that combines Logs (Loki), Grafana, Traces (Tempo), and Metrics (Prometheus). + === Zipkin https://zipkin.io/[Zipkin] is a distributed tracing system designed to help developers visualize and diagnose latency issues in microservices-based applications. It provides a lightweight and fast tracing solution, making it ideal for quick deployment with minimal resource usage. Its simplicity and efficiency make it a popular choice for teams looking to implement tracing without significant infrastructure overhead. @@ -231,61 +237,352 @@ One of Zipkin’s core strengths is its tag-based searching, which allows develo https://grafana.com/oss/tempo/[Grafana Tempo] is a distributed tracing backend. Unlike Jaeger and Zipkin, Tempo does not require indexing as it only requires object storage, making it highly scalable and cost-efficient for handling large volumes of trace data. This unique approach allows Tempo to store traces efficiently without increasing storage and query overhead, making it an ideal choice for high-performance microservices environments. One of Tempo’s key advantages is its tight integration with Grafana dashboards, enabling developers to correlate logs, metrics, and traces within a unified observability platform. Additionally, Tempo offers multi-backend support, meaning it can ingest and process trace data from OpenTelemetry, Jaeger, and Zipkin sources, ensuring compatibility with existing tracing setups. Its scalability makes it well-suited for large-scale microservices architectures, where efficiently managing distributed tracing data is crucial. -== Exporting the Traces +== Exporting Telemetry Data + +To export telemetry data, configure the exporter type and endpoint in `src/main/resources/META-INF/microprofile-config.properties`. +MicroProfile Telemetry 2.0 and later require developers to configure exporters for all three signal types: traces, metrics, and logs. +For OTLP (OpenTelemetry Protocol) export, add the following configuration: + +[source] +---- +# Enable OpenTelemetry +otel.sdk.disabled=false + +# Set the OTLP exporter endpoint (gRPC default: port 4317) +otel.exporter.otlp.endpoint=http://:4317 + +# Define the service name +otel.service.name=payment-service + +# Sampling: parentbased_always_on is the default +otel.traces.sampler=parentbased_always_on +---- -To export the traces we need to configure the exporter type and endpoint in the `src/main/resources/META-INF/microprofile-config.properties`. -For using OTLP (OpenTelemetry Protocol) export, you need to add the following configuration in: +Configure signal-specific exporters only when developers need to override the shared OTLP endpoint or protocol: [source] ---- -# Enable OpenTelemetry +# Traces exporter (default: otlp) otel.traces.exporter=otlp +# Metrics exporter (default: otlp) +otel.metrics.exporter=otlp + +# Logs exporter (default: otlp) +otel.logs.exporter=otlp +---- + +This configuration sends telemetry data directly to an observability backend, enabling real-time distributed tracing, metrics collection, and log correlation. Ensure that the observability backend (for example, Jaeger for traces, or Grafana with Tempo and Loki) is running to receive telemetry data. + +OTLP is the native standard for OpenTelemetry. It allows developers to use multiple observability platforms without changing instrumentation, providing a unified, vendor-neutral telemetry solution. + +=== Verify the Traces + +After enabling tracing and configuring the exporter, verify that the traces are being captured and sent to the observability backend. This step confirms that the MicroProfile Telemetry setup functions correctly and that distributed tracing data is available for monitoring and debugging. + +==== Run LGTM (Logs, Grafana, Traces, and Metrics) + +https://github.com/grafana/docker-otel-lgtm[LGTM] is a comprehensive Docker-based observability stack that combines multiple open-source tools into a single, unified platform for collecting, storing, and visualizing telemetry data. It provides an integrated solution that consolidates logs, metrics, and traces in one place, simplifying observability management for developers. + +LGTM includes: + +* *Logs (Loki)*: A log aggregation system for storing and querying logs +* *Grafana*: A powerful visualization platform for dashboards and analytics +* *Traces (Tempo)*: A distributed tracing backend for storing and analyzing traces +* *Metrics (Prometheus)*: A time-series database for collecting and querying metrics +* *OpenTelemetry Collector*: An intermediary for receiving and processing telemetry data. + +===== Set Up LGTM with Docker Compose + +To run the complete LGTM stack, create a `docker-compose.yml` file in your project directory with the following configuration: + +[source, yaml] +---- +version: '3.8' + +services: + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-storage:/var/lib/grafana + depends_on: + - prometheus + - loki + - tempo + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-storage:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "3100:3100" + volumes: + - loki-storage:/loki + command: -config.file=/etc/loki/local-config.yml + + tempo: + image: grafana/tempo:latest + container_name: tempo + ports: + - "4317:4317" + - "4318:4318" + volumes: + - tempo-storage:/var/tempo + command: [ "-config.file=/etc/tempo/local-config.yml" ] + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + ports: + - "4317:4317" + - "4318:4318" + - "9411:9411" + volumes: + - ./otel-collector-config.yml:/etc/otel-collector-config.yml + command: [ "--config=/etc/otel-collector-config.yml" ] + depends_on: + - loki + - prometheus + - tempo + +volumes: + grafana-storage: + prometheus-storage: + loki-storage: + tempo-storage: +---- + +===== Configure OpenTelemetry Collector + +Create an `otel-collector-config.yml` file to configure the OpenTelemetry Collector to receive telemetry data and export it to the appropriate backends: + +[source, yaml] +---- +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + logging: + loglevel: debug + + prometheus: + endpoint: "0.0.0.0:9411" + + otlp: + client: + endpoint: tempo:4317 + tls: + insecure: true + + loki: + endpoint: http://loki:3100/loki/api/v1/push + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp, logging] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus, logging] + + logs: + receivers: [otlp] + processors: [batch] + exporters: [loki, logging] +---- + +===== Configure Prometheus + +Create a `prometheus.yml` file to configure Prometheus to scrape metrics: + +[source, yaml] +---- +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9411'] +---- + +===== Start the LGTM Stack + +To start all services, run the following command in the directory containing the `docker-compose.yml` file: + +[source, bash] +---- +docker-compose up -d +---- + +Verify that all services are running: + +[source, bash] +---- +docker-compose ps +---- + +===== Configure MicroProfile Application for LGTM + +To send telemetry data to the LGTM stack, update the `src/main/resources/META-INF/microprofile-config.properties` file in your MicroProfile application with the following configuration: + +[source] +---- +# Enable OpenTelemetry +otel.sdk.disabled=false + # Set the OTLP exporter endpoint -otel.exporter.otlp.endpoint=http://localhost:4317 +otel.exporter.otlp.endpoint=http://otel-collector:4317 # Define the service name -otel.service.name=payment-service +otel.service.name=payment-service -# Sampling rate: (1.0 = always, 0.5 = 50%, 0.0 = never) +# Sampling: parentbased_always_on is the default otel.traces.sampler=parentbased_always_on + +# Configure traces exporter +otel.traces.exporter=otlp + +# Configure metrics exporter +otel.metrics.exporter=otlp + +# Configure logs exporter +otel.logs.exporter=otlp ---- -This sends traces directly to a observability tool, enabling real-time distributed tracing and performance monitoring. To ensure proper tracing, your observability tool (for e.g. Jaeger) must be running to receive trace data. +===== Access the LGTM Components -Using OTLP is advantageous because it is the native standard for OpenTelemetry, ensuring seamless integration with a wide range of observability tools. One of its key benefits is that it allows developers to use multiple observability platforms without changing instrumentation, providing a unified and vendor-neutral tracing solution. +Once the LGTM stack is running and your MicroProfile application is sending telemetry data, access the various components to monitor your services: -=== Verify the Traces +====== Grafana + +Access the Grafana dashboards at `http://:3000`. The default username is `admin` and the default password is `admin`. You can create custom dashboards to visualize metrics, logs, and traces. -Once tracing is enabled and the appropriate exporter is configured, the next step is to verify that traces are being captured and sent to the observability backend. This ensures that the MicroProfile Telemetry setup is functioning correctly and that distributed tracing data is available for monitoring and debugging. +To set up data sources in Grafana: -==== Run Jaeger +1. Navigate to *Configuration* -> *Data Sources* +2. Add the following data sources: + - *Prometheus*: `http://prometheus:9090` + - *Loki*: `http://loki:3100` + - *Tempo*: `http://tempo:4317` -The simplest way to run Jaeger is with Docker using the command as below: +====== View Logs in Grafana/Loki +1. Open Grafana at `http://:3000` +2. Click on *Explore* in the left sidebar +3. Select *Loki* as the data source +4. Use the log query syntax to filter logs. For example: ++ +[source] +---- +{job="payment-service"} |= "error" +---- ++ +This query retrieves all error logs from the payment service. You can also filter by trace ID to correlate logs with specific traces: ++ +[source] +---- +{job="payment-service"} |= "trace_id=abc123" +---- + +====== View Metrics in Prometheus + +1. Access Prometheus directly at `http://:9090` or through Grafana +2. In the *Prometheus* tab, use PromQL (Prometheus Query Language) to query metrics. For example: ++ +[source] +---- +http_requests_total{service="payment-service"} +---- ++ +This query retrieves the total number of HTTP requests for the payment service. You can also create graphs by clicking the *Graph* tab and add custom panels to Grafana dashboards for long-term monitoring. + +====== View Traces in Grafana with Tempo + +1. Open Grafana at `http://:3000` +2. Click on *Explore* in the left sidebar +3. Select *Tempo* as the data source +4. Search for traces using various criteria: + - *Service Name*: Select the payment service to view traces for that service + - *Trace ID*: Enter a specific trace ID to find a particular trace + - *Operation*: Filter by operation name (e.g., `payment.process`) + - *Duration*: Set a time range to find traces within a specific duration + - *Status*: Filter by trace status (e.g., OK, ERROR) +5. Click on a trace to view its detailed breakdown: + - *Timeline*: View the temporal relationship between spans + - *Span Details*: Examine individual span attributes, events, and exceptions + - *Logs*: View logs associated with the trace by clicking on the *Logs* tab + - *Metrics*: View metrics related to the trace (if configured) +6. Use the *Service Graph* to visualize service dependencies and identify bottlenecks or performance issues + +===== Troubleshooting LGTM + +If telemetry data is not appearing in LGTM, follow these troubleshooting steps: + +1. **Verify Services are Running**: Confirm that all LGTM services are running: ++ [source, bash] ---- -docker run -d --name jaeger \ - -e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \ - -p 5775:5775/udp \ - -p 6831:6831/udp \ - -p 6832:6832/udp \ - -p 5778:5778 \ - -p 16686:16686 \ - -p 14268:14268 \ - -p 14250:14250 \ - -p 9411:9411 \ - jaegertracing/all-in-one:latest +docker-compose ps ---- -The above command runs the *all-in-one* Jaeger container, which includes the agent, collector, query service, and UI. +2. **Check Network Connectivity**: Ensure that the MicroProfile application can reach the OpenTelemetry Collector. If running outside Docker, use the host IP instead of `localhost` or container names. + +3. **Verify Configuration**: Confirm that the OTLP endpoint in the application configuration matches the OpenTelemetry Collector address. -The Jaeger UI can be accessed at: `https://:16686`. +4. **Check Logs**: View the logs from the OpenTelemetry Collector and other LGTM services to identify any errors: ++ +[source, bash] +---- +docker-compose logs otel-collector +docker-compose logs tempo +docker-compose logs loki +docker-compose logs prometheus +---- -Ensure all the services of our MicroProfile E-commerce applications are running. +5. **Verify Data Flow**: Use the OpenTelemetry Collector's logging exporter to confirm that telemetry data is being received and processed. -Search using parameters like operation name, time range, or service for the traces associated with different microservices and confirm that the telemetry data is visible. -View a detailed breakdown of each span within the trace, including timing and attributes. +6. **Test Application Requests**: Ensure that the MicroProfile application is processing requests. Generate some HTTP requests to trigger telemetry data collection: ++ +[source, bash] +---- +curl -X GET http://localhost:8080/api/payments/123 +---- == Types of Telemetry @@ -326,7 +623,7 @@ public class PaymentService { } ---- -Every time processPayment is called, a new span is created. The span is automatically linked to the current trace context. No need for explicit span creation or lifecycle management. You can use `@WithSpan` for tracing key business operations, such as order processing, payment handling, or API requests. +Each time `processPayment` is called, a new span is created. The span is automatically linked to the current trace context. This approach avoids explicit span creation and lifecycle management. You can use `@WithSpan` to trace key business operations, such as order processing, payment handling, or API requests. ==== Using `SpanBuilder` for Custom Spans @@ -357,11 +654,11 @@ public class TraceResource { } ---- -The method `tracer.spanBuilder("custom-span").startSpan()` creates a span with a specific name allowing developers to define meaningful trace segments for better observability. Using `span.setAttribute("custom.key", "customValue")`, custom metadata can be attached to the span, enriching trace data with relevant contextual information. Finally, calling `span.end()` explicitly marks the completion of the span, ensuring accurate tracking of execution duration. The `SpanBuilder` approach is particularly useful when developers require fine-grained control over when spans start and end, as well as the ability to include detailed metadata for enhanced trace analysis. +The method `tracer.spanBuilder("custom-span").startSpan()` creates a span with a specific name, which allows developers to define meaningful trace segments for better observability. Using `span.setAttribute("custom.key", "customValue")`, custom metadata can be attached to the span to enrich trace data with relevant contextual information. Calling `span.end()` explicitly marks the completion of the span and ensures accurate tracking of execution duration. The `SpanBuilder` approach is useful when developers need fine-grained control over span start and end points and detailed metadata for trace analysis. === Manual Tracing in `PaymentService` -To manually instrument the processPayment method in the PaymentService, we use OpenTelemetry’s API to create a custom span, add attributes, and control the span lifecycle. +To manually instrument the `processPayment` method in `PaymentService`, use the OpenTelemetry API to create a custom span, add attributes, and control the span lifecycle. [source, java] ---- @@ -388,7 +685,7 @@ public class PaymentService { span.setAttribute("payment.status", "IN_PROGRESS"); // Business logic for processing the payment - System.out.println(“Processing Payment…); + System.out.println("Processing payment..."); // Update span attribute on successful completion span.setAttribute("payment.status", "SUCCESS"); @@ -408,7 +705,7 @@ The `payment.process` span is manually created using `tracer.spanBuilder()`, all In the event of an error, the span captures and records the exception, ensuring failure details are logged for debugging. The span lifecycle is carefully managed, starting before the business logic executes and ending only after the process is completed in the `finally` block. This structured approach guarantees accurate performance monitoring and trace completeness, improving visibility into how payments are processed in a distributed system. -== Agent Instrumentation +== Agent Instrumentation Agent Instrumentation enables telemetry data collection without modifying application code by attaching a Java agent at runtime. This approach is particularly useful for legacy applications or scenarios where modifying source code is not feasible. The OpenTelemetry Java Agent dynamically instruments applications, automatically detecting and tracing interactions within commonly used frameworks such as Jakarta RESTful Web Services, database connections, and messaging systems. @@ -417,15 +714,109 @@ One of the key advantages of agent-based instrumentation is that it requires no Refer to the https://opentelemetry.io/docs/zero-code/java/agent/getting-started/[OpenTelemetry Java Agent Getting Started page] for step-by-step instructions on enabling it for your application. Once enabled, the agent automatically instruments the application, seamlessly integrating with distributed tracing systems without requiring developer intervention. This makes it an efficient and non-intrusive way to implement observability in MicroProfile applications. -Once enabled, the agent automatically instruments the application, seamlessly integrating with distributed tracing systems without requiring developer intervention. This makes it an efficient and non-intrusive way to implement observability in MicroProfile applications. +== Metrics + +Metrics are measurements of application and runtime behavior. Applications can define custom metrics in addition to the required metrics provided by the runtime. + +=== Access to the OpenTelemetry Metrics API + +MicroProfile Telemetry MUST provide the following CDI bean for supporting contextual instance injection: + +* `io.opentelemetry.api.metrics.Meter` + +Inject the `Meter` to define and record custom metrics: + +[source, java] +---- +import io.opentelemetry.api.metrics.LongCounter; +import io.opentelemetry.api.metrics.Meter; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.common.AttributeKey; +import jakarta.annotation.PostConstruct; +import jakarta.enterprise.context.ApplicationScoped; +import jakarta.inject.Inject; + +@ApplicationScoped +public class SubscriptionService { + + @Inject + Meter meter; + + private LongCounter subscriptionCounter; + + @PostConstruct + public void init() { + subscriptionCounter = meter + .counterBuilder("new_subscriptions") + .setDescription("Number of new subscriptions") + .setUnit("1") + .build(); + } + + public void subscribe(String plan) { + subscriptionCounter.add(1, + Attributes.of(AttributeKey.stringKey("plan"), plan)); + } +} +---- + +The `Meter` instance creates instruments such as counters and histograms. The runtime computes separate aggregations for each unique combination of attributes. + +=== Required Metrics + +Runtimes MUST provide the following metrics, as defined in the OpenTelemetry Semantic Conventions. + +.Required HTTP server metric +[options="header"] +|=== +|Metric Name |Type +|`http.server.request.duration` |Histogram +|=== + +.Required JVM metrics +[options="header"] +|=== +|Metric Name |Type +|`jvm.memory.used` |UpDownCounter +|`jvm.memory.committed` |UpDownCounter +|`jvm.memory.limit` |UpDownCounter +|`jvm.memory.used_after_last_gc` |UpDownCounter +|`jvm.gc.duration` |Histogram +|`jvm.thread.count` |UpDownCounter +|`jvm.class.loaded` |Counter +|`jvm.class.unloaded` |Counter +|`jvm.class.count` |UpDownCounter +|`jvm.cpu.time` |Counter +|`jvm.cpu.count` |UpDownCounter +|`jvm.cpu.recent_utilization` |Gauge +|=== + +Metrics are activated whenever MicroProfile Telemetry is enabled with `otel.sdk.disabled=false`. + +== Logs + +The OpenTelemetry Logs bridge API enables existing log frameworks (such as SLF4J, Log4j, JUL, and Logback) to emit logs through OpenTelemetry. This specification does not define new Log APIs. The Logs bridge API is used by runtimes, not directly by application code. Therefore, this specification does not expose any Log APIs to applications. + +Log output from an application is automatically bridged to the configured OpenTelemetry SDK instance when MicroProfile Telemetry is enabled. Configure the logs exporter in `microprofile-config.properties`: + +[source, properties] +---- +otel.sdk.disabled=false +otel.logs.exporter=otlp +otel.exporter.otlp.endpoint=http://:4317 +---- + +When a log record is emitted from an application, the runtime bridges it to the configured OpenTelemetry SDK instance, which then exports it using the configured log exporter (for example, via OTLP). When an active trace context exists, the log record automatically includes the `traceId` and `spanId`, enabling correlation between logs and traces. + +Logs are activated whenever MicroProfile Telemetry is enabled with `otel.sdk.disabled=false`. == Analyzing Traces -Once trace data is collected and exported to a backend system, analyzing these traces becomes a crucial step in understanding the behavior of your distributed microservices architecture. By examining traces, you can gain insights into system performance, identify bottlenecks, and detect failures or anomalies. +Once trace data is collected and exported to a backend system, analyzing these traces becomes a crucial step in understanding the behavior of distributed microservices architectures. By examining traces, developers can gain insights into system performance, identify bottlenecks, and detect failures or anomalies. === Visualizing Traces -Tracing backends like *Jaeger*, *Zipkin*, or *Graphana Tempo* provide visual interfaces to explore and analyze traces. These tools display traces as timelines or dependency graphs, making it easier to: +Tracing backends like *Jaeger*, *Zipkin*, or *Grafana Tempo* provide visual interfaces to explore and analyze traces. These tools display traces as timelines or dependency graphs, making it easier to: * Understand the sequence of operations. * Identify the services and components involved in a request. @@ -443,7 +834,7 @@ Traces highlight spans with long durations or repeated retries, which often poin Traces provide valuable information for diagnosing failures, including: -* *Error Codes*: Look for spans with error attributes, such as `http.status_code=500`. +* *Error Codes*: Look for spans with error attributes, such as `http.response.status_code=500` or `error.type`. * *Exception Details*: Many tracing systems capture stack traces or error messages in spans. * *Service Impact*: Identify which upstream and downstream services are affected by the failure. @@ -452,7 +843,7 @@ Traces provide valuable information for diagnosing failures, including: Dependency graphs generated from traces show the interactions between services. These graphs help: * Visualize which services depend on each other. -* Detects circular dependencies or excessive coupling. +* Detect circular dependencies or excessive coupling. * Plan optimizations by focusing on critical services. === Correlating Traces with Logs and Metrics @@ -460,8 +851,8 @@ Dependency graphs generated from traces show the interactions between services. Traces, when combined with logs and metrics, provide a comprehensive picture of the system: * *Logs*: Use trace IDs and span IDs in logs to correlate application logs with specific spans. -* *Metrics*: Correlate trace performance data with system metrics like CPU usage, memory consumption, or request rates. -Example: If a span indicates high latency, check corresponding logs and metrics to identify the underlying cause, such as a resource constraint or network delay. +* *Metrics*: Correlate trace performance data with system metrics, such as CPU usage, memory consumption, or request rates. +*Example:* If a span indicates high latency, check corresponding logs and metrics to identify the underlying cause, such as a resource constraint or network delay. === Best Practices for Analyzing Traces @@ -471,7 +862,7 @@ Example: If a span indicates high latency, check corresponding logs and metrics . *Automate Alerts*: Set up alerts for abnormal patterns in traces, such as increased latency or failure rates. . *Collaborate Across Teams*: Share trace insights with development, operations, and QA teams to improve system reliability. -By analyzing traces effectively, you can identify opportunities to optimize your microservices, ensure smoother operations, and enhance the overall user experience. Tracing tools provide a powerful way to visualize and understand the intricate dynamics of distributed systems. + +By analyzing traces effectively, developers can identify opportunities to optimize their microservices, ensure smoother operations, and enhance the overall user experience. Tracing tools provide a powerful way to visualize and understand the intricate dynamics of distributed systems. When analyzing traces, developers should look for the following: * *Long spans:* Spans that take a long time to complete may indicate a performance issue. @@ -479,11 +870,11 @@ When analyzing traces, developers should look for the following: * *Errors:* Errors can indicate problems with a service or a request. * *High latency:* High latency can indicate a problem with the network or a service. -By analyzing traces, developers can identify and troubleshoot problems with their microservices applications. This can help developers improve the performance and reliability of their applications. +By analyzing traces, developers can identify and troubleshoot problems in microservices applications. This improves performance and reliability. -Here are some tips for analyzing traces: +The following tips can help developers analyze traces: -* *Use a trace viewer:* A trace viewer is a tool that can help you visualize and analyze traces. +* *Use a trace viewer:* A trace viewer helps developers visualize and analyze traces. * *Look for patterns:* Look for patterns in the traces that may indicate a problem. * *Correlate traces with metrics:* Correlate traces with metrics to get a better understanding of the performance of your application. * *Use sampling:* Use sampling to reduce the number of traces that are collected. This can improve the performance of your tracing system. @@ -521,14 +912,13 @@ span.setAttribute("credit.card.last4", "****1234"); === Encrypt Trace Data To prevent unauthorized access during transmission, ensure that telemetry data is encrypted. Use secure protocols such as HTTPS or TLS for exporting trace data to a backend. - - *Example:* + +*Example:* * Configure the tracing provider to use encrypted connections: [source, properties] ---- -otel.exporter.jaeger.endpoint=https://secure-jaeger-collector.example.com otel.exporter.otlp.endpoint=https://secure-collector.example.com ---- @@ -558,12 +948,12 @@ Sampling reduces the volume of traces collected and limits the exposure of sensi *Example:* -Random sampling to limiting the amount of trace data collected: +Use random sampling to limit the amount of trace data collected: [source, properties] ---- otel.traces.sampler=traceidratio -otel.traces.sampler.traceidratio=0.1 +otel.traces.sampler.arg=0.1 ---- === Compliance with Regulations @@ -576,7 +966,7 @@ Ensure that your tracing practices comply with data protection and privacy regul === Isolate Tracing Infrastructure -The tracing infrastructure, such as Jaeger or OpenTelemetry Collector, should be isolated from the public internet and accessible only within secure networks. +The tracing infrastructure, such as Jaeger or OpenTelemetry Collector, should be isolated from the public internet and accessible only within secure networks. *Best Practice:* @@ -590,12 +980,27 @@ Tracing can help detect potential security incidents. Monitor traces for unusual * Unexpected spikes in requests. * Requests from unknown or unauthorized sources. * Abnormal response times indicating possible exploits. -Set up alerts for these anomalies to investigate and mitigate potential issues. + -By following these security considerations, you can leverage the benefits of distributed tracing without compromising the security of your system or the privacy of your users. Careful handling of trace data, coupled with robust encryption, access controls, and compliance practices, ensures that tracing remains a valuable yet secure component of your observability strategy. +Set up alerts for these anomalies to investigate and mitigate potential issues. +By following these security considerations, developers can leverage the benefits of distributed tracing without compromising the security of their systems or the privacy of their users. Careful handling of trace data, coupled with robust encryption, access controls, and compliance practices, ensures that tracing remains a valuable yet secure component of observability strategies. + +== What's New in MicroProfile Telemetry 2.1 + +MicroProfile Telemetry 2.1 is aligned with MicroProfile 7.1. The following changes are delivered in this release. + +* MicroProfile Telemetry 2.1 consumes https://github.com/open-telemetry/opentelemetry-java/releases/tag/v1.48.0[OpenTelemetry Java v1.48.0]. +* If migrating from an earlier version of MicroProfile Telemetry, update the `microprofile-telemetry-api` dependency version to `2.1`. +* Verify that your deployment environment provides the OpenTelemetry Java v1.48.0 libraries or a later patch version. +* The stabilization of HTTP semantic conventions (attributes such as `http.method` have been renamed to `http.request.method`). +* The introduction of a single shared OpenTelemetry SDK instance when `otel.sdk.disabled=false` is configured at runtime initialization time. +* The addition of metrics and logs support. + +=== Impact on Existing Applications + +Applications that do not use JVM metrics are unaffected by the 2.1 changes. Applications relying on JVM metrics should update their `microprofile-telemetry-api` dependency version to 2.1 to benefit from the corrected JVM metrics configuration. == Conclusion -MicroProfile Telemetry provides a robust foundation for observability in Java-based microservices, enabling developers to implement distributed tracing seamlessly. By leveraging this specification, you can gain deep insights into the flow of requests, identify bottlenecks, and enhance the reliability and performance of your applications. The integration of standardized tracing concepts like spans, traces, and context propagation ensures that developers can maintain a cohesive understanding of their system's behavior across service boundaries. +MicroProfile Telemetry provides a robust foundation for observability in Java-based microservices, enabling developers to implement distributed tracing, metrics collection, and log bridging seamlessly. By leveraging this specification, developers can gain deep insights into the flow of requests, identify bottlenecks, and enhance the reliability and performance of their applications. The integration of standardized concepts such as spans, traces, context propagation, metrics instruments, and log correlation ensures that developers can maintain a cohesive understanding of their system's behavior across service boundaries. Through instrumentation, context propagation, and effective trace analysis, MicroProfile Telemetry simplifies the complexities of monitoring and debugging distributed systems. It empowers teams to proactively address issues, optimize performance, and improve the user experience. Moreover, by adhering to security best practices, developers can ensure that telemetry data is protected, compliant with regulations, and free of sensitive information.