diff --git a/code/chapter09/docker-compose.yml b/code/chapter09/docker-compose.yml new file mode 100644 index 00000000..508ad8d3 --- /dev/null +++ b/code/chapter09/docker-compose.yml @@ -0,0 +1,88 @@ +version: '3.8' + +services: + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "13000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-storage:/var/lib/grafana + depends_on: + - prometheus + - loki + - tempo + dns_search: [] + networks: + - observability + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "19090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-storage:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + dns_search: [] + networks: + - observability + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "13100:3100" + volumes: + - loki-storage:/loki + dns_search: [] + networks: + - observability + + tempo: + image: grafana/tempo:latest + container_name: tempo + ports: + - "13200:3200" + - "14317:4317" + - "14318:4318" + volumes: + - tempo-storage:/var/tempo + - ./tempo-config.yaml:/etc/tempo/tempo-config.yaml + command: [ "-config.file=/etc/tempo/tempo-config.yaml" ] + dns_search: [] + networks: + - observability + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + ports: + - "24317:4317" + - "24318:4318" + - "19411:9411" + volumes: + - ./otel-collector-config.yml:/etc/otel-collector-config.yml + command: [ "--config=/etc/otel-collector-config.yml" ] + depends_on: + - loki + - prometheus + - tempo + dns_search: [] + networks: + - observability + +volumes: + grafana-storage: + prometheus-storage: + loki-storage: + tempo-storage: + +networks: + observability: + driver: bridge diff --git a/code/chapter09/images/grafana-loki-payment-service-logs.png b/code/chapter09/images/grafana-loki-payment-service-logs.png new file mode 100644 index 00000000..c8156b38 Binary files /dev/null and b/code/chapter09/images/grafana-loki-payment-service-logs.png differ diff --git a/code/chapter09/images/grafana-prometheus-payment-attempts-metric.png b/code/chapter09/images/grafana-prometheus-payment-attempts-metric.png new file mode 100644 index 00000000..e880caaf Binary files /dev/null and b/code/chapter09/images/grafana-prometheus-payment-attempts-metric.png differ diff --git a/code/chapter09/images/grafana-tempo-all-payment-traces.png b/code/chapter09/images/grafana-tempo-all-payment-traces.png new file mode 100644 index 00000000..d5f16ff9 Binary files /dev/null and b/code/chapter09/images/grafana-tempo-all-payment-traces.png differ diff --git a/code/chapter09/images/grafana-tempo-authorize-trace-detail.png b/code/chapter09/images/grafana-tempo-authorize-trace-detail.png new file mode 100644 index 00000000..56c31414 Binary files /dev/null and b/code/chapter09/images/grafana-tempo-authorize-trace-detail.png differ diff --git a/code/chapter09/images/grafana-tempo-gateway-health-traces.png b/code/chapter09/images/grafana-tempo-gateway-health-traces.png new file mode 100644 index 00000000..f26b195e Binary files /dev/null and b/code/chapter09/images/grafana-tempo-gateway-health-traces.png differ diff --git a/code/chapter09/otel-collector-config.yml b/code/chapter09/otel-collector-config.yml new file mode 100644 index 00000000..57f06d51 --- /dev/null +++ b/code/chapter09/otel-collector-config.yml @@ -0,0 +1,46 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + debug: + verbosity: detailed + + otlp_grpc/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + otlp_http/loki: + endpoint: http://loki:3100/otlp + tls: + insecure: true + + prometheus: + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp_grpc/tempo, debug] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus, debug] + + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/loki, debug] diff --git a/code/chapter09/payment/README.adoc b/code/chapter09/payment/README.adoc index e3b65582..1b029852 100644 --- a/code/chapter09/payment/README.adoc +++ b/code/chapter09/payment/README.adoc @@ -1,4 +1,4 @@ -= Payment Service += Payment Service — MicroProfile Telemetry 2.1 :toc: macro :toclevels: 3 :icons: font @@ -7,937 +7,741 @@ toc::[] -This microservice is part of the Jakarta EE 10 and MicroProfile 6.1-based e-commerce application. It handles payment processing and transaction management. +This microservice demonstrates MicroProfile Telemetry 2.1 integrated with the LGTM observability stack (Loki, Grafana, Tempo, Prometheus) via an OpenTelemetry Collector. It is part of the Chapter 09 MicroProfile tutorial. -== Features +The service processes payments with full fault tolerance (retry, fallback, circuit breaker, bulkhead, timeout) and exports **traces**, **metrics**, and **logs** over OTLP to the LGTM stack. -* Payment transaction processing -* Dynamic configuration management via MicroProfile Config -* RESTful API endpoints with JSON support -* Custom ConfigSource implementation -* OpenAPI documentation -* **MicroProfile Fault Tolerance with Retry Policies** -* **Circuit Breaker protection for external services** -* **Fallback mechanisms for service resilience** -* **Bulkhead pattern for concurrency control** -* **Timeout protection for long-running operations** +== Architecture -== MicroProfile Fault Tolerance Implementation - -The Payment Service implements comprehensive fault tolerance patterns using MicroProfile Fault Tolerance annotations: - -=== Retry Policies - -The service implements different retry strategies based on operation criticality: - -==== Payment Authorization Retry (@Retry) -* **Max Retries**: 3 attempts -* **Delay**: 1000ms with 500ms jitter -* **Max Duration**: 10 seconds -* **Retry On**: RuntimeException, WebApplicationException -* **Use Case**: Standard payment authorization with exponential backoff - -[source,java] ----- -@Retry( - maxRetries = 3, - delay = 2000, - maxDuration = 10000 - jitter = 500, - retryOn = {RuntimeException.class, WebApplicationException.class} -) ----- - -=== Circuit Breaker Protection - -Payment capture operations use circuit breaker pattern: - -[source,java] ---- -@CircuitBreaker( - failureRatio = 0.5, - requestVolumeThreshold = 4, - delay = 5000 -) + Payment Service (Open Liberty) + │ MicroProfile Telemetry 2.1 + │ OTLP gRPC → port 24317 (Codespaces) + ▼ port 4317 (local) + OpenTelemetry Collector + ├──► Tempo (traces) — port 14317 → Tempo:4317 + ├──► Loki (logs) — HTTP → Loki:3100/otlp + └──► Prometheus (metrics) — scrape endpoint :8889 + │ + ▼ + Grafana (unified dashboard) — port 13000 ---- -* **Failure Ratio**: 50% failure rate triggers circuit opening -* **Request Volume**: Minimum 4 requests for evaluation -* **Recovery Delay**: 5 seconds before attempting recovery +== Prerequisites -=== Timeout Protection +* JDK 21 or higher +* Maven 3.9.0 or higher +* Docker and Docker Compose -Operations with potential long delays are protected with timeouts: - -[source,java] ----- -@Timeout(value = 3000) ----- - -=== Bulkhead Pattern - -The bulkhead pattern limits concurrent requests to prevent system overload: +== Project Structure -[source,java] ---- -@Bulkhead(value = 5) +code/chapter09/ +├── docker-compose.yml # LGTM + OTel Collector stack +├── otel-collector-config.yml # OTel Collector pipeline config +├── prometheus.yml # Prometheus scrape config +├── tempo-config.yaml # Grafana Tempo v3 config +└── payment/ + ├── pom.xml + └── src/main/ + ├── java/io/microprofile/tutorial/store/payment/ + │ ├── entity/PaymentDetails.java + │ ├── exception/ + │ ├── resource/PaymentResource.java # REST endpoints + │ └── service/PaymentService.java # Business logic + telemetry + ├── liberty/config/server.xml # Open Liberty features + └── resources/META-INF/ + └── microprofile-config.properties # OTLP / sampler config ---- -* **Concurrent Requests**: Limited to 5 concurrent requests -* **Excess Requests**: Rejected immediately instead of queuing -* **Use Case**: Protect service from traffic spikes and cascading failures - -=== Fallback Mechanisms - -All critical operations have fallback methods that provide graceful degradation: - -* **Payment Authorization Fallback**: Returns service unavailable with retry instructions +== LGTM Observability Stack -== Endpoints +All observability infrastructure lives in `code/chapter09/`. Start from that directory. -=== GET /payment/api/payment-config -* Returns all current payment configuration values -* Example: `GET http://localhost:9080/payment/api/payment-config` -* Response: `{"gateway.endpoint":"https://api.paymentgateway.com"}` +=== docker-compose.yml -=== POST /payment/api/payment-config -* Updates a payment configuration value -* Example: `POST http://localhost:9080/payment/api/payment-config` -* Request body: `{"key": "payment.gateway.endpoint", "value": "https://new-api.paymentgateway.com"}` -* Response: `{"key":"payment.gateway.endpoint","value":"https://new-api.paymentgateway.com","message":"Configuration updated successfully"}` +The stack exposes ports offset from the defaults to avoid conflicts in GitHub Codespaces: -=== POST /payment/api/authorize -* Processes a payment authorization with retry policy -* **Retry Configuration**: 3 attempts, 1s delay, 500ms jitter -* **Fallback**: Service unavailable response -* Example: `POST http://localhost:9080/payment/api/authorize` -* Request body: `{"cardNumber":"4111111111111111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":100.00}` -* Response: `{"status":"success", "message":"Payment authorized successfully", "transactionId":"TXN1234567890", "amount":100.00}` -* Fallback Response: `{"status":"failed", "message":"Payment gateway unavailable. Please try again later.", "fallback":true}` - -=== POST /payment/api/payment-config/process-example -* Example endpoint demonstrating payment processing with configuration -* Example: `POST http://localhost:9080/payment/api/payment-config/process-example` -* Request body: `{"cardNumber":"4111111111111111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":100.00}` -* Response: `{"amount":100.00,"message":"Payment processed successfully","status":"success","configUsed":{"gatewayEndpoint":"https://new-api.paymentgateway.com"}}` - -== Building and Running the Service - -=== Prerequisites +[options="header"] +|=== +|Service |Image |Host Port |Container Port |Purpose +|Grafana |grafana/grafana:latest |13000 |3000 |Unified dashboard +|Prometheus |prom/prometheus:latest |19090 |9090 |Metrics storage +|Loki |grafana/loki:latest |13100 |3100 |Log storage +|Tempo |grafana/tempo:latest |13200, 14317, 14318 |3200, 4317, 4318 |Trace storage +|OTel Collector |otel/opentelemetry-collector-contrib:latest |24317, 24318 |4317, 4318 |Telemetry gateway +|=== -* JDK 17 or higher -* Maven 3.6.0 or higher +IMPORTANT: `dns_search: []` is set on every service. Without it, Docker's embedded DNS in GitHub Codespaces appends an Azure search domain (e.g. `gjgsbu0qvyie5dgfl5s3ewuwyc.ax.internal.cloudapp.net`) to short hostnames like `tempo`, causing `no such host` errors when Grafana tries to connect to its data sources. + +[source,yaml] +---- +version: '3.8' + +services: + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "13000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana-storage:/var/lib/grafana + depends_on: + - prometheus + - loki + - tempo + dns_search: [] + networks: + - observability + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - "19090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-storage:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + dns_search: [] + networks: + - observability + + loki: + image: grafana/loki:latest + container_name: loki + ports: + - "13100:3100" + volumes: + - loki-storage:/loki + dns_search: [] + networks: + - observability + + tempo: + image: grafana/tempo:latest + container_name: tempo + ports: + - "13200:3200" + - "14317:4317" + - "14318:4318" + volumes: + - tempo-storage:/var/tempo + - ./tempo-config.yaml:/etc/tempo/tempo-config.yaml + command: [ "-config.file=/etc/tempo/tempo-config.yaml" ] + dns_search: [] + networks: + - observability + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + ports: + - "24317:4317" + - "24318:4318" + - "19411:9411" + volumes: + - ./otel-collector-config.yml:/etc/otel-collector-config.yml + command: [ "--config=/etc/otel-collector-config.yml" ] + depends_on: + - loki + - prometheus + - tempo + dns_search: [] + networks: + - observability + +volumes: + grafana-storage: + prometheus-storage: + loki-storage: + tempo-storage: + +networks: + observability: + driver: bridge +---- + +=== otel-collector-config.yml + +The OTel Collector receives all three signals on OTLP ports and fans them out to the appropriate backend. + +[source,yaml] +---- +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 10s + send_batch_size: 1024 + +exporters: + debug: + verbosity: detailed + + otlp_grpc/tempo: + endpoint: tempo:4317 + tls: + insecure: true + + otlp_http/loki: + endpoint: http://loki:3100/otlp + tls: + insecure: true + + prometheus: + endpoint: "0.0.0.0:8889" + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp_grpc/tempo, debug] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [prometheus, debug] + + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/loki, debug] +---- + +NOTE: The `debug` exporter prints received telemetry to the collector's stdout. Use `docker compose logs otel-collector` to confirm data is flowing. Remove it in production to reduce noise. + +=== tempo-config.yaml + +Grafana Tempo v3 removed the top-level `ingester` field. Use this minimal v3-compatible configuration: + +[source,yaml] +---- +server: + http_listen_port: 3200 + log_level: info -=== Local Development +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 -[source,bash] +storage: + trace: + backend: local + wal: + path: /var/tempo/wal + local: + path: /var/tempo/blocks ---- -# Build the application -mvn clean package -# Run the application with Liberty -mvn liberty:run ----- +=== prometheus.yml -The server will start on port 9080 (HTTP) and 9081 (HTTPS). +Prometheus scrapes itself and the OTel Collector's Prometheus exporter endpoint: -=== Docker - -[source,bash] +[source,yaml] ---- -# Build and run with Docker -./run-docker.sh ----- - -== Project Structure - -* `src/main/java/io/microprofile/tutorial/PaymentRestApplication.java` - Jakarta Restful web service application class -* `src/main/java/io/microprofile/tutorial/store/payment/config/` - Configuration classes -* `src/main/java/io/microprofile/tutorial/store/payment/resource/` - REST resource endpoints -* `src/main/java/io/microprofile/tutorial/store/payment/service/` - Business logic services -* `src/main/java/io/microprofile/tutorial/store/payment/entity/` - Data models -* `src/main/resources/META-INF/services/` - Service provider configuration -* `src/main/liberty/config/` - Liberty server configuration +global: + scrape_interval: 15s + evaluation_interval: 15s -== Custom ConfigSource +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] -The Payment Service implements a custom MicroProfile ConfigSource named `PaymentServiceConfigSource` that provides payment-specific configuration with high priority (ordinal: 600). - -=== Available Configuration Properties + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] +---- -[cols="1,2,2", options="header"] -|=== -|Property -|Description -|Default Value +== Payment Service Configuration -|payment.gateway.endpoint -|Payment gateway endpoint URL -|https://api.paymentgateway.com -|=== +=== server.xml — Open Liberty Features -=== Testing ConfigSource Endpoints +The service runs on the `microProfile-7.1` platform, which includes MicroProfile Telemetry 2.1: -You can test the ConfigSource endpoints using curl or any REST client: - -[source,bash] +[source,xml] ---- -# Get current configuration -curl -s http://localhost:9080/payment/api/payment-config | json_pp + + + microProfile-7.1 + jakartaEE-10.0 + restfulWS + jsonp + jsonb + cdi + mpConfig + mpOpenAPI + mpHealth + mpMetrics + mpTelemetry + mpFaultTolerance + + + + + + +---- + +=== microprofile-config.properties — Telemetry Settings -# Update configuration property -curl -s -X POST -H "Content-Type: application/json" \ - -d '{"key":"payment.gateway.endpoint", "value":"https://new-api.paymentgateway.com"}' \ - http://localhost:9080/payment/api/payment-config | json_pp +[source,properties] +---- +# MicroProfile Telemetry Configuration +otel.service.name=payment-service +otel.sdk.disabled=false -# Test payment processing with the configuration -curl -s -X POST -H "Content-Type: application/json" \ - -d '{"cardNumber":"4111111111111111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":100.00}' \ - http://localhost:9080/payment/api/payment-config/process-example | json_pp +# OTLP Exporter Configuration +# Use port 4317 locally; use port 24317 when running in GitHub Codespaces +otel.exporter.otlp.endpoint=http://localhost:24317 +otel.traces.exporter=otlp +otel.metrics.exporter=otlp +otel.logs.exporter=otlp -# Test basic payment authorization -curl -s -X POST -H "Content-Type: application/json" \ - http://localhost:9080/payment/api/authorize | json_pp +# Sampling — always sample, respecting parent decision +otel.traces.sampler=parentbased_always_on ---- -=== Implementation Details +IMPORTANT: When running locally (not in Codespaces), change the endpoint to `http://localhost:4317` to match the OTel Collector's standard OTLP gRPC port. In Codespaces the host port is `24317`. -The custom ConfigSource is implemented in the following classes: +== Code Implementation -* `PaymentServiceConfigSource.java` - Implements the MicroProfile ConfigSource interface -* `PaymentConfig.java` - Utility class for accessing configuration properties +=== PaymentService.java — CDI Telemetry Injection -Example usage in application code: +MicroProfile Telemetry 2.1 exposes `Tracer` and `Meter` as CDI beans. Inject them directly — do not use `GlobalOpenTelemetry.get*()`: [source,java] ---- -// Inject standard MicroProfile Config -@Inject -@ConfigProperty(name="payment.gateway.endpoint") -private String endpoint; - -// Or use the utility class -String gatewayUrl = PaymentConfig.getConfigProperty("payment.gateway.endpoint"); ----- - -The custom ConfigSource provides a higher priority (ordinal: 600) than system properties and environment variables, allowing for service-specific defaults while still enabling override via standard mechanisms. - -=== MicroProfile Config Sources - -MicroProfile Config uses a prioritized set of configuration sources. The payment service uses the following configuration sources in order of priority (highest to lowest): - -1. Custom ConfigSource (`PaymentServiceConfigSource`) - Ordinal: 600 -2. System properties - Ordinal: 400 -3. Environment variables - Ordinal: 300 -4. microprofile-config.properties file - Ordinal: 100 - -==== Updating Configuration Values - -You can update configuration properties through different methods: - -===== 1. Using the REST API (runtime) +@ApplicationScoped +public class PaymentService { -This uses the custom ConfigSource and persists only for the current server session: + @Inject + Tracer tracer; // io.opentelemetry.api.trace.Tracer -[source,bash] ----- -curl -X POST -H "Content-Type: application/json" \ - -d '{"key":"payment.gateway.endpoint", "value":"https://test-api.paymentgateway.com"}' \ - http://localhost:9080/payment/api/payment-config ----- + @Inject + Meter meter; // io.opentelemetry.api.metrics.Meter -===== 2. Using System Properties (startup) + private LongCounter paymentAttemptsCounter; -[source,bash] + @PostConstruct + public void init() { + paymentAttemptsCounter = meter + .counterBuilder("payment.attempts") + .setDescription("Number of payment attempts by result") + .setUnit("1") + .build(); + } +} ---- -# Linux/macOS -mvn liberty:run -Dpayment.gateway.endpoint=https://sys-api.paymentgateway.com -# Windows -mvn liberty:run "-Dpayment.gateway.endpoint=https://sys-api.paymentgateway.com" ----- +NOTE: The `Meter` is used in `@PostConstruct` to build instruments — not in `init()` directly on the `Tracer`, which is used per-request. Building instruments in `@PostConstruct` ensures they are registered once at startup. -===== 3. Using Environment Variables (startup) +=== Manual Span Creation -Environment variable names must follow the MicroProfile Config convention (uppercase with underscores): +The `processPayment` method creates a child span with business attributes: -[source,bash] +[source,java] ---- -# Linux/macOS -export PAYMENT_GATEWAY_ENDPOINT=https://env-api.paymentgateway.com -mvn liberty:run - -# Windows PowerShell -$env:PAYMENT_GATEWAY_ENDPOINT="https://env-api.paymentgateway.com" -mvn liberty:run - -# Windows CMD -set PAYMENT_GATEWAY_ENDPOINT=https://env-api.paymentgateway.com -mvn liberty:run +@Asynchronous +@Timeout(3000) +@Retry(maxRetries = 3, delay = 2000, jitter = 500, + retryOn = PaymentProcessingException.class, + abortOn = CriticalPaymentException.class) +@Fallback(fallbackMethod = "fallbackProcessPayment") +@Bulkhead(value = 5) +public CompletionStage processPayment(PaymentDetails paymentDetails) + throws PaymentProcessingException { + + Span span = tracer.spanBuilder("payment.process") + .setAttribute("payment.amount", paymentDetails.getAmount().toString()) + .setAttribute("payment.method", "credit_card") + .setAttribute("payment.service", "payment-service") + .startSpan(); + + try (Scope scope = span.makeCurrent()) { + span.setAttribute("payment.status", "IN_PROGRESS"); + span.addEvent("Starting payment processing"); + + // ... business logic ... + + paymentAttemptsCounter.add(1, + Attributes.of(AttributeKey.stringKey("result"), "success")); + span.setAttribute("payment.status", "SUCCESS"); + span.setStatus(StatusCode.OK); + span.addEvent("Payment processed successfully"); + return CompletableFuture.completedFuture("{\"status\":\"success\",...}"); + } finally { + span.end(); + } +} ---- -===== 4. Using microprofile-config.properties File - -Edit the file at `src/main/resources/META-INF/microprofile-config.properties`: - -[source,properties] ----- -# Update the endpoint -payment.gateway.endpoint=https://config-api.paymentgateway.com ----- +The `payment.attempts` counter is incremented with a `result` attribute on every outcome: `success`, `failed`, or `fallback`. This enables per-outcome metrics in Prometheus. -Then rebuild and restart the application: +=== Circuit Breaker — checkGatewayHealth -[source,bash] +[source,java] ---- -mvn clean package liberty:run +@CircuitBreaker(requestVolumeThreshold = 4, failureRatio = 0.75, + delay = 1000, successThreshold = 2) +public boolean checkGatewayHealth() { + // Simulates a network call to the payment gateway + simulateNetworkCall(200); + if (Math.random() > 0.9) { + throw new RuntimeException("Payment gateway not responding"); + } + return true; +} ---- -==== Testing Configuration Changes - -After changing a configuration property, you can verify it was updated by calling: +=== Async Notification — sendPaymentNotification -[source,bash] +[source,java] ---- -curl http://localhost:9080/payment/api/payment-config +@Asynchronous +@Bulkhead(5) +public CompletionStage sendPaymentNotification( + String paymentId, String recipient) { + simulateNetworkCall(300); + return CompletableFuture.completedFuture( + "Notification sent to " + recipient + " for payment " + paymentId); +} ---- -== Documentation - -=== OpenAPI - -The payment service automatically generates OpenAPI documentation using MicroProfile OpenAPI annotations. +== API Endpoints -* OpenAPI UI: `http://localhost:9080/payment/api/openapi-ui/` -* OpenAPI JSON: `http://localhost:9080/payment/api/openapi` +Base URL: `http://localhost:9080/payment` -=== MicroProfile Config Specification - -For more information about MicroProfile Config, refer to the official documentation: - -* https://download.eclipse.org/microprofile/microprofile-config-3.1/microprofile-config-spec-3.1.html - -=== Related Resources - -* MicroProfile: https://microprofile.io/ -* Jakarta EE: https://jakarta.ee/ -* Open Liberty: https://openliberty.io/ - -== Troubleshooting +[options="header"] +|=== +|Method |Path |Description |Fault Tolerance +|POST |`/authorize?amount={value}` |Process payment by amount |`@Retry`, `@Fallback`, `@Bulkhead`, `@Timeout`, `@Asynchronous` +|POST |`/payments` |Process payment with full card details |`@Retry`, `@Fallback`, `@Bulkhead`, `@Timeout`, `@Asynchronous` +|POST |`/verify` |Verify payment — runs validation, fraud check, funds check |`@Asynchronous` +|GET |`/health/gateway` |Check gateway health |`@CircuitBreaker` +|POST |`/notify/{paymentId}?recipient={email}` |Send async payment notification |`@Asynchronous`, `@Bulkhead` +|=== -=== Common Issues +OpenAPI UI: http://localhost:9080/openapi/ui/ -==== Port Conflicts +=== PaymentDetails request body -If you encounter a port conflict when starting the server, you can change the ports in the `pom.xml` file: +Used by `POST /payments` and `POST /verify`: -[source,xml] +[source,json] ---- -9080 -9081 +{ + "cardNumber": "4111111111111111", + "cardHolderName": "Test User", + "expiryDate": "12/25", + "securityCode": "123", + "amount": 99.99 +} ---- -==== ConfigSource Not Loading - -If the custom ConfigSource is not loading, check the following: - -1. Verify the service provider configuration file exists at: - `src/main/resources/META-INF/services/org.eclipse.microprofile.config.spi.ConfigSource` - -2. Ensure it contains the correct fully qualified class name: - `io.microprofile.tutorial.store.payment.config.PaymentServiceConfigSource` - -==== Deployment Errors - -For CWWKZ0004E deployment errors, check the server logs at: -`target/liberty/wlp/usr/servers/mpServer/logs/messages.log` +NOTE: Card numbers ending in `0000` trigger a fraud-check failure in the verification flow. Amounts above `1000` trigger an insufficient-funds failure. -== Testing Fault Tolerance Features +== Building and Running -=== Automated Test Scripts +=== Step 1 — Start the LGTM observability stack -The Payment Service includes several test scripts to demonstrate and validate fault tolerance features: - -==== test-payment-basic.sh - -Basic functionality test to verify core payment operations: - -* Configuration retrieval -* Simple payment processing -* Error handling +Open a terminal and run from the `code/chapter09/` directory: [source,bash] ---- -# Test basic payment operations -chmod +x test-payment-basic.sh -./test-payment-basic.sh +cd code/chapter09 +docker compose up -d +docker compose ps ---- -==== test-payment-retry.sh -Tests various retry scenarios with different triggers: +Expected: all five services (`grafana`, `prometheus`, `loki`, `tempo`, `otel-collector`) show status `running`. -* Normal payment processing (successful) -* Failed payment with retry (card ending in "0000") -* Verification with random failures -* Invalid input handling +Verify each backend is healthy: [source,bash] ---- -# Test retry scenarios -chmod +x test-payment-retry.sh -./test-payment-retry.sh +curl -s http://localhost:13200/ready # Tempo → "ready" +curl -s http://localhost:13100/ready # Loki → "ready" +curl -s http://localhost:19090/-/healthy # Prometheus → "Prometheus Server is Healthy." ---- -==== test-payment-concurrent-load.sh +=== Step 2 — Configure Grafana data sources -Tests the service under concurrent load: +Open Grafana at http://localhost:13000 (login: `admin` / `admin`). -* Multiple simultaneous requests -* Observing thread handling -* Response time analysis +Go to *Connections → Data sources → Add data source* and add: -[source,bash] ----- -# Test service under concurrent load -chmod +x test-payment-concurrent-load.sh -./test-payment-concurrent-load.sh ----- +[options="header"] +|=== +|Type |Name |URL +|Prometheus |Prometheus |`http://prometheus:9090` +|Loki |Loki |`http://loki:3100` +|Tempo |Tempo |`http://tempo:3200` +|=== -==== test-payment-async.sh +Click *Save & test* for each. All three should show a success message. -Analyzes asynchronous processing behavior: +NOTE: Use the Docker service names (`prometheus`, `loki`, `tempo`) as hostnames — not `localhost`. Grafana runs inside the same Docker network as the backends, so service-name DNS resolution works. -* Response time measurement -* Thread utilization -* Future completion patterns +=== Step 3 — Build and start the payment service + +Open a second terminal from the `code/chapter09/payment/` directory: [source,bash] ---- -# Analyze asynchronous processing -chmod +x test-payment-async.sh -./test-payment-async.sh +cd code/chapter09/payment +mvn clean package +mvn liberty:run ---- -==== test-payment-bulkhead.sh -Demonstrates the bulkhead pattern by sending concurrent requests: - -* Concurrent request handling -* Bulkhead limit verification (5 requests) -* Rejection of excess requests -* Service recovery after load reduction +Wait for the message: -[source,bash] ---- -# Test bulkhead functionality with concurrent requests -chmod +x test-payment-bulkhead.sh -./test-payment-bulkhead.sh +[AUDIT] CWWKF0011I: The server mpServer is ready to run a smarter planet. ---- -==== test-payment-async-analysis.sh +The service is now available at http://localhost:9080/payment. -Analyzes asynchronous processing behavior: +=== Step 4 — Generate telemetry traffic -* Response time measurement -* Thread utilization -* Future completion patterns +Run the following to exercise all telemetry paths: [source,bash] ---- -# Analyze asynchronous processing -chmod +x test-payment-async-analysis.sh -./test-payment-async-analysis.sh ----- - -=== Running the Tests +# 1. Simple authorize (retry + fallback path) +curl -s -X POST "http://localhost:9080/payment/authorize?amount=75.50" -To run any of these test scripts: +# 2. Full payment with card details (creates payment.process span) +curl -s -X POST "http://localhost:9080/payment/payments" \ + -H "Content-Type: application/json" \ + -d '{"cardNumber":"4111111111111111","cardHolderName":"Test User","expiryDate":"12/25","securityCode":"123","amount":99.99}' -[source,bash] ----- -# Make the script executable -chmod +x test-payment-bulkhead.sh +# 3. Verification flow (validation → fraud check → funds check sub-spans) +curl -s -X POST "http://localhost:9080/payment/verify" \ + -H "Content-Type: application/json" \ + -d '{"cardNumber":"4111111111111111","cardHolderName":"Test User","expiryDate":"12/25","securityCode":"123","amount":150.00}' -# Run the script -./test-payment-bulkhead.sh ----- +# 4. Trigger fraud check failure (card ending 0000) +curl -s -X POST "http://localhost:9080/payment/verify" \ + -H "Content-Type: application/json" \ + -d '{"cardNumber":"4111111110000","cardHolderName":"Test User","expiryDate":"12/25","securityCode":"123","amount":50.00}' -You can also run all test scripts in sequence: +# 5. Gateway health check (circuit breaker path) +curl -s "http://localhost:9080/payment/health/gateway" -[source,bash] ----- -# Run all test scripts -for script in test-payment-*.sh; do - echo "Running $script..." - chmod +x $script - ./$script - echo "----------------------------------------" - sleep 2 -done +# 6. Async notification +curl -s -X POST "http://localhost:9080/payment/notify/PAY-12345?recipient=ops@example.com" ---- -== Configuration Properties +Run each command several times to produce enough data for Grafana to display. -=== Fault Tolerance Configuration +== Verifying Telemetry in Grafana -The following properties can be configured via MicroProfile Config: +=== Traces in Tempo -[cols="1,2,2", options="header"] -|=== -|Property -|Description -|Default Value +1. Open Grafana → *Explore* → select *Tempo* +2. Switch to the *Search* tab +3. Set *Service Name* = `payment-service` +4. Click *Run query* -|payment.gateway.endpoint -|Payment gateway endpoint URL -|https://api.paymentgateway.com +You should see recent traces. Click any trace to expand the span tree. Look for: -|payment.retry.maxRetries -|Maximum retry attempts for payment operations -|3 +* Root HTTP span: `POST /payment/payments` or `POST /payment/verify` +* Child span: `payment.process` with attributes `payment.amount`, `payment.method`, `payment.status` +* For the verify flow: `validatePaymentDetails`, `performFraudCheck`, `verifyFundsAvailability`, `recordTransaction` steps appear as log events on the span -|payment.retry.delay -|Delay between retry attempts (milliseconds) -|1000 +The following screenshot shows all payment-service traces listed in Tempo after generating traffic: -|payment.circuitbreaker.failureRatio -|Circuit breaker failure ratio threshold -|0.5 +image::../images/grafana-tempo-all-payment-traces.png[Grafana Tempo — all payment-service traces,role="related thumb right"] -|payment.circuitbreaker.requestVolumeThreshold -|Minimum requests for circuit breaker evaluation -|4 +You can filter by span name (e.g. `GET /payment/api/health/gateway`) to narrow down traces for a specific endpoint: -|payment.timeout.duration -|Timeout duration for payment operations (milliseconds) -|3000 +image::../images/grafana-tempo-gateway-health-traces.png[Grafana Tempo — gateway health check traces filtered by span name,role="related thumb right"] -|payment.bulkhead.value -|Maximum concurrent requests for bulkhead -|5 -|=== +Click any row to open the trace detail panel. The example below shows the full span tree for a `POST /payment/api/authorize` request: -== Fault Tolerance Implementation Details +image::../images/grafana-tempo-authorize-trace-detail.png[Grafana Tempo — POST /payment/api/authorize trace detail with span tree,role="related thumb right"] -=== Server Configuration +=== Logs in Loki -The MicroProfile Fault Tolerance feature is enabled in the Liberty server configuration: +The following screenshot shows Loki logs for the payment-service, including the log volume histogram and individual log lines with `traceId` links back to Tempo: -[source,xml] +image::../images/grafana-loki-payment-service-logs.png[Grafana Loki — payment-service logs with volume histogram and trace correlation,role="related thumb right"] + +1. Open Grafana → *Explore* → select *Loki* +2. Enter one of these LogQL queries: ++ +[source] +---- +{exporter="OTLP"} ---- -mpFaultTolerance ++ +or filter by service: ++ +[source] ---- +{service_name="payment-service"} +---- ++ +3. Look for log lines that contain `traceId` — click the trace link to jump directly to the correlated trace in Tempo. -=== Code Implementation +=== Metrics in Prometheus -==== PaymentService Class +The following screenshot shows the `payment_attempts_total` counter plotted over time, broken down by `result` label (`failed` and `success`): -The PaymentService class is annotated with `@ApplicationScoped` to ensure proper fault tolerance behavior: +image::..../images/grafana-prometheus-payment-attempts-metric.png[Grafana Prometheus — payment_attempts_total metric time series by result label,role="related thumb right"] -[source,java] +1. Open Grafana → *Explore* → select *Prometheus* (or use http://localhost:19090 directly) +2. Query the custom payment counter: ++ +[source] ---- -@ApplicationScoped -public class PaymentService { - // ... -} +payment_attempts_total ---- - -==== Authorization Method - -[source,java] ++ +3. Break it down by result label: ++ +[source] ---- -@Retry( - maxRetries = 3, - delay = 1000, - jitter = 500, - maxDuration = 10000, - retryOn = {RuntimeException.class, WebApplicationException.class} -) -@Fallback(fallbackMethod = "fallbackPaymentAuthorization") -public PaymentResponse processPayment(PaymentRequest request) { - // Payment processing logic -} - -public PaymentResponse fallbackPaymentAuthorization(PaymentRequest request) { - // Fallback logic for payment authorization - return new PaymentResponse("failed", "Payment gateway unavailable. Please try again later.", true); -} +payment_attempts_total{result="success"} +payment_attempts_total{result="failed"} +payment_attempts_total{result="fallback"} +---- ++ +4. Also query HTTP server metrics emitted automatically by MicroProfile Telemetry: ++ +[source] +---- +http_server_request_duration_seconds_count +---- ++ +5. Fault tolerance metrics from MicroProfile Fault Tolerance: ++ +[source] +---- +ft_retry_calls_total +ft_circuitbreaker_state_total +ft_bulkhead_calls_total ---- -=== Key Implementation Benefits - -==== 1. Resilience -- Service continues operating despite external service failures -- Automatic recovery from transient failures -- Protection against cascading failures - -==== 2. User Experience -- Reduced timeout errors through retry mechanisms -- Graceful degradation with meaningful error messages -- Improved service availability - -==== 3. Operational Excellence -- Configurable fault tolerance parameters -- Comprehensive logging and monitoring -- Clear separation of concerns between business logic and resilience - -==== 4. Enterprise Readiness -- Production-ready fault tolerance patterns -- Compliance with microservices best practices -- Integration with MicroProfile ecosystem - -== MicroProfile Fault Tolerance Patterns - -=== Retry Pattern - -The retry pattern allows the service to automatically retry failed operations: - -* **@Retry**: Automatically retries failed operations -* **Parameters**: maxRetries, delay, jitter, maxDuration, retryOn, abortOn -* **Use Case**: Transient failures in external service calls - -=== Circuit Breaker Pattern - -The circuit breaker pattern prevents cascading failures: - -* **@CircuitBreaker**: Tracks failure rates and opens circuit when threshold is reached -* **Parameters**: failureRatio, requestVolumeThreshold, delay -* **States**: Closed (normal), Open (failing), Half-Open (testing recovery) -* **Use Case**: Protect against downstream service failures - -=== Timeout Pattern - -The timeout pattern prevents operations from hanging indefinitely: - -* **@Timeout**: Sets maximum duration for operations -* **Parameters**: value, unit -* **Use Case**: Prevent indefinite waiting for slow external services - -=== Bulkhead Pattern - -The bulkhead pattern limits concurrent requests: - -* **@Bulkhead**: Sets maximum concurrent executions -* **Parameters**: value, waitingTaskQueue (for async) -* **Use Case**: Prevent system overload during traffic spikes - -=== Fallback Pattern - -The fallback pattern provides alternatives when operations fail: - -* **@Fallback**: Specifies alternative method when operation fails -* **Parameters**: fallbackMethod, applyOn, skipOn -* **Use Case**: Graceful degradation for failed operations - -== Fault Tolerance Best Practices - -=== Configuring Retry Policies - -When configuring retry policies, consider these best practices: - -* **Operation Criticality**: Use more aggressive retry policies for critical operations -* **Retry Delay**: Implement exponential backoff for external service calls -* **Jitter**: Add random jitter to prevent thundering herd problems -* **Max Duration**: Set an overall timeout to prevent excessive retries -* **Abort Conditions**: Define specific exceptions that should abort retry attempts - -=== Circuit Breaker Configuration - -For effective circuit breaker implementation: - -* **Failure Ratio**: Set appropriate threshold based on expected error rates (typically 0.3-0.5) -* **Request Volume**: Set minimum request count to prevent premature circuit opening -* **Recovery Delay**: Allow sufficient time for downstream services to recover -* **Monitoring**: Track circuit state transitions for operational visibility - -=== Bulkhead Strategies - -Choose the appropriate bulkhead strategy: - -* **Synchronous Bulkhead**: Limits concurrent executions for thread-constrained systems -* **Asynchronous Bulkhead**: Provides a waiting queue for manageable load spikes -* **Isolation Levels**: Consider using separate bulkheads for different types of operations - -=== Fallback Implementation - -Implement effective fallback mechanisms: - -* **Graceful Degradation**: Return partial results when possible -* **Meaningful Responses**: Provide clear error messages to clients -* **Operation Queuing**: Queue failed operations for later processing -* **Fallback Chain**: Implement multiple fallback levels for critical operations - -=== Combining Fault Tolerance Annotations - -When combining multiple fault tolerance annotations: - -* **Execution Order**: Understand the execution order (Fallback → Retry → CircuitBreaker → Timeout → Bulkhead) -* **Compatibility**: Ensure annotations work together as expected -* **Resource Impact**: Consider the resource impact of combined annotations -* **Testing**: Test all combinations of annotation behaviors - -== Troubleshooting Fault Tolerance Issues - -=== Common Fault Tolerance Issues - -==== 1. Ineffective Retry Policies - -**Symptoms**: -* Operations fail without retrying -* Excessive retries causing performance issues - -**Solutions**: -* Verify exceptions match retryOn parameter -* Check that delay and jitter are appropriate -* Ensure maxDuration allows sufficient time for retries - -==== 2. Circuit Breaker Problems - -**Symptoms**: -* Circuit opens too frequently -* Circuit never opens despite failures -* Circuit remains open indefinitely - -**Solutions**: -* Adjust failureRatio based on expected error rates -* Increase requestVolumeThreshold if premature opening occurs -* Verify that delay allows sufficient recovery time -* Ensure exceptions are properly handled - -==== 3. Timeout Issues - -**Symptoms**: -* Operations timeout too quickly -* Timeouts not triggering as expected +== Troubleshooting -**Solutions**: -* Adjust timeout duration based on operation complexity -* Ensure timeout is shorter than upstream timeouts -* Verify that timeout unit is properly specified +=== Payment service cannot reach OTel Collector -==== 4. Bulkhead Restrictions +Symptom in Liberty logs: -**Symptoms**: -* Too many rejections during normal load -* Service overloaded despite bulkhead +---- +Failed to export logs. The request could not be executed. +Failed to connect to localhost/[0:0:0:0:0:0:0:1]:24317 +---- -**Solutions**: -* Adjust bulkhead value based on resource capacity -* Consider using asynchronous bulkheads with waiting queue -* Implement client-side load balancing for better distribution +Cause: The `otel.exporter.otlp.endpoint` in `microprofile-config.properties` points to the wrong port. -==== 5. Fallback Failures +Fix: In GitHub Codespaces use port `24317`; locally use `4317`: -**Symptoms**: -* Fallbacks not triggering despite failures -* Fallbacks throwing unexpected exceptions +[source,properties] +---- +# Codespaces +otel.exporter.otlp.endpoint=http://localhost:24317 -**Solutions**: -* Verify fallback method signature matches original method -* Ensure fallback method handles exceptions properly -* Check that fallback logic is fully tested +# Local Docker +otel.exporter.otlp.endpoint=http://localhost:4317 +---- -=== Diagnosing with Metrics +=== Grafana data source "Failed to connect to Tempo/Loki/Prometheus" -MicroProfile Metrics provides valuable insight into fault tolerance behavior: +Symptom: -[source,bash] ---- -# Total number of retry attempts -curl https://localhost:9080/metrics?name=ft_retry_retries_total - -# Bulkhead calls total -curl http://localhost:9080/metrics?name=ft_bulkhead_calls_total - -# Timeout execution duration -curl http://localhost:9080/payment/metrics/application?name=ft_timeout_executionDuration_nanoseconds +Get "http://tempo:3200/api/echo": dial tcp: lookup tempo on 127.0.0.11:53: no such host ---- -=== Server Log Analysis +Cause: GitHub Codespaces injects an Azure DNS search domain. Docker's embedded DNS (127.0.0.11) appends it to short names like `tempo`, producing an NXDOMAIN. -Liberty server logs provide detailed information about fault tolerance operations: +Fix: The `docker-compose.yml` already sets `dns_search: []` on every service to suppress the search domain. If you see this error, ensure you are using the latest `docker-compose.yml` and recreate all containers: [source,bash] ---- -tail -f target/liberty/wlp/usr/servers/mpServer/logs/messages.log | grep -E "Retry|CircuitBreaker|Timeout|Bulkhead|Fallback" +cd code/chapter09 +docker compose down +docker compose up -d ---- -Look for messages indicating: -* Retry attempts and success/failure -* Circuit breaker state transitions -* Timeout exceptions -* Bulkhead rejections -* Fallback method invocations - -== Resources and References - -=== MicroProfile Fault Tolerance Specification - -For detailed information about MicroProfile Fault Tolerance, refer to: - -* https://download.eclipse.org/microprofile/microprofile-fault-tolerance-4.0/microprofile-fault-tolerance-spec-4.0.html - -=== API Documentation - -* https://download.eclipse.org/microprofile/microprofile-fault-tolerance-4.0/apidocs/ - -=== Fault Tolerance Guides - -* https://openliberty.io/guides/microprofile-fallback.html -* https://openliberty.io/guides/retry-timeout.html -* https://openliberty.io/guides/circuit-breaker.html -* https://openliberty.io/guides/bulkhead.html - -=== Best Practices Resources - -* https://microprofile.io/ -* https://www.ibm.com/docs/en/was-liberty/base?topic=liberty-microprofile-fault-tolerance - -== MicroProfile Telemetry Implementation +=== Tempo container exits immediately -The Payment Service implements distributed tracing using MicroProfile Telemetry 1.1, which is based on OpenTelemetry standards. This enables end-to-end visibility of payment transactions across microservices and external dependencies. +Symptom in `docker compose logs tempo`: -=== Telemetry Configuration - -The service is configured to send telemetry data to Jaeger, enabling comprehensive transaction monitoring: - -==== Application Configuration (microprofile-config.properties) - -[source,properties] ---- -# MicroProfile Telemetry Configuration -otel.service.name=payment-service -otel.sdk.disabled=false -otel.metrics.exporter=none -otel.logs.exporter=none +field ingester not found in type app.Config ---- -=== Automatic Instrumentation - -MicroProfile Telemetry provides automatic instrumentation for: +Cause: Tempo v3 removed the `ingester` top-level key. Ensure `tempo-config.yaml` does not contain an `ingester:` section. The configuration in this repo is already v3-compatible. -* Jakarta Restful Web Services endpoints (inbound and outbound HTTP requests) -* CDI method invocations -* MicroProfile Rest Client calls +=== Loki container exits immediately -This enables tracing without modifying application code, capturing: +Symptom in `docker compose logs loki`: -* HTTP request information (method, URL, status code) -* Transaction timing and duration -* Service dependencies and call hierarchy - -=== Manual Instrumentation - -For enhanced visibility, the Payment Service also implements manual instrumentation: - -[source,java] ---- -private Tracer tracer; // Injected tracer for OpenTelemetry - -@PostConstruct -public void init() { - // Programmatic tracer access - the correct approach - this.tracer = GlobalOpenTelemetry.getTracer("payment-service", "1.0.0"); - logger.info("Tracer initialized successfully"); -} - -// Create explicit span with business context -Span span = tracer.spanBuilder("payment.process") - .setAttribute("payment.amount", paymentDetails.getAmount().toString()) - .setAttribute("payment.method", "credit_card") - .setAttribute("payment.service", "payment-service") - .startSpan(); - -try (io.opentelemetry.context.Scope scope = span.makeCurrent()) { - // Business logic here - span.addEvent("Starting payment processing"); - - // Add result information - span.setStatus(StatusCode.OK); -} catch (Exception e) { - // Record error details - span.recordException(e); - span.setStatus(StatusCode.ERROR, e.getMessage()); - throw e; -} finally { - span.end(); // Always end the span -} +/etc/loki/local-config.yml does not exist ---- -=== Key Telemetry Points - -The service captures telemetry at critical transaction points: - -1. **Payment Authorization**: Complete trace of payment authorization flow -2. **Payment Verification**: Detailed verification steps with fraud check results -3. **External Service Calls**: Timing of gateway communications -4. **Retry Operations**: Visibility into retry attempts and fallbacks -5. **Error Handling**: Detailed error context and fault tolerance behavior - -=== Business Context Enrichment +Cause: An override `command:` was pointing to a `.yml` file but the image default uses `.yaml`. The `docker-compose.yml` in this repo does not override the Loki command, so the image default applies and the container starts correctly. -Traces are enriched with business context to enable business-oriented analysis: - -* **Payment Amounts**: Track transaction values for business insights -* **Payment Methods**: Categorize by payment method for pattern analysis -* **Transaction IDs**: Correlate with order management systems -* **Processing Time**: Measure critical business SLAs -* **Error Categories**: Classify errors for targeted improvements - -=== Viewing Telemetry Data - -Telemetry data can be viewed in Jaeger UI: +=== No traces appear in Tempo +1. Confirm the OTel Collector is receiving data: ++ [source,bash] ---- -# Start Jaeger container (if not already running) -docker run --rm --name jaeger \ - -p 16686:16686 \ - -p 4317:4317 \ - -p 4318:4318 \ - -p 5778:5778 \ - -p 9411:9411 \ - jaegertracing/jaeger:2.7.0 - -# Access Jaeger UI -open http://localhost:16686 +docker compose logs --tail=30 otel-collector ---- ++ +You should see `ResourceSpans` logged by the `debug` exporter. -In the Jaeger UI: -1. Select "payment-service" from the Service dropdown -2. Choose an operation or search by transaction attributes -3. Explore the full transaction trace across services +2. Confirm `otel.sdk.disabled=false` in `microprofile-config.properties`. -=== Troubleshooting Telemetry - -If telemetry data is not appearing in Jaeger: - -1. **Verify Jaeger is running** with OTLP ports exposed (4317, 4318) -2. **Check Liberty server configuration** in server.xml -3. **Validate application configuration** in microprofile-config.properties -4. **Ensure trace application is enabled** with `` -5. **Check network connectivity** between the service and Jaeger -6. **Inspect Liberty server logs** for telemetry-related messages - -=== Testing Telemetry +3. Confirm the service restarted after the config change: ++ +[source,bash] +---- +# Stop (Ctrl+C in the mvn liberty:run terminal), then restart +mvn liberty:run +---- -To generate and verify telemetry data: +== Clean Up [source,bash] ---- -# Generate sample telemetry with payment request -curl -X POST -H "Content-Type: application/json" \ - -d '{"cardNumber":"4111-1111-1111-1111", "cardHolderName":"Test User", "expiryDate":"12/25", "securityCode":"123", "amount":75.50}' \ - http://localhost:9080/payment/api/payments +# Stop the payment service (Ctrl+C in the mvn liberty:run terminal) +mvn liberty:stop -# Check for payment service in Jaeger UI services dropdown -curl -s http://localhost:16686/api/services +# Stop and remove all LGTM containers and volumes +cd code/chapter09 +docker compose down -v ---- -=== Benefits of Telemetry Implementation +== Related Resources -1. **End-to-End Transaction Visibility**: Follow payment flows across services -2. **Performance Monitoring**: Identify bottlenecks and optimization opportunities -3. **Error Detection**: Quickly locate and diagnose failures -4. **Dependency Analysis**: Understand service dependencies and impacts -5. **Business Insights**: Correlate technical metrics with business outcomes -6. **Operational Excellence**: Improve MTTR and system reliability \ No newline at end of file +* https://microprofile.io/specifications/microprofile-telemetry/[MicroProfile Telemetry 2.1 Specification] +* https://opentelemetry.io/docs/[OpenTelemetry Documentation] +* https://grafana.com/docs/tempo/[Grafana Tempo] +* https://grafana.com/docs/loki/[Grafana Loki] +* https://openliberty.io/docs/latest/microprofile-telemetry.html[Open Liberty MicroProfile Telemetry] diff --git a/code/chapter09/payment/pom.xml b/code/chapter09/payment/pom.xml index 65a1c5b5..60db6035 100644 --- a/code/chapter09/payment/pom.xml +++ b/code/chapter09/payment/pom.xml @@ -12,8 +12,8 @@ UTF-8 - 17 - 17 + 21 + 21 UTF-8 UTF-8 @@ -33,7 +33,7 @@ org.projectlombok lombok - 1.18.26 + 1.18.36 provided @@ -49,7 +49,7 @@ org.eclipse.microprofile microprofile - 6.1 + 7.1 pom provided @@ -58,8 +58,8 @@ io.opentelemetry opentelemetry-api - 1.32.0 - compile + 1.48.0 + provided @@ -73,6 +73,16 @@ ${project.artifactId} + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 21 + + + io.openliberty.tools @@ -90,4 +100,4 @@ - \ No newline at end of file + diff --git a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java index ae9258fb..b48c0c07 100644 --- a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java +++ b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/resource/PaymentResource.java @@ -11,8 +11,10 @@ import io.microprofile.tutorial.store.payment.service.PaymentService; import jakarta.enterprise.context.RequestScoped; import jakarta.inject.Inject; +import jakarta.ws.rs.GET; import jakarta.ws.rs.POST; import jakarta.ws.rs.Path; +import jakarta.ws.rs.PathParam; import jakarta.ws.rs.Produces; import jakarta.ws.rs.QueryParam; import jakarta.ws.rs.Consumes; @@ -22,10 +24,13 @@ import java.math.BigDecimal; import java.util.concurrent.CompletionStage; import java.util.UUID; +import java.util.logging.Logger; @RequestScoped @Path("/") public class PaymentResource { + + private static final Logger logger = Logger.getLogger(PaymentResource.class.getName()); @Inject @ConfigProperty(name = "payment.gateway.endpoint") @@ -131,4 +136,72 @@ public Response verifyPaymentWithTelemetry(PaymentDetails paymentDetails) } } + @GET + @Path("/health/gateway") + @Produces(MediaType.APPLICATION_JSON) + @Operation(summary = "Check gateway health", description = "Check payment gateway health with circuit breaker protection") + @APIResponses(value = { + @APIResponse(responseCode = "200", description = "Gateway is healthy"), + @APIResponse(responseCode = "503", description = "Gateway is unavailable") + }) + public Response checkGatewayHealth() { + try { + boolean healthy = paymentService.checkGatewayHealth(); + + if (healthy) { + return Response.ok() + .entity("{\"status\":\"healthy\",\"message\":\"Payment gateway is operational\"}") + .build(); + } + + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"status\":\"unhealthy\",\"message\":\"Payment gateway is not responding\"}") + .build(); + } catch (Exception e) { + logger.warning("Gateway health check failed: " + e.getMessage()); + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"status\":\"circuit_open\",\"message\":\"Circuit breaker is open - gateway appears to be down\"}") + .build(); + } + } + + @POST + @Path("/notify/{paymentId}") + @Produces(MediaType.APPLICATION_JSON) + @Operation(summary = "Send payment notification", description = "Send asynchronous payment notification with bulkhead protection") + @APIResponses(value = { + @APIResponse(responseCode = "200", description = "Notification sent or queued"), + @APIResponse(responseCode = "503", description = "Bulkhead rejected request") + }) + public CompletionStage sendNotification( + @PathParam("paymentId") String paymentId, + @QueryParam("recipient") String recipient + ) { + + if (recipient == null || recipient.isEmpty()) { + recipient = "default@example.com"; + } + + return paymentService.sendPaymentNotification(paymentId, recipient) + .thenApply(result -> { + logger.info("Notification result: " + result); + return Response.ok() + .entity("{\"status\":\"success\",\"message\":\"" + result + "\"}") + .build(); + }) + .exceptionally(ex -> { + logger.warning("Notification failed: " + ex.getMessage()); + + if (ex.getMessage() != null && ex.getMessage().contains("BulkheadException")) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"status\":\"rejected\",\"message\":\"Too many concurrent notifications - please try again later\"}") + .build(); + } + + return Response.status(Response.Status.INTERNAL_SERVER_ERROR) + .entity("{\"status\":\"error\",\"message\":\"Notification processing failed\"}") + .build(); + }); + } + } diff --git a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java index da3ed2ba..92d971f3 100644 --- a/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java +++ b/code/chapter09/payment/src/main/java/io/microprofile/tutorial/store/payment/service/PaymentService.java @@ -1,21 +1,27 @@ package io.microprofile.tutorial.store.payment.service; -import io.microprofile.tutorial.store.payment.exception.PaymentProcessingException; -import io.opentelemetry.api.GlobalOpenTelemetry; -import io.opentelemetry.api.trace.Tracer; -import io.opentelemetry.api.trace.Span; import io.microprofile.tutorial.store.payment.entity.PaymentDetails; import io.microprofile.tutorial.store.payment.exception.CriticalPaymentException; +import io.microprofile.tutorial.store.payment.exception.PaymentProcessingException; + +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.LongCounter; +import io.opentelemetry.api.metrics.Meter; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.StatusCode; +import io.opentelemetry.api.trace.Tracer; import org.eclipse.microprofile.faulttolerance.Asynchronous; import org.eclipse.microprofile.faulttolerance.Bulkhead; +import org.eclipse.microprofile.faulttolerance.CircuitBreaker; import org.eclipse.microprofile.faulttolerance.Fallback; import org.eclipse.microprofile.faulttolerance.Retry; import org.eclipse.microprofile.faulttolerance.Timeout; -import jakarta.enterprise.context.ApplicationScoped; import jakarta.annotation.PostConstruct; - +import jakarta.enterprise.context.ApplicationScoped; +import jakarta.inject.Inject; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionStage; @@ -26,56 +32,56 @@ public class PaymentService { private static final Logger logger = Logger.getLogger(PaymentService.class.getName()); - private Tracer tracer; // Injected tracer for OpenTelemetry + @Inject + Tracer tracer; + + @Inject + Meter meter; + + private LongCounter paymentAttemptsCounter; @PostConstruct public void init() { - // Programmatic tracer access - the correct approach - this.tracer = GlobalOpenTelemetry.getTracer("payment-service", "1.0.0"); - logger.info("Tracer initialized successfully"); + paymentAttemptsCounter = meter + .counterBuilder("payment.attempts") + .setDescription("Number of payment attempts by result") + .setUnit("1") + .build(); + logger.info("PaymentService initialized with telemetry instrumentation"); } - /** - * Process the payment request with automatic tracing via MicroProfile Telemetry. - * The mpTelemetry feature automatically creates spans for this method. - * - * @param paymentDetails details of the payment - * @return response message indicating success or failure - * @throws PaymentProcessingException if a transient issue occurs - */ @Asynchronous @Timeout(3000) @Retry(maxRetries = 3, delay = 2000, jitter = 500, retryOn = PaymentProcessingException.class, abortOn = CriticalPaymentException.class) @Fallback(fallbackMethod = "fallbackProcessPayment") - @Bulkhead(value=5) + @Bulkhead(value = 5) public CompletionStage processPayment(PaymentDetails paymentDetails) throws PaymentProcessingException { - // Create explicit span for payment processing to help with debugging Span span = tracer.spanBuilder("payment.process") - .setAttribute("payment.amount", paymentDetails.getAmount().toString()) - .setAttribute("payment.method", "credit_card") - .setAttribute("payment.service", "payment-service") - .startSpan(); - + .setAttribute("payment.amount", paymentDetails.getAmount().toString()) + .setAttribute("payment.method", "credit_card") + .setAttribute("payment.service", "payment-service") + .startSpan(); + try (io.opentelemetry.context.Scope scope = span.makeCurrent()) { - // MicroProfile Telemetry automatically traces this method String maskedCardNumber = maskCardNumber(paymentDetails.getCardNumber()); - - logger.info(String.format("Processing payment - Amount: %s, Card: %s", + logger.info(String.format("Processing payment - Amount: %s, Card: %s", paymentDetails.getAmount(), maskedCardNumber)); - + + span.setAttribute("payment.status", "IN_PROGRESS"); span.addEvent("Starting payment processing"); simulateDelay(); - // Simulating a transient failure if (Math.random() > 0.7) { - span.setStatus(io.opentelemetry.api.trace.StatusCode.ERROR, "Payment processing failed"); + paymentAttemptsCounter.add(1, Attributes.of(AttributeKey.stringKey("result"), "failed")); + span.setStatus(StatusCode.ERROR, "Payment processing failed"); span.addEvent("Payment processing failed due to transient error"); logger.warning("Payment processing failed due to transient error"); throw new PaymentProcessingException("Temporary payment processing failure"); } - // Simulating successful processing - span.setStatus(io.opentelemetry.api.trace.StatusCode.OK); + paymentAttemptsCounter.add(1, Attributes.of(AttributeKey.stringKey("result"), "success")); + span.setAttribute("payment.status", "SUCCESS"); + span.setStatus(StatusCode.OK); span.addEvent("Payment processed successfully"); logger.info("Payment processed successfully"); return CompletableFuture.completedFuture("{\"status\":\"success\", \"message\":\"Payment processed successfully.\"}"); @@ -84,51 +90,74 @@ public CompletionStage processPayment(PaymentDetails paymentDetails) thr } } - /** - * Fallback method when payment processing fails. - * Automatically traced by MicroProfile Telemetry. - * - * @param paymentDetails details of the payment - * @return response message for fallback - */ public CompletionStage fallbackProcessPayment(PaymentDetails paymentDetails) { - logger.warning(() -> String.format("Fallback invoked for payment - Amount: %s", + paymentAttemptsCounter.add(1, Attributes.of(AttributeKey.stringKey("result"), "fallback")); + logger.warning(() -> String.format("Fallback invoked for payment - Amount: %s", paymentDetails.getAmount())); - return CompletableFuture.completedFuture("{\"status\":\"failed\", \"message\":\"Payment service is currently unavailable.\"}"); } - /** - * Masks a credit card number for security in logs and traces. - * Only the last 4 digits are shown, all others are replaced with 'X'. - * - * @param cardNumber The full card number - * @return A masked card number (e.g., "XXXXXXXXXXXX1234") - */ + @CircuitBreaker(requestVolumeThreshold = 4, failureRatio = 0.75, delay = 1000, successThreshold = 2) + public boolean checkGatewayHealth() { + logger.info("Checking payment gateway health"); + simulateNetworkCall(200); + if (Math.random() > 0.9) { + logger.warning("Gateway health check failed"); + throw new RuntimeException("Payment gateway not responding"); + } + logger.info("Gateway is healthy"); + return true; + } + + @Asynchronous + @Bulkhead(5) + public CompletionStage sendPaymentNotification(String paymentId, String recipient) { + logger.info(() -> String.format("Sending payment notification - Payment ID: %s, Recipient: %s", + paymentId, recipient)); + simulateNetworkCall(300); + logger.info("Payment notification sent successfully"); + return CompletableFuture.completedFuture( + String.format("Notification sent to %s for payment %s", recipient, paymentId)); + } + + @Asynchronous + public CompletionStage verifyPaymentWithTelemetry(PaymentDetails paymentDetails, String transactionId) + throws PaymentProcessingException { + logger.info(() -> String.format("Starting payment verification - Transaction ID: %s", transactionId)); + + try { + validatePaymentDetails(paymentDetails); + performFraudCheck(paymentDetails, transactionId); + verifyFundsAvailability(paymentDetails); + recordTransaction(paymentDetails, transactionId); + + logger.info("Payment verification completed successfully"); + return CompletableFuture.completedFuture( + String.format("{\"status\":\"verified\", \"transaction_id\":\"%s\", \"message\":\"Payment verification complete.\"}", + transactionId)); + } catch (Exception e) { + logger.severe(() -> String.format("Payment verification failed: %s", e.getMessage())); + throw e; + } + } + private String maskCardNumber(String cardNumber) { if (cardNumber == null || cardNumber.length() < 4) { return "INVALID_CARD"; } - int visibleDigits = 4; int length = cardNumber.length(); - StringBuilder masked = new StringBuilder(); - for (int i = 0; i < length - visibleDigits; i++) { + for (int i = 0; i < length - 4; i++) { masked.append('X'); } - masked.append(cardNumber.substring(length - visibleDigits)); - + masked.append(cardNumber.substring(length - 4)); return masked.toString(); } - /** - * Simulate a delay in processing to demonstrate timeout. - * This method will be automatically traced by MicroProfile Telemetry. - */ private void simulateDelay() { try { logger.fine("Starting payment processing delay simulation"); - Thread.sleep(1500); // Simulated long-running task + Thread.sleep(1500); logger.fine("Payment processing delay simulation completed"); } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -137,119 +166,48 @@ private void simulateDelay() { } } - /** - * Processes a comprehensive payment verification with multiple steps. - * Each method call will be automatically traced by MicroProfile Telemetry. - * - * @param paymentDetails The payment details to verify - * @param transactionId The unique transaction ID - * @return A detailed verification result - * @throws PaymentProcessingException if verification fails - */ - @Asynchronous - public CompletionStage verifyPaymentWithTelemetry(PaymentDetails paymentDetails, String transactionId) - throws PaymentProcessingException { - - logger.info(() -> String.format("Starting payment verification - Transaction ID: %s", transactionId)); - - try { - // Step 1: Validate payment details - validatePaymentDetails(paymentDetails); - - // Step 2: Check for fraud indicators - performFraudCheck(paymentDetails, transactionId); - - // Step 3: Verify funds with bank - verifyFundsAvailability(paymentDetails); - - // Step 4: Record transaction - recordTransaction(paymentDetails, transactionId); - - logger.info("Payment verification completed successfully"); - return CompletableFuture.completedFuture( - String.format("{\"status\":\"verified\", \"transaction_id\":\"%s\", \"message\":\"Payment verification complete.\"}", - transactionId)); - } catch (Exception e) { - logger.severe(() -> String.format("Payment verification failed: %s", e.getMessage())); - throw e; - } - } - - /** - * Validates payment details - automatically traced - */ private void validatePaymentDetails(PaymentDetails details) throws PaymentProcessingException { logger.info("Validating payment details"); - - boolean isValid = details.getCardNumber() != null && - details.getCardNumber().length() >= 15 && - details.getExpiryDate() != null && - details.getAmount() != null && - details.getAmount().doubleValue() > 0; - + boolean isValid = details.getCardNumber() != null && + details.getCardNumber().length() >= 15 && + details.getExpiryDate() != null && + details.getAmount() != null && + details.getAmount().doubleValue() > 0; if (!isValid) { logger.warning("Payment details validation failed"); throw new PaymentProcessingException("Payment details validation failed"); } - logger.info("Payment details validation successful"); } - - /** - * Performs fraud check - automatically traced - */ + private void performFraudCheck(PaymentDetails details, String transactionId) throws PaymentProcessingException { logger.info(() -> String.format("Performing fraud check for transaction: %s", transactionId)); - - // Simulate external service call simulateNetworkCall(300); - - // Simulate fraud detection (cards ending with "0000" are flagged) boolean isSafe = !details.getCardNumber().endsWith("0000"); - if (!isSafe) { logger.warning("Potential fraud detected"); throw new PaymentProcessingException("Fraud check failed"); } - logger.info("Fraud check passed"); } - - /** - * Verifies funds availability - automatically traced - */ + private void verifyFundsAvailability(PaymentDetails details) throws PaymentProcessingException { logger.info(() -> String.format("Verifying funds availability - Amount: %s", details.getAmount())); - - // Simulate banking service call simulateNetworkCall(500); - - // Simulate funds verification (amounts over 1000 fail) boolean hasFunds = details.getAmount().doubleValue() <= 1000; - if (!hasFunds) { logger.warning("Insufficient funds detected"); throw new PaymentProcessingException("Insufficient funds"); } - logger.info("Sufficient funds verified"); } - - /** - * Records transaction - automatically traced - */ + private void recordTransaction(PaymentDetails details, String transactionId) { logger.info(() -> String.format("Recording transaction: %s", transactionId)); - - // Simulate database operation simulateNetworkCall(200); - logger.info("Transaction recorded successfully"); } - - /** - * Simulates network calls or database operations - automatically traced - */ + private void simulateNetworkCall(int milliseconds) { try { logger.fine(() -> String.format("Simulating network call - Duration: %dms", milliseconds)); diff --git a/code/chapter09/payment/src/main/liberty/config/server.xml b/code/chapter09/payment/src/main/liberty/config/server.xml index 2d5c7290..c680bc91 100644 --- a/code/chapter09/payment/src/main/liberty/config/server.xml +++ b/code/chapter09/payment/src/main/liberty/config/server.xml @@ -1,7 +1,7 @@ + microProfile-7.1 jakartaEE-10.0 - microProfile-6.1 restfulWS jsonp jsonb @@ -11,7 +11,6 @@ mpHealth mpMetrics mpTelemetry - mpOpenTracing mpFaultTolerance diff --git a/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties b/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties index d41c5ed4..f7fcce07 100644 --- a/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties +++ b/code/chapter09/payment/src/main/resources/META-INF/microprofile-config.properties @@ -13,5 +13,14 @@ io.microprofile.tutorial.store.payment.service.PaymentService/processPayment/Ret # MicroProfile Telemetry Configuration otel.service.name=payment-service otel.sdk.disabled=false -otel.metrics.exporter=none -otel.logs.exporter=none \ No newline at end of file + +# OTLP Exporter Configuration +# Override with the forwarded port when running in GitHub Codespaces: http://localhost:24317 +otel.exporter.otlp.endpoint=http://localhost:24317 +otel.traces.exporter=otlp +otel.metrics.exporter=otlp +otel.logs.exporter=otlp + +# Sampling +otel.traces.sampler=parentbased_always_on + diff --git a/code/chapter09/payment/src/main/webapp/index.html b/code/chapter09/payment/src/main/webapp/index.html index f7ba4adc..7d20f937 100644 --- a/code/chapter09/payment/src/main/webapp/index.html +++ b/code/chapter09/payment/src/main/webapp/index.html @@ -262,7 +262,7 @@ Links