diff --git a/infrastructure/terraform/components/api/README.md b/infrastructure/terraform/components/api/README.md index 8184dc84d..e0282eb0d 100644 --- a/infrastructure/terraform/components/api/README.md +++ b/infrastructure/terraform/components/api/README.md @@ -45,7 +45,11 @@ No requirements. |------|--------|---------| | [amendment\_event\_transformer](#module\_amendment\_event\_transformer) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [amendments\_queue](#module\_amendments\_queue) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.24/terraform-sqs.zip | n/a | +| [apigw\_alarms](#module\_apigw\_alarms) | ../../modules/alarms-apigw | n/a | | [authorizer\_lambda](#module\_authorizer\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | +| [ddb\_alarms\_letters](#module\_ddb\_alarms\_letters) | ../../modules/alarms-ddb | n/a | +| [ddb\_alarms\_mi](#module\_ddb\_alarms\_mi) | ../../modules/alarms-ddb | n/a | +| [ddb\_alarms\_suppliers](#module\_ddb\_alarms\_suppliers) | ../../modules/alarms-ddb | n/a | | [domain\_truststore](#module\_domain\_truststore) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | | [eventpub](#module\_eventpub) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.31/terraform-eventpub.zip | n/a | | [eventsub](#module\_eventsub) | ../../modules/eventsub | n/a | @@ -54,6 +58,7 @@ No requirements. | [get\_letters](#module\_get\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [get\_status](#module\_get\_status) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-kms.zip | n/a | +| [lambda\_alarms](#module\_lambda\_alarms) | ../../modules/alarms-lambda | n/a | | [letter\_status\_updates\_queue](#module\_letter\_status\_updates\_queue) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.24/terraform-sqs.zip | n/a | | [letter\_updates\_transformer](#module\_letter\_updates\_transformer) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [logging\_bucket](#module\_logging\_bucket) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | @@ -62,6 +67,7 @@ No requirements. | [post\_letters](#module\_post\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [post\_mi](#module\_post\_mi) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | | [s3bucket\_test\_letters](#module\_s3bucket\_test\_letters) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-s3bucket.zip | n/a | +| [sqs\_alarms](#module\_sqs\_alarms) | ../../modules/alarms-sqs | n/a | | [sqs\_letter\_updates](#module\_sqs\_letter\_updates) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-sqs.zip | n/a | | [sqs\_supplier\_allocator](#module\_sqs\_supplier\_allocator) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.26/terraform-sqs.zip | n/a | | [supplier\_allocator](#module\_supplier\_allocator) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/v2.0.29/terraform-lambda.zip | n/a | diff --git a/infrastructure/terraform/components/api/locals_alarms.tf b/infrastructure/terraform/components/api/locals_alarms.tf new file mode 100644 index 000000000..63d56d776 --- /dev/null +++ b/infrastructure/terraform/components/api/locals_alarms.tf @@ -0,0 +1,24 @@ +locals { + lambda_alarm_targets = { + authorizer_lambda = module.authorizer_lambda.function_name + get_letter = module.get_letter.function_name + get_letters = module.get_letters.function_name + get_letter_data = module.get_letter_data.function_name + get_status = module.get_status.function_name + patch_letter = module.patch_letter.function_name + post_letters = module.post_letters.function_name + post_mi = module.post_mi.function_name + upsert_letter = module.upsert_letter.function_name + amendment_event_transformer = module.amendment_event_transformer.function_name + letter_updates_transformer = module.letter_updates_transformer.function_name + mi_updates_transformer = module.mi_updates_transformer.function_name + supplier_allocator = module.supplier_allocator.function_name + } + + sqs_alarm_targets = { + sqs_letter_updates = module.sqs_letter_updates.sqs_queue_name + amendments_queue = module.amendments_queue.sqs_queue_name + letter_status_updates_queue = module.letter_status_updates_queue.sqs_queue_name + sqs_supplier_allocator = module.sqs_supplier_allocator.sqs_queue_name + } +} diff --git a/infrastructure/terraform/components/api/module_apigw_alarms.tf b/infrastructure/terraform/components/api/module_apigw_alarms.tf new file mode 100644 index 000000000..ee48ed4e6 --- /dev/null +++ b/infrastructure/terraform/components/api/module_apigw_alarms.tf @@ -0,0 +1,7 @@ +module "apigw_alarms" { + source = "../../modules/alarms-apigw" + alarm_prefix = local.csi + api_name = aws_api_gateway_rest_api.main.name + stage_name = aws_api_gateway_stage.main.stage_name + tags = local.default_tags +} diff --git a/infrastructure/terraform/components/api/module_authorizer_lambda.tf b/infrastructure/terraform/components/api/module_authorizer_lambda.tf index 7e3c94b8b..c90a7d303 100644 --- a/infrastructure/terraform/components/api/module_authorizer_lambda.tf +++ b/infrastructure/terraform/components/api/module_authorizer_lambda.tf @@ -36,7 +36,7 @@ module "authorizer_lambda" { lambda_env_vars = { CLOUDWATCH_NAMESPACE = "/aws/api-gateway/supplier/alarms", - CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS = 14, + CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS = 30, APIM_SUPPLIER_ID_HEADER = "NHSD-Supplier-ID", SUPPLIERS_TABLE_NAME = aws_dynamodb_table.suppliers.name } diff --git a/infrastructure/terraform/components/api/module_ddb_alarms_letters.tf b/infrastructure/terraform/components/api/module_ddb_alarms_letters.tf new file mode 100644 index 000000000..1e3c270f7 --- /dev/null +++ b/infrastructure/terraform/components/api/module_ddb_alarms_letters.tf @@ -0,0 +1,6 @@ +module "ddb_alarms_letters" { + source = "../../modules/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.letters.name + tags = local.default_tags +} diff --git a/infrastructure/terraform/components/api/module_ddb_alarms_mi.tf b/infrastructure/terraform/components/api/module_ddb_alarms_mi.tf new file mode 100644 index 000000000..c6af98217 --- /dev/null +++ b/infrastructure/terraform/components/api/module_ddb_alarms_mi.tf @@ -0,0 +1,6 @@ +module "ddb_alarms_mi" { + source = "../../modules/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.mi.name + tags = local.default_tags +} diff --git a/infrastructure/terraform/components/api/module_ddb_alarms_suppliers.tf b/infrastructure/terraform/components/api/module_ddb_alarms_suppliers.tf new file mode 100644 index 000000000..a5a2d5396 --- /dev/null +++ b/infrastructure/terraform/components/api/module_ddb_alarms_suppliers.tf @@ -0,0 +1,6 @@ +module "ddb_alarms_suppliers" { + source = "../../modules/alarms-ddb" + alarm_prefix = local.csi + table_name = aws_dynamodb_table.suppliers.name + tags = local.default_tags +} diff --git a/infrastructure/terraform/components/api/module_lambda_alarms.tf b/infrastructure/terraform/components/api/module_lambda_alarms.tf new file mode 100644 index 000000000..4467c5001 --- /dev/null +++ b/infrastructure/terraform/components/api/module_lambda_alarms.tf @@ -0,0 +1,9 @@ +module "lambda_alarms" { + for_each = local.lambda_alarm_targets + source = "../../modules/alarms-lambda" + + alarm_prefix = local.csi + function_name = each.value + log_group_name = "/aws/lambda/${each.value}" + tags = local.default_tags +} diff --git a/infrastructure/terraform/components/api/module_sqs_alarms.tf b/infrastructure/terraform/components/api/module_sqs_alarms.tf new file mode 100644 index 000000000..62a524554 --- /dev/null +++ b/infrastructure/terraform/components/api/module_sqs_alarms.tf @@ -0,0 +1,9 @@ +module "sqs_alarms" { + for_each = local.sqs_alarm_targets + source = "../../modules/alarms-sqs" + + alarm_prefix = local.csi + queue_name = each.value + dlq_queue_name = replace(each.value, "-queue", "-dlq") + tags = local.default_tags +} diff --git a/infrastructure/terraform/modules/alarms-apigw/README.md b/infrastructure/terraform/modules/alarms-apigw/README.md new file mode 100644 index 000000000..d1de73b57 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/README.md @@ -0,0 +1,34 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [api\_name](#input\_api\_name) | n/a | `string` | n/a | yes | +| [error\_5xx\_evaluation\_periods](#input\_error\_5xx\_evaluation\_periods) | n/a | `number` | `1` | no | +| [error\_5xx\_period\_seconds](#input\_error\_5xx\_period\_seconds) | n/a | `number` | `60` | no | +| [error\_5xx\_threshold](#input\_error\_5xx\_threshold) | n/a | `number` | `0` | no | +| [latency\_anomaly\_sensitivity](#input\_latency\_anomaly\_sensitivity) | n/a | `number` | `2` | no | +| [latency\_datapoints\_to\_alarm](#input\_latency\_datapoints\_to\_alarm) | n/a | `number` | `3` | no | +| [latency\_evaluation\_periods](#input\_latency\_evaluation\_periods) | n/a | `number` | `5` | no | +| [latency\_period\_seconds](#input\_latency\_period\_seconds) | n/a | `number` | `60` | no | +| [latency\_threshold\_ms](#input\_latency\_threshold\_ms) | n/a | `number` | `29000` | no | +| [stage\_name](#input\_stage\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-apigw/alarm-5xx.tf b/infrastructure/terraform/modules/alarms-apigw/alarm-5xx.tf new file mode 100644 index 000000000..2035bba5b --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/alarm-5xx.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "five_xx" { + alarm_name = "${var.alarm_prefix}-apigw-5xx" + alarm_description = "RELIABILITY: API Gateway 5xx responses" + + namespace = "AWS/ApiGateway" + metric_name = "5XXError" + statistic = "Sum" + period = var.error_5xx_period_seconds + + evaluation_periods = var.error_5xx_evaluation_periods + threshold = var.error_5xx_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = local.api_dimensions + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-apigw/alarm-latency-anomaly.tf b/infrastructure/terraform/modules/alarms-apigw/alarm-latency-anomaly.tf new file mode 100644 index 000000000..ae64a72d9 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/alarm-latency-anomaly.tf @@ -0,0 +1,34 @@ +resource "aws_cloudwatch_metric_alarm" "latency_anomaly" { + alarm_name = "${var.alarm_prefix}-apigw-latency-anomaly" + alarm_description = "RELIABILITY: API Gateway latency anomaly" + comparison_operator = "GreaterThanUpperThreshold" + evaluation_periods = var.latency_evaluation_periods + datapoints_to_alarm = var.latency_datapoints_to_alarm + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags + + metric_query { + id = "m1" + metric { + metric_name = "Latency" + namespace = "AWS/ApiGateway" + stat = "Average" + period = var.latency_period_seconds + dimensions = local.api_dimensions + } + return_data = true + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.latency_anomaly_sensitivity})" + label = "Latency (expected)" + return_data = true + } +} diff --git a/infrastructure/terraform/modules/alarms-apigw/alarm-latency-threshold.tf b/infrastructure/terraform/modules/alarms-apigw/alarm-latency-threshold.tf new file mode 100644 index 000000000..5372bb1ab --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/alarm-latency-threshold.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "latency_threshold" { + alarm_name = "${var.alarm_prefix}-apigw-latency-threshold" + alarm_description = "RELIABILITY: API Gateway latency above threshold" + + namespace = "AWS/ApiGateway" + metric_name = "Latency" + statistic = "Average" + period = var.latency_period_seconds + + evaluation_periods = var.latency_evaluation_periods + threshold = var.latency_threshold_ms + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = local.api_dimensions + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-apigw/locals.tf b/infrastructure/terraform/modules/alarms-apigw/locals.tf new file mode 100644 index 000000000..e05765be3 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/locals.tf @@ -0,0 +1,6 @@ +locals { + api_dimensions = { + ApiName = var.api_name + Stage = var.stage_name + } +} diff --git a/infrastructure/terraform/modules/alarms-apigw/variables.tf b/infrastructure/terraform/modules/alarms-apigw/variables.tf new file mode 100644 index 000000000..70909ad7f --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/variables.tf @@ -0,0 +1,56 @@ +variable "alarm_prefix" { + type = string +} + +variable "api_name" { + type = string +} + +variable "stage_name" { + type = string +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "error_5xx_threshold" { + type = number + default = 0 +} + +variable "error_5xx_period_seconds" { + type = number + default = 60 +} + +variable "error_5xx_evaluation_periods" { + type = number + default = 1 +} + +variable "latency_threshold_ms" { + type = number + default = 29000 +} + +variable "latency_period_seconds" { + type = number + default = 60 +} + +variable "latency_evaluation_periods" { + type = number + default = 5 +} + +variable "latency_datapoints_to_alarm" { + type = number + default = 3 +} + +variable "latency_anomaly_sensitivity" { + type = number + default = 2 +} diff --git a/infrastructure/terraform/modules/alarms-apigw/versions.tf b/infrastructure/terraform/modules/alarms-apigw/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-apigw/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms-ddb/README.md b/infrastructure/terraform/modules/alarms-ddb/README.md new file mode 100644 index 000000000..b9c3b0c12 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/README.md @@ -0,0 +1,29 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no | +| [period\_seconds](#input\_period\_seconds) | n/a | `number` | `60` | no | +| [read\_throttle\_threshold](#input\_read\_throttle\_threshold) | n/a | `number` | `0` | no | +| [table\_name](#input\_table\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +| [write\_throttle\_threshold](#input\_write\_throttle\_threshold) | n/a | `number` | `0` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-ddb/alarm-read-throttle.tf b/infrastructure/terraform/modules/alarms-ddb/alarm-read-throttle.tf new file mode 100644 index 000000000..b48686b18 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/alarm-read-throttle.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "read_throttle" { + alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-read-throttle" + alarm_description = "RELIABILITY: DynamoDB read throttling" + + namespace = "AWS/DynamoDB" + metric_name = "ReadThrottleEvents" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.read_throttle_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { TableName = var.table_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-ddb/alarm-write-throttle.tf b/infrastructure/terraform/modules/alarms-ddb/alarm-write-throttle.tf new file mode 100644 index 000000000..8975e1efa --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/alarm-write-throttle.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "write_throttle" { + alarm_name = "${var.alarm_prefix}-ddb-${var.table_name}-write-throttle" + alarm_description = "RELIABILITY: DynamoDB write throttling" + + namespace = "AWS/DynamoDB" + metric_name = "WriteThrottleEvents" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.write_throttle_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { TableName = var.table_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-ddb/variables.tf b/infrastructure/terraform/modules/alarms-ddb/variables.tf new file mode 100644 index 000000000..3895d21eb --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/variables.tf @@ -0,0 +1,32 @@ +variable "alarm_prefix" { + type = string +} + +variable "table_name" { + type = string +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "period_seconds" { + type = number + default = 60 +} + +variable "evaluation_periods" { + type = number + default = 1 +} + +variable "read_throttle_threshold" { + type = number + default = 0 +} + +variable "write_throttle_threshold" { + type = number + default = 0 +} diff --git a/infrastructure/terraform/modules/alarms-ddb/versions.tf b/infrastructure/terraform/modules/alarms-ddb/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-ddb/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms-lambda/README.md b/infrastructure/terraform/modules/alarms-lambda/README.md new file mode 100644 index 000000000..5c0074b5f --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/README.md @@ -0,0 +1,36 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [enable\_error\_log\_metric](#input\_enable\_error\_log\_metric) | n/a | `bool` | `true` | no | +| [error\_log\_evaluation\_periods](#input\_error\_log\_evaluation\_periods) | n/a | `number` | `1` | no | +| [error\_log\_metric\_filter\_pattern](#input\_error\_log\_metric\_filter\_pattern) | n/a | `string` | `"{ ($.level = \"50\" || $.level = \"error\") && $.environment = * }"` | no | +| [error\_log\_metric\_name\_prefix](#input\_error\_log\_metric\_name\_prefix) | n/a | `string` | `"LambdaErrorLogs-"` | no | +| [error\_log\_metric\_namespace](#input\_error\_log\_metric\_namespace) | n/a | `string` | `"Custom/LambdaErrorLogs"` | no | +| [error\_log\_threshold](#input\_error\_log\_threshold) | n/a | `number` | `0` | no | +| [errors\_threshold](#input\_errors\_threshold) | n/a | `number` | `0` | no | +| [evaluation\_periods](#input\_evaluation\_periods) | n/a | `number` | `1` | no | +| [function\_name](#input\_function\_name) | n/a | `string` | n/a | yes | +| [log\_group\_name](#input\_log\_group\_name) | n/a | `string` | `""` | no | +| [period\_seconds](#input\_period\_seconds) | n/a | `number` | `300` | no | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +| [throttles\_threshold](#input\_throttles\_threshold) | n/a | `number` | `0` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-lambda/alarm-error-logs.tf b/infrastructure/terraform/modules/alarms-lambda/alarm-error-logs.tf new file mode 100644 index 000000000..36d15ddda --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/alarm-error-logs.tf @@ -0,0 +1,21 @@ +resource "aws_cloudwatch_metric_alarm" "error_logs" { + count = var.enable_error_log_metric ? 1 : 0 + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + alarm_description = "ERROR: Lambda error logs detected" + + namespace = var.error_log_metric_namespace + metric_name = "${var.error_log_metric_name_prefix}${var.function_name}" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.error_log_evaluation_periods + threshold = var.error_log_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-lambda/alarm-errors.tf b/infrastructure/terraform/modules/alarms-lambda/alarm-errors.tf new file mode 100644 index 000000000..662715893 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/alarm-errors.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "errors" { + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-errors" + alarm_description = "ERROR: Lambda errors" + + namespace = "AWS/Lambda" + metric_name = "Errors" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.errors_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { FunctionName = var.function_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-lambda/alarm-throttles.tf b/infrastructure/terraform/modules/alarms-lambda/alarm-throttles.tf new file mode 100644 index 000000000..89c2b0cdf --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/alarm-throttles.tf @@ -0,0 +1,22 @@ +resource "aws_cloudwatch_metric_alarm" "throttles" { + alarm_name = "${var.alarm_prefix}-lambda-${var.function_name}-throttles" + alarm_description = "RELIABILITY: Lambda throttles" + + namespace = "AWS/Lambda" + metric_name = "Throttles" + statistic = "Sum" + period = var.period_seconds + + evaluation_periods = var.evaluation_periods + threshold = var.throttles_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { FunctionName = var.function_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-lambda/metric-filter-error-logs.tf b/infrastructure/terraform/modules/alarms-lambda/metric-filter-error-logs.tf new file mode 100644 index 000000000..0090adc8d --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/metric-filter-error-logs.tf @@ -0,0 +1,12 @@ +resource "aws_cloudwatch_log_metric_filter" "error_logs" { + count = var.enable_error_log_metric ? 1 : 0 + name = "${var.alarm_prefix}-lambda-${var.function_name}-error-logs" + log_group_name = var.log_group_name + pattern = var.error_log_metric_filter_pattern + + metric_transformation { + name = "${var.error_log_metric_name_prefix}${var.function_name}" + namespace = var.error_log_metric_namespace + value = "1" + } +} diff --git a/infrastructure/terraform/modules/alarms-lambda/variables.tf b/infrastructure/terraform/modules/alarms-lambda/variables.tf new file mode 100644 index 000000000..5da36fd53 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/variables.tf @@ -0,0 +1,67 @@ +variable "alarm_prefix" { + type = string +} + +variable "function_name" { + type = string +} + +variable "log_group_name" { + type = string + default = "" +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "period_seconds" { + type = number + default = 300 +} + +variable "evaluation_periods" { + type = number + default = 1 +} + +variable "errors_threshold" { + type = number + default = 0 +} + +variable "throttles_threshold" { + type = number + default = 0 +} + +variable "enable_error_log_metric" { + type = bool + default = true +} + +variable "error_log_metric_namespace" { + type = string + default = "Custom/LambdaErrorLogs" +} + +variable "error_log_metric_name_prefix" { + type = string + default = "LambdaErrorLogs-" +} + +variable "error_log_metric_filter_pattern" { + type = string + default = "{ ($.level = \"50\" || $.level = \"error\") && $.environment = * }" +} + +variable "error_log_threshold" { + type = number + default = 0 +} + +variable "error_log_evaluation_periods" { + type = number + default = 1 +} diff --git a/infrastructure/terraform/modules/alarms-lambda/versions.tf b/infrastructure/terraform/modules/alarms-lambda/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-lambda/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/infrastructure/terraform/modules/alarms-sqs/README.md b/infrastructure/terraform/modules/alarms-sqs/README.md new file mode 100644 index 000000000..6ddbea4ca --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/README.md @@ -0,0 +1,31 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.9.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [age\_anomaly\_datapoints\_to\_alarm](#input\_age\_anomaly\_datapoints\_to\_alarm) | n/a | `number` | `3` | no | +| [age\_anomaly\_evaluation\_periods](#input\_age\_anomaly\_evaluation\_periods) | n/a | `number` | `5` | no | +| [age\_anomaly\_sensitivity](#input\_age\_anomaly\_sensitivity) | n/a | `number` | `3` | no | +| [age\_period\_seconds](#input\_age\_period\_seconds) | n/a | `number` | `900` | no | +| [alarm\_prefix](#input\_alarm\_prefix) | n/a | `string` | n/a | yes | +| [dlq\_queue\_name](#input\_dlq\_queue\_name) | n/a | `string` | `null` | no | +| [dlq\_visible\_threshold](#input\_dlq\_visible\_threshold) | n/a | `number` | `0` | no | +| [queue\_name](#input\_queue\_name) | n/a | `string` | n/a | yes | +| [tags](#input\_tags) | n/a | `map(string)` | `{}` | no | +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/modules/alarms-sqs/alarm-age-anomaly.tf b/infrastructure/terraform/modules/alarms-sqs/alarm-age-anomaly.tf new file mode 100644 index 000000000..e236ce905 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/alarm-age-anomaly.tf @@ -0,0 +1,34 @@ +resource "aws_cloudwatch_metric_alarm" "age_anomaly" { + alarm_name = "${var.alarm_prefix}-sqs-${var.queue_name}-age-anomaly" + alarm_description = "RELIABILITY: SQS oldest message age anomaly" + comparison_operator = "GreaterThanUpperThreshold" + evaluation_periods = var.age_anomaly_evaluation_periods + datapoints_to_alarm = var.age_anomaly_datapoints_to_alarm + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags + + metric_query { + id = "m1" + metric { + metric_name = "ApproximateAgeOfOldestMessage" + namespace = "AWS/SQS" + stat = "Maximum" + period = var.age_period_seconds + dimensions = local.queue_dimensions + } + return_data = true + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.age_anomaly_sensitivity})" + label = "AgeOfOldestMessage (expected)" + return_data = true + } +} diff --git a/infrastructure/terraform/modules/alarms-sqs/alarm-dlq-depth.tf b/infrastructure/terraform/modules/alarms-sqs/alarm-dlq-depth.tf new file mode 100644 index 000000000..41b1251ba --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/alarm-dlq-depth.tf @@ -0,0 +1,23 @@ +resource "aws_cloudwatch_metric_alarm" "dlq_depth" { + count = var.dlq_queue_name == null ? 0 : 1 + alarm_name = "${var.alarm_prefix}-sqs-${var.dlq_queue_name}-dlq-depth" + alarm_description = "RELIABILITY: SQS DLQ has messages" + + namespace = "AWS/SQS" + metric_name = "ApproximateNumberOfMessagesVisible" + statistic = "Sum" + period = 60 + + evaluation_periods = 1 + threshold = var.dlq_visible_threshold + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + + dimensions = { QueueName = var.dlq_queue_name } + + actions_enabled = false + alarm_actions = [] + ok_actions = [] + insufficient_data_actions = [] + tags = var.tags +} diff --git a/infrastructure/terraform/modules/alarms-sqs/locals.tf b/infrastructure/terraform/modules/alarms-sqs/locals.tf new file mode 100644 index 000000000..38eac3152 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/locals.tf @@ -0,0 +1,3 @@ +locals { + queue_dimensions = { QueueName = var.queue_name } +} diff --git a/infrastructure/terraform/modules/alarms-sqs/variables.tf b/infrastructure/terraform/modules/alarms-sqs/variables.tf new file mode 100644 index 000000000..364fc6153 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/variables.tf @@ -0,0 +1,42 @@ +variable "alarm_prefix" { + type = string +} + +variable "queue_name" { + type = string +} + +variable "dlq_queue_name" { + type = string + default = null +} + +variable "tags" { + type = map(string) + default = {} +} + +variable "age_period_seconds" { + type = number + default = 900 +} + +variable "age_anomaly_sensitivity" { + type = number + default = 3 +} + +variable "age_anomaly_evaluation_periods" { + type = number + default = 5 +} + +variable "age_anomaly_datapoints_to_alarm" { + type = number + default = 3 +} + +variable "dlq_visible_threshold" { + type = number + default = 0 +} diff --git a/infrastructure/terraform/modules/alarms-sqs/versions.tf b/infrastructure/terraform/modules/alarms-sqs/versions.tf new file mode 100644 index 000000000..f8dc86e97 --- /dev/null +++ b/infrastructure/terraform/modules/alarms-sqs/versions.tf @@ -0,0 +1,9 @@ + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 1.9.0" +} diff --git a/lambdas/authorizer/src/__tests__/index.test.ts b/lambdas/authorizer/src/__tests__/index.test.ts index cce287b30..878f692b3 100644 --- a/lambdas/authorizer/src/__tests__/index.test.ts +++ b/lambdas/authorizer/src/__tests__/index.test.ts @@ -35,7 +35,7 @@ const mockedDeps: jest.Mocked = { } as unknown as pino.Logger, env: { CLOUDWATCH_NAMESPACE: "cloudwatch-namespace", - CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS: 14, + CLIENT_CERTIFICATE_EXPIRATION_ALERT_DAYS: 30, APIM_SUPPLIER_ID_HEADER: "NHSD-Supplier-ID", } as unknown as EnvVars, supplierRepo: { @@ -74,10 +74,11 @@ describe("Authorizer Lambda Function", () => { }); describe("Certificate expiry check", () => { + const currentDate = new Date("2025-11-01T14:19:00Z"); beforeEach(() => { jest .useFakeTimers({ doNotFake: ["nextTick"] }) - .setSystemTime(new Date("2025-11-03T14:19:00Z")); + .setSystemTime(currentDate); (metricScope as jest.Mock).mockClear(); (mockedDeps.logger.warn as jest.Mock).mockClear(); const metricsMock = jest.requireMock( @@ -103,7 +104,7 @@ describe("Authorizer Lambda Function", () => { it("Should log CloudWatch metric when the certificate expiry threshold is reached", async () => { mockEvent.requestContext.identity.clientCert = buildCertWithExpiry( - "2025-11-17T14:19:00Z", + "2025-11-31T14:19:00Z", ); const handler = createAuthorizerHandler(mockedDeps); @@ -117,19 +118,19 @@ describe("Authorizer Lambda Function", () => { expect(metricScope).toHaveBeenCalledTimes(1); expect(mockedDeps.logger.warn).toHaveBeenCalledWith({ description: "APIM Certificate expiry", - days: 14, + days: 30, }); expect(metricsMock.setNamespace).toHaveBeenCalledWith("authorizer"); expect(metricsMock.putMetric).toHaveBeenCalledWith( "apim-client-certificate-near-expiry", - 14, + 30, "Count", ); }); it("Should not log CloudWatch metric when the certificate expiry threshold is not yet reached", async () => { mockEvent.requestContext.identity.clientCert = buildCertWithExpiry( - "2025-11-18T14:19:00Z", + "2026-01-01T14:19:00Z", ); const handler = createAuthorizerHandler(mockedDeps); diff --git a/tests/component-tests/apiGateway-tests/update-letter-status.spec.ts b/tests/component-tests/apiGateway-tests/update-letter-status.spec.ts index bd90cb267..3fe0bc1ca 100644 --- a/tests/component-tests/apiGateway-tests/update-letter-status.spec.ts +++ b/tests/component-tests/apiGateway-tests/update-letter-status.spec.ts @@ -10,7 +10,6 @@ import { } from "./testCases/update-letter-status"; import { createTestData, - deleteLettersBySupplier, getLettersBySupplier, } from "../../helpers/generate-fetch-test-data"; import { createInvalidRequestHeaders } from "../../constants/request-headers"; @@ -20,22 +19,27 @@ import { } from "../../helpers/common-types"; let baseUrl: string; +let letters: Awaited>; test.beforeAll(async () => { baseUrl = await getRestApiGatewayBaseUrl(); }); test.describe("API Gateway Tests to Verify Patch Status Endpoint", () => { + test.beforeAll(async () => { + await createTestData(SUPPLIERID, 2); + letters = await getLettersBySupplier(SUPPLIERID, "PENDING", 2); + + if (!letters || letters.length < 2) { + throw new Error( + `Expected 2 PENDING letters for supplier ${SUPPLIERID}, got ${letters?.length ?? 0}.`, + ); + } + }); + test(`Patch /letters returns 202 and status is updated to ACCEPTED`, async ({ request, }) => { - await createTestData(SUPPLIERID); - const letters = await getLettersBySupplier(SUPPLIERID, "PENDING", 1); - - if (!letters?.length) { - test.fail(true, `No PENDING letters found for supplier ${SUPPLIERID}`); - return; - } const letter = letters[0]; const headers = patchRequestHeaders(); const body = patchValidRequestBody(letter.id, "ACCEPTED"); @@ -49,21 +53,12 @@ test.describe("API Gateway Tests to Verify Patch Status Endpoint", () => { ); expect(response.status()).toBe(202); - - await deleteLettersBySupplier(letter.id); }); test(`Patch /letters returns 202 and status is updated to REJECTED`, async ({ request, }) => { - await createTestData(SUPPLIERID); - const letters = await getLettersBySupplier(SUPPLIERID, "PENDING", 1); - - if (!letters?.length) { - test.fail(true, `No PENDING letters found for supplier ${SUPPLIERID}`); - return; - } - const letter = letters[0]; + const letter = letters[1]; const headers = patchRequestHeaders(); const body = patchFailureRequestBody(letter.id, "REJECTED"); @@ -76,8 +71,6 @@ test.describe("API Gateway Tests to Verify Patch Status Endpoint", () => { ); expect(response.status()).toBe(202); - - await deleteLettersBySupplier(letter.id); }); test(`Patch /letters returns 400 if request Body is invalid`, async ({ diff --git a/tests/helpers/generate-fetch-test-data.ts b/tests/helpers/generate-fetch-test-data.ts index 3d0f230e1..86746ff9b 100644 --- a/tests/helpers/generate-fetch-test-data.ts +++ b/tests/helpers/generate-fetch-test-data.ts @@ -74,11 +74,14 @@ export const getLettersBySupplier = async ( return Items as SupplierApiLetters[]; }; -export const deleteLettersBySupplier = async (id: string) => { +export const deleteLettersBySupplier = async ( + supplierId: string, + id: string, +) => { const resp = await docClient.send( new DeleteCommand({ TableName: LETTERSTABLENAME, - Key: { supplierId: SUPPLIERID, id }, + Key: { id, supplierId }, ReturnValues: "ALL_OLD", }), );