From 9babf8525c42c06dc41be57b44350cc699f90203 Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 15:42:13 +0000 Subject: [PATCH 01/23] docs: add least-privilege deployment roles and deployment guide Add DEPLOYMENT_ROLES.md with least-privilege IAM policy for the CloudFormation execution role (IaCRole-ABCA), derived from analysis of all CDK constructs and handler code in the current single-stack architecture. Includes optional ECS statements when Fargate is enabled. Add DEPLOYMENT_GUIDE.md covering compute backend choices (AgentCore vs opt-in ECS Fargate via ComputeStrategy), scale-to-zero analysis, and complete AWS services inventory. Update COST_MODEL.md with scale-to-zero characteristics section, corrected baseline to ~$85-95/month, and updated references. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/COST_MODEL.md | 15 +- docs/design/DEPLOYMENT_ROLES.md | 585 ++++++++++++++++++++++++++++++++ docs/guides/DEPLOYMENT_GUIDE.md | 123 +++++++ 3 files changed, 718 insertions(+), 5 deletions(-) create mode 100644 docs/design/DEPLOYMENT_ROLES.md create mode 100644 docs/guides/DEPLOYMENT_GUIDE.md diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index 68220ad..a4039b3 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -16,7 +16,11 @@ These costs are incurred regardless of task volume: | DynamoDB (on-demand, idle) | ~$0/month | Pay-per-request; no cost when idle. | | CloudWatch Logs retention | ~$1–5/month | Depends on log volume. 90-day retention. | | API Gateway (idle) | ~$0/month | Pay-per-request. | -| **Total baseline** | **~$85–90/month** | | +| **Total baseline** | **~$85–95/month** | | + +### Scale-to-zero characteristics + +Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$85–95/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) for the full scale-to-zero breakdown. ## Per-task variable costs @@ -85,7 +89,8 @@ For multi-user deployments, cost should be attributable to individual users and ## Reference -- [COMPUTE.md - Network architecture](./COMPUTE.md) - VPC infrastructure cost breakdown. -- [ORCHESTRATOR.md](./ORCHESTRATOR.md) - Polling cost analysis. -- [COMPUTE.md](./COMPUTE.md) - Compute option billing models. -- [OBSERVABILITY.md](./OBSERVABILITY.md) - Cost-related metrics (`agent.cost_usd`, token usage). +- [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. +- [DEPLOYMENT_ROLES.md](./DEPLOYMENT_ROLES.md) -- Least-privilege IAM policies for deployment. +- [COMPUTE.md](./COMPUTE.md) -- VPC infrastructure cost breakdown, compute option billing models. +- [ORCHESTRATOR.md](./ORCHESTRATOR.md) -- Polling cost analysis. +- [OBSERVABILITY.md](./OBSERVABILITY.md) -- Cost-related metrics (`agent.cost_usd`, token usage). diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md new file mode 100644 index 0000000..bff0689 --- /dev/null +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -0,0 +1,585 @@ +# Deployment roles + +This document defines least-privilege IAM policies for the CloudFormation execution role used during `cdk deploy`. The default CDK bootstrap grants `AdministratorAccess` to this role; the policies below scope it to only what ABCA needs. + +> **Origin**: These IAM policies were generated from a thorough review of the repository's CDK constructs, stacks, and handler code. Each permission was derived by analyzing what CloudFormation needs to create, update, and delete every resource defined in the CDK stack. They have not yet been validated against a live deployment and should be treated as a starting point for iterative tightening. + +## How CDK deployment roles work + +CDK uses a **four-role model** created during `cdk bootstrap`: + +1. **CDK Deploy Role** -- assumed by the CLI user to initiate deployment +2. **CDK File Publishing Role** -- uploads Lambda zip assets to S3 +3. **CDK Image Publishing Role** -- pushes Docker images to ECR +4. **CloudFormation Execution Role** -- assumed by CloudFormation to create/modify/delete resources + +The policy below is a **CloudFormation Execution Role** replacement. The other three roles are scoped by the bootstrap template and do not need modification for least-privilege deployment. + +## Using this role + +```bash +# Option 1: Re-bootstrap with custom execution policy +# First, create the IAM policy in your account, then: +cdk bootstrap aws://ACCOUNT/REGION \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Policy" + +# Option 2: For CI/CD pipelines, configure the execution role in the pipeline definition +``` + +## Trust policy + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "cloudformation.amazonaws.com" + }, + "Action": "sts:AssumeRole" + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::ACCOUNT_ID:root" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "cdk-hnb659fds" + } + } + } + ] +} +``` + +## IaCRole-ABCA + +For deploying the `backgroundagent-dev` stack. This single stack contains all platform resources including the AgentCore runtime, ECS compute (when enabled), API Gateway, Cognito, DynamoDB tables, VPC, DNS Firewall, and observability infrastructure. + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudFormationSelf", + "Effect": "Allow", + "Action": [ + "cloudformation:CreateStack", + "cloudformation:UpdateStack", + "cloudformation:DeleteStack", + "cloudformation:DescribeStacks", + "cloudformation:DescribeStackEvents", + "cloudformation:DescribeStackResources", + "cloudformation:GetTemplate", + "cloudformation:GetTemplateSummary", + "cloudformation:ListStackResources", + "cloudformation:CreateChangeSet", + "cloudformation:DeleteChangeSet", + "cloudformation:DescribeChangeSet", + "cloudformation:ExecuteChangeSet", + "cloudformation:SetStackPolicy", + "cloudformation:ValidateTemplate", + "cloudformation:ListChangeSets" + ], + "Resource": [ + "arn:aws:cloudformation:*:*:stack/backgroundagent-dev/*", + "arn:aws:cloudformation:*:*:stack/CDKToolkit/*" + ] + }, + { + "Sid": "IAMRolesAndPolicies", + "Effect": "Allow", + "Action": [ + "iam:CreateRole", + "iam:DeleteRole", + "iam:GetRole", + "iam:PassRole", + "iam:UpdateRole", + "iam:TagRole", + "iam:UntagRole", + "iam:ListRoleTags", + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + "iam:PutRolePolicy", + "iam:DeleteRolePolicy", + "iam:GetRolePolicy", + "iam:ListRolePolicies", + "iam:ListAttachedRolePolicies", + "iam:CreatePolicy", + "iam:DeletePolicy", + "iam:GetPolicy", + "iam:GetPolicyVersion", + "iam:CreatePolicyVersion", + "iam:DeletePolicyVersion", + "iam:ListPolicyVersions", + "iam:TagPolicy", + "iam:CreateServiceLinkedRole", + "iam:ListInstanceProfilesForRole" + ], + "Resource": [ + "arn:aws:iam::*:role/backgroundagent-dev-*", + "arn:aws:iam::*:policy/backgroundagent-dev-*", + "arn:aws:iam::*:role/aws-service-role/*" + ] + }, + { + "Sid": "DynamoDB", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DeleteTable", + "dynamodb:DescribeTable", + "dynamodb:DescribeTimeToLive", + "dynamodb:UpdateTimeToLive", + "dynamodb:UpdateTable", + "dynamodb:UpdateContinuousBackups", + "dynamodb:DescribeContinuousBackups", + "dynamodb:TagResource", + "dynamodb:UntagResource", + "dynamodb:ListTagsOfResource", + "dynamodb:PutItem", + "dynamodb:UpdateItem" + ], + "Resource": "arn:aws:dynamodb:*:*:table/backgroundagent-dev-*" + }, + { + "Sid": "Lambda", + "Effect": "Allow", + "Action": [ + "lambda:CreateFunction", + "lambda:DeleteFunction", + "lambda:GetFunction", + "lambda:GetFunctionConfiguration", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:GetPolicy", + "lambda:TagResource", + "lambda:UntagResource", + "lambda:ListTags", + "lambda:PublishVersion", + "lambda:CreateAlias", + "lambda:DeleteAlias", + "lambda:GetAlias", + "lambda:UpdateAlias", + "lambda:PutFunctionEventInvokeConfig", + "lambda:DeleteFunctionEventInvokeConfig", + "lambda:GetFunctionEventInvokeConfig", + "lambda:PutFunctionConcurrency", + "lambda:DeleteFunctionConcurrency" + ], + "Resource": "arn:aws:lambda:*:*:function:backgroundagent-dev-*" + }, + { + "Sid": "APIGateway", + "Effect": "Allow", + "Action": [ + "apigateway:POST", + "apigateway:GET", + "apigateway:PUT", + "apigateway:PATCH", + "apigateway:DELETE", + "apigateway:TagResource", + "apigateway:UntagResource", + "apigateway:SetWebACL", + "apigateway:UpdateRestApiPolicy" + ], + "Resource": [ + "arn:aws:apigateway:*::/restapis", + "arn:aws:apigateway:*::/restapis/*", + "arn:aws:apigateway:*::/account", + "arn:aws:apigateway:*::/tags/*" + ] + }, + { + "Sid": "Cognito", + "Effect": "Allow", + "Action": [ + "cognito-idp:CreateUserPool", + "cognito-idp:DeleteUserPool", + "cognito-idp:DescribeUserPool", + "cognito-idp:UpdateUserPool", + "cognito-idp:CreateUserPoolClient", + "cognito-idp:DeleteUserPoolClient", + "cognito-idp:DescribeUserPoolClient", + "cognito-idp:UpdateUserPoolClient", + "cognito-idp:TagResource", + "cognito-idp:UntagResource", + "cognito-idp:ListTagsForResource" + ], + "Resource": "arn:aws:cognito-idp:*:*:userpool/*" + }, + { + "Sid": "WAFv2", + "Effect": "Allow", + "Action": [ + "wafv2:CreateWebACL", + "wafv2:DeleteWebACL", + "wafv2:GetWebACL", + "wafv2:UpdateWebACL", + "wafv2:AssociateWebACL", + "wafv2:DisassociateWebACL", + "wafv2:ListTagsForResource", + "wafv2:TagResource", + "wafv2:UntagResource" + ], + "Resource": [ + "arn:aws:wafv2:*:*:regional/webacl/*", + "arn:aws:wafv2:*:*:regional/managedruleset/*" + ] + }, + { + "Sid": "VPCNetworking", + "Effect": "Allow", + "Action": [ + "ec2:CreateVpc", + "ec2:DeleteVpc", + "ec2:DescribeVpcs", + "ec2:ModifyVpcAttribute", + "ec2:CreateSubnet", + "ec2:DeleteSubnet", + "ec2:DescribeSubnets", + "ec2:CreateInternetGateway", + "ec2:DeleteInternetGateway", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:DescribeInternetGateways", + "ec2:AllocateAddress", + "ec2:ReleaseAddress", + "ec2:DescribeAddresses", + "ec2:CreateNatGateway", + "ec2:DeleteNatGateway", + "ec2:DescribeNatGateways", + "ec2:CreateRouteTable", + "ec2:DeleteRouteTable", + "ec2:DescribeRouteTables", + "ec2:AssociateRouteTable", + "ec2:DisassociateRouteTable", + "ec2:CreateRoute", + "ec2:DeleteRoute", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:DescribeSecurityGroups", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:RevokeSecurityGroupEgress", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:CreateVpcEndpoint", + "ec2:DeleteVpcEndpoints", + "ec2:DescribeVpcEndpoints", + "ec2:ModifyVpcEndpoint", + "ec2:CreateFlowLogs", + "ec2:DeleteFlowLogs", + "ec2:DescribeFlowLogs", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:DescribeTags", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribePrefixLists" + ], + "Resource": "*" + }, + { + "Sid": "Route53ResolverDNSFirewall", + "Effect": "Allow", + "Action": [ + "route53resolver:CreateFirewallRuleGroup", + "route53resolver:DeleteFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroup", + "route53resolver:CreateFirewallRule", + "route53resolver:DeleteFirewallRule", + "route53resolver:ListFirewallRules", + "route53resolver:UpdateFirewallRule", + "route53resolver:CreateFirewallDomainList", + "route53resolver:DeleteFirewallDomainList", + "route53resolver:GetFirewallDomainList", + "route53resolver:UpdateFirewallDomains", + "route53resolver:AssociateFirewallRuleGroup", + "route53resolver:DisassociateFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroupAssociation", + "route53resolver:ListFirewallRuleGroupAssociations", + "route53resolver:UpdateFirewallConfig", + "route53resolver:GetFirewallConfig", + "route53resolver:TagResource", + "route53resolver:UntagResource", + "route53resolver:ListTagsForResource", + "route53resolver:CreateResolverQueryLogConfig", + "route53resolver:DeleteResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfig", + "route53resolver:AssociateResolverQueryLogConfig", + "route53resolver:DisassociateResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfigAssociation" + ], + "Resource": "*" + }, + { + "Sid": "SecretsManager", + "Effect": "Allow", + "Action": [ + "secretsmanager:CreateSecret", + "secretsmanager:DeleteSecret", + "secretsmanager:DescribeSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue", + "secretsmanager:UpdateSecret", + "secretsmanager:TagResource", + "secretsmanager:UntagResource", + "secretsmanager:GetResourcePolicy", + "secretsmanager:PutResourcePolicy", + "secretsmanager:DeleteResourcePolicy" + ], + "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" + }, + { + "Sid": "BedrockAgentCore", + "Effect": "Allow", + "Action": [ + "bedrock-agentcore:CreateRuntime", + "bedrock-agentcore:DeleteRuntime", + "bedrock-agentcore:GetRuntime", + "bedrock-agentcore:UpdateRuntime", + "bedrock-agentcore:CreateMemory", + "bedrock-agentcore:DeleteMemory", + "bedrock-agentcore:GetMemory", + "bedrock-agentcore:UpdateMemory", + "bedrock-agentcore:TagResource", + "bedrock-agentcore:UntagResource", + "bedrock-agentcore:ListTagsForResource" + ], + "Resource": "*" + }, + { + "Sid": "BedrockGuardrailsAndLogging", + "Effect": "Allow", + "Action": [ + "bedrock:CreateGuardrail", + "bedrock:DeleteGuardrail", + "bedrock:GetGuardrail", + "bedrock:UpdateGuardrail", + "bedrock:CreateGuardrailVersion", + "bedrock:ListGuardrails", + "bedrock:TagResource", + "bedrock:UntagResource", + "bedrock:ListTagsForResource", + "bedrock:PutModelInvocationLoggingConfiguration", + "bedrock:DeleteModelInvocationLoggingConfiguration", + "bedrock:GetModelInvocationLoggingConfiguration" + ], + "Resource": "*" + }, + { + "Sid": "CloudWatchLogsAndDashboards", + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:DeleteLogGroup", + "logs:DescribeLogGroups", + "logs:PutRetentionPolicy", + "logs:DeleteRetentionPolicy", + "logs:TagLogGroup", + "logs:UntagLogGroup", + "logs:TagResource", + "logs:UntagResource", + "logs:ListTagsForResource", + "logs:ListTagsLogGroup", + "logs:PutResourcePolicy", + "logs:DeleteResourcePolicy", + "logs:DescribeResourcePolicies", + "cloudwatch:PutDashboard", + "cloudwatch:DeleteDashboards", + "cloudwatch:GetDashboard", + "cloudwatch:PutMetricAlarm", + "cloudwatch:DeleteAlarms", + "cloudwatch:DescribeAlarms", + "cloudwatch:TagResource", + "cloudwatch:UntagResource" + ], + "Resource": "*" + }, + { + "Sid": "EventBridge", + "Effect": "Allow", + "Action": [ + "events:PutRule", + "events:DeleteRule", + "events:DescribeRule", + "events:PutTargets", + "events:RemoveTargets", + "events:ListTargetsByRule", + "events:TagResource", + "events:UntagResource" + ], + "Resource": "arn:aws:events:*:*:rule/backgroundagent-dev-*" + }, + { + "Sid": "S3CDKAssets", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:GetBucketLocation", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::cdk-hnb659fds-assets-*", + "arn:aws:s3:::cdk-hnb659fds-assets-*/*" + ] + }, + { + "Sid": "ECRForDockerAssets", + "Effect": "Allow", + "Action": [ + "ecr:CreateRepository", + "ecr:DescribeRepositories", + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:PutImage", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload", + "ecr:SetRepositoryPolicy", + "ecr:GetRepositoryPolicy", + "ecr:DeleteRepository", + "ecr:ListTagsForResource", + "ecr:TagResource" + ], + "Resource": [ + "arn:aws:ecr:*:*:repository/cdk-hnb659fds-container-assets-*", + "arn:aws:ecr:*:*:repository/backgroundagent-*" + ] + }, + { + "Sid": "ECRAuthToken", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*" + }, + { + "Sid": "XRay", + "Effect": "Allow", + "Action": [ + "xray:UpdateTraceSegmentDestination", + "xray:GetTraceSegmentDestination" + ], + "Resource": "*" + }, + { + "Sid": "SSMParameterStoreForCDK", + "Effect": "Allow", + "Action": [ + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:PutParameter", + "ssm:DeleteParameter" + ], + "Resource": "arn:aws:ssm:*:*:parameter/cdk-bootstrap/*" + }, + { + "Sid": "STSForCDK", + "Effect": "Allow", + "Action": [ + "sts:AssumeRole", + "sts:GetCallerIdentity" + ], + "Resource": [ + "arn:aws:iam::*:role/cdk-hnb659fds-*" + ] + } + ] +} +``` + +### When ECS compute is enabled + +If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the policy: + +```json +{ + "Sid": "ECS", + "Effect": "Allow", + "Action": [ + "ecs:CreateCluster", + "ecs:DeleteCluster", + "ecs:DescribeClusters", + "ecs:UpdateCluster", + "ecs:UpdateClusterSettings", + "ecs:PutClusterCapacityProviders", + "ecs:RegisterTaskDefinition", + "ecs:DeregisterTaskDefinition", + "ecs:DescribeTaskDefinition", + "ecs:ListTaskDefinitions", + "ecs:TagResource", + "ecs:UntagResource", + "ecs:ListTagsForResource", + "ecs:PutAccountSetting" + ], + "Resource": "*" +} +``` + +## Runtime IAM roles (created by the stack) + +These roles are created inside the CloudFormation stack at deploy time, not by the deployer. They are documented here for a complete picture of the IAM footprint. + +| Role | Assumed By | Purpose | +|------|-----------|---------| +| AgentCore Runtime execution role | AgentCore Runtime | Runs MicroVM containers; DynamoDB, Secrets Manager, CloudWatch Logs, Bedrock, AgentCore Memory access | +| BedrockLoggingRole | `bedrock.amazonaws.com` | Writes model invocation logs to CloudWatch | +| TaskOrchestrator Lambda role | Lambda | Durable orchestrator; DynamoDB, Secrets Manager, AgentCore Runtime invocation, AgentCore Memory | +| ConcurrencyReconciler Lambda role | Lambda | Scheduled reconciliation; DynamoDB scan + conditional updates | +| TaskApi Lambda roles (9-10) | Lambda | API handler functions; DynamoDB, Secrets Manager (webhook handlers), Bedrock Guardrail, Lambda invoke | +| AwsCustomResource Lambda role | Lambda | Blueprint DDB writes, Bedrock logging config, DNS firewall config | +| API Gateway CloudWatch role | API Gateway | Pushes API Gateway access logs | +| VPC Flow Log role | VPC Flow Logs | Writes flow logs to CloudWatch | +| ECS task execution role (when enabled) | ECS (pull images) | ECR image pull, CloudWatch Logs write | +| ECS task role (when enabled) | ECS (container runtime) | DynamoDB, Secrets Manager, Bedrock InvokeModel | + +### CDK bootstrap roles + +| Role | Purpose | +|------|---------| +| `cdk-hnb659fds-deploy-role-*` | Assumed by CDK CLI to initiate deployments | +| `cdk-hnb659fds-cfn-exec-role-*` | Assumed by CloudFormation to create resources (**this is what IaCRole-ABCA replaces**) | +| `cdk-hnb659fds-file-publish-role-*` | Uploads Lambda zip assets to S3 | +| `cdk-hnb659fds-image-publish-role-*` | Pushes Docker images to ECR | +| `cdk-hnb659fds-lookup-role-*` | Context lookups (VPC, AZs, etc.) | + +## Resource-level permission constraints + +Several services require `Resource: "*"` because they do not support resource-level permissions for create/describe operations: + +| Service | Actions Requiring `"*"` | Reason | +|---------|------------------------|--------| +| EC2 (VPC) | `Create*`, `Describe*`, `Allocate*` | VPC resource ARNs unknown at policy creation time | +| Route 53 Resolver | All DNS Firewall actions | No resource-level ARN support for firewall rule groups | +| Bedrock | Guardrail + logging config actions | Account-level APIs (`PutModelInvocationLoggingConfiguration`) | +| Bedrock AgentCore | Runtime + Memory CRUD | Resource ARN patterns may not be fully supported in IAM yet | +| CloudWatch Logs | `CreateLogGroup`, `PutResourcePolicy` | Log group ARNs unknown at policy creation; resource policies are account-scoped | +| ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | +| ECR | `GetAuthorizationToken` | Account-level operation | +| X-Ray | `UpdateTraceSegmentDestination` | Account-level operation | + +These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. + +## Iterative tightening + +These policies are conservative-but-scoped starting points. To tighten further: + +1. **Deploy once with CloudTrail enabled**, then use [iamlive](https://github.com/iann0036/iamlive) or CloudTrail Lake to identify the exact API calls made during deployment. +2. **Replace `*` resources** with actual ARNs after the first deploy (e.g., once you know the VPC ID, scope EC2 actions to that VPC). +3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. +4. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. +5. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. + +## Reference + +- [SECURITY.md](./SECURITY.md) -- Runtime IAM, memory isolation, custom step trust boundaries. +- [COMPUTE.md](./COMPUTE.md) -- Compute backend options (AgentCore vs ECS Fargate). +- [COST_MODEL.md](./COST_MODEL.md) -- Infrastructure baseline costs and scale-to-zero analysis. diff --git a/docs/guides/DEPLOYMENT_GUIDE.md b/docs/guides/DEPLOYMENT_GUIDE.md new file mode 100644 index 0000000..725b7a5 --- /dev/null +++ b/docs/guides/DEPLOYMENT_GUIDE.md @@ -0,0 +1,123 @@ +# Deployment guide + +This guide covers deploying ABCA into an AWS account, including compute backend choices, scale-to-zero characteristics, and the complete AWS service inventory. For day-to-day development workflow, see the [Developer guide](./DEVELOPER_GUIDE.md). For a quick first deployment, see the [Quick start](./QUICK_START.md). For least-privilege IAM deployment roles, see [DEPLOYMENT_ROLES.md](../design/DEPLOYMENT_ROLES.md). + +## Architecture overview + +ABCA deploys as a **single CDK stack** (`backgroundagent-dev`) containing all platform resources. The stack uses a `ComputeStrategy` interface to support two compute backends within the same stack: + +| Aspect | AgentCore (default) | ECS Fargate (opt-in) | +|--------|--------------------|--------------------| +| **Compute** | Bedrock AgentCore Runtime (Firecracker MicroVMs) | ECS Fargate containers | +| **Resources** | 2 vCPU, 8 GB RAM, 2 GB max image size | 2 vCPU, 4 GB RAM | +| **Orchestration** | Durable Lambda (checkpoint/replay) | Same durable Lambda via `ComputeStrategy` | +| **Agent mode** | FastAPI server (HTTP invocation) | Batch (run-to-completion) | +| **Startup** | ~10s (warm MicroVM) | ~60-180s (Fargate cold start) | +| **Max duration** | 8 hours (AgentCore session) | Limited by orchestrator timeout (9 hours) | + +Both backends are orchestrated by the same durable Lambda function. The `ComputeStrategy` interface abstracts `startSession()`, `pollSession()`, and `stopSession()` -- the ECS strategy calls `ecs:RunTask` / `ecs:DescribeTasks` / `ecs:StopTask` directly from the Lambda. No Step Functions are used. + +ECS Fargate is currently **opt-in** -- the `EcsAgentCluster` construct is present in the stack code but commented out. To enable it, uncomment the ECS blocks in `cdk/src/stacks/agent.ts`. + +## Scale-to-zero analysis + +### Components that scale to zero (pay-per-use) + +| Component | Billing Model | Idle Cost | +|-----------|--------------|-----------| +| DynamoDB (5 tables) | PAY_PER_REQUEST | $0 | +| Lambda (all functions) | Per invocation | $0 | +| API Gateway REST | Per request | $0 | +| ECS Fargate tasks (when enabled) | Per running task | $0 (cluster is free) | +| AgentCore Runtime | Per session | $0 | +| Bedrock inference | Per token | $0 | +| AgentCore Memory | Proportional to usage | ~$0 | +| Cognito | Free tier (50K MAU) | $0 | + +### Components that do not scale to zero (always-on) + +| Component | Est. Monthly Idle Cost | Why | +|-----------|----------------------|-----| +| NAT Gateway (1x) | ~$32 | $0.045/hr fixed charge | +| VPC Interface Endpoints (7x, 2 AZs) | ~$50 | $0.01/hr per endpoint per AZ | +| WAF v2 Web ACL | ~$5 | Base monthly charge | +| CloudWatch Dashboard | ~$3 | Per-dashboard charge | +| Secrets Manager (1+ secrets) | ~$0.40/secret | Per-secret monthly | +| CloudWatch Alarms | ~$0.10/alarm | Per standard alarm | +| CloudWatch Logs retention | ~$1-5 | Storage for retained logs | +| **Total always-on baseline** | **~$85-95/month** | | + +The dominant idle cost is VPC networking: 7 interface endpoints (~$50/month) plus the NAT Gateway (~$32/month). + +For the full cost model including per-task costs, see [COST_MODEL.md](../design/COST_MODEL.md). + +## AWS services inventory + +### Compute + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| Bedrock AgentCore Runtime (MicroVMs) | Agent sessions (default) | Yes | +| ECS Fargate (when enabled) | Agent sessions (opt-in) | Yes | +| Lambda (Node.js 24, ARM64) | Orchestrator, API handlers, reconciler, custom resources | Yes | + +### AI/ML + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| Bedrock (Claude Sonnet 4.6, Opus 4, Haiku 4.5) | Agent reasoning, cross-region inference profiles | Yes | +| Bedrock Guardrails | Prompt injection detection on task input | Yes | +| Bedrock AgentCore Memory | Semantic + episodic extraction strategies | Yes | + +### Networking + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| VPC (public + private subnets, 2 AZs) | All compute | N/A (no direct cost) | +| NAT Gateway (1x) | Private subnet internet egress | **No** (~$32/mo) | +| VPC Interface Endpoints (7x) | AWS service connectivity from private subnets | **No** (~$50/mo) | +| VPC Gateway Endpoints (2x: S3, DynamoDB) | S3 and DynamoDB connectivity | Yes (free) | +| Security Groups | HTTPS-only egress | N/A | +| Route 53 Resolver DNS Firewall | Domain allowlisting for agent egress | Minimal | + +### Storage / Database + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| DynamoDB (5 tables, PAY_PER_REQUEST) | Task state, events, concurrency, webhooks, repo config | Yes | +| S3 | CDK asset bucket, ECR image layers, FUSE session storage | Minimal | +| Secrets Manager | GitHub PAT, webhook HMAC secrets | **No** (~$0.40/secret/mo) | + +### API / Auth + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| API Gateway (REST) | Task REST API | Yes | +| Cognito User Pool | CLI/API authentication | Yes (free tier) | +| WAF v2 | API Gateway protection (managed rules + rate limiting) | **No** (~$5/mo base) | + +### Observability + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| CloudWatch Logs (multiple log groups) | Application, usage, model invocation, VPC flow, DNS query logs | **No** (storage) | +| CloudWatch Dashboard | Operational metrics visualization | **No** (~$3/mo) | +| CloudWatch Alarms | Orchestrator error alerting | **No** (~$0.10/alarm) | +| X-Ray | AgentCore Runtime tracing | Yes | + +### Infrastructure / Deployment + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| CloudFormation | Stack deployment, custom resources | N/A | +| ECR | Container image storage | Minimal | +| IAM | Roles and policies for all components | N/A | + +## Reference + +- [Quick start](./QUICK_START.md) -- Zero-to-first-PR in 6 steps. +- [Developer guide](./DEVELOPER_GUIDE.md) -- Local development, testing, repository onboarding. +- [User guide](./USER_GUIDE.md) -- API reference, CLI usage, task management. +- [DEPLOYMENT_ROLES.md](../design/DEPLOYMENT_ROLES.md) -- Least-privilege IAM policies for CloudFormation execution. +- [COST_MODEL.md](../design/COST_MODEL.md) -- Per-task costs, cost guardrails, cost at scale. +- [COMPUTE.md](../design/COMPUTE.md) -- Compute backend architecture and trade-offs. From 94e557958d02f891f125b34edc705b908080908c Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 15:46:04 +0000 Subject: [PATCH 02/23] fix(docs): preserve original reference order in COST_MODEL.md Append new references at the bottom instead of reordering the existing list. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/COST_MODEL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index a4039b3..8b68d77 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -89,8 +89,8 @@ For multi-user deployments, cost should be attributable to individual users and ## Reference -- [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. -- [DEPLOYMENT_ROLES.md](./DEPLOYMENT_ROLES.md) -- Least-privilege IAM policies for deployment. - [COMPUTE.md](./COMPUTE.md) -- VPC infrastructure cost breakdown, compute option billing models. - [ORCHESTRATOR.md](./ORCHESTRATOR.md) -- Polling cost analysis. - [OBSERVABILITY.md](./OBSERVABILITY.md) -- Cost-related metrics (`agent.cost_usd`, token usage). +- [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. +- [DEPLOYMENT_ROLES.md](./DEPLOYMENT_ROLES.md) -- Least-privilege IAM policies for deployment. From ee32d47abbbedf46cddd5245916b8717bcae40a2 Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 15:48:06 +0000 Subject: [PATCH 03/23] fix(docs): restore dual COMPUTE.md references in COST_MODEL.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original had COMPUTE.md listed twice intentionally — once for the network architecture section and once for compute billing. Restore this pattern instead of merging into one entry. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/COST_MODEL.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index 8b68d77..eb4fc3e 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -89,8 +89,9 @@ For multi-user deployments, cost should be attributable to individual users and ## Reference -- [COMPUTE.md](./COMPUTE.md) -- VPC infrastructure cost breakdown, compute option billing models. +- [COMPUTE.md - Network architecture](./COMPUTE.md) -- VPC infrastructure cost breakdown. - [ORCHESTRATOR.md](./ORCHESTRATOR.md) -- Polling cost analysis. +- [COMPUTE.md](./COMPUTE.md) -- Compute option billing models. - [OBSERVABILITY.md](./OBSERVABILITY.md) -- Cost-related metrics (`agent.cost_usd`, token usage). - [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. - [DEPLOYMENT_ROLES.md](./DEPLOYMENT_ROLES.md) -- Least-privilege IAM policies for deployment. From e0789d8d27e549a82d2a14bd1d2f6e66f7a7f1c2 Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 15:51:57 +0000 Subject: [PATCH 04/23] fix(docs): consolidate COMPUTE.md references with section anchor Single entry with anchor link to the network architecture section instead of listing the same file twice. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/COST_MODEL.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index eb4fc3e..68e50d0 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -89,9 +89,8 @@ For multi-user deployments, cost should be attributable to individual users and ## Reference -- [COMPUTE.md - Network architecture](./COMPUTE.md) -- VPC infrastructure cost breakdown. +- [COMPUTE.md](./COMPUTE.md) -- Compute option billing models and [network architecture](./COMPUTE.md#network-architecture). - [ORCHESTRATOR.md](./ORCHESTRATOR.md) -- Polling cost analysis. -- [COMPUTE.md](./COMPUTE.md) -- Compute option billing models. - [OBSERVABILITY.md](./OBSERVABILITY.md) -- Cost-related metrics (`agent.cost_usd`, token usage). - [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. - [DEPLOYMENT_ROLES.md](./DEPLOYMENT_ROLES.md) -- Least-privilege IAM policies for deployment. From 5b8e5601ba76ac72a5c48875146bc5fdb799b7c5 Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 15:59:40 +0000 Subject: [PATCH 05/23] fix(docs): replace iamlive with IAM Access Analyzer recommendation Use AWS-native IAM Access Analyzer policy generation instead of third-party tooling for iterative policy tightening. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index bff0689..2252a83 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -572,7 +572,7 @@ These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the These policies are conservative-but-scoped starting points. To tighten further: -1. **Deploy once with CloudTrail enabled**, then use [iamlive](https://github.com/iann0036/iamlive) or CloudTrail Lake to identify the exact API calls made during deployment. +1. **Deploy once with CloudTrail enabled**, then use [IAM Access Analyzer policy generation](https://docs.aws.amazon.com/IAM/latest/UserGuide/access-analyzer-policy-generation.html) to generate a least-privilege policy based on the actual API calls recorded in CloudTrail. 2. **Replace `*` resources** with actual ARNs after the first deploy (e.g., once you know the VPC ID, scope EC2 actions to that VPC). 3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. 4. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. From f506b4e7c7fcdfc99d518a56430eaa89f37db29a Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust Date: Thu, 23 Apr 2026 09:45:02 -0700 Subject: [PATCH 06/23] fix: remove sub section --- docs/design/COST_MODEL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index 68e50d0..9e4a88b 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -89,7 +89,7 @@ For multi-user deployments, cost should be attributable to individual users and ## Reference -- [COMPUTE.md](./COMPUTE.md) -- Compute option billing models and [network architecture](./COMPUTE.md#network-architecture). +- [COMPUTE.md](./COMPUTE.md) -- Compute option billing models and network architecture. - [ORCHESTRATOR.md](./ORCHESTRATOR.md) -- Polling cost analysis. - [OBSERVABILITY.md](./OBSERVABILITY.md) -- Cost-related metrics (`agent.cost_usd`, token usage). - [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. From 19c188d55ba234f482537589078f35d571475b73 Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:55:05 +0000 Subject: [PATCH 07/23] fix(docs): add generated Starlight mirrors for new and modified docs The sync-starlight.mjs script generates mirror files under docs/src/content/docs/ from source docs. These generated files were missing from prior commits, causing the CI mutation check to fail. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../content/docs/architecture/Cost-model.md | 15 +- .../docs/architecture/Deployment-roles.md | 589 ++++++++++++++++++ 2 files changed, 599 insertions(+), 5 deletions(-) create mode 100644 docs/src/content/docs/architecture/Deployment-roles.md diff --git a/docs/src/content/docs/architecture/Cost-model.md b/docs/src/content/docs/architecture/Cost-model.md index 2a742f6..20a33f3 100644 --- a/docs/src/content/docs/architecture/Cost-model.md +++ b/docs/src/content/docs/architecture/Cost-model.md @@ -20,7 +20,11 @@ These costs are incurred regardless of task volume: | DynamoDB (on-demand, idle) | ~$0/month | Pay-per-request; no cost when idle. | | CloudWatch Logs retention | ~$1–5/month | Depends on log volume. 90-day retention. | | API Gateway (idle) | ~$0/month | Pay-per-request. | -| **Total baseline** | **~$85–90/month** | | +| **Total baseline** | **~$85–95/month** | | + +### Scale-to-zero characteristics + +Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$85–95/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) for the full scale-to-zero breakdown. ## Per-task variable costs @@ -89,7 +93,8 @@ For multi-user deployments, cost should be attributable to individual users and ## Reference -- [COMPUTE.md - Network architecture](/architecture/compute) - VPC infrastructure cost breakdown. -- [ORCHESTRATOR.md](/architecture/orchestrator) - Polling cost analysis. -- [COMPUTE.md](/architecture/compute) - Compute option billing models. -- [OBSERVABILITY.md](/architecture/observability) - Cost-related metrics (`agent.cost_usd`, token usage). +- [COMPUTE.md](/architecture/compute) -- Compute option billing models and network architecture. +- [ORCHESTRATOR.md](/architecture/orchestrator) -- Polling cost analysis. +- [OBSERVABILITY.md](/architecture/observability) -- Cost-related metrics (`agent.cost_usd`, token usage). +- [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. +- [DEPLOYMENT_ROLES.md](/architecture/deployment-roles) -- Least-privilege IAM policies for deployment. diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md new file mode 100644 index 0000000..775ed70 --- /dev/null +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -0,0 +1,589 @@ +--- +title: Deployment roles +--- + +# Deployment roles + +This document defines least-privilege IAM policies for the CloudFormation execution role used during `cdk deploy`. The default CDK bootstrap grants `AdministratorAccess` to this role; the policies below scope it to only what ABCA needs. + +> **Origin**: These IAM policies were generated from a thorough review of the repository's CDK constructs, stacks, and handler code. Each permission was derived by analyzing what CloudFormation needs to create, update, and delete every resource defined in the CDK stack. They have not yet been validated against a live deployment and should be treated as a starting point for iterative tightening. + +## How CDK deployment roles work + +CDK uses a **four-role model** created during `cdk bootstrap`: + +1. **CDK Deploy Role** -- assumed by the CLI user to initiate deployment +2. **CDK File Publishing Role** -- uploads Lambda zip assets to S3 +3. **CDK Image Publishing Role** -- pushes Docker images to ECR +4. **CloudFormation Execution Role** -- assumed by CloudFormation to create/modify/delete resources + +The policy below is a **CloudFormation Execution Role** replacement. The other three roles are scoped by the bootstrap template and do not need modification for least-privilege deployment. + +## Using this role + +```bash +# Option 1: Re-bootstrap with custom execution policy +# First, create the IAM policy in your account, then: +cdk bootstrap aws://ACCOUNT/REGION \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Policy" + +# Option 2: For CI/CD pipelines, configure the execution role in the pipeline definition +``` + +## Trust policy + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "cloudformation.amazonaws.com" + }, + "Action": "sts:AssumeRole" + }, + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::ACCOUNT_ID:root" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "cdk-hnb659fds" + } + } + } + ] +} +``` + +## IaCRole-ABCA + +For deploying the `backgroundagent-dev` stack. This single stack contains all platform resources including the AgentCore runtime, ECS compute (when enabled), API Gateway, Cognito, DynamoDB tables, VPC, DNS Firewall, and observability infrastructure. + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudFormationSelf", + "Effect": "Allow", + "Action": [ + "cloudformation:CreateStack", + "cloudformation:UpdateStack", + "cloudformation:DeleteStack", + "cloudformation:DescribeStacks", + "cloudformation:DescribeStackEvents", + "cloudformation:DescribeStackResources", + "cloudformation:GetTemplate", + "cloudformation:GetTemplateSummary", + "cloudformation:ListStackResources", + "cloudformation:CreateChangeSet", + "cloudformation:DeleteChangeSet", + "cloudformation:DescribeChangeSet", + "cloudformation:ExecuteChangeSet", + "cloudformation:SetStackPolicy", + "cloudformation:ValidateTemplate", + "cloudformation:ListChangeSets" + ], + "Resource": [ + "arn:aws:cloudformation:*:*:stack/backgroundagent-dev/*", + "arn:aws:cloudformation:*:*:stack/CDKToolkit/*" + ] + }, + { + "Sid": "IAMRolesAndPolicies", + "Effect": "Allow", + "Action": [ + "iam:CreateRole", + "iam:DeleteRole", + "iam:GetRole", + "iam:PassRole", + "iam:UpdateRole", + "iam:TagRole", + "iam:UntagRole", + "iam:ListRoleTags", + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + "iam:PutRolePolicy", + "iam:DeleteRolePolicy", + "iam:GetRolePolicy", + "iam:ListRolePolicies", + "iam:ListAttachedRolePolicies", + "iam:CreatePolicy", + "iam:DeletePolicy", + "iam:GetPolicy", + "iam:GetPolicyVersion", + "iam:CreatePolicyVersion", + "iam:DeletePolicyVersion", + "iam:ListPolicyVersions", + "iam:TagPolicy", + "iam:CreateServiceLinkedRole", + "iam:ListInstanceProfilesForRole" + ], + "Resource": [ + "arn:aws:iam::*:role/backgroundagent-dev-*", + "arn:aws:iam::*:policy/backgroundagent-dev-*", + "arn:aws:iam::*:role/aws-service-role/*" + ] + }, + { + "Sid": "DynamoDB", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DeleteTable", + "dynamodb:DescribeTable", + "dynamodb:DescribeTimeToLive", + "dynamodb:UpdateTimeToLive", + "dynamodb:UpdateTable", + "dynamodb:UpdateContinuousBackups", + "dynamodb:DescribeContinuousBackups", + "dynamodb:TagResource", + "dynamodb:UntagResource", + "dynamodb:ListTagsOfResource", + "dynamodb:PutItem", + "dynamodb:UpdateItem" + ], + "Resource": "arn:aws:dynamodb:*:*:table/backgroundagent-dev-*" + }, + { + "Sid": "Lambda", + "Effect": "Allow", + "Action": [ + "lambda:CreateFunction", + "lambda:DeleteFunction", + "lambda:GetFunction", + "lambda:GetFunctionConfiguration", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:GetPolicy", + "lambda:TagResource", + "lambda:UntagResource", + "lambda:ListTags", + "lambda:PublishVersion", + "lambda:CreateAlias", + "lambda:DeleteAlias", + "lambda:GetAlias", + "lambda:UpdateAlias", + "lambda:PutFunctionEventInvokeConfig", + "lambda:DeleteFunctionEventInvokeConfig", + "lambda:GetFunctionEventInvokeConfig", + "lambda:PutFunctionConcurrency", + "lambda:DeleteFunctionConcurrency" + ], + "Resource": "arn:aws:lambda:*:*:function:backgroundagent-dev-*" + }, + { + "Sid": "APIGateway", + "Effect": "Allow", + "Action": [ + "apigateway:POST", + "apigateway:GET", + "apigateway:PUT", + "apigateway:PATCH", + "apigateway:DELETE", + "apigateway:TagResource", + "apigateway:UntagResource", + "apigateway:SetWebACL", + "apigateway:UpdateRestApiPolicy" + ], + "Resource": [ + "arn:aws:apigateway:*::/restapis", + "arn:aws:apigateway:*::/restapis/*", + "arn:aws:apigateway:*::/account", + "arn:aws:apigateway:*::/tags/*" + ] + }, + { + "Sid": "Cognito", + "Effect": "Allow", + "Action": [ + "cognito-idp:CreateUserPool", + "cognito-idp:DeleteUserPool", + "cognito-idp:DescribeUserPool", + "cognito-idp:UpdateUserPool", + "cognito-idp:CreateUserPoolClient", + "cognito-idp:DeleteUserPoolClient", + "cognito-idp:DescribeUserPoolClient", + "cognito-idp:UpdateUserPoolClient", + "cognito-idp:TagResource", + "cognito-idp:UntagResource", + "cognito-idp:ListTagsForResource" + ], + "Resource": "arn:aws:cognito-idp:*:*:userpool/*" + }, + { + "Sid": "WAFv2", + "Effect": "Allow", + "Action": [ + "wafv2:CreateWebACL", + "wafv2:DeleteWebACL", + "wafv2:GetWebACL", + "wafv2:UpdateWebACL", + "wafv2:AssociateWebACL", + "wafv2:DisassociateWebACL", + "wafv2:ListTagsForResource", + "wafv2:TagResource", + "wafv2:UntagResource" + ], + "Resource": [ + "arn:aws:wafv2:*:*:regional/webacl/*", + "arn:aws:wafv2:*:*:regional/managedruleset/*" + ] + }, + { + "Sid": "VPCNetworking", + "Effect": "Allow", + "Action": [ + "ec2:CreateVpc", + "ec2:DeleteVpc", + "ec2:DescribeVpcs", + "ec2:ModifyVpcAttribute", + "ec2:CreateSubnet", + "ec2:DeleteSubnet", + "ec2:DescribeSubnets", + "ec2:CreateInternetGateway", + "ec2:DeleteInternetGateway", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:DescribeInternetGateways", + "ec2:AllocateAddress", + "ec2:ReleaseAddress", + "ec2:DescribeAddresses", + "ec2:CreateNatGateway", + "ec2:DeleteNatGateway", + "ec2:DescribeNatGateways", + "ec2:CreateRouteTable", + "ec2:DeleteRouteTable", + "ec2:DescribeRouteTables", + "ec2:AssociateRouteTable", + "ec2:DisassociateRouteTable", + "ec2:CreateRoute", + "ec2:DeleteRoute", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:DescribeSecurityGroups", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:RevokeSecurityGroupEgress", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:CreateVpcEndpoint", + "ec2:DeleteVpcEndpoints", + "ec2:DescribeVpcEndpoints", + "ec2:ModifyVpcEndpoint", + "ec2:CreateFlowLogs", + "ec2:DeleteFlowLogs", + "ec2:DescribeFlowLogs", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:DescribeTags", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribePrefixLists" + ], + "Resource": "*" + }, + { + "Sid": "Route53ResolverDNSFirewall", + "Effect": "Allow", + "Action": [ + "route53resolver:CreateFirewallRuleGroup", + "route53resolver:DeleteFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroup", + "route53resolver:CreateFirewallRule", + "route53resolver:DeleteFirewallRule", + "route53resolver:ListFirewallRules", + "route53resolver:UpdateFirewallRule", + "route53resolver:CreateFirewallDomainList", + "route53resolver:DeleteFirewallDomainList", + "route53resolver:GetFirewallDomainList", + "route53resolver:UpdateFirewallDomains", + "route53resolver:AssociateFirewallRuleGroup", + "route53resolver:DisassociateFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroupAssociation", + "route53resolver:ListFirewallRuleGroupAssociations", + "route53resolver:UpdateFirewallConfig", + "route53resolver:GetFirewallConfig", + "route53resolver:TagResource", + "route53resolver:UntagResource", + "route53resolver:ListTagsForResource", + "route53resolver:CreateResolverQueryLogConfig", + "route53resolver:DeleteResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfig", + "route53resolver:AssociateResolverQueryLogConfig", + "route53resolver:DisassociateResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfigAssociation" + ], + "Resource": "*" + }, + { + "Sid": "SecretsManager", + "Effect": "Allow", + "Action": [ + "secretsmanager:CreateSecret", + "secretsmanager:DeleteSecret", + "secretsmanager:DescribeSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue", + "secretsmanager:UpdateSecret", + "secretsmanager:TagResource", + "secretsmanager:UntagResource", + "secretsmanager:GetResourcePolicy", + "secretsmanager:PutResourcePolicy", + "secretsmanager:DeleteResourcePolicy" + ], + "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" + }, + { + "Sid": "BedrockAgentCore", + "Effect": "Allow", + "Action": [ + "bedrock-agentcore:CreateRuntime", + "bedrock-agentcore:DeleteRuntime", + "bedrock-agentcore:GetRuntime", + "bedrock-agentcore:UpdateRuntime", + "bedrock-agentcore:CreateMemory", + "bedrock-agentcore:DeleteMemory", + "bedrock-agentcore:GetMemory", + "bedrock-agentcore:UpdateMemory", + "bedrock-agentcore:TagResource", + "bedrock-agentcore:UntagResource", + "bedrock-agentcore:ListTagsForResource" + ], + "Resource": "*" + }, + { + "Sid": "BedrockGuardrailsAndLogging", + "Effect": "Allow", + "Action": [ + "bedrock:CreateGuardrail", + "bedrock:DeleteGuardrail", + "bedrock:GetGuardrail", + "bedrock:UpdateGuardrail", + "bedrock:CreateGuardrailVersion", + "bedrock:ListGuardrails", + "bedrock:TagResource", + "bedrock:UntagResource", + "bedrock:ListTagsForResource", + "bedrock:PutModelInvocationLoggingConfiguration", + "bedrock:DeleteModelInvocationLoggingConfiguration", + "bedrock:GetModelInvocationLoggingConfiguration" + ], + "Resource": "*" + }, + { + "Sid": "CloudWatchLogsAndDashboards", + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:DeleteLogGroup", + "logs:DescribeLogGroups", + "logs:PutRetentionPolicy", + "logs:DeleteRetentionPolicy", + "logs:TagLogGroup", + "logs:UntagLogGroup", + "logs:TagResource", + "logs:UntagResource", + "logs:ListTagsForResource", + "logs:ListTagsLogGroup", + "logs:PutResourcePolicy", + "logs:DeleteResourcePolicy", + "logs:DescribeResourcePolicies", + "cloudwatch:PutDashboard", + "cloudwatch:DeleteDashboards", + "cloudwatch:GetDashboard", + "cloudwatch:PutMetricAlarm", + "cloudwatch:DeleteAlarms", + "cloudwatch:DescribeAlarms", + "cloudwatch:TagResource", + "cloudwatch:UntagResource" + ], + "Resource": "*" + }, + { + "Sid": "EventBridge", + "Effect": "Allow", + "Action": [ + "events:PutRule", + "events:DeleteRule", + "events:DescribeRule", + "events:PutTargets", + "events:RemoveTargets", + "events:ListTargetsByRule", + "events:TagResource", + "events:UntagResource" + ], + "Resource": "arn:aws:events:*:*:rule/backgroundagent-dev-*" + }, + { + "Sid": "S3CDKAssets", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:GetBucketLocation", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::cdk-hnb659fds-assets-*", + "arn:aws:s3:::cdk-hnb659fds-assets-*/*" + ] + }, + { + "Sid": "ECRForDockerAssets", + "Effect": "Allow", + "Action": [ + "ecr:CreateRepository", + "ecr:DescribeRepositories", + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "ecr:PutImage", + "ecr:InitiateLayerUpload", + "ecr:UploadLayerPart", + "ecr:CompleteLayerUpload", + "ecr:SetRepositoryPolicy", + "ecr:GetRepositoryPolicy", + "ecr:DeleteRepository", + "ecr:ListTagsForResource", + "ecr:TagResource" + ], + "Resource": [ + "arn:aws:ecr:*:*:repository/cdk-hnb659fds-container-assets-*", + "arn:aws:ecr:*:*:repository/backgroundagent-*" + ] + }, + { + "Sid": "ECRAuthToken", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*" + }, + { + "Sid": "XRay", + "Effect": "Allow", + "Action": [ + "xray:UpdateTraceSegmentDestination", + "xray:GetTraceSegmentDestination" + ], + "Resource": "*" + }, + { + "Sid": "SSMParameterStoreForCDK", + "Effect": "Allow", + "Action": [ + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:PutParameter", + "ssm:DeleteParameter" + ], + "Resource": "arn:aws:ssm:*:*:parameter/cdk-bootstrap/*" + }, + { + "Sid": "STSForCDK", + "Effect": "Allow", + "Action": [ + "sts:AssumeRole", + "sts:GetCallerIdentity" + ], + "Resource": [ + "arn:aws:iam::*:role/cdk-hnb659fds-*" + ] + } + ] +} +``` + +### When ECS compute is enabled + +If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the policy: + +```json +{ + "Sid": "ECS", + "Effect": "Allow", + "Action": [ + "ecs:CreateCluster", + "ecs:DeleteCluster", + "ecs:DescribeClusters", + "ecs:UpdateCluster", + "ecs:UpdateClusterSettings", + "ecs:PutClusterCapacityProviders", + "ecs:RegisterTaskDefinition", + "ecs:DeregisterTaskDefinition", + "ecs:DescribeTaskDefinition", + "ecs:ListTaskDefinitions", + "ecs:TagResource", + "ecs:UntagResource", + "ecs:ListTagsForResource", + "ecs:PutAccountSetting" + ], + "Resource": "*" +} +``` + +## Runtime IAM roles (created by the stack) + +These roles are created inside the CloudFormation stack at deploy time, not by the deployer. They are documented here for a complete picture of the IAM footprint. + +| Role | Assumed By | Purpose | +|------|-----------|---------| +| AgentCore Runtime execution role | AgentCore Runtime | Runs MicroVM containers; DynamoDB, Secrets Manager, CloudWatch Logs, Bedrock, AgentCore Memory access | +| BedrockLoggingRole | `bedrock.amazonaws.com` | Writes model invocation logs to CloudWatch | +| TaskOrchestrator Lambda role | Lambda | Durable orchestrator; DynamoDB, Secrets Manager, AgentCore Runtime invocation, AgentCore Memory | +| ConcurrencyReconciler Lambda role | Lambda | Scheduled reconciliation; DynamoDB scan + conditional updates | +| TaskApi Lambda roles (9-10) | Lambda | API handler functions; DynamoDB, Secrets Manager (webhook handlers), Bedrock Guardrail, Lambda invoke | +| AwsCustomResource Lambda role | Lambda | Blueprint DDB writes, Bedrock logging config, DNS firewall config | +| API Gateway CloudWatch role | API Gateway | Pushes API Gateway access logs | +| VPC Flow Log role | VPC Flow Logs | Writes flow logs to CloudWatch | +| ECS task execution role (when enabled) | ECS (pull images) | ECR image pull, CloudWatch Logs write | +| ECS task role (when enabled) | ECS (container runtime) | DynamoDB, Secrets Manager, Bedrock InvokeModel | + +### CDK bootstrap roles + +| Role | Purpose | +|------|---------| +| `cdk-hnb659fds-deploy-role-*` | Assumed by CDK CLI to initiate deployments | +| `cdk-hnb659fds-cfn-exec-role-*` | Assumed by CloudFormation to create resources (**this is what IaCRole-ABCA replaces**) | +| `cdk-hnb659fds-file-publish-role-*` | Uploads Lambda zip assets to S3 | +| `cdk-hnb659fds-image-publish-role-*` | Pushes Docker images to ECR | +| `cdk-hnb659fds-lookup-role-*` | Context lookups (VPC, AZs, etc.) | + +## Resource-level permission constraints + +Several services require `Resource: "*"` because they do not support resource-level permissions for create/describe operations: + +| Service | Actions Requiring `"*"` | Reason | +|---------|------------------------|--------| +| EC2 (VPC) | `Create*`, `Describe*`, `Allocate*` | VPC resource ARNs unknown at policy creation time | +| Route 53 Resolver | All DNS Firewall actions | No resource-level ARN support for firewall rule groups | +| Bedrock | Guardrail + logging config actions | Account-level APIs (`PutModelInvocationLoggingConfiguration`) | +| Bedrock AgentCore | Runtime + Memory CRUD | Resource ARN patterns may not be fully supported in IAM yet | +| CloudWatch Logs | `CreateLogGroup`, `PutResourcePolicy` | Log group ARNs unknown at policy creation; resource policies are account-scoped | +| ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | +| ECR | `GetAuthorizationToken` | Account-level operation | +| X-Ray | `UpdateTraceSegmentDestination` | Account-level operation | + +These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. + +## Iterative tightening + +These policies are conservative-but-scoped starting points. To tighten further: + +1. **Deploy once with CloudTrail enabled**, then use [IAM Access Analyzer policy generation](https://docs.aws.amazon.com/IAM/latest/UserGuide/access-analyzer-policy-generation.html) to generate a least-privilege policy based on the actual API calls recorded in CloudTrail. +2. **Replace `*` resources** with actual ARNs after the first deploy (e.g., once you know the VPC ID, scope EC2 actions to that VPC). +3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. +4. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. +5. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. + +## Reference + +- [SECURITY.md](/architecture/security) -- Runtime IAM, memory isolation, custom step trust boundaries. +- [COMPUTE.md](/architecture/compute) -- Compute backend options (AgentCore vs ECS Fargate). +- [COST_MODEL.md](/architecture/cost-model) -- Infrastructure baseline costs and scale-to-zero analysis. From 0a48dbae17d263bb53fe2431762524f2d3809723 Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:59:06 +0000 Subject: [PATCH 08/23] fix: add docs-sync pre-commit hook and strengthen agentic instructions The PR#46 build failed because Starlight mirror files under docs/src/content/docs/ were not regenerated after editing source docs. The pre-commit hooks had no step to catch this locally. - Add `docs-sync` pre-commit hook that auto-runs sync-starlight.mjs and stages the generated mirrors when docs sources change - Strengthen AGENTS.md boundary and common-mistakes sections to explicitly warn that CI rejects stale mirrors and name the exact command to regenerate them Co-Authored-By: Claude Opus 4.6 (1M context) --- .pre-commit-config.yaml | 8 ++++++++ AGENTS.md | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 04cbdfd..613657f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,6 +54,14 @@ repos: files: ^agent/.*\.py$ stages: [pre-commit] + - id: docs-sync + name: sync docs → Starlight mirrors + entry: bash -lc 'cd "$(git rev-parse --show-toplevel)/docs" && node scripts/sync-starlight.mjs && git add src/content/docs/' + language: system + pass_filenames: false + files: ^(docs/(design|guides)/.*\.md$|CONTRIBUTING\.md$) + stages: [pre-commit] + - id: docs-astro-check name: astro check (docs) entry: bash -lc 'cd "$(git rev-parse --show-toplevel)/docs" && ./node_modules/.bin/astro check' diff --git a/AGENTS.md b/AGENTS.md index a466d6c..b254c35 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -38,6 +38,7 @@ Handler entry tests: `cdk/test/handlers/orchestrate-task.test.ts`, `create-task. ### Common mistakes - Editing **`docs/src/content/docs/`** instead of **`docs/guides/`** or **`docs/design/`** — content is generated; sync from sources. +- Adding or editing files in **`docs/design/`** or **`docs/guides/`** without running **`cd docs && node scripts/sync-starlight.mjs`** — CI will reject ("Fail build on mutation") because the Starlight mirror files in `docs/src/content/docs/` are stale. Always commit the regenerated mirrors alongside source changes. - Changing **`cdk/.../types.ts`** without updating **`cli/src/types.ts`** — CLI and API drift. - Running raw **`jest`/`tsc`/`cdk`** from muscle memory — prefer **`mise //cdk:test`**, **`mise //cdk:compile`**, **`mise //cdk:synth`** (see [Commands you can use](#commands-you-can-use)). - **`MISE_EXPERIMENTAL=1`** — required for namespaced tasks like **`mise //cdk:build`** (see [CONTRIBUTING.md](./CONTRIBUTING.md)). @@ -120,7 +121,7 @@ To build or test only the CLI subproject: ## Boundaries -- **Generated docs** — If you change docs sources (`docs/guides/`, `docs/design/`, `CONTRIBUTING.md`), run `mise //docs:sync` or `mise //docs:build`. +- **Generated docs (CI will reject if stale)** — Editing files in `docs/guides/`, `docs/design/`, or `CONTRIBUTING.md` requires regenerating Starlight mirrors under `docs/src/content/docs/`. Run **`cd docs && node scripts/sync-starlight.mjs`** (fast, <1 s) or **`mise //docs:sync`**, then commit the updated mirrors alongside your source changes. The pre-commit hook `docs-sync` does this automatically when prek hooks are installed, but if you bypass hooks (e.g. `--no-verify`), CI's "Fail build on mutation" step will catch it. - **Dependencies** — Add dependencies to the owning package `package.json` (`cdk/`, `cli/`, or `docs/`), then install via workspace/root install. -- **Build before commit** — Run a full build (`mise run build`) when done so tests/synth/docs/security checks stay in sync. +- **Build before commit** — Run a full build (`mise run build`) when done so tests/synth/docs/security checks stay in sync. This is especially critical for docs changes — the build includes `//docs:sync` which regenerates Starlight mirrors, and CI will fail if the committed mirrors don't match what the build produces. - **Major changes** — Before modifying existing files in a major way (large refactors, new stacks, changing the agent contract), ask first. From 55f04a0451831b9fda8cd2658c6f6512f72bacee Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 17:16:07 +0000 Subject: [PATCH 09/23] fix(docs): correct session timeout and concurrency defaults in COST_MODEL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Session timeout: 8 hours → 9 hours (matches task-orchestrator.ts:173) - Concurrency limit: 2 → 3 (matches task-orchestrator.ts:163 default) Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/COST_MODEL.md | 4 ++-- docs/src/content/docs/architecture/Cost-model.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index 9e4a88b..f1eeb52 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -76,8 +76,8 @@ For multi-user deployments, cost should be attributable to individual users and |---|---|---| | Turn limit | `max_turns` per task | 100 | | Cost budget | `max_budget_usd` per task | None (unlimited) | -| Session timeout | Orchestrator timeout | 8 hours | -| Concurrency limit | Per-user atomic counter | 2 concurrent tasks | +| Session timeout | Orchestrator timeout | 9 hours | +| Concurrency limit | Per-user atomic counter | 3 concurrent tasks | | System concurrency | System-wide counter | Account-level AgentCore quota | ## Additional guardrails diff --git a/docs/src/content/docs/architecture/Cost-model.md b/docs/src/content/docs/architecture/Cost-model.md index 20a33f3..bb83b5e 100644 --- a/docs/src/content/docs/architecture/Cost-model.md +++ b/docs/src/content/docs/architecture/Cost-model.md @@ -80,8 +80,8 @@ For multi-user deployments, cost should be attributable to individual users and |---|---|---| | Turn limit | `max_turns` per task | 100 | | Cost budget | `max_budget_usd` per task | None (unlimited) | -| Session timeout | Orchestrator timeout | 8 hours | -| Concurrency limit | Per-user atomic counter | 2 concurrent tasks | +| Session timeout | Orchestrator timeout | 9 hours | +| Concurrency limit | Per-user atomic counter | 3 concurrent tasks | | System concurrency | System-wide counter | Account-level AgentCore quota | ## Additional guardrails From c7df38461692cf459ff0d84a12b3b8904f866fdc Mon Sep 17 00:00:00 2001 From: Scott Schreckengaust <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 18:01:14 +0000 Subject: [PATCH 10/23] chore: gitignore Claude Code plugin artifacts (.mcp.json, .remember/) Prevents local plugin state from the remember and MCP plugins from being tracked in version control. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index d4e5bb9..b643784 100644 --- a/.gitignore +++ b/.gitignore @@ -90,6 +90,12 @@ agent/gitleaks-report.json .env.* .claude/settings.local.json +# ────────────────────────────────────────────── +# Claude Code plugins +# ────────────────────────────────────────────── +.mcp.json +.remember/ + # ────────────────────────────────────────────── # Misc # ────────────────────────────────────────────── From dbdc12677a9033b72462703a39511911ee4dabd0 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:56:49 +0000 Subject: [PATCH 11/23] fix(docs): add X-Ray resource policy prerequisite and build credential notes On a fresh AWS account, `aws xray update-trace-segment-destination` fails with AccessDeniedException because X-Ray needs a CloudWatch Logs resource policy before it can write spans. Added the prerequisite `aws logs put-resource-policy` command to Quick Start Step 3. Also documented that `mise run build` requires AWS credentials with ec2:DescribeAvailabilityZones for CDK synthesis, and added common error table entries for the X-Ray, build credential, and non-TTY deploy issues. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/guides/QUICK_START.md | 18 ++++++++++++++---- .../docs/getting-started/Quick-start.md | 18 ++++++++++++++---- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/docs/guides/QUICK_START.md b/docs/guides/QUICK_START.md index d81feca..1c38b1e 100644 --- a/docs/guides/QUICK_START.md +++ b/docs/guides/QUICK_START.md @@ -6,7 +6,7 @@ Go from zero to your first agent-created pull request in about 30 minutes. This Install these before you begin: -- **AWS account** with credentials configured (`aws configure`) +- **AWS account** with credentials configured (`aws configure`). If you use named profiles, set `AWS_PROFILE` before running any commands in this guide. - **Docker** - for building the agent container image - **mise** - task runner ([install guide](https://mise.jdx.dev/getting-started.html)) - **AWS CDK CLI** - `npm install -g aws-cdk` (after mise is active) @@ -35,6 +35,8 @@ mise run build `mise run install` installs all JavaScript and Python dependencies across the monorepo. `mise run build` compiles the CDK app, the CLI, the agent image, and the docs site. A successful build means you are ready to deploy. +> **Note:** `mise run build` includes CDK synthesis, which queries AWS for availability zones. Your active AWS credentials must have at least `ec2:DescribeAvailabilityZones` permission, or the build will fail. If you use named profiles, make sure `AWS_PROFILE` is set before running the build. + ## Step 2 - Prepare a repository The agent works by cloning a GitHub repository, creating a branch, making code changes, running the build and tests, and opening a pull request. This means it needs **write access** to a real repository. @@ -75,7 +77,12 @@ The `repo` value must match **exactly** what you will pass to the CLI later (`ow The CDK stack deploys the full platform: API Gateway, Lambda functions (orchestrator, task CRUD, webhooks), DynamoDB tables, AgentCore Runtime, VPC with network isolation, Cognito user pool, and CloudWatch dashboards. ```bash -# One-time account setup (X-Ray destination) +# One-time account setup: allow X-Ray to write spans to CloudWatch Logs. +# On a fresh account, X-Ray needs a resource policy before the destination can be set. +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +aws logs put-resource-policy \ + --policy-name xray-spans-policy \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":\"*\"}]}" aws xray update-trace-segment-destination --destination CloudWatchLogs # Bootstrap CDK (first time only) @@ -85,7 +92,7 @@ mise run //cdk:bootstrap mise run //cdk:deploy ``` -The X-Ray command is a one-time per-account setup. CDK bootstrap provisions the staging resources CDK needs (S3 bucket, IAM roles). The deploy itself takes around 10 minutes - most of the time is spent building the Docker image and provisioning the AgentCore Runtime. +The X-Ray commands are a one-time per-account setup. On a fresh account the `put-resource-policy` call is required first — without it, the `update-trace-segment-destination` command fails with an `AccessDeniedException` because X-Ray cannot write to the `aws/spans` log group. CDK bootstrap provisions the staging resources CDK needs (S3 bucket, IAM roles). The deploy itself takes around 10 minutes - most of the time is spent building the Docker image and provisioning the AgentCore Runtime. ## Step 4 - Store the GitHub token @@ -183,7 +190,10 @@ Here is what the platform did after you ran `bgagent submit`: |---|---|---| | `yarn: command not found` | Corepack not enabled or mise not activated in your shell | Run `eval "$(mise activate zsh)"`, then `corepack enable && corepack prepare yarn@1.22.22 --activate` | | `MISE_EXPERIMENTAL required` | Namespaced tasks need the experimental flag | `export MISE_EXPERIMENTAL=1` | -| CDK deploy fails with "X-Ray Delivery Destination..." | Missing one-time account setup | `aws xray update-trace-segment-destination --destination CloudWatchLogs` | +| `AccessDeniedException` on `update-trace-segment-destination` | Fresh account missing CloudWatch Logs resource policy for X-Ray | Run `aws logs put-resource-policy` first (see Step 3) | +| CDK deploy fails with "X-Ray Delivery Destination..." | Missing one-time account setup | Run both X-Ray commands in Step 3 | +| `mise run build` fails with `ec2:DescribeAvailabilityZones` error | AWS credentials missing or insufficient for CDK synth | Set `AWS_PROFILE` or configure credentials with at least EC2 read access | +| CDK deploy prompts for approval and hangs | Non-interactive terminal (CI/CD, scripts) | Pass `--require-approval never` to `cdk deploy` or use an interactive terminal | | `put-secret-value` returns double-dot endpoint | `REGION` variable is empty | Set `REGION=us-east-1` (or your actual region) before running the command | | `REPO_NOT_ONBOARDED` on task submit | Blueprint `repo` does not match what you passed to the CLI | Check `cdk/src/stacks/agent.ts` - the `repo` value must be exactly `owner/repo` matching your fork | | `INSUFFICIENT_GITHUB_REPO_PERMISSIONS` | PAT is missing required permissions or is scoped to the wrong repo | Regenerate the PAT with Contents (read/write) and Pull requests (read/write) scoped to your fork, then update Secrets Manager | diff --git a/docs/src/content/docs/getting-started/Quick-start.md b/docs/src/content/docs/getting-started/Quick-start.md index 46e965b..7bc48cc 100644 --- a/docs/src/content/docs/getting-started/Quick-start.md +++ b/docs/src/content/docs/getting-started/Quick-start.md @@ -10,7 +10,7 @@ Go from zero to your first agent-created pull request in about 30 minutes. This Install these before you begin: -- **AWS account** with credentials configured (`aws configure`) +- **AWS account** with credentials configured (`aws configure`). If you use named profiles, set `AWS_PROFILE` before running any commands in this guide. - **Docker** - for building the agent container image - **mise** - task runner ([install guide](https://mise.jdx.dev/getting-started.html)) - **AWS CDK CLI** - `npm install -g aws-cdk` (after mise is active) @@ -39,6 +39,8 @@ mise run build `mise run install` installs all JavaScript and Python dependencies across the monorepo. `mise run build` compiles the CDK app, the CLI, the agent image, and the docs site. A successful build means you are ready to deploy. +> **Note:** `mise run build` includes CDK synthesis, which queries AWS for availability zones. Your active AWS credentials must have at least `ec2:DescribeAvailabilityZones` permission, or the build will fail. If you use named profiles, make sure `AWS_PROFILE` is set before running the build. + ## Step 2 - Prepare a repository The agent works by cloning a GitHub repository, creating a branch, making code changes, running the build and tests, and opening a pull request. This means it needs **write access** to a real repository. @@ -79,7 +81,12 @@ The `repo` value must match **exactly** what you will pass to the CLI later (`ow The CDK stack deploys the full platform: API Gateway, Lambda functions (orchestrator, task CRUD, webhooks), DynamoDB tables, AgentCore Runtime, VPC with network isolation, Cognito user pool, and CloudWatch dashboards. ```bash -# One-time account setup (X-Ray destination) +# One-time account setup: allow X-Ray to write spans to CloudWatch Logs. +# On a fresh account, X-Ray needs a resource policy before the destination can be set. +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +aws logs put-resource-policy \ + --policy-name xray-spans-policy \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":\"*\"}]}" aws xray update-trace-segment-destination --destination CloudWatchLogs # Bootstrap CDK (first time only) @@ -89,7 +96,7 @@ mise run //cdk:bootstrap mise run //cdk:deploy ``` -The X-Ray command is a one-time per-account setup. CDK bootstrap provisions the staging resources CDK needs (S3 bucket, IAM roles). The deploy itself takes around 10 minutes - most of the time is spent building the Docker image and provisioning the AgentCore Runtime. +The X-Ray commands are a one-time per-account setup. On a fresh account the `put-resource-policy` call is required first — without it, the `update-trace-segment-destination` command fails with an `AccessDeniedException` because X-Ray cannot write to the `aws/spans` log group. CDK bootstrap provisions the staging resources CDK needs (S3 bucket, IAM roles). The deploy itself takes around 10 minutes - most of the time is spent building the Docker image and provisioning the AgentCore Runtime. ## Step 4 - Store the GitHub token @@ -187,7 +194,10 @@ Here is what the platform did after you ran `bgagent submit`: |---|---|---| | `yarn: command not found` | Corepack not enabled or mise not activated in your shell | Run `eval "$(mise activate zsh)"`, then `corepack enable && corepack prepare yarn@1.22.22 --activate` | | `MISE_EXPERIMENTAL required` | Namespaced tasks need the experimental flag | `export MISE_EXPERIMENTAL=1` | -| CDK deploy fails with "X-Ray Delivery Destination..." | Missing one-time account setup | `aws xray update-trace-segment-destination --destination CloudWatchLogs` | +| `AccessDeniedException` on `update-trace-segment-destination` | Fresh account missing CloudWatch Logs resource policy for X-Ray | Run `aws logs put-resource-policy` first (see Step 3) | +| CDK deploy fails with "X-Ray Delivery Destination..." | Missing one-time account setup | Run both X-Ray commands in Step 3 | +| `mise run build` fails with `ec2:DescribeAvailabilityZones` error | AWS credentials missing or insufficient for CDK synth | Set `AWS_PROFILE` or configure credentials with at least EC2 read access | +| CDK deploy prompts for approval and hangs | Non-interactive terminal (CI/CD, scripts) | Pass `--require-approval never` to `cdk deploy` or use an interactive terminal | | `put-secret-value` returns double-dot endpoint | `REGION` variable is empty | Set `REGION=us-east-1` (or your actual region) before running the command | | `REPO_NOT_ONBOARDED` on task submit | Blueprint `repo` does not match what you passed to the CLI | Check `cdk/src/stacks/agent.ts` - the `repo` value must be exactly `owner/repo` matching your fork | | `INSUFFICIENT_GITHUB_REPO_PERMISSIONS` | PAT is missing required permissions or is scoped to the wrong repo | Regenerate the PAT with Contents (read/write) and Pull requests (read/write) scoped to your fork, then update Secrets Manager | From 544cbf2253578253e8f3fad676112f37b4cb82b5 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 20:32:14 +0000 Subject: [PATCH 12/23] fix(plugin): add X-Ray resource policy to /setup and least-privilege ref to /deploy The /setup skill's Phase 3 only ran `aws xray update-trace-segment-destination` which fails with AccessDeniedException on fresh accounts. Added the prerequisite `aws logs put-resource-policy` command. Added a "Least-Privilege Deployment" section to the /deploy skill linking to DEPLOYMENT_ROLES.md with the re-bootstrap command for scoped execution policies. Updated CLAUDE.md to reference the abca-plugin and its available skills so Claude Code sessions discover the guided workflows without requiring --plugin-dir. Co-Authored-By: Claude Opus 4.6 (1M context) --- CLAUDE.md | 2 ++ docs/abca-plugin/skills/deploy/SKILL.md | 11 +++++++++++ docs/abca-plugin/skills/setup/SKILL.md | 8 +++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CLAUDE.md b/CLAUDE.md index 43c994c..f84c43e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1 +1,3 @@ @AGENTS.md + +See also [README.md](./README.md) for the Claude Code plugin (`docs/abca-plugin/`), which provides interactive guided workflows for setup, deployment, repository onboarding, task submission, and troubleshooting via `/setup`, `/deploy`, `/onboard-repo`, `/submit-task`, `/status`, and `/troubleshoot` skills. Run Claude Code with `claude --plugin-dir docs/abca-plugin` to activate it. diff --git a/docs/abca-plugin/skills/deploy/SKILL.md b/docs/abca-plugin/skills/deploy/SKILL.md index ca0471c..e2e0463 100644 --- a/docs/abca-plugin/skills/deploy/SKILL.md +++ b/docs/abca-plugin/skills/deploy/SKILL.md @@ -81,3 +81,14 @@ After a successful deploy, remind the user to: - Store/update the GitHub PAT in Secrets Manager if this is a fresh deployment - Onboard repositories via Blueprint constructs if needed - Run a smoke test: `curl -s -H "Authorization: $TOKEN" $API_URL/tasks` + +## Least-Privilege Deployment + +By default, CDK bootstrap grants `AdministratorAccess` to the CloudFormation execution role. For production or security-sensitive accounts, re-bootstrap with a scoped execution policy: + +```bash +cdk bootstrap aws://ACCOUNT/REGION \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Policy" +``` + +See `docs/design/DEPLOYMENT_ROLES.md` in the repo root for the complete least-privilege IAM policy, trust policy, runtime role inventory, and iterative tightening recommendations. diff --git a/docs/abca-plugin/skills/setup/SKILL.md b/docs/abca-plugin/skills/setup/SKILL.md index a0a86b6..1a5f01b 100644 --- a/docs/abca-plugin/skills/setup/SKILL.md +++ b/docs/abca-plugin/skills/setup/SKILL.md @@ -52,11 +52,17 @@ If `mise run install` fails with "yarn: command not found", Corepack wasn't acti ## Phase 3: One-Time AWS Setup +On a fresh AWS account, X-Ray needs a CloudWatch Logs resource policy before it can write spans. Run both commands — the first creates the policy, the second sets the destination: + ```bash +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +aws logs put-resource-policy \ + --policy-name xray-spans-policy \ + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":\"*\"}]}" aws xray update-trace-segment-destination --destination CloudWatchLogs ``` -This must be run once per AWS account before first deployment. +These must be run once per AWS account before first deployment. If the `put-resource-policy` step is skipped, the `update-trace-segment-destination` command fails with `AccessDeniedException`. ## Phase 4: First Deployment From ad18fcc39d77c8351d2639eceae935833e2d3917 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:03:10 +0000 Subject: [PATCH 13/23] fix(docs): replace IaCRole-ABCA with validated 3-way policy split Replace the single monolithic IAM policy (which exceeded the 6,144-char IAM managed policy limit) with three validated policies: - IaCRole-ABCA-Infrastructure (CFN, IAM, VPC, DNS Firewall) - IaCRole-ABCA-Application (DDB, Lambda, APIGW, Cognito, WAF, EB, SM) - IaCRole-ABCA-Observability (Bedrock, CW, X-Ray, S3, ECR, KMS, SSM, STS) All three policies were validated against a live deployment in us-east-1 (create, update, task execution, and destroy). CloudTrail analysis found 36 additional actions beyond the initial code review, and 7 deployment iterations refined the policies. Key additions: - KMS (entirely missing from original) - lambda:InvokeFunction for AwsCustomResource - bedrock-agentcore:* (CFN handler uses internal action names) - Legacy CW Logs delivery actions for Route53 Resolver - Various Describe/List/Get actions for read-only CFN operations Updated the origin disclaimer, Resource-level permission constraints table, and ECS section to reference the Application policy. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 338 +++++++++++------- .../docs/architecture/Deployment-roles.md | 338 +++++++++++------- 2 files changed, 424 insertions(+), 252 deletions(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index 2252a83..d793ccc 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -2,7 +2,7 @@ This document defines least-privilege IAM policies for the CloudFormation execution role used during `cdk deploy`. The default CDK bootstrap grants `AdministratorAccess` to this role; the policies below scope it to only what ABCA needs. -> **Origin**: These IAM policies were generated from a thorough review of the repository's CDK constructs, stacks, and handler code. Each permission was derived by analyzing what CloudFormation needs to create, update, and delete every resource defined in the CDK stack. They have not yet been validated against a live deployment and should be treated as a starting point for iterative tightening. +> **Origin**: These IAM policies were derived from a thorough review of the repository's CDK constructs, stacks, and handler code, then **validated against a live deployment** in `us-east-1` (create, update, task execution, and destroy). CloudTrail analysis identified 36 additional actions beyond the initial code review, and 7 deployment iterations refined the policies to their current form. The policies are split into three managed policies to stay under the IAM 6,144-character limit. ## How CDK deployment roles work @@ -15,17 +15,26 @@ CDK uses a **four-role model** created during `cdk bootstrap`: The policy below is a **CloudFormation Execution Role** replacement. The other three roles are scoped by the bootstrap template and do not need modification for least-privilege deployment. -## Using this role +## Using these policies + +The policies are split into three IAM managed policies (each under the 6,144-character limit): + +| Policy Name | Scope | +|-------------|-------| +| `IaCRole-ABCA-Infrastructure` | CloudFormation, IAM, VPC networking, Route 53 Resolver DNS Firewall | +| `IaCRole-ABCA-Application` | DynamoDB, Lambda, API Gateway, Cognito, WAFv2, EventBridge, Secrets Manager | +| `IaCRole-ABCA-Observability` | Bedrock AgentCore, Bedrock Guardrails, CloudWatch, X-Ray, S3, ECR, KMS, SSM, STS | ```bash -# Option 1: Re-bootstrap with custom execution policy -# First, create the IAM policy in your account, then: +# Create all three policies in your account, then re-bootstrap: cdk bootstrap aws://ACCOUNT/REGION \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Policy" - -# Option 2: For CI/CD pipelines, configure the execution role in the pipeline definition + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Infrastructure" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Application" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Observability" ``` +The `--cloudformation-execution-policies` flag can be repeated to attach multiple policies to the CloudFormation execution role. + ## Trust policy ```json @@ -59,6 +68,12 @@ cdk bootstrap aws://ACCOUNT/REGION \ For deploying the `backgroundagent-dev` stack. This single stack contains all platform resources including the AgentCore runtime, ECS compute (when enabled), API Gateway, Cognito, DynamoDB tables, VPC, DNS Firewall, and observability infrastructure. +> **IAM managed policy size limit**: A single managed policy cannot exceed 6,144 characters. The permissions below are split into three policies to stay under this limit. Use all three when re-bootstrapping (see [Using these policies](#using-these-policies)). + +### IaCRole-ABCA-Infrastructure + +CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 53 Resolver DNS Firewall. + ```json { "Version": "2012-10-17", @@ -125,6 +140,108 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "arn:aws:iam::*:role/aws-service-role/*" ] }, + { + "Sid": "VPCNetworking", + "Effect": "Allow", + "Action": [ + "ec2:CreateVpc", + "ec2:DeleteVpc", + "ec2:DescribeVpcs", + "ec2:ModifyVpcAttribute", + "ec2:CreateSubnet", + "ec2:DeleteSubnet", + "ec2:DescribeSubnets", + "ec2:CreateInternetGateway", + "ec2:DeleteInternetGateway", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:DescribeInternetGateways", + "ec2:AllocateAddress", + "ec2:ReleaseAddress", + "ec2:DescribeAddresses", + "ec2:CreateNatGateway", + "ec2:DeleteNatGateway", + "ec2:DescribeNatGateways", + "ec2:CreateRouteTable", + "ec2:DeleteRouteTable", + "ec2:DescribeRouteTables", + "ec2:AssociateRouteTable", + "ec2:DisassociateRouteTable", + "ec2:CreateRoute", + "ec2:DeleteRoute", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:DescribeSecurityGroups", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:RevokeSecurityGroupEgress", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:CreateVpcEndpoint", + "ec2:DeleteVpcEndpoints", + "ec2:DescribeVpcEndpoints", + "ec2:ModifyVpcEndpoint", + "ec2:CreateFlowLogs", + "ec2:DeleteFlowLogs", + "ec2:DescribeFlowLogs", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:DescribeTags", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribePrefixLists", + "ec2:DescribeNetworkAcls", + "ec2:DescribeVpcAttribute", + "ec2:ModifySubnetAttribute" + ], + "Resource": "*" + }, + { + "Sid": "Route53ResolverDNSFirewall", + "Effect": "Allow", + "Action": [ + "route53resolver:CreateFirewallRuleGroup", + "route53resolver:DeleteFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroup", + "route53resolver:CreateFirewallRule", + "route53resolver:DeleteFirewallRule", + "route53resolver:ListFirewallRules", + "route53resolver:UpdateFirewallRule", + "route53resolver:CreateFirewallDomainList", + "route53resolver:DeleteFirewallDomainList", + "route53resolver:GetFirewallDomainList", + "route53resolver:UpdateFirewallDomains", + "route53resolver:AssociateFirewallRuleGroup", + "route53resolver:DisassociateFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroupAssociation", + "route53resolver:ListFirewallRuleGroupAssociations", + "route53resolver:UpdateFirewallConfig", + "route53resolver:GetFirewallConfig", + "route53resolver:TagResource", + "route53resolver:UntagResource", + "route53resolver:ListTagsForResource", + "route53resolver:CreateResolverQueryLogConfig", + "route53resolver:DeleteResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfig", + "route53resolver:AssociateResolverQueryLogConfig", + "route53resolver:DisassociateResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfigAssociation", + "route53resolver:ListResolverQueryLogConfigAssociations", + "route53resolver:ListResolverQueryLogConfigs" + ], + "Resource": "*" + } + ] +} +``` + +### IaCRole-ABCA-Application + +DynamoDB tables, Lambda functions, API Gateway, Cognito, WAFv2, EventBridge, and Secrets Manager. When ECS Fargate compute is enabled, add the ECS statement below to this policy. + +```json +{ + "Version": "2012-10-17", + "Statement": [ { "Sid": "DynamoDB", "Effect": "Allow", @@ -141,7 +258,10 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "dynamodb:UntagResource", "dynamodb:ListTagsOfResource", "dynamodb:PutItem", - "dynamodb:UpdateItem" + "dynamodb:UpdateItem", + "dynamodb:DescribeContributorInsights", + "dynamodb:DescribeKinesisStreamingDestination", + "dynamodb:GetResourcePolicy" ], "Resource": "arn:aws:dynamodb:*:*:table/backgroundagent-dev-*" }, @@ -170,9 +290,18 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "lambda:DeleteFunctionEventInvokeConfig", "lambda:GetFunctionEventInvokeConfig", "lambda:PutFunctionConcurrency", - "lambda:DeleteFunctionConcurrency" + "lambda:DeleteFunctionConcurrency", + "lambda:GetFunctionCodeSigningConfig", + "lambda:GetFunctionRecursionConfig", + "lambda:GetProvisionedConcurrencyConfig", + "lambda:GetRuntimeManagementConfig", + "lambda:ListVersionsByFunction", + "lambda:InvokeFunction" ], - "Resource": "arn:aws:lambda:*:*:function:backgroundagent-dev-*" + "Resource": [ + "arn:aws:lambda:*:*:function:backgroundagent-dev-*", + "arn:aws:lambda:*:*:function:backgroundagent-dev-AWS*" + ] }, { "Sid": "APIGateway", @@ -209,7 +338,8 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "cognito-idp:UpdateUserPoolClient", "cognito-idp:TagResource", "cognito-idp:UntagResource", - "cognito-idp:ListTagsForResource" + "cognito-idp:ListTagsForResource", + "cognito-idp:GetUserPoolMfaConfig" ], "Resource": "arn:aws:cognito-idp:*:*:userpool/*" }, @@ -225,7 +355,8 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "wafv2:DisassociateWebACL", "wafv2:ListTagsForResource", "wafv2:TagResource", - "wafv2:UntagResource" + "wafv2:UntagResource", + "wafv2:GetWebACLForResource" ], "Resource": [ "arn:aws:wafv2:*:*:regional/webacl/*", @@ -233,89 +364,20 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl ] }, { - "Sid": "VPCNetworking", - "Effect": "Allow", - "Action": [ - "ec2:CreateVpc", - "ec2:DeleteVpc", - "ec2:DescribeVpcs", - "ec2:ModifyVpcAttribute", - "ec2:CreateSubnet", - "ec2:DeleteSubnet", - "ec2:DescribeSubnets", - "ec2:CreateInternetGateway", - "ec2:DeleteInternetGateway", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:DescribeInternetGateways", - "ec2:AllocateAddress", - "ec2:ReleaseAddress", - "ec2:DescribeAddresses", - "ec2:CreateNatGateway", - "ec2:DeleteNatGateway", - "ec2:DescribeNatGateways", - "ec2:CreateRouteTable", - "ec2:DeleteRouteTable", - "ec2:DescribeRouteTables", - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable", - "ec2:CreateRoute", - "ec2:DeleteRoute", - "ec2:CreateSecurityGroup", - "ec2:DeleteSecurityGroup", - "ec2:DescribeSecurityGroups", - "ec2:AuthorizeSecurityGroupEgress", - "ec2:RevokeSecurityGroupEgress", - "ec2:AuthorizeSecurityGroupIngress", - "ec2:RevokeSecurityGroupIngress", - "ec2:CreateVpcEndpoint", - "ec2:DeleteVpcEndpoints", - "ec2:DescribeVpcEndpoints", - "ec2:ModifyVpcEndpoint", - "ec2:CreateFlowLogs", - "ec2:DeleteFlowLogs", - "ec2:DescribeFlowLogs", - "ec2:CreateTags", - "ec2:DeleteTags", - "ec2:DescribeTags", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribePrefixLists" - ], - "Resource": "*" - }, - { - "Sid": "Route53ResolverDNSFirewall", + "Sid": "EventBridge", "Effect": "Allow", "Action": [ - "route53resolver:CreateFirewallRuleGroup", - "route53resolver:DeleteFirewallRuleGroup", - "route53resolver:GetFirewallRuleGroup", - "route53resolver:CreateFirewallRule", - "route53resolver:DeleteFirewallRule", - "route53resolver:ListFirewallRules", - "route53resolver:UpdateFirewallRule", - "route53resolver:CreateFirewallDomainList", - "route53resolver:DeleteFirewallDomainList", - "route53resolver:GetFirewallDomainList", - "route53resolver:UpdateFirewallDomains", - "route53resolver:AssociateFirewallRuleGroup", - "route53resolver:DisassociateFirewallRuleGroup", - "route53resolver:GetFirewallRuleGroupAssociation", - "route53resolver:ListFirewallRuleGroupAssociations", - "route53resolver:UpdateFirewallConfig", - "route53resolver:GetFirewallConfig", - "route53resolver:TagResource", - "route53resolver:UntagResource", - "route53resolver:ListTagsForResource", - "route53resolver:CreateResolverQueryLogConfig", - "route53resolver:DeleteResolverQueryLogConfig", - "route53resolver:GetResolverQueryLogConfig", - "route53resolver:AssociateResolverQueryLogConfig", - "route53resolver:DisassociateResolverQueryLogConfig", - "route53resolver:GetResolverQueryLogConfigAssociation" + "events:PutRule", + "events:DeleteRule", + "events:DescribeRule", + "events:PutTargets", + "events:RemoveTargets", + "events:ListTargetsByRule", + "events:TagResource", + "events:UntagResource", + "events:ListTagsForResource" ], - "Resource": "*" + "Resource": "arn:aws:events:*:*:rule/backgroundagent-dev-*" }, { "Sid": "SecretsManager", @@ -331,25 +393,31 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "secretsmanager:UntagResource", "secretsmanager:GetResourcePolicy", "secretsmanager:PutResourcePolicy", - "secretsmanager:DeleteResourcePolicy" + "secretsmanager:DeleteResourcePolicy", + "secretsmanager:GetRandomPassword" ], - "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" - }, + "Resource": [ + "arn:aws:secretsmanager:*:*:secret:backgroundagent-*", + "*" + ] + } + ] +} +``` + +### IaCRole-ABCA-Observability + +Bedrock AgentCore, Bedrock Guardrails, CloudWatch Logs/Dashboards/Alarms, X-Ray, S3 (CDK assets), KMS, ECR, SSM, and STS. + +```json +{ + "Version": "2012-10-17", + "Statement": [ { "Sid": "BedrockAgentCore", "Effect": "Allow", "Action": [ - "bedrock-agentcore:CreateRuntime", - "bedrock-agentcore:DeleteRuntime", - "bedrock-agentcore:GetRuntime", - "bedrock-agentcore:UpdateRuntime", - "bedrock-agentcore:CreateMemory", - "bedrock-agentcore:DeleteMemory", - "bedrock-agentcore:GetMemory", - "bedrock-agentcore:UpdateMemory", - "bedrock-agentcore:TagResource", - "bedrock-agentcore:UntagResource", - "bedrock-agentcore:ListTagsForResource" + "bedrock-agentcore:*" ], "Resource": "*" }, @@ -397,25 +465,28 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "cloudwatch:DeleteAlarms", "cloudwatch:DescribeAlarms", "cloudwatch:TagResource", - "cloudwatch:UntagResource" + "cloudwatch:UntagResource", + "logs:CreateDelivery", + "logs:DescribeDeliveries", + "logs:GetDelivery", + "logs:GetDeliveryDestination", + "logs:GetDeliveryDestinationPolicy", + "logs:GetDeliverySource", + "logs:PutDeliveryDestination", + "logs:PutDeliverySource", + "logs:DescribeIndexPolicies", + "cloudwatch:ListTagsForResource", + "logs:CreateLogDelivery", + "logs:DeleteLogDelivery", + "logs:GetLogDelivery", + "logs:UpdateLogDelivery", + "logs:ListLogDeliveries", + "logs:DeleteDelivery", + "logs:DeleteDeliverySource", + "logs:DeleteDeliveryDestination" ], "Resource": "*" }, - { - "Sid": "EventBridge", - "Effect": "Allow", - "Action": [ - "events:PutRule", - "events:DeleteRule", - "events:DescribeRule", - "events:PutTargets", - "events:RemoveTargets", - "events:ListTargetsByRule", - "events:TagResource", - "events:UntagResource" - ], - "Resource": "arn:aws:events:*:*:rule/backgroundagent-dev-*" - }, { "Sid": "S3CDKAssets", "Effect": "Allow", @@ -430,6 +501,18 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "arn:aws:s3:::cdk-hnb659fds-assets-*/*" ] }, + { + "Sid": "KMSForCDKAssets", + "Effect": "Allow", + "Action": [ + "kms:CreateGrant", + "kms:Decrypt", + "kms:DescribeKey", + "kms:Encrypt", + "kms:GenerateDataKey" + ], + "Resource": "*" + }, { "Sid": "ECRForDockerAssets", "Effect": "Allow", @@ -466,7 +549,9 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "Effect": "Allow", "Action": [ "xray:UpdateTraceSegmentDestination", - "xray:GetTraceSegmentDestination" + "xray:GetTraceSegmentDestination", + "xray:ListResourcePolicies", + "xray:PutResourcePolicy" ], "Resource": "*" }, @@ -498,7 +583,7 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl ### When ECS compute is enabled -If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the policy: +If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the `IaCRole-ABCA-Application` policy: ```json { @@ -560,11 +645,12 @@ Several services require `Resource: "*"` because they do not support resource-le | EC2 (VPC) | `Create*`, `Describe*`, `Allocate*` | VPC resource ARNs unknown at policy creation time | | Route 53 Resolver | All DNS Firewall actions | No resource-level ARN support for firewall rule groups | | Bedrock | Guardrail + logging config actions | Account-level APIs (`PutModelInvocationLoggingConfiguration`) | -| Bedrock AgentCore | Runtime + Memory CRUD | Resource ARN patterns may not be fully supported in IAM yet | +| Bedrock AgentCore | All actions (`bedrock-agentcore:*`) | CloudFormation resource handler uses internal action names that differ from the public API; wildcard required for reliable deployment | | CloudWatch Logs | `CreateLogGroup`, `PutResourcePolicy` | Log group ARNs unknown at policy creation; resource policies are account-scoped | | ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | | ECR | `GetAuthorizationToken` | Account-level operation | -| X-Ray | `UpdateTraceSegmentDestination` | Account-level operation | +| KMS | `CreateGrant`, `Decrypt`, `Encrypt`, `GenerateDataKey` | CDK asset encryption keys; key ARNs unknown at policy time | +| X-Ray | `UpdateTraceSegmentDestination`, `PutResourcePolicy` | Account-level operations | These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index 775ed70..0b19241 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -6,7 +6,7 @@ title: Deployment roles This document defines least-privilege IAM policies for the CloudFormation execution role used during `cdk deploy`. The default CDK bootstrap grants `AdministratorAccess` to this role; the policies below scope it to only what ABCA needs. -> **Origin**: These IAM policies were generated from a thorough review of the repository's CDK constructs, stacks, and handler code. Each permission was derived by analyzing what CloudFormation needs to create, update, and delete every resource defined in the CDK stack. They have not yet been validated against a live deployment and should be treated as a starting point for iterative tightening. +> **Origin**: These IAM policies were derived from a thorough review of the repository's CDK constructs, stacks, and handler code, then **validated against a live deployment** in `us-east-1` (create, update, task execution, and destroy). CloudTrail analysis identified 36 additional actions beyond the initial code review, and 7 deployment iterations refined the policies to their current form. The policies are split into three managed policies to stay under the IAM 6,144-character limit. ## How CDK deployment roles work @@ -19,17 +19,26 @@ CDK uses a **four-role model** created during `cdk bootstrap`: The policy below is a **CloudFormation Execution Role** replacement. The other three roles are scoped by the bootstrap template and do not need modification for least-privilege deployment. -## Using this role +## Using these policies + +The policies are split into three IAM managed policies (each under the 6,144-character limit): + +| Policy Name | Scope | +|-------------|-------| +| `IaCRole-ABCA-Infrastructure` | CloudFormation, IAM, VPC networking, Route 53 Resolver DNS Firewall | +| `IaCRole-ABCA-Application` | DynamoDB, Lambda, API Gateway, Cognito, WAFv2, EventBridge, Secrets Manager | +| `IaCRole-ABCA-Observability` | Bedrock AgentCore, Bedrock Guardrails, CloudWatch, X-Ray, S3, ECR, KMS, SSM, STS | ```bash -# Option 1: Re-bootstrap with custom execution policy -# First, create the IAM policy in your account, then: +# Create all three policies in your account, then re-bootstrap: cdk bootstrap aws://ACCOUNT/REGION \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Policy" - -# Option 2: For CI/CD pipelines, configure the execution role in the pipeline definition + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Infrastructure" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Application" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Observability" ``` +The `--cloudformation-execution-policies` flag can be repeated to attach multiple policies to the CloudFormation execution role. + ## Trust policy ```json @@ -63,6 +72,12 @@ cdk bootstrap aws://ACCOUNT/REGION \ For deploying the `backgroundagent-dev` stack. This single stack contains all platform resources including the AgentCore runtime, ECS compute (when enabled), API Gateway, Cognito, DynamoDB tables, VPC, DNS Firewall, and observability infrastructure. +> **IAM managed policy size limit**: A single managed policy cannot exceed 6,144 characters. The permissions below are split into three policies to stay under this limit. Use all three when re-bootstrapping (see [Using these policies](#using-these-policies)). + +### IaCRole-ABCA-Infrastructure + +CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 53 Resolver DNS Firewall. + ```json { "Version": "2012-10-17", @@ -129,6 +144,108 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "arn:aws:iam::*:role/aws-service-role/*" ] }, + { + "Sid": "VPCNetworking", + "Effect": "Allow", + "Action": [ + "ec2:CreateVpc", + "ec2:DeleteVpc", + "ec2:DescribeVpcs", + "ec2:ModifyVpcAttribute", + "ec2:CreateSubnet", + "ec2:DeleteSubnet", + "ec2:DescribeSubnets", + "ec2:CreateInternetGateway", + "ec2:DeleteInternetGateway", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:DescribeInternetGateways", + "ec2:AllocateAddress", + "ec2:ReleaseAddress", + "ec2:DescribeAddresses", + "ec2:CreateNatGateway", + "ec2:DeleteNatGateway", + "ec2:DescribeNatGateways", + "ec2:CreateRouteTable", + "ec2:DeleteRouteTable", + "ec2:DescribeRouteTables", + "ec2:AssociateRouteTable", + "ec2:DisassociateRouteTable", + "ec2:CreateRoute", + "ec2:DeleteRoute", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:DescribeSecurityGroups", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:RevokeSecurityGroupEgress", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:CreateVpcEndpoint", + "ec2:DeleteVpcEndpoints", + "ec2:DescribeVpcEndpoints", + "ec2:ModifyVpcEndpoint", + "ec2:CreateFlowLogs", + "ec2:DeleteFlowLogs", + "ec2:DescribeFlowLogs", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:DescribeTags", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribePrefixLists", + "ec2:DescribeNetworkAcls", + "ec2:DescribeVpcAttribute", + "ec2:ModifySubnetAttribute" + ], + "Resource": "*" + }, + { + "Sid": "Route53ResolverDNSFirewall", + "Effect": "Allow", + "Action": [ + "route53resolver:CreateFirewallRuleGroup", + "route53resolver:DeleteFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroup", + "route53resolver:CreateFirewallRule", + "route53resolver:DeleteFirewallRule", + "route53resolver:ListFirewallRules", + "route53resolver:UpdateFirewallRule", + "route53resolver:CreateFirewallDomainList", + "route53resolver:DeleteFirewallDomainList", + "route53resolver:GetFirewallDomainList", + "route53resolver:UpdateFirewallDomains", + "route53resolver:AssociateFirewallRuleGroup", + "route53resolver:DisassociateFirewallRuleGroup", + "route53resolver:GetFirewallRuleGroupAssociation", + "route53resolver:ListFirewallRuleGroupAssociations", + "route53resolver:UpdateFirewallConfig", + "route53resolver:GetFirewallConfig", + "route53resolver:TagResource", + "route53resolver:UntagResource", + "route53resolver:ListTagsForResource", + "route53resolver:CreateResolverQueryLogConfig", + "route53resolver:DeleteResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfig", + "route53resolver:AssociateResolverQueryLogConfig", + "route53resolver:DisassociateResolverQueryLogConfig", + "route53resolver:GetResolverQueryLogConfigAssociation", + "route53resolver:ListResolverQueryLogConfigAssociations", + "route53resolver:ListResolverQueryLogConfigs" + ], + "Resource": "*" + } + ] +} +``` + +### IaCRole-ABCA-Application + +DynamoDB tables, Lambda functions, API Gateway, Cognito, WAFv2, EventBridge, and Secrets Manager. When ECS Fargate compute is enabled, add the ECS statement below to this policy. + +```json +{ + "Version": "2012-10-17", + "Statement": [ { "Sid": "DynamoDB", "Effect": "Allow", @@ -145,7 +262,10 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "dynamodb:UntagResource", "dynamodb:ListTagsOfResource", "dynamodb:PutItem", - "dynamodb:UpdateItem" + "dynamodb:UpdateItem", + "dynamodb:DescribeContributorInsights", + "dynamodb:DescribeKinesisStreamingDestination", + "dynamodb:GetResourcePolicy" ], "Resource": "arn:aws:dynamodb:*:*:table/backgroundagent-dev-*" }, @@ -174,9 +294,18 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "lambda:DeleteFunctionEventInvokeConfig", "lambda:GetFunctionEventInvokeConfig", "lambda:PutFunctionConcurrency", - "lambda:DeleteFunctionConcurrency" + "lambda:DeleteFunctionConcurrency", + "lambda:GetFunctionCodeSigningConfig", + "lambda:GetFunctionRecursionConfig", + "lambda:GetProvisionedConcurrencyConfig", + "lambda:GetRuntimeManagementConfig", + "lambda:ListVersionsByFunction", + "lambda:InvokeFunction" ], - "Resource": "arn:aws:lambda:*:*:function:backgroundagent-dev-*" + "Resource": [ + "arn:aws:lambda:*:*:function:backgroundagent-dev-*", + "arn:aws:lambda:*:*:function:backgroundagent-dev-AWS*" + ] }, { "Sid": "APIGateway", @@ -213,7 +342,8 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "cognito-idp:UpdateUserPoolClient", "cognito-idp:TagResource", "cognito-idp:UntagResource", - "cognito-idp:ListTagsForResource" + "cognito-idp:ListTagsForResource", + "cognito-idp:GetUserPoolMfaConfig" ], "Resource": "arn:aws:cognito-idp:*:*:userpool/*" }, @@ -229,7 +359,8 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "wafv2:DisassociateWebACL", "wafv2:ListTagsForResource", "wafv2:TagResource", - "wafv2:UntagResource" + "wafv2:UntagResource", + "wafv2:GetWebACLForResource" ], "Resource": [ "arn:aws:wafv2:*:*:regional/webacl/*", @@ -237,89 +368,20 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl ] }, { - "Sid": "VPCNetworking", - "Effect": "Allow", - "Action": [ - "ec2:CreateVpc", - "ec2:DeleteVpc", - "ec2:DescribeVpcs", - "ec2:ModifyVpcAttribute", - "ec2:CreateSubnet", - "ec2:DeleteSubnet", - "ec2:DescribeSubnets", - "ec2:CreateInternetGateway", - "ec2:DeleteInternetGateway", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:DescribeInternetGateways", - "ec2:AllocateAddress", - "ec2:ReleaseAddress", - "ec2:DescribeAddresses", - "ec2:CreateNatGateway", - "ec2:DeleteNatGateway", - "ec2:DescribeNatGateways", - "ec2:CreateRouteTable", - "ec2:DeleteRouteTable", - "ec2:DescribeRouteTables", - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable", - "ec2:CreateRoute", - "ec2:DeleteRoute", - "ec2:CreateSecurityGroup", - "ec2:DeleteSecurityGroup", - "ec2:DescribeSecurityGroups", - "ec2:AuthorizeSecurityGroupEgress", - "ec2:RevokeSecurityGroupEgress", - "ec2:AuthorizeSecurityGroupIngress", - "ec2:RevokeSecurityGroupIngress", - "ec2:CreateVpcEndpoint", - "ec2:DeleteVpcEndpoints", - "ec2:DescribeVpcEndpoints", - "ec2:ModifyVpcEndpoint", - "ec2:CreateFlowLogs", - "ec2:DeleteFlowLogs", - "ec2:DescribeFlowLogs", - "ec2:CreateTags", - "ec2:DeleteTags", - "ec2:DescribeTags", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribePrefixLists" - ], - "Resource": "*" - }, - { - "Sid": "Route53ResolverDNSFirewall", + "Sid": "EventBridge", "Effect": "Allow", "Action": [ - "route53resolver:CreateFirewallRuleGroup", - "route53resolver:DeleteFirewallRuleGroup", - "route53resolver:GetFirewallRuleGroup", - "route53resolver:CreateFirewallRule", - "route53resolver:DeleteFirewallRule", - "route53resolver:ListFirewallRules", - "route53resolver:UpdateFirewallRule", - "route53resolver:CreateFirewallDomainList", - "route53resolver:DeleteFirewallDomainList", - "route53resolver:GetFirewallDomainList", - "route53resolver:UpdateFirewallDomains", - "route53resolver:AssociateFirewallRuleGroup", - "route53resolver:DisassociateFirewallRuleGroup", - "route53resolver:GetFirewallRuleGroupAssociation", - "route53resolver:ListFirewallRuleGroupAssociations", - "route53resolver:UpdateFirewallConfig", - "route53resolver:GetFirewallConfig", - "route53resolver:TagResource", - "route53resolver:UntagResource", - "route53resolver:ListTagsForResource", - "route53resolver:CreateResolverQueryLogConfig", - "route53resolver:DeleteResolverQueryLogConfig", - "route53resolver:GetResolverQueryLogConfig", - "route53resolver:AssociateResolverQueryLogConfig", - "route53resolver:DisassociateResolverQueryLogConfig", - "route53resolver:GetResolverQueryLogConfigAssociation" + "events:PutRule", + "events:DeleteRule", + "events:DescribeRule", + "events:PutTargets", + "events:RemoveTargets", + "events:ListTargetsByRule", + "events:TagResource", + "events:UntagResource", + "events:ListTagsForResource" ], - "Resource": "*" + "Resource": "arn:aws:events:*:*:rule/backgroundagent-dev-*" }, { "Sid": "SecretsManager", @@ -335,25 +397,31 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "secretsmanager:UntagResource", "secretsmanager:GetResourcePolicy", "secretsmanager:PutResourcePolicy", - "secretsmanager:DeleteResourcePolicy" + "secretsmanager:DeleteResourcePolicy", + "secretsmanager:GetRandomPassword" ], - "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" - }, + "Resource": [ + "arn:aws:secretsmanager:*:*:secret:backgroundagent-*", + "*" + ] + } + ] +} +``` + +### IaCRole-ABCA-Observability + +Bedrock AgentCore, Bedrock Guardrails, CloudWatch Logs/Dashboards/Alarms, X-Ray, S3 (CDK assets), KMS, ECR, SSM, and STS. + +```json +{ + "Version": "2012-10-17", + "Statement": [ { "Sid": "BedrockAgentCore", "Effect": "Allow", "Action": [ - "bedrock-agentcore:CreateRuntime", - "bedrock-agentcore:DeleteRuntime", - "bedrock-agentcore:GetRuntime", - "bedrock-agentcore:UpdateRuntime", - "bedrock-agentcore:CreateMemory", - "bedrock-agentcore:DeleteMemory", - "bedrock-agentcore:GetMemory", - "bedrock-agentcore:UpdateMemory", - "bedrock-agentcore:TagResource", - "bedrock-agentcore:UntagResource", - "bedrock-agentcore:ListTagsForResource" + "bedrock-agentcore:*" ], "Resource": "*" }, @@ -401,25 +469,28 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "cloudwatch:DeleteAlarms", "cloudwatch:DescribeAlarms", "cloudwatch:TagResource", - "cloudwatch:UntagResource" + "cloudwatch:UntagResource", + "logs:CreateDelivery", + "logs:DescribeDeliveries", + "logs:GetDelivery", + "logs:GetDeliveryDestination", + "logs:GetDeliveryDestinationPolicy", + "logs:GetDeliverySource", + "logs:PutDeliveryDestination", + "logs:PutDeliverySource", + "logs:DescribeIndexPolicies", + "cloudwatch:ListTagsForResource", + "logs:CreateLogDelivery", + "logs:DeleteLogDelivery", + "logs:GetLogDelivery", + "logs:UpdateLogDelivery", + "logs:ListLogDeliveries", + "logs:DeleteDelivery", + "logs:DeleteDeliverySource", + "logs:DeleteDeliveryDestination" ], "Resource": "*" }, - { - "Sid": "EventBridge", - "Effect": "Allow", - "Action": [ - "events:PutRule", - "events:DeleteRule", - "events:DescribeRule", - "events:PutTargets", - "events:RemoveTargets", - "events:ListTargetsByRule", - "events:TagResource", - "events:UntagResource" - ], - "Resource": "arn:aws:events:*:*:rule/backgroundagent-dev-*" - }, { "Sid": "S3CDKAssets", "Effect": "Allow", @@ -434,6 +505,18 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "arn:aws:s3:::cdk-hnb659fds-assets-*/*" ] }, + { + "Sid": "KMSForCDKAssets", + "Effect": "Allow", + "Action": [ + "kms:CreateGrant", + "kms:Decrypt", + "kms:DescribeKey", + "kms:Encrypt", + "kms:GenerateDataKey" + ], + "Resource": "*" + }, { "Sid": "ECRForDockerAssets", "Effect": "Allow", @@ -470,7 +553,9 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl "Effect": "Allow", "Action": [ "xray:UpdateTraceSegmentDestination", - "xray:GetTraceSegmentDestination" + "xray:GetTraceSegmentDestination", + "xray:ListResourcePolicies", + "xray:PutResourcePolicy" ], "Resource": "*" }, @@ -502,7 +587,7 @@ For deploying the `backgroundagent-dev` stack. This single stack contains all pl ### When ECS compute is enabled -If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the policy: +If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the `IaCRole-ABCA-Application` policy: ```json { @@ -564,11 +649,12 @@ Several services require `Resource: "*"` because they do not support resource-le | EC2 (VPC) | `Create*`, `Describe*`, `Allocate*` | VPC resource ARNs unknown at policy creation time | | Route 53 Resolver | All DNS Firewall actions | No resource-level ARN support for firewall rule groups | | Bedrock | Guardrail + logging config actions | Account-level APIs (`PutModelInvocationLoggingConfiguration`) | -| Bedrock AgentCore | Runtime + Memory CRUD | Resource ARN patterns may not be fully supported in IAM yet | +| Bedrock AgentCore | All actions (`bedrock-agentcore:*`) | CloudFormation resource handler uses internal action names that differ from the public API; wildcard required for reliable deployment | | CloudWatch Logs | `CreateLogGroup`, `PutResourcePolicy` | Log group ARNs unknown at policy creation; resource policies are account-scoped | | ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | | ECR | `GetAuthorizationToken` | Account-level operation | -| X-Ray | `UpdateTraceSegmentDestination` | Account-level operation | +| KMS | `CreateGrant`, `Decrypt`, `Encrypt`, `GenerateDataKey` | CDK asset encryption keys; key ARNs unknown at policy time | +| X-Ray | `UpdateTraceSegmentDestination`, `PutResourcePolicy` | Account-level operations | These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. From 060da61eb405a7bc303945dbacb34f9cf9912191 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:22:02 +0000 Subject: [PATCH 14/23] fix(docs): note ECS policy fits under IAM size limit Clarify in the ECS section that adding the ECS statement to IaCRole-ABCA-Application keeps the combined policy under the 6,144-character IAM managed policy limit (4,212 of 6,144 chars). Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 2 +- docs/src/content/docs/architecture/Deployment-roles.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index d793ccc..e309eaf 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -583,7 +583,7 @@ Bedrock AgentCore, Bedrock Guardrails, CloudWatch Logs/Dashboards/Alarms, X-Ray, ### When ECS compute is enabled -If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the `IaCRole-ABCA-Application` policy: +If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the `IaCRole-ABCA-Application` policy (the combined policy remains under the 6,144-character IAM limit): ```json { diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index 0b19241..b2665e8 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -587,7 +587,7 @@ Bedrock AgentCore, Bedrock Guardrails, CloudWatch Logs/Dashboards/Alarms, X-Ray, ### When ECS compute is enabled -If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the `IaCRole-ABCA-Application` policy: +If you uncomment the ECS blocks in `cdk/src/stacks/agent.ts` to enable the Fargate compute backend, add the following statement to the `IaCRole-ABCA-Application` policy (the combined policy remains under the 6,144-character IAM limit): ```json { From 14a197badc6ccfed92c7601183923a21f883949a Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 20:45:00 +0000 Subject: [PATCH 15/23] fix(docs): document SecretsManager GetRandomPassword Resource:"*" in constraints table GetRandomPassword is an account-level API with no secret ARN, so it requires Resource:"*". Document this in the Resource-level permission constraints table alongside other services that require "*". Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 1 + docs/src/content/docs/architecture/Deployment-roles.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index e309eaf..b343b92 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -650,6 +650,7 @@ Several services require `Resource: "*"` because they do not support resource-le | ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | | ECR | `GetAuthorizationToken` | Account-level operation | | KMS | `CreateGrant`, `Decrypt`, `Encrypt`, `GenerateDataKey` | CDK asset encryption keys; key ARNs unknown at policy time | +| Secrets Manager | `GetRandomPassword` | Account-level API (no secret ARN) — combined in a single statement to stay under the IAM 6,144-character policy limit; see [Iterative tightening](#iterative-tightening) | | X-Ray | `UpdateTraceSegmentDestination`, `PutResourcePolicy` | Account-level operations | These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index b2665e8..b58d6da 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -654,6 +654,7 @@ Several services require `Resource: "*"` because they do not support resource-le | ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | | ECR | `GetAuthorizationToken` | Account-level operation | | KMS | `CreateGrant`, `Decrypt`, `Encrypt`, `GenerateDataKey` | CDK asset encryption keys; key ARNs unknown at policy time | +| Secrets Manager | `GetRandomPassword` | Account-level API (no secret ARN) — combined in a single statement to stay under the IAM 6,144-character policy limit; see [Iterative tightening](#iterative-tightening) | | X-Ray | `UpdateTraceSegmentDestination`, `PutResourcePolicy` | Account-level operations | These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. From 80ff90ce902d2c575f481bd4f080349a704604cc Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 20:45:24 +0000 Subject: [PATCH 16/23] fix(plugin): update /deploy skill to reference 3-way policy split The skill referenced a non-existent IaCRole-ABCA-Policy. Update to the three actual policy names (Infrastructure, Application, Observability) matching DEPLOYMENT_ROLES.md. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/abca-plugin/skills/deploy/SKILL.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/abca-plugin/skills/deploy/SKILL.md b/docs/abca-plugin/skills/deploy/SKILL.md index e2e0463..ca7e92f 100644 --- a/docs/abca-plugin/skills/deploy/SKILL.md +++ b/docs/abca-plugin/skills/deploy/SKILL.md @@ -88,7 +88,9 @@ By default, CDK bootstrap grants `AdministratorAccess` to the CloudFormation exe ```bash cdk bootstrap aws://ACCOUNT/REGION \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Policy" + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Infrastructure" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Application" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Observability" ``` -See `docs/design/DEPLOYMENT_ROLES.md` in the repo root for the complete least-privilege IAM policy, trust policy, runtime role inventory, and iterative tightening recommendations. +See `docs/design/DEPLOYMENT_ROLES.md` in the repo root for the complete least-privilege IAM policies, trust policy, runtime role inventory, and iterative tightening recommendations. From 00113755a0a695650adbf0fb6d81d7c4d02bef1d Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 20:46:21 +0000 Subject: [PATCH 17/23] fix(docs): add DEPLOYMENT_GUIDE.md Starlight mirror and sidebar entry Add explicit route mapping, mirrorMarkdownFile call, and sidebar entry so the Deployment Guide renders on the docs site and cross-doc links from COST_MODEL.md resolve correctly. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/astro.config.mjs | 5 +- docs/scripts/sync-starlight.mjs | 7 + .../content/docs/architecture/Cost-model.md | 4 +- .../docs/getting-started/Deployment-guide.md | 127 ++++++++++++++++++ 4 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 docs/src/content/docs/getting-started/Deployment-guide.md diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs index d5c8885..9c50391 100644 --- a/docs/astro.config.mjs +++ b/docs/astro.config.mjs @@ -40,7 +40,10 @@ export default defineConfig({ { label: 'Introduction', slug: 'index' }, { label: 'Getting Started', - items: [{ label: 'Quick Start', slug: 'getting-started/quick-start' }], + items: [ + { label: 'Quick Start', slug: 'getting-started/quick-start' }, + { label: 'Deployment Guide', slug: 'getting-started/deployment-guide' }, + ], }, { label: 'Using the Platform', diff --git a/docs/scripts/sync-starlight.mjs b/docs/scripts/sync-starlight.mjs index f9a5519..c7b81f6 100644 --- a/docs/scripts/sync-starlight.mjs +++ b/docs/scripts/sync-starlight.mjs @@ -43,6 +43,7 @@ function rewriteDocsLinkTarget(target) { DEVELOPER_GUIDE: '/developer-guide/introduction', USER_GUIDE: '/using/overview', CONTRIBUTING: '/developer-guide/contributing', + DEPLOYMENT_GUIDE: '/getting-started/deployment-guide', }; /** `splitGuide` emits each `##` from DEVELOPER_GUIDE as its own page — map #anchors to those routes. */ @@ -210,6 +211,12 @@ mirrorMarkdownFile( path.join('src', 'content', 'docs', 'getting-started', 'Quick-start.md'), ); +// --- Deployment Guide: mirror to getting-started/ --- +mirrorMarkdownFile( + path.join(docsRoot, 'guides', 'DEPLOYMENT_GUIDE.md'), + path.join('src', 'content', 'docs', 'getting-started', 'Deployment-guide.md'), +); + // --- Prompt Guide: mirror to customizing/ --- mirrorMarkdownFile( path.join(docsRoot, 'guides', 'PROMPT_GUIDE.md'), diff --git a/docs/src/content/docs/architecture/Cost-model.md b/docs/src/content/docs/architecture/Cost-model.md index bb83b5e..bf08f74 100644 --- a/docs/src/content/docs/architecture/Cost-model.md +++ b/docs/src/content/docs/architecture/Cost-model.md @@ -24,7 +24,7 @@ These costs are incurred regardless of task volume: ### Scale-to-zero characteristics -Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$85–95/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) for the full scale-to-zero breakdown. +Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$85–95/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](/getting-started/deployment-guide) for the full scale-to-zero breakdown. ## Per-task variable costs @@ -96,5 +96,5 @@ For multi-user deployments, cost should be attributable to individual users and - [COMPUTE.md](/architecture/compute) -- Compute option billing models and network architecture. - [ORCHESTRATOR.md](/architecture/orchestrator) -- Polling cost analysis. - [OBSERVABILITY.md](/architecture/observability) -- Cost-related metrics (`agent.cost_usd`, token usage). -- [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) -- Deployment choices, scale-to-zero analysis, AWS services inventory. +- [Deployment guide](/getting-started/deployment-guide) -- Deployment choices, scale-to-zero analysis, AWS services inventory. - [DEPLOYMENT_ROLES.md](/architecture/deployment-roles) -- Least-privilege IAM policies for deployment. diff --git a/docs/src/content/docs/getting-started/Deployment-guide.md b/docs/src/content/docs/getting-started/Deployment-guide.md new file mode 100644 index 0000000..940410a --- /dev/null +++ b/docs/src/content/docs/getting-started/Deployment-guide.md @@ -0,0 +1,127 @@ +--- +title: Deployment guide +--- + +# Deployment guide + +This guide covers deploying ABCA into an AWS account, including compute backend choices, scale-to-zero characteristics, and the complete AWS service inventory. For day-to-day development workflow, see the [Developer guide](/developer-guide/introduction). For a quick first deployment, see the [Quick start](/getting-started/quick-start). For least-privilege IAM deployment roles, see [DEPLOYMENT_ROLES.md](/architecture/deployment-roles). + +## Architecture overview + +ABCA deploys as a **single CDK stack** (`backgroundagent-dev`) containing all platform resources. The stack uses a `ComputeStrategy` interface to support two compute backends within the same stack: + +| Aspect | AgentCore (default) | ECS Fargate (opt-in) | +|--------|--------------------|--------------------| +| **Compute** | Bedrock AgentCore Runtime (Firecracker MicroVMs) | ECS Fargate containers | +| **Resources** | 2 vCPU, 8 GB RAM, 2 GB max image size | 2 vCPU, 4 GB RAM | +| **Orchestration** | Durable Lambda (checkpoint/replay) | Same durable Lambda via `ComputeStrategy` | +| **Agent mode** | FastAPI server (HTTP invocation) | Batch (run-to-completion) | +| **Startup** | ~10s (warm MicroVM) | ~60-180s (Fargate cold start) | +| **Max duration** | 8 hours (AgentCore session) | Limited by orchestrator timeout (9 hours) | + +Both backends are orchestrated by the same durable Lambda function. The `ComputeStrategy` interface abstracts `startSession()`, `pollSession()`, and `stopSession()` -- the ECS strategy calls `ecs:RunTask` / `ecs:DescribeTasks` / `ecs:StopTask` directly from the Lambda. No Step Functions are used. + +ECS Fargate is currently **opt-in** -- the `EcsAgentCluster` construct is present in the stack code but commented out. To enable it, uncomment the ECS blocks in `cdk/src/stacks/agent.ts`. + +## Scale-to-zero analysis + +### Components that scale to zero (pay-per-use) + +| Component | Billing Model | Idle Cost | +|-----------|--------------|-----------| +| DynamoDB (5 tables) | PAY_PER_REQUEST | $0 | +| Lambda (all functions) | Per invocation | $0 | +| API Gateway REST | Per request | $0 | +| ECS Fargate tasks (when enabled) | Per running task | $0 (cluster is free) | +| AgentCore Runtime | Per session | $0 | +| Bedrock inference | Per token | $0 | +| AgentCore Memory | Proportional to usage | ~$0 | +| Cognito | Free tier (50K MAU) | $0 | + +### Components that do not scale to zero (always-on) + +| Component | Est. Monthly Idle Cost | Why | +|-----------|----------------------|-----| +| NAT Gateway (1x) | ~$32 | $0.045/hr fixed charge | +| VPC Interface Endpoints (7x, 2 AZs) | ~$50 | $0.01/hr per endpoint per AZ | +| WAF v2 Web ACL | ~$5 | Base monthly charge | +| CloudWatch Dashboard | ~$3 | Per-dashboard charge | +| Secrets Manager (1+ secrets) | ~$0.40/secret | Per-secret monthly | +| CloudWatch Alarms | ~$0.10/alarm | Per standard alarm | +| CloudWatch Logs retention | ~$1-5 | Storage for retained logs | +| **Total always-on baseline** | **~$85-95/month** | | + +The dominant idle cost is VPC networking: 7 interface endpoints (~$50/month) plus the NAT Gateway (~$32/month). + +For the full cost model including per-task costs, see [COST_MODEL.md](/architecture/cost-model). + +## AWS services inventory + +### Compute + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| Bedrock AgentCore Runtime (MicroVMs) | Agent sessions (default) | Yes | +| ECS Fargate (when enabled) | Agent sessions (opt-in) | Yes | +| Lambda (Node.js 24, ARM64) | Orchestrator, API handlers, reconciler, custom resources | Yes | + +### AI/ML + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| Bedrock (Claude Sonnet 4.6, Opus 4, Haiku 4.5) | Agent reasoning, cross-region inference profiles | Yes | +| Bedrock Guardrails | Prompt injection detection on task input | Yes | +| Bedrock AgentCore Memory | Semantic + episodic extraction strategies | Yes | + +### Networking + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| VPC (public + private subnets, 2 AZs) | All compute | N/A (no direct cost) | +| NAT Gateway (1x) | Private subnet internet egress | **No** (~$32/mo) | +| VPC Interface Endpoints (7x) | AWS service connectivity from private subnets | **No** (~$50/mo) | +| VPC Gateway Endpoints (2x: S3, DynamoDB) | S3 and DynamoDB connectivity | Yes (free) | +| Security Groups | HTTPS-only egress | N/A | +| Route 53 Resolver DNS Firewall | Domain allowlisting for agent egress | Minimal | + +### Storage / Database + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| DynamoDB (5 tables, PAY_PER_REQUEST) | Task state, events, concurrency, webhooks, repo config | Yes | +| S3 | CDK asset bucket, ECR image layers, FUSE session storage | Minimal | +| Secrets Manager | GitHub PAT, webhook HMAC secrets | **No** (~$0.40/secret/mo) | + +### API / Auth + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| API Gateway (REST) | Task REST API | Yes | +| Cognito User Pool | CLI/API authentication | Yes (free tier) | +| WAF v2 | API Gateway protection (managed rules + rate limiting) | **No** (~$5/mo base) | + +### Observability + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| CloudWatch Logs (multiple log groups) | Application, usage, model invocation, VPC flow, DNS query logs | **No** (storage) | +| CloudWatch Dashboard | Operational metrics visualization | **No** (~$3/mo) | +| CloudWatch Alarms | Orchestrator error alerting | **No** (~$0.10/alarm) | +| X-Ray | AgentCore Runtime tracing | Yes | + +### Infrastructure / Deployment + +| Service | Used By | Scales to Zero | +|---------|---------|---------------| +| CloudFormation | Stack deployment, custom resources | N/A | +| ECR | Container image storage | Minimal | +| IAM | Roles and policies for all components | N/A | + +## Reference + +- [Quick start](/getting-started/quick-start) -- Zero-to-first-PR in 6 steps. +- [Developer guide](/developer-guide/introduction) -- Local development, testing, repository onboarding. +- [User guide](/using/overview) -- API reference, CLI usage, task management. +- [DEPLOYMENT_ROLES.md](/architecture/deployment-roles) -- Least-privilege IAM policies for CloudFormation execution. +- [COST_MODEL.md](/architecture/cost-model) -- Per-task costs, cost guardrails, cost at scale. +- [COMPUTE.md](/architecture/compute) -- Compute backend architecture and trade-offs. From d02311f535620a2742161b1943215b3d2f362860 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:09:30 +0000 Subject: [PATCH 18/23] fix(docs): split SecretsManager GetRandomPassword into own statement Isolate the account-level GetRandomPassword action (which requires Resource:*) from the scoped SecretsManager statement. With ECS the Application policy is still only ~4K of the 6,144-char IAM limit, leaving ~2K headroom for future services. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 16 +++++++++------- .../docs/architecture/Deployment-roles.md | 16 +++++++++------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index b343b92..cb0f41b 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -393,13 +393,15 @@ DynamoDB tables, Lambda functions, API Gateway, Cognito, WAFv2, EventBridge, and "secretsmanager:UntagResource", "secretsmanager:GetResourcePolicy", "secretsmanager:PutResourcePolicy", - "secretsmanager:DeleteResourcePolicy", - "secretsmanager:GetRandomPassword" + "secretsmanager:DeleteResourcePolicy" ], - "Resource": [ - "arn:aws:secretsmanager:*:*:secret:backgroundagent-*", - "*" - ] + "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" + }, + { + "Sid": "SecretsManagerAccountLevel", + "Effect": "Allow", + "Action": "secretsmanager:GetRandomPassword", + "Resource": "*" } ] } @@ -650,7 +652,7 @@ Several services require `Resource: "*"` because they do not support resource-le | ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | | ECR | `GetAuthorizationToken` | Account-level operation | | KMS | `CreateGrant`, `Decrypt`, `Encrypt`, `GenerateDataKey` | CDK asset encryption keys; key ARNs unknown at policy time | -| Secrets Manager | `GetRandomPassword` | Account-level API (no secret ARN) — combined in a single statement to stay under the IAM 6,144-character policy limit; see [Iterative tightening](#iterative-tightening) | +| Secrets Manager | `GetRandomPassword` | Account-level API (no secret ARN); isolated in its own statement with `Resource: "*"` | | X-Ray | `UpdateTraceSegmentDestination`, `PutResourcePolicy` | Account-level operations | These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index b58d6da..9522deb 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -397,13 +397,15 @@ DynamoDB tables, Lambda functions, API Gateway, Cognito, WAFv2, EventBridge, and "secretsmanager:UntagResource", "secretsmanager:GetResourcePolicy", "secretsmanager:PutResourcePolicy", - "secretsmanager:DeleteResourcePolicy", - "secretsmanager:GetRandomPassword" + "secretsmanager:DeleteResourcePolicy" ], - "Resource": [ - "arn:aws:secretsmanager:*:*:secret:backgroundagent-*", - "*" - ] + "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" + }, + { + "Sid": "SecretsManagerAccountLevel", + "Effect": "Allow", + "Action": "secretsmanager:GetRandomPassword", + "Resource": "*" } ] } @@ -654,7 +656,7 @@ Several services require `Resource: "*"` because they do not support resource-le | ECS | Cluster + task definition actions | `RegisterTaskDefinition` doesn't support resource-level permissions | | ECR | `GetAuthorizationToken` | Account-level operation | | KMS | `CreateGrant`, `Decrypt`, `Encrypt`, `GenerateDataKey` | CDK asset encryption keys; key ARNs unknown at policy time | -| Secrets Manager | `GetRandomPassword` | Account-level API (no secret ARN) — combined in a single statement to stay under the IAM 6,144-character policy limit; see [Iterative tightening](#iterative-tightening) | +| Secrets Manager | `GetRandomPassword` | Account-level API (no secret ARN); isolated in its own statement with `Resource: "*"` | | X-Ray | `UpdateTraceSegmentDestination`, `PutResourcePolicy` | Account-level operations | These constraints align with the CDK Nag `AwsSolutions-IAM5` suppressions in the codebase. From ca33f4733f67c5c47012f928c459dfebae763f99 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:30:22 +0000 Subject: [PATCH 19/23] fix(docs): add PassedToService condition to PassRole and tightening notes Separate iam:PassRole into its own statement with iam:PassedToService condition limiting to the 7 services ABCA passes roles to. Add iterative tightening items for AttachRolePolicy (iam:PolicyARN) and CreateServiceLinkedRole (iam:AWSServiceName) conditions. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 26 ++++++++++++++++--- .../docs/architecture/Deployment-roles.md | 26 ++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index cb0f41b..7c30017 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -111,7 +111,6 @@ CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 5 "iam:CreateRole", "iam:DeleteRole", "iam:GetRole", - "iam:PassRole", "iam:UpdateRole", "iam:TagRole", "iam:UntagRole", @@ -140,6 +139,25 @@ CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 5 "arn:aws:iam::*:role/aws-service-role/*" ] }, + { + "Sid": "IAMPassRole", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::*:role/backgroundagent-dev-*", + "Condition": { + "StringEquals": { + "iam:PassedToService": [ + "lambda.amazonaws.com", + "ecs-tasks.amazonaws.com", + "ecs.amazonaws.com", + "apigateway.amazonaws.com", + "logs.amazonaws.com", + "bedrock.amazonaws.com", + "events.amazonaws.com" + ] + } + } + }, { "Sid": "VPCNetworking", "Effect": "Allow", @@ -664,8 +682,10 @@ These policies are conservative-but-scoped starting points. To tighten further: 1. **Deploy once with CloudTrail enabled**, then use [IAM Access Analyzer policy generation](https://docs.aws.amazon.com/IAM/latest/UserGuide/access-analyzer-policy-generation.html) to generate a least-privilege policy based on the actual API calls recorded in CloudTrail. 2. **Replace `*` resources** with actual ARNs after the first deploy (e.g., once you know the VPC ID, scope EC2 actions to that VPC). 3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. -4. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. -5. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. +4. **Restrict `iam:AttachRolePolicy`** with an `iam:PolicyARN` condition to limit which policies can be attached to `backgroundagent-dev-*` roles. This requires enumerating the AWS managed policies CDK attaches (e.g., `service-role/AWSLambdaBasicExecutionRole`) from a synthesized template, so it is deferred to a post-deployment tightening pass. +5. **Scope `iam:CreateServiceLinkedRole`** with an `iam:AWSServiceName` condition to limit which AWS services can have service-linked roles created. After a first deploy, check CloudTrail for which service-linked roles were actually created and restrict accordingly. +6. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. +7. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. ## Reference diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index 9522deb..f330587 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -115,7 +115,6 @@ CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 5 "iam:CreateRole", "iam:DeleteRole", "iam:GetRole", - "iam:PassRole", "iam:UpdateRole", "iam:TagRole", "iam:UntagRole", @@ -144,6 +143,25 @@ CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 5 "arn:aws:iam::*:role/aws-service-role/*" ] }, + { + "Sid": "IAMPassRole", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::*:role/backgroundagent-dev-*", + "Condition": { + "StringEquals": { + "iam:PassedToService": [ + "lambda.amazonaws.com", + "ecs-tasks.amazonaws.com", + "ecs.amazonaws.com", + "apigateway.amazonaws.com", + "logs.amazonaws.com", + "bedrock.amazonaws.com", + "events.amazonaws.com" + ] + } + } + }, { "Sid": "VPCNetworking", "Effect": "Allow", @@ -668,8 +686,10 @@ These policies are conservative-but-scoped starting points. To tighten further: 1. **Deploy once with CloudTrail enabled**, then use [IAM Access Analyzer policy generation](https://docs.aws.amazon.com/IAM/latest/UserGuide/access-analyzer-policy-generation.html) to generate a least-privilege policy based on the actual API calls recorded in CloudTrail. 2. **Replace `*` resources** with actual ARNs after the first deploy (e.g., once you know the VPC ID, scope EC2 actions to that VPC). 3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. -4. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. -5. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. +4. **Restrict `iam:AttachRolePolicy`** with an `iam:PolicyARN` condition to limit which policies can be attached to `backgroundagent-dev-*` roles. This requires enumerating the AWS managed policies CDK attaches (e.g., `service-role/AWSLambdaBasicExecutionRole`) from a synthesized template, so it is deferred to a post-deployment tightening pass. +5. **Scope `iam:CreateServiceLinkedRole`** with an `iam:AWSServiceName` condition to limit which AWS services can have service-linked roles created. After a first deploy, check CloudTrail for which service-linked roles were actually created and restrict accordingly. +6. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. +7. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. ## Reference From 6df700d9273201459cf8112aa3e4bc61996703ae Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:31:36 +0000 Subject: [PATCH 20/23] fix(docs): scope X-Ray resource policy, add KMS tightening, unify placeholders - Scope X-Ray resource policy Resource from * to arn:aws:logs:*:ACCOUNT_ID:log-group:aws/spans in QUICK_START.md and setup SKILL.md (item 7) - Add KMS kms:ResourceAliases tightening recommendation (item 6) - Unify placeholder to ACCOUNT_ID everywhere with substitution note (item 8) Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/abca-plugin/skills/setup/SKILL.md | 2 +- docs/design/DEPLOYMENT_ROLES.md | 15 +++++++++------ docs/guides/QUICK_START.md | 2 +- .../content/docs/architecture/Deployment-roles.md | 15 +++++++++------ .../content/docs/getting-started/Quick-start.md | 2 +- 5 files changed, 21 insertions(+), 15 deletions(-) diff --git a/docs/abca-plugin/skills/setup/SKILL.md b/docs/abca-plugin/skills/setup/SKILL.md index 1a5f01b..fa786a2 100644 --- a/docs/abca-plugin/skills/setup/SKILL.md +++ b/docs/abca-plugin/skills/setup/SKILL.md @@ -58,7 +58,7 @@ On a fresh AWS account, X-Ray needs a CloudWatch Logs resource policy before it ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) aws logs put-resource-policy \ --policy-name xray-spans-policy \ - --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":\"*\"}]}" + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":[\"arn:aws:logs:*:${ACCOUNT_ID}:log-group:aws/spans\",\"arn:aws:logs:*:${ACCOUNT_ID}:log-group:aws/spans:*\"]}]}" aws xray update-trace-segment-destination --destination CloudWatchLogs ``` diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index 7c30017..3731435 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -25,12 +25,14 @@ The policies are split into three IAM managed policies (each under the 6,144-cha | `IaCRole-ABCA-Application` | DynamoDB, Lambda, API Gateway, Cognito, WAFv2, EventBridge, Secrets Manager | | `IaCRole-ABCA-Observability` | Bedrock AgentCore, Bedrock Guardrails, CloudWatch, X-Ray, S3, ECR, KMS, SSM, STS | +> **Placeholder substitution**: Replace `ACCOUNT_ID` with your 12-digit AWS account ID and `REGION` with your deployment region (e.g., `us-east-1`) throughout this document. + ```bash # Create all three policies in your account, then re-bootstrap: -cdk bootstrap aws://ACCOUNT/REGION \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Infrastructure" \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Application" \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Observability" +cdk bootstrap aws://ACCOUNT_ID/REGION \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT_ID:policy/IaCRole-ABCA-Infrastructure" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT_ID:policy/IaCRole-ABCA-Application" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT_ID:policy/IaCRole-ABCA-Observability" ``` The `--cloudformation-execution-policies` flag can be repeated to attach multiple policies to the CloudFormation execution role. @@ -684,8 +686,9 @@ These policies are conservative-but-scoped starting points. To tighten further: 3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. 4. **Restrict `iam:AttachRolePolicy`** with an `iam:PolicyARN` condition to limit which policies can be attached to `backgroundagent-dev-*` roles. This requires enumerating the AWS managed policies CDK attaches (e.g., `service-role/AWSLambdaBasicExecutionRole`) from a synthesized template, so it is deferred to a post-deployment tightening pass. 5. **Scope `iam:CreateServiceLinkedRole`** with an `iam:AWSServiceName` condition to limit which AWS services can have service-linked roles created. After a first deploy, check CloudTrail for which service-linked roles were actually created and restrict accordingly. -6. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. -7. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. +6. **Scope KMS actions** with a `kms:ResourceAliases` condition (e.g., `"kms:ResourceAliases": "alias/cdk-hnb659fds-*"`) to limit `CreateGrant`, `Decrypt`, `Encrypt`, and `GenerateDataKey` to the deterministic CDK bootstrap key. +7. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. +8. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. ## Reference diff --git a/docs/guides/QUICK_START.md b/docs/guides/QUICK_START.md index 1c38b1e..808ff02 100644 --- a/docs/guides/QUICK_START.md +++ b/docs/guides/QUICK_START.md @@ -82,7 +82,7 @@ The CDK stack deploys the full platform: API Gateway, Lambda functions (orchestr ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) aws logs put-resource-policy \ --policy-name xray-spans-policy \ - --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":\"*\"}]}" + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":[\"arn:aws:logs:*:${ACCOUNT_ID}:log-group:aws/spans\",\"arn:aws:logs:*:${ACCOUNT_ID}:log-group:aws/spans:*\"]}]}" aws xray update-trace-segment-destination --destination CloudWatchLogs # Bootstrap CDK (first time only) diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index f330587..449194b 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -29,12 +29,14 @@ The policies are split into three IAM managed policies (each under the 6,144-cha | `IaCRole-ABCA-Application` | DynamoDB, Lambda, API Gateway, Cognito, WAFv2, EventBridge, Secrets Manager | | `IaCRole-ABCA-Observability` | Bedrock AgentCore, Bedrock Guardrails, CloudWatch, X-Ray, S3, ECR, KMS, SSM, STS | +> **Placeholder substitution**: Replace `ACCOUNT_ID` with your 12-digit AWS account ID and `REGION` with your deployment region (e.g., `us-east-1`) throughout this document. + ```bash # Create all three policies in your account, then re-bootstrap: -cdk bootstrap aws://ACCOUNT/REGION \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Infrastructure" \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Application" \ - --cloudformation-execution-policies "arn:aws:iam::ACCOUNT:policy/IaCRole-ABCA-Observability" +cdk bootstrap aws://ACCOUNT_ID/REGION \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT_ID:policy/IaCRole-ABCA-Infrastructure" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT_ID:policy/IaCRole-ABCA-Application" \ + --cloudformation-execution-policies "arn:aws:iam::ACCOUNT_ID:policy/IaCRole-ABCA-Observability" ``` The `--cloudformation-execution-policies` flag can be repeated to attach multiple policies to the CloudFormation execution role. @@ -688,8 +690,9 @@ These policies are conservative-but-scoped starting points. To tighten further: 3. **Add region conditions** where possible (e.g., `"aws:RequestedRegion": "us-east-1"`) to prevent cross-region resource creation. 4. **Restrict `iam:AttachRolePolicy`** with an `iam:PolicyARN` condition to limit which policies can be attached to `backgroundagent-dev-*` roles. This requires enumerating the AWS managed policies CDK attaches (e.g., `service-role/AWSLambdaBasicExecutionRole`) from a synthesized template, so it is deferred to a post-deployment tightening pass. 5. **Scope `iam:CreateServiceLinkedRole`** with an `iam:AWSServiceName` condition to limit which AWS services can have service-linked roles created. After a first deploy, check CloudTrail for which service-linked roles were actually created and restrict accordingly. -6. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. -7. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. +6. **Scope KMS actions** with a `kms:ResourceAliases` condition (e.g., `"kms:ResourceAliases": "alias/cdk-hnb659fds-*"`) to limit `CreateGrant`, `Decrypt`, `Encrypt`, and `GenerateDataKey` to the deterministic CDK bootstrap key. +7. **Use permission boundaries** on the IaC role to set an outer limit even if the policy is too broad. +8. **Review after each CDK version upgrade** -- new CDK versions may add/remove custom resources that need different permissions. ## Reference diff --git a/docs/src/content/docs/getting-started/Quick-start.md b/docs/src/content/docs/getting-started/Quick-start.md index 7bc48cc..b4c78fd 100644 --- a/docs/src/content/docs/getting-started/Quick-start.md +++ b/docs/src/content/docs/getting-started/Quick-start.md @@ -86,7 +86,7 @@ The CDK stack deploys the full platform: API Gateway, Lambda functions (orchestr ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) aws logs put-resource-policy \ --policy-name xray-spans-policy \ - --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":\"*\"}]}" + --policy-document "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Sid\":\"XRaySpansAccess\",\"Effect\":\"Allow\",\"Principal\":{\"Service\":\"xray.amazonaws.com\"},\"Action\":[\"logs:PutLogEvents\",\"logs:CreateLogGroup\",\"logs:CreateLogStream\"],\"Resource\":[\"arn:aws:logs:*:${ACCOUNT_ID}:log-group:aws/spans\",\"arn:aws:logs:*:${ACCOUNT_ID}:log-group:aws/spans:*\"]}]}" aws xray update-trace-segment-destination --destination CloudWatchLogs # Bootstrap CDK (first time only) From abe2a905c7b52dc6d0ec2025a7cef2d771c3f15a Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Fri, 24 Apr 2026 21:33:40 +0000 Subject: [PATCH 21/23] fix(docs): correct VPC endpoint cost to ~$102/mo and clarify session timeouts VPC endpoint cost was ~$50/mo (1 AZ math), actual is ~$102/mo (7 endpoints x 2 AZs x $0.01/hr x 730 hrs). Update baseline totals from ~$85-95 to ~$140-150 in COST_MODEL.md and DEPLOYMENT_GUIDE.md. Clarify the two distinct timeout limits: AgentCore 8-hour service limit vs orchestrator 9-hour executionTimeout. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/COST_MODEL.md | 12 ++++++------ docs/guides/DEPLOYMENT_GUIDE.md | 10 +++++----- docs/src/content/docs/architecture/Cost-model.md | 12 ++++++------ .../content/docs/getting-started/Deployment-guide.md | 10 +++++----- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/design/COST_MODEL.md b/docs/design/COST_MODEL.md index f1eeb52..ffb726d 100644 --- a/docs/design/COST_MODEL.md +++ b/docs/design/COST_MODEL.md @@ -11,16 +11,16 @@ These costs are incurred regardless of task volume: | Component | Estimated cost | Notes | |---|---|---| | NAT Gateway (1×) | ~$32/month | Fixed hourly cost + data processing. Single AZ (see [COMPUTE.md - Network architecture](./COMPUTE.md)). | -| VPC Interface Endpoints (7×) | ~$50/month | $0.01/hr per endpoint per AZ. | +| VPC Interface Endpoints (7×, 2 AZs) | ~$102/month | $0.01/hr × 7 endpoints × 2 AZs × 730 hrs. | | VPC Flow Logs | ~$3/month | CloudWatch ingestion. | | DynamoDB (on-demand, idle) | ~$0/month | Pay-per-request; no cost when idle. | | CloudWatch Logs retention | ~$1–5/month | Depends on log volume. 90-day retention. | | API Gateway (idle) | ~$0/month | Pay-per-request. | -| **Total baseline** | **~$85–95/month** | | +| **Total baseline** | **~$140–150/month** | | ### Scale-to-zero characteristics -Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$85–95/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) for the full scale-to-zero breakdown. +Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$140–150/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints across 2 AZs) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](../guides/DEPLOYMENT_GUIDE.md) for the full scale-to-zero breakdown. ## Per-task variable costs @@ -47,7 +47,7 @@ Assuming a typical task: 1–2 hours, Claude Sonnet, ~100K input tokens, ~20K ou | Model choice | 5–10× between Haiku and Opus | Default to Claude Sonnet; allow per-repo override. | | Turn count | Linear with turns | `max_turns` cap (default 100, configurable 1–500). | | Cost budget | Hard stop at budget | `max_budget_usd` cap (configurable $0.01–$100). Agent stops when budget is reached regardless of remaining turns. | -| Task duration | Sub-linear (compute is cheap; tokens dominate) | 8-hour max session timeout. | +| Task duration | Sub-linear (compute is cheap; tokens dominate) | AgentCore: 8-hour service limit; orchestrator: 9-hour `executionTimeout`. | | Prompt caching | 50–90% token cost reduction | Enable by default; cache system prompts and repo context. | | Concurrency | Linear with parallel tasks | Per-user and system-wide concurrency limits. | @@ -55,8 +55,8 @@ Assuming a typical task: 1–2 hours, Claude Sonnet, ~100K input tokens, ~20K ou | Scale | Tasks/month | Estimated monthly cost (infra + tasks) | |---|---|---| -| Low (1 developer) | 30–60 | $150–500 | -| Medium (small team) | 200–500 | $500–3,000 | +| Low (1 developer) | 30–60 | $200–550 | +| Medium (small team) | 200–500 | $550–3,000 | | High (org-wide) | 2,000–5,000 | $5,000–30,000 | These estimates assume Claude Sonnet with prompt caching enabled and average task complexity. diff --git a/docs/guides/DEPLOYMENT_GUIDE.md b/docs/guides/DEPLOYMENT_GUIDE.md index 725b7a5..795580e 100644 --- a/docs/guides/DEPLOYMENT_GUIDE.md +++ b/docs/guides/DEPLOYMENT_GUIDE.md @@ -13,7 +13,7 @@ ABCA deploys as a **single CDK stack** (`backgroundagent-dev`) containing all pl | **Orchestration** | Durable Lambda (checkpoint/replay) | Same durable Lambda via `ComputeStrategy` | | **Agent mode** | FastAPI server (HTTP invocation) | Batch (run-to-completion) | | **Startup** | ~10s (warm MicroVM) | ~60-180s (Fargate cold start) | -| **Max duration** | 8 hours (AgentCore session) | Limited by orchestrator timeout (9 hours) | +| **Max duration** | 8 hours (AgentCore service limit) | 9 hours (orchestrator `executionTimeout`) | Both backends are orchestrated by the same durable Lambda function. The `ComputeStrategy` interface abstracts `startSession()`, `pollSession()`, and `stopSession()` -- the ECS strategy calls `ecs:RunTask` / `ecs:DescribeTasks` / `ecs:StopTask` directly from the Lambda. No Step Functions are used. @@ -39,15 +39,15 @@ ECS Fargate is currently **opt-in** -- the `EcsAgentCluster` construct is presen | Component | Est. Monthly Idle Cost | Why | |-----------|----------------------|-----| | NAT Gateway (1x) | ~$32 | $0.045/hr fixed charge | -| VPC Interface Endpoints (7x, 2 AZs) | ~$50 | $0.01/hr per endpoint per AZ | +| VPC Interface Endpoints (7x, 2 AZs) | ~$102 | $0.01/hr × 7 endpoints × 2 AZs × 730 hrs | | WAF v2 Web ACL | ~$5 | Base monthly charge | | CloudWatch Dashboard | ~$3 | Per-dashboard charge | | Secrets Manager (1+ secrets) | ~$0.40/secret | Per-secret monthly | | CloudWatch Alarms | ~$0.10/alarm | Per standard alarm | | CloudWatch Logs retention | ~$1-5 | Storage for retained logs | -| **Total always-on baseline** | **~$85-95/month** | | +| **Total always-on baseline** | **~$140-150/month** | | -The dominant idle cost is VPC networking: 7 interface endpoints (~$50/month) plus the NAT Gateway (~$32/month). +The dominant idle cost is VPC networking: 7 interface endpoints across 2 AZs (~$102/month) plus the NAT Gateway (~$32/month). For the full cost model including per-task costs, see [COST_MODEL.md](../design/COST_MODEL.md). @@ -75,7 +75,7 @@ For the full cost model including per-task costs, see [COST_MODEL.md](../design/ |---------|---------|---------------| | VPC (public + private subnets, 2 AZs) | All compute | N/A (no direct cost) | | NAT Gateway (1x) | Private subnet internet egress | **No** (~$32/mo) | -| VPC Interface Endpoints (7x) | AWS service connectivity from private subnets | **No** (~$50/mo) | +| VPC Interface Endpoints (7x, 2 AZs) | AWS service connectivity from private subnets | **No** (~$102/mo) | | VPC Gateway Endpoints (2x: S3, DynamoDB) | S3 and DynamoDB connectivity | Yes (free) | | Security Groups | HTTPS-only egress | N/A | | Route 53 Resolver DNS Firewall | Domain allowlisting for agent egress | Minimal | diff --git a/docs/src/content/docs/architecture/Cost-model.md b/docs/src/content/docs/architecture/Cost-model.md index bf08f74..9006a8b 100644 --- a/docs/src/content/docs/architecture/Cost-model.md +++ b/docs/src/content/docs/architecture/Cost-model.md @@ -15,16 +15,16 @@ These costs are incurred regardless of task volume: | Component | Estimated cost | Notes | |---|---|---| | NAT Gateway (1×) | ~$32/month | Fixed hourly cost + data processing. Single AZ (see [COMPUTE.md - Network architecture](/architecture/compute)). | -| VPC Interface Endpoints (7×) | ~$50/month | $0.01/hr per endpoint per AZ. | +| VPC Interface Endpoints (7×, 2 AZs) | ~$102/month | $0.01/hr × 7 endpoints × 2 AZs × 730 hrs. | | VPC Flow Logs | ~$3/month | CloudWatch ingestion. | | DynamoDB (on-demand, idle) | ~$0/month | Pay-per-request; no cost when idle. | | CloudWatch Logs retention | ~$1–5/month | Depends on log volume. 90-day retention. | | API Gateway (idle) | ~$0/month | Pay-per-request. | -| **Total baseline** | **~$85–95/month** | | +| **Total baseline** | **~$140–150/month** | | ### Scale-to-zero characteristics -Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$85–95/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](/getting-started/deployment-guide) for the full scale-to-zero breakdown. +Most platform components are fully serverless and incur zero cost when idle: DynamoDB (PAY_PER_REQUEST), Lambda, API Gateway, ECS Fargate (cluster is free, when enabled), AgentCore Runtime (per-session), Bedrock (per-token), and Cognito (free tier). The always-on cost floor (~$140–150/month) is dominated by VPC networking infrastructure (NAT Gateway + 7 interface endpoints across 2 AZs) which is required for private subnet connectivity to AWS services and GitHub. See the [Deployment guide](/getting-started/deployment-guide) for the full scale-to-zero breakdown. ## Per-task variable costs @@ -51,7 +51,7 @@ Assuming a typical task: 1–2 hours, Claude Sonnet, ~100K input tokens, ~20K ou | Model choice | 5–10× between Haiku and Opus | Default to Claude Sonnet; allow per-repo override. | | Turn count | Linear with turns | `max_turns` cap (default 100, configurable 1–500). | | Cost budget | Hard stop at budget | `max_budget_usd` cap (configurable $0.01–$100). Agent stops when budget is reached regardless of remaining turns. | -| Task duration | Sub-linear (compute is cheap; tokens dominate) | 8-hour max session timeout. | +| Task duration | Sub-linear (compute is cheap; tokens dominate) | AgentCore: 8-hour service limit; orchestrator: 9-hour `executionTimeout`. | | Prompt caching | 50–90% token cost reduction | Enable by default; cache system prompts and repo context. | | Concurrency | Linear with parallel tasks | Per-user and system-wide concurrency limits. | @@ -59,8 +59,8 @@ Assuming a typical task: 1–2 hours, Claude Sonnet, ~100K input tokens, ~20K ou | Scale | Tasks/month | Estimated monthly cost (infra + tasks) | |---|---|---| -| Low (1 developer) | 30–60 | $150–500 | -| Medium (small team) | 200–500 | $500–3,000 | +| Low (1 developer) | 30–60 | $200–550 | +| Medium (small team) | 200–500 | $550–3,000 | | High (org-wide) | 2,000–5,000 | $5,000–30,000 | These estimates assume Claude Sonnet with prompt caching enabled and average task complexity. diff --git a/docs/src/content/docs/getting-started/Deployment-guide.md b/docs/src/content/docs/getting-started/Deployment-guide.md index 940410a..1ea3ee1 100644 --- a/docs/src/content/docs/getting-started/Deployment-guide.md +++ b/docs/src/content/docs/getting-started/Deployment-guide.md @@ -17,7 +17,7 @@ ABCA deploys as a **single CDK stack** (`backgroundagent-dev`) containing all pl | **Orchestration** | Durable Lambda (checkpoint/replay) | Same durable Lambda via `ComputeStrategy` | | **Agent mode** | FastAPI server (HTTP invocation) | Batch (run-to-completion) | | **Startup** | ~10s (warm MicroVM) | ~60-180s (Fargate cold start) | -| **Max duration** | 8 hours (AgentCore session) | Limited by orchestrator timeout (9 hours) | +| **Max duration** | 8 hours (AgentCore service limit) | 9 hours (orchestrator `executionTimeout`) | Both backends are orchestrated by the same durable Lambda function. The `ComputeStrategy` interface abstracts `startSession()`, `pollSession()`, and `stopSession()` -- the ECS strategy calls `ecs:RunTask` / `ecs:DescribeTasks` / `ecs:StopTask` directly from the Lambda. No Step Functions are used. @@ -43,15 +43,15 @@ ECS Fargate is currently **opt-in** -- the `EcsAgentCluster` construct is presen | Component | Est. Monthly Idle Cost | Why | |-----------|----------------------|-----| | NAT Gateway (1x) | ~$32 | $0.045/hr fixed charge | -| VPC Interface Endpoints (7x, 2 AZs) | ~$50 | $0.01/hr per endpoint per AZ | +| VPC Interface Endpoints (7x, 2 AZs) | ~$102 | $0.01/hr × 7 endpoints × 2 AZs × 730 hrs | | WAF v2 Web ACL | ~$5 | Base monthly charge | | CloudWatch Dashboard | ~$3 | Per-dashboard charge | | Secrets Manager (1+ secrets) | ~$0.40/secret | Per-secret monthly | | CloudWatch Alarms | ~$0.10/alarm | Per standard alarm | | CloudWatch Logs retention | ~$1-5 | Storage for retained logs | -| **Total always-on baseline** | **~$85-95/month** | | +| **Total always-on baseline** | **~$140-150/month** | | -The dominant idle cost is VPC networking: 7 interface endpoints (~$50/month) plus the NAT Gateway (~$32/month). +The dominant idle cost is VPC networking: 7 interface endpoints across 2 AZs (~$102/month) plus the NAT Gateway (~$32/month). For the full cost model including per-task costs, see [COST_MODEL.md](/architecture/cost-model). @@ -79,7 +79,7 @@ For the full cost model including per-task costs, see [COST_MODEL.md](/architect |---------|---------|---------------| | VPC (public + private subnets, 2 AZs) | All compute | N/A (no direct cost) | | NAT Gateway (1x) | Private subnet internet egress | **No** (~$32/mo) | -| VPC Interface Endpoints (7x) | AWS service connectivity from private subnets | **No** (~$50/mo) | +| VPC Interface Endpoints (7x, 2 AZs) | AWS service connectivity from private subnets | **No** (~$102/mo) | | VPC Gateway Endpoints (2x: S3, DynamoDB) | S3 and DynamoDB connectivity | Yes (free) | | Security Groups | HTTPS-only egress | N/A | | Route 53 Resolver DNS Firewall | Domain allowlisting for agent egress | Minimal | From 3adb6e23808d7a7a978c565e8483c7a4431d38e6 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Sat, 25 Apr 2026 00:56:02 +0000 Subject: [PATCH 22/23] fix(docs): add GitHubTokenSecret to SecretsManager resource scope CDK generates the GitHub token secret with construct ID hash (GitHubTokenSecret09BC4210-*), not the backgroundagent- prefix. Add this pattern to the SecretsManager statement Resource list. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 5 ++++- docs/src/content/docs/architecture/Deployment-roles.md | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index 3731435..a48016d 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -415,7 +415,10 @@ DynamoDB tables, Lambda functions, API Gateway, Cognito, WAFv2, EventBridge, and "secretsmanager:PutResourcePolicy", "secretsmanager:DeleteResourcePolicy" ], - "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" + "Resource": [ + "arn:aws:secretsmanager:*:*:secret:backgroundagent-*", + "arn:aws:secretsmanager:*:*:secret:GitHubTokenSecret*" + ] }, { "Sid": "SecretsManagerAccountLevel", diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index 449194b..00fde70 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -419,7 +419,10 @@ DynamoDB tables, Lambda functions, API Gateway, Cognito, WAFv2, EventBridge, and "secretsmanager:PutResourcePolicy", "secretsmanager:DeleteResourcePolicy" ], - "Resource": "arn:aws:secretsmanager:*:*:secret:backgroundagent-*" + "Resource": [ + "arn:aws:secretsmanager:*:*:secret:backgroundagent-*", + "arn:aws:secretsmanager:*:*:secret:GitHubTokenSecret*" + ] }, { "Sid": "SecretsManagerAccountLevel", From 6cd306bf1702b7b2216e57cc4f52ea2213608471 Mon Sep 17 00:00:00 2001 From: bgagent <345885+scottschreckengaust@users.noreply.github.com> Date: Sat, 25 Apr 2026 01:01:18 +0000 Subject: [PATCH 23/23] fix(docs): add vpc-flow-logs and bedrock-agentcore to PassedToService V3 least-privilege deploy found two missing services in the iam:PassedToService condition: vpc-flow-logs.amazonaws.com (VPC Flow Log role) and bedrock-agentcore.amazonaws.com (AgentMemory service role). Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/DEPLOYMENT_ROLES.md | 4 +++- docs/src/content/docs/architecture/Deployment-roles.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/design/DEPLOYMENT_ROLES.md b/docs/design/DEPLOYMENT_ROLES.md index a48016d..b533c73 100644 --- a/docs/design/DEPLOYMENT_ROLES.md +++ b/docs/design/DEPLOYMENT_ROLES.md @@ -155,7 +155,9 @@ CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 5 "apigateway.amazonaws.com", "logs.amazonaws.com", "bedrock.amazonaws.com", - "events.amazonaws.com" + "bedrock-agentcore.amazonaws.com", + "events.amazonaws.com", + "vpc-flow-logs.amazonaws.com" ] } } diff --git a/docs/src/content/docs/architecture/Deployment-roles.md b/docs/src/content/docs/architecture/Deployment-roles.md index 00fde70..c021ce3 100644 --- a/docs/src/content/docs/architecture/Deployment-roles.md +++ b/docs/src/content/docs/architecture/Deployment-roles.md @@ -159,7 +159,9 @@ CloudFormation stack operations, IAM roles/policies, VPC networking, and Route 5 "apigateway.amazonaws.com", "logs.amazonaws.com", "bedrock.amazonaws.com", - "events.amazonaws.com" + "bedrock-agentcore.amazonaws.com", + "events.amazonaws.com", + "vpc-flow-logs.amazonaws.com" ] } }