From bdb89f44b8bc520a3c297fdcdc032bde1492570c Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 12:53:23 -0700 Subject: [PATCH 1/7] feat(compute): add EC2 fleet compute strategy with SSM dispatch Add a third compute backend (EC2 fleet with SSM Run Command) alongside the existing AgentCore and ECS strategies. This provides maximum flexibility with no image size limits, configurable instance types (including GPU), and full control over the compute environment. New files: - ec2-strategy.ts: ComputeStrategy implementation using EC2 tags for instance tracking and SSM RunShellScript for task dispatch - ec2-agent-fleet.ts: CDK construct with ASG, launch template, security group, S3 payload bucket, and IAM role - ec2-strategy.test.ts and ec2-agent-fleet.test.ts: full test coverage Wiring: - repo-config.ts: add 'ec2' to ComputeType, add instance_type field - compute-strategy.ts: add EC2 SessionHandle variant and resolver case - task-orchestrator.ts: add ec2Config prop with env vars and IAM grants - orchestrate-task.ts: enable compute polling for EC2 - cancel-task.ts: add SSM CancelCommand for EC2 tasks - task-api.ts: add ssm:CancelCommand permission for cancel Lambda - agent.ts: add commented-out EC2 fleet block (same pattern as ECS) --- cdk/package.json | 3 + cdk/src/constructs/blueprint.ts | 2 +- cdk/src/constructs/ec2-agent-fleet.ts | 221 ++++++++++ cdk/src/constructs/task-api.ts | 15 + cdk/src/constructs/task-orchestrator.ts | 55 ++- cdk/src/handlers/cancel-task.ts | 26 ++ cdk/src/handlers/orchestrate-task.ts | 6 +- cdk/src/handlers/shared/compute-strategy.ts | 6 +- cdk/src/handlers/shared/orchestrator.ts | 1 + cdk/src/handlers/shared/repo-config.ts | 4 +- .../shared/strategies/ec2-strategy.ts | 267 ++++++++++++ cdk/src/stacks/agent.ts | 26 ++ cdk/test/constructs/ec2-agent-fleet.test.ts | 213 ++++++++++ .../handlers/shared/compute-strategy.test.ts | 29 ++ .../shared/strategies/ec2-strategy.test.ts | 340 +++++++++++++++ yarn.lock | 397 +++++++++++++++++- 16 files changed, 1604 insertions(+), 7 deletions(-) create mode 100644 cdk/src/constructs/ec2-agent-fleet.ts create mode 100644 cdk/src/handlers/shared/strategies/ec2-strategy.ts create mode 100644 cdk/test/constructs/ec2-agent-fleet.test.ts create mode 100644 cdk/test/handlers/shared/strategies/ec2-strategy.test.ts diff --git a/cdk/package.json b/cdk/package.json index 0feaf5b..7faa142 100644 --- a/cdk/package.json +++ b/cdk/package.json @@ -18,10 +18,13 @@ "@aws-cdk/mixins-preview": "2.238.0-alpha.0", "@aws-sdk/client-bedrock-agentcore": "^3.1021.0", "@aws-sdk/client-bedrock-runtime": "^3.1021.0", + "@aws-sdk/client-ec2": "^3.1021.0", "@aws-sdk/client-ecs": "^3.1021.0", "@aws-sdk/client-dynamodb": "^3.1021.0", "@aws-sdk/client-lambda": "^3.1021.0", + "@aws-sdk/client-s3": "^3.1021.0", "@aws-sdk/client-secrets-manager": "^3.1021.0", + "@aws-sdk/client-ssm": "^3.1021.0", "@aws-sdk/lib-dynamodb": "^3.1021.0", "@aws/durable-execution-sdk-js": "^1.1.0", "aws-cdk-lib": "^2.238.0", diff --git a/cdk/src/constructs/blueprint.ts b/cdk/src/constructs/blueprint.ts index 55ce7ed..3adeb50 100644 --- a/cdk/src/constructs/blueprint.ts +++ b/cdk/src/constructs/blueprint.ts @@ -47,7 +47,7 @@ export interface BlueprintProps { * Compute strategy type. * @default 'agentcore' */ - readonly type?: 'agentcore' | 'ecs'; + readonly type?: 'agentcore' | 'ecs' | 'ec2'; /** * Override the default runtime ARN (agentcore strategy). diff --git a/cdk/src/constructs/ec2-agent-fleet.ts b/cdk/src/constructs/ec2-agent-fleet.ts new file mode 100644 index 0000000..ded688d --- /dev/null +++ b/cdk/src/constructs/ec2-agent-fleet.ts @@ -0,0 +1,221 @@ +/** + * MIT No Attribution + * + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +import { Duration, RemovalPolicy } from 'aws-cdk-lib'; +import * as autoscaling from 'aws-cdk-lib/aws-autoscaling'; +import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; +import * as ec2 from 'aws-cdk-lib/aws-ec2'; +import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets'; +import * as iam from 'aws-cdk-lib/aws-iam'; +import * as logs from 'aws-cdk-lib/aws-logs'; +import * as s3 from 'aws-cdk-lib/aws-s3'; +import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager'; +import { NagSuppressions } from 'cdk-nag'; +import { Construct } from 'constructs'; + +export interface Ec2AgentFleetProps { + readonly vpc: ec2.IVpc; + readonly agentImageAsset: ecr_assets.DockerImageAsset; + readonly taskTable: dynamodb.ITable; + readonly taskEventsTable: dynamodb.ITable; + readonly userConcurrencyTable: dynamodb.ITable; + readonly githubTokenSecret: secretsmanager.ISecret; + readonly memoryId?: string; + readonly instanceType?: ec2.InstanceType; + readonly desiredCapacity?: number; + readonly maxCapacity?: number; +} + +export class Ec2AgentFleet extends Construct { + public readonly securityGroup: ec2.SecurityGroup; + public readonly instanceRole: iam.Role; + public readonly payloadBucket: s3.Bucket; + public readonly autoScalingGroup: autoscaling.AutoScalingGroup; + public readonly fleetTagKey: string; + public readonly fleetTagValue: string; + + constructor(scope: Construct, id: string, props: Ec2AgentFleetProps) { + super(scope, id); + + this.fleetTagKey = 'bgagent:fleet'; + this.fleetTagValue = id; + + // Security group — egress TCP 443 only + this.securityGroup = new ec2.SecurityGroup(this, 'FleetSG', { + vpc: props.vpc, + description: 'EC2 Agent Fleet - egress TCP 443 only', + allowAllOutbound: false, + }); + + this.securityGroup.addEgressRule( + ec2.Peer.anyIpv4(), + ec2.Port.tcp(443), + 'Allow HTTPS egress (GitHub API, AWS services)', + ); + + // S3 bucket for payload overflow + this.payloadBucket = new s3.Bucket(this, 'PayloadBucket', { + removalPolicy: RemovalPolicy.DESTROY, + autoDeleteObjects: true, + encryption: s3.BucketEncryption.S3_MANAGED, + enforceSSL: true, + blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL, + lifecycleRules: [ + { expiration: Duration.days(7) }, + ], + }); + + // CloudWatch log group + const logGroup = new logs.LogGroup(this, 'FleetLogGroup', { + retention: logs.RetentionDays.THREE_MONTHS, + removalPolicy: RemovalPolicy.DESTROY, + }); + + // IAM Role for instances + this.instanceRole = new iam.Role(this, 'InstanceRole', { + assumedBy: new iam.ServicePrincipal('ec2.amazonaws.com'), + managedPolicies: [ + iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonSSMManagedInstanceCore'), + ], + }); + + // DynamoDB read/write on task tables + props.taskTable.grantReadWriteData(this.instanceRole); + props.taskEventsTable.grantReadWriteData(this.instanceRole); + props.userConcurrencyTable.grantReadWriteData(this.instanceRole); + + // Secrets Manager read for GitHub token + props.githubTokenSecret.grantRead(this.instanceRole); + + // Bedrock model invocation + this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({ + actions: [ + 'bedrock:InvokeModel', + 'bedrock:InvokeModelWithResponseStream', + ], + resources: ['*'], + })); + + // CloudWatch Logs write + logGroup.grantWrite(this.instanceRole); + + // ECR pull + this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({ + actions: [ + 'ecr:GetAuthorizationToken', + ], + resources: ['*'], + })); + this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({ + actions: [ + 'ecr:BatchGetImage', + 'ecr:GetDownloadUrlForLayer', + ], + resources: [props.agentImageAsset.repository.repositoryArn], + })); + + // S3 read on payload bucket + this.payloadBucket.grantRead(this.instanceRole); + + // EC2 tag management on self (conditioned on fleet tag) + this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({ + actions: ['ec2:CreateTags', 'ec2:DeleteTags'], + resources: ['*'], + conditions: { + StringEquals: { + [`ec2:ResourceTag/${this.fleetTagKey}`]: this.fleetTagValue, + }, + }, + })); + + const imageUri = props.agentImageAsset.imageUri; + + // User data: install Docker, pull image, tag as idle + const userData = ec2.UserData.forLinux(); + userData.addCommands( + '#!/bin/bash', + 'set -euo pipefail', + '', + '# Install Docker', + 'dnf install -y docker', + 'systemctl enable docker', + 'systemctl start docker', + '', + '# ECR login and pre-pull agent image', + 'REGION=$(ec2-metadata --availability-zone | cut -d" " -f2 | sed \'s/.$//\')', + `aws ecr get-login-password --region "$REGION" | docker login --username AWS --password-stdin $(echo '${imageUri}' | cut -d/ -f1)`, + `docker pull '${imageUri}'`, + '', + '# Tag self as idle', + 'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)', + 'aws ec2 create-tags --resources "$INSTANCE_ID" --region "$REGION" --tags Key=bgagent:status,Value=idle', + ); + + // Auto Scaling Group + this.autoScalingGroup = new autoscaling.AutoScalingGroup(this, 'ASG', { + vpc: props.vpc, + vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS }, + instanceType: props.instanceType ?? new ec2.InstanceType('m7g.xlarge'), + machineImage: ec2.MachineImage.latestAmazonLinux2023({ + cpuType: ec2.AmazonLinuxCpuType.ARM_64, + }), + role: this.instanceRole, + securityGroup: this.securityGroup, + userData, + desiredCapacity: props.desiredCapacity ?? 1, + minCapacity: props.desiredCapacity ?? 1, + maxCapacity: props.maxCapacity ?? 3, + healthCheck: autoscaling.HealthCheck.ec2(), + }); + + // Tag the ASG instances for fleet identification + // CDK auto-propagates tags from the ASG to instances + this.autoScalingGroup.node.defaultChild; + this.autoScalingGroup.addUserData(`aws ec2 create-tags --resources "$(ec2-metadata -i | cut -d' ' -f2)" --region "$(ec2-metadata --availability-zone | cut -d' ' -f2 | sed 's/.$//')" --tags Key=${this.fleetTagKey},Value=${this.fleetTagValue}`); + + NagSuppressions.addResourceSuppressions(this.instanceRole, [ + { + id: 'AwsSolutions-IAM4', + reason: 'AmazonSSMManagedInstanceCore is the AWS-recommended managed policy for SSM-managed instances', + }, + { + id: 'AwsSolutions-IAM5', + reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; Bedrock InvokeModel requires * resource; Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite; ECR GetAuthorizationToken requires * resource; EC2 CreateTags/DeleteTags conditioned on fleet tag; S3 read wildcards from CDK grantRead', + }, + ], true); + + NagSuppressions.addResourceSuppressions(this.autoScalingGroup, [ + { + id: 'AwsSolutions-AS3', + reason: 'ASG scaling notifications are not required for this dev/preview compute backend', + }, + { + id: 'AwsSolutions-EC26', + reason: 'EBS encryption uses default AWS-managed key — sufficient for agent ephemeral workloads', + }, + ], true); + + NagSuppressions.addResourceSuppressions(this.payloadBucket, [ + { + id: 'AwsSolutions-S1', + reason: 'Server access logging not required for ephemeral payload overflow bucket with 7-day lifecycle', + }, + ], true); + } +} diff --git a/cdk/src/constructs/task-api.ts b/cdk/src/constructs/task-api.ts index a69b02a..159851f 100644 --- a/cdk/src/constructs/task-api.ts +++ b/cdk/src/constructs/task-api.ts @@ -106,6 +106,14 @@ export interface TaskApiProps { * When provided, the cancel Lambda gets `ECS_CLUSTER_ARN` env var and `ecs:StopTask` permission. */ readonly ecsClusterArn?: string; + + /** + * EC2 fleet configuration for cancel-task to stop EC2-backed tasks. + * When provided, the cancel Lambda gets `ssm:CancelCommand` permission. + */ + readonly ec2FleetConfig?: { + readonly instanceRoleArn: string; + }; } /** @@ -384,6 +392,13 @@ export class TaskApi extends Construct { })); } + if (props.ec2FleetConfig) { + cancelTaskFn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['ssm:CancelCommand'], + resources: ['*'], + })); + } + // Repo table read for onboarding gate if (props.repoTable) { props.repoTable.grantReadData(createTaskFn); diff --git a/cdk/src/constructs/task-orchestrator.ts b/cdk/src/constructs/task-orchestrator.ts index 7ebf432..1937021 100644 --- a/cdk/src/constructs/task-orchestrator.ts +++ b/cdk/src/constructs/task-orchestrator.ts @@ -18,7 +18,7 @@ */ import * as path from 'path'; -import { Duration, Stack } from 'aws-cdk-lib'; +import { Aws, Duration, Stack } from 'aws-cdk-lib'; import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch'; import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; import * as iam from 'aws-cdk-lib/aws-iam'; @@ -127,6 +127,18 @@ export interface TaskOrchestratorProps { readonly taskRoleArn: string; readonly executionRoleArn: string; }; + + /** + * EC2 fleet compute strategy configuration. + * When provided, EC2-related env vars and IAM policies are added to the orchestrator. + */ + readonly ec2Config?: { + readonly fleetTagKey: string; + readonly fleetTagValue: string; + readonly payloadBucketName: string; + readonly ecrImageUri: string; + readonly instanceRoleArn: string; + }; } /** @@ -195,6 +207,12 @@ export class TaskOrchestrator extends Construct { ECS_SECURITY_GROUP: props.ecsConfig.securityGroup, ECS_CONTAINER_NAME: props.ecsConfig.containerName, }), + ...(props.ec2Config && { + EC2_FLEET_TAG_KEY: props.ec2Config.fleetTagKey, + EC2_FLEET_TAG_VALUE: props.ec2Config.fleetTagValue, + EC2_PAYLOAD_BUCKET: props.ec2Config.payloadBucketName, + ECR_IMAGE_URI: props.ec2Config.ecrImageUri, + }), }, bundling: { externalModules: ['@aws-sdk/*'], @@ -262,6 +280,41 @@ export class TaskOrchestrator extends Construct { })); } + // EC2 fleet compute strategy permissions (only when EC2 is configured) + if (props.ec2Config) { + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: [ + 'ec2:DescribeInstances', + 'ec2:CreateTags', + ], + resources: ['*'], + })); + + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: [ + 'ssm:SendCommand', + 'ssm:GetCommandInvocation', + 'ssm:CancelCommand', + ], + resources: ['*'], + })); + + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['s3:PutObject'], + resources: [`arn:${Aws.PARTITION}:s3:::${props.ec2Config.payloadBucketName}/*`], + })); + + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['iam:PassRole'], + resources: [props.ec2Config.instanceRoleArn], + conditions: { + StringEquals: { + 'iam:PassedToService': 'ec2.amazonaws.com', + }, + }, + })); + } + // Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints) for (const [index, secretArn] of (props.additionalSecretArns ?? []).entries()) { const secret = secretsmanager.Secret.fromSecretCompleteArn( diff --git a/cdk/src/handlers/cancel-task.ts b/cdk/src/handlers/cancel-task.ts index 7da0812..f67d949 100644 --- a/cdk/src/handlers/cancel-task.ts +++ b/cdk/src/handlers/cancel-task.ts @@ -20,6 +20,7 @@ import { BedrockAgentCoreClient, StopRuntimeSessionCommand } from '@aws-sdk/client-bedrock-agentcore'; import { DynamoDBClient } from '@aws-sdk/client-dynamodb'; import { ECSClient, StopTaskCommand } from '@aws-sdk/client-ecs'; +import { SSMClient, CancelCommandCommand } from '@aws-sdk/client-ssm'; import { DynamoDBDocumentClient, GetCommand, PutCommand, UpdateCommand } from '@aws-sdk/lib-dynamodb'; import type { APIGatewayProxyEvent, APIGatewayProxyResult } from 'aws-lambda'; import { ulid } from 'ulid'; @@ -33,6 +34,7 @@ import { computeTtlEpoch } from './shared/validation'; const ddb = DynamoDBDocumentClient.from(new DynamoDBClient({})); const agentCoreClient = new BedrockAgentCoreClient({}); const ecsClient = new ECSClient({}); +const ssmClient = new SSMClient({}); const TABLE_NAME = process.env.TASK_TABLE_NAME!; const EVENTS_TABLE_NAME = process.env.TASK_EVENTS_TABLE_NAME!; const TASK_RETENTION_DAYS = Number(process.env.TASK_RETENTION_DAYS ?? '90'); @@ -140,6 +142,30 @@ export async function handler(event: APIGatewayProxyEvent): Promise = asyn // Build compute metadata for the task record so cancel-task can stop the right backend const computeMetadata: Record = handle.strategyType === 'ecs' ? { clusterArn: handle.clusterArn, taskArn: handle.taskArn } - : { runtimeArn: handle.runtimeArn }; + : handle.strategyType === 'ec2' + ? { instanceId: handle.instanceId, commandId: handle.commandId } + : { runtimeArn: handle.runtimeArn }; await transitionTask(taskId, TaskStatus.HYDRATING, TaskStatus.RUNNING, { session_id: handle.sessionId, @@ -159,7 +161,7 @@ const durableHandler: DurableExecutionHandler = asyn // Resolve the compute strategy once and reuse it across poll iterations // instead of constructing a new instance on every cycle. - const computeStrategy = blueprintConfig.compute_type === 'ecs' + const computeStrategy = (blueprintConfig.compute_type === 'ecs' || blueprintConfig.compute_type === 'ec2') ? resolveComputeStrategy(blueprintConfig) : undefined; diff --git a/cdk/src/handlers/shared/compute-strategy.ts b/cdk/src/handlers/shared/compute-strategy.ts index e3d3c1d..9b04b95 100644 --- a/cdk/src/handlers/shared/compute-strategy.ts +++ b/cdk/src/handlers/shared/compute-strategy.ts @@ -19,11 +19,13 @@ import type { BlueprintConfig, ComputeType } from './repo-config'; import { AgentCoreComputeStrategy } from './strategies/agentcore-strategy'; +import { Ec2ComputeStrategy } from './strategies/ec2-strategy'; import { EcsComputeStrategy } from './strategies/ecs-strategy'; export type SessionHandle = | { readonly sessionId: string; readonly strategyType: 'agentcore'; readonly runtimeArn: string } - | { readonly sessionId: string; readonly strategyType: 'ecs'; readonly clusterArn: string; readonly taskArn: string }; + | { readonly sessionId: string; readonly strategyType: 'ecs'; readonly clusterArn: string; readonly taskArn: string } + | { readonly sessionId: string; readonly strategyType: 'ec2'; readonly instanceId: string; readonly commandId: string }; export type SessionStatus = | { readonly status: 'running' } @@ -48,6 +50,8 @@ export function resolveComputeStrategy(blueprintConfig: BlueprintConfig): Comput return new AgentCoreComputeStrategy(); case 'ecs': return new EcsComputeStrategy(); + case 'ec2': + return new Ec2ComputeStrategy(); default: { const _exhaustive: never = computeType; throw new Error(`Unknown compute_type: '${_exhaustive}'`); diff --git a/cdk/src/handlers/shared/orchestrator.ts b/cdk/src/handlers/shared/orchestrator.ts index ec7982a..4e7758f 100644 --- a/cdk/src/handlers/shared/orchestrator.ts +++ b/cdk/src/handlers/shared/orchestrator.ts @@ -238,6 +238,7 @@ export async function loadBlueprintConfig(task: TaskRecord): Promise; + blueprintConfig: BlueprintConfig; + }): Promise { + if (!EC2_FLEET_TAG_KEY || !EC2_FLEET_TAG_VALUE || !EC2_PAYLOAD_BUCKET || !ECR_IMAGE_URI) { + throw new Error( + 'EC2 compute strategy requires EC2_FLEET_TAG_KEY, EC2_FLEET_TAG_VALUE, EC2_PAYLOAD_BUCKET, and ECR_IMAGE_URI environment variables', + ); + } + + const { taskId, payload, blueprintConfig } = input; + const payloadJson = JSON.stringify(payload); + + // 1. Upload payload to S3 + const payloadKey = `tasks/${taskId}/payload.json`; + await getS3Client().send(new PutObjectCommand({ + Bucket: EC2_PAYLOAD_BUCKET, + Key: payloadKey, + Body: payloadJson, + ContentType: 'application/json', + })); + + // 2. Find an idle instance + const describeResult = await getEc2Client().send(new DescribeInstancesCommand({ + Filters: [ + { Name: `tag:${EC2_FLEET_TAG_KEY}`, Values: [EC2_FLEET_TAG_VALUE] }, + { Name: 'instance-state-name', Values: ['running'] }, + { Name: 'tag:bgagent:status', Values: ['idle'] }, + ], + })); + + const instances = (describeResult.Reservations ?? []).flatMap(r => r.Instances ?? []); + if (instances.length === 0 || !instances[0]?.InstanceId) { + throw new Error('No idle EC2 instances available in fleet'); + } + + const instanceId = instances[0].InstanceId; + + // 3. Tag instance as busy + await getEc2Client().send(new CreateTagsCommand({ + Resources: [instanceId], + Tags: [ + { Key: 'bgagent:status', Value: 'busy' }, + { Key: 'bgagent:task-id', Value: taskId }, + ], + })); + + // 4. Build the boot command (mirrors ECS strategy env vars and Python boot command) + const envExports = [ + `export TASK_ID='${taskId}'`, + `export REPO_URL='${String(payload.repo_url ?? '')}'`, + ...(payload.prompt ? [`export TASK_DESCRIPTION='${String(payload.prompt).replace(/'/g, "'\\''")}'`] : []), + ...(payload.issue_number ? [`export ISSUE_NUMBER='${String(payload.issue_number)}'`] : []), + `export MAX_TURNS='${String(payload.max_turns ?? 100)}'`, + ...(payload.max_budget_usd !== undefined ? [`export MAX_BUDGET_USD='${String(payload.max_budget_usd)}'`] : []), + ...(blueprintConfig.model_id ? [`export ANTHROPIC_MODEL='${blueprintConfig.model_id}'`] : []), + ...(blueprintConfig.system_prompt_overrides ? [`export SYSTEM_PROMPT_OVERRIDES='${blueprintConfig.system_prompt_overrides.replace(/'/g, "'\\''")}'`] : []), + "export CLAUDE_CODE_USE_BEDROCK='1'", + ...(payload.github_token_secret_arn ? [`export GITHUB_TOKEN_SECRET_ARN='${String(payload.github_token_secret_arn)}'`] : []), + ...(payload.memory_id ? [`export MEMORY_ID='${String(payload.memory_id)}'`] : []), + ]; + + const bootScript = [ + '#!/bin/bash', + 'set -euo pipefail', + '', + '# Fetch payload from S3', + `aws s3 cp "s3://${EC2_PAYLOAD_BUCKET}/${payloadKey}" /tmp/payload.json`, + 'export AGENT_PAYLOAD=$(cat /tmp/payload.json)', + '', + '# Set environment variables', + ...envExports, + '', + '# ECR login and pull', + `aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $(echo '${ECR_IMAGE_URI}' | cut -d/ -f1)`, + `docker pull '${ECR_IMAGE_URI}'`, + '', + '# Run the agent container', + 'docker run --rm \\', + ' -e TASK_ID -e REPO_URL -e CLAUDE_CODE_USE_BEDROCK -e AGENT_PAYLOAD \\', + ' -e AWS_REGION -e AWS_DEFAULT_REGION \\', + ` ${payload.prompt ? '-e TASK_DESCRIPTION ' : ''}${payload.issue_number ? '-e ISSUE_NUMBER ' : ''}-e MAX_TURNS \\`, + ` ${payload.max_budget_usd !== undefined ? '-e MAX_BUDGET_USD ' : ''}${blueprintConfig.model_id ? '-e ANTHROPIC_MODEL ' : ''}${blueprintConfig.system_prompt_overrides ? '-e SYSTEM_PROMPT_OVERRIDES ' : ''}\\`, + ` ${payload.github_token_secret_arn ? '-e GITHUB_TOKEN_SECRET_ARN ' : ''}${payload.memory_id ? '-e MEMORY_ID ' : ''}\\`, + ` '${ECR_IMAGE_URI}' \\`, + ' python -c \'import json, os, sys; sys.path.insert(0, "/app"); from entrypoint import run_task; p = json.loads(os.environ["AGENT_PAYLOAD"]); r = run_task(repo_url=p.get("repo_url",""), task_description=p.get("prompt",""), issue_number=str(p.get("issue_number","")), github_token=p.get("github_token",""), anthropic_model=p.get("model_id",""), max_turns=int(p.get("max_turns",100)), max_budget_usd=p.get("max_budget_usd"), aws_region=os.environ.get("AWS_REGION",""), task_id=p.get("task_id",""), hydrated_context=p.get("hydrated_context"), system_prompt_overrides=p.get("system_prompt_overrides",""), prompt_version=p.get("prompt_version",""), memory_id=p.get("memory_id",""), task_type=p.get("task_type","new_task"), branch_name=p.get("branch_name",""), pr_number=str(p.get("pr_number",""))); sys.exit(0 if r.get("status")=="success" else 1)\'', + '', + '# Cleanup', + 'docker system prune -f', + 'rm -f /tmp/payload.json', + '', + '# Tag instance back to idle', + 'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)', + 'aws ec2 create-tags --resources "$INSTANCE_ID" --tags Key=bgagent:status,Value=idle', + 'aws ec2 delete-tags --resources "$INSTANCE_ID" --tags Key=bgagent:task-id', + ].join('\n'); + + // 5. Send SSM Run Command + const ssmResult = await getSsmClient().send(new SendCommandCommand({ + DocumentName: 'AWS-RunShellScript', + InstanceIds: [instanceId], + Parameters: { + commands: [bootScript], + }, + TimeoutSeconds: 32400, // 9 hours, matches orchestrator max + })); + + const commandId = ssmResult.Command?.CommandId; + if (!commandId) { + throw new Error('SSM SendCommand returned no CommandId'); + } + + logger.info('EC2 SSM command dispatched', { + task_id: taskId, + instance_id: instanceId, + command_id: commandId, + container_name: EC2_CONTAINER_NAME, + }); + + return { + sessionId: commandId, + strategyType: 'ec2', + instanceId, + commandId, + }; + } + + async pollSession(handle: SessionHandle): Promise { + if (handle.strategyType !== 'ec2') { + throw new Error('pollSession called with non-ec2 handle'); + } + const { commandId, instanceId } = handle; + + try { + const result = await getSsmClient().send(new GetCommandInvocationCommand({ + CommandId: commandId, + InstanceId: instanceId, + })); + + const status = result.Status; + + switch (status) { + case 'InProgress': + case 'Pending': + case 'Delayed': + return { status: 'running' }; + case 'Success': + return { status: 'completed' }; + case 'Failed': + case 'Cancelled': + case 'TimedOut': + case 'Cancelling': + return { status: 'failed', error: result.StatusDetails ?? `SSM command ${status}` }; + default: + // Covers any unexpected status values — treat as running to avoid + // premature failure on transient states. + return { status: 'running' }; + } + } catch (err) { + const errName = err instanceof Error ? err.name : undefined; + if (errName === 'InvocationDoesNotExist') { + return { status: 'failed', error: 'SSM command invocation not found' }; + } + throw err; + } + } + + async stopSession(handle: SessionHandle): Promise { + if (handle.strategyType !== 'ec2') { + throw new Error('stopSession called with non-ec2 handle'); + } + const { commandId, instanceId } = handle; + + try { + await getSsmClient().send(new CancelCommandCommand({ + CommandId: commandId, + InstanceIds: [instanceId], + })); + logger.info('EC2 SSM command cancelled', { command_id: commandId, instance_id: instanceId }); + } catch (err) { + const errName = err instanceof Error ? err.name : undefined; + if (errName === 'InvalidCommandId' || errName === 'InvalidInstanceId') { + logger.info('EC2 SSM command already cancelled or not found', { command_id: commandId, instance_id: instanceId }); + } else { + logger.error('Failed to cancel EC2 SSM command', { + command_id: commandId, + instance_id: instanceId, + error: err instanceof Error ? err.message : String(err), + }); + } + } + + // Best-effort: tag instance back to idle + try { + await getEc2Client().send(new CreateTagsCommand({ + Resources: [instanceId], + Tags: [{ Key: 'bgagent:status', Value: 'idle' }], + })); + await getEc2Client().send(new DeleteTagsCommand({ + Resources: [instanceId], + Tags: [{ Key: 'bgagent:task-id' }], + })); + } catch { + // Swallow — instance may already be terminated + } + } +} diff --git a/cdk/src/stacks/agent.ts b/cdk/src/stacks/agent.ts index 7d16690..b28f6be 100644 --- a/cdk/src/stacks/agent.ts +++ b/cdk/src/stacks/agent.ts @@ -37,6 +37,7 @@ import { Blueprint } from '../constructs/blueprint'; import { ConcurrencyReconciler } from '../constructs/concurrency-reconciler'; import { DnsFirewall } from '../constructs/dns-firewall'; // import { EcsAgentCluster } from '../constructs/ecs-agent-cluster'; +// import { Ec2AgentFleet } from '../constructs/ec2-agent-fleet'; import { RepoTable } from '../constructs/repo-table'; import { TaskApi } from '../constructs/task-api'; import { TaskDashboard } from '../constructs/task-dashboard'; @@ -296,6 +297,21 @@ export class AgentStack extends Stack { // memoryId: agentMemory.memory.memoryId, // }); + // --- EC2 fleet compute backend (optional) --- + // To enable EC2 as an alternative compute backend, uncomment the block below + // and the Ec2AgentFleet import at the top of this file. Repos can then use + // compute_type: 'ec2' in their blueprint config to route tasks to the EC2 fleet. + // + // const ec2Fleet = new Ec2AgentFleet(this, 'Ec2AgentFleet', { + // vpc: agentVpc.vpc, + // agentImageAsset, + // taskTable: taskTable.table, + // taskEventsTable: taskEventsTable.table, + // userConcurrencyTable: userConcurrencyTable.table, + // githubTokenSecret, + // memoryId: agentMemory.memory.memoryId, + // }); + // --- Task Orchestrator (durable Lambda function) --- const orchestrator = new TaskOrchestrator(this, 'TaskOrchestrator', { taskTable: taskTable.table, @@ -317,6 +333,14 @@ export class AgentStack extends Stack { // taskRoleArn: ecsCluster.taskRoleArn, // executionRoleArn: ecsCluster.executionRoleArn, // }, + // To wire EC2, uncomment the ec2Fleet block above and add: + // ec2Config: { + // fleetTagKey: ec2Fleet.fleetTagKey, + // fleetTagValue: ec2Fleet.fleetTagValue, + // payloadBucketName: ec2Fleet.payloadBucket.bucketName, + // ecrImageUri: agentImageAsset.imageUri, + // instanceRoleArn: ec2Fleet.instanceRole.roleArn, + // }, }); // Grant the orchestrator Lambda read+write access to memory @@ -341,6 +365,8 @@ export class AgentStack extends Stack { agentCoreStopSessionRuntimeArns: [runtime.agentRuntimeArn], // To allow cancel-task to stop ECS-backed tasks, uncomment: // ecsClusterArn: ecsCluster.cluster.clusterArn, + // To allow cancel-task to stop EC2-backed tasks, uncomment: + // ec2FleetConfig: { instanceRoleArn: ec2Fleet.instanceRole.roleArn }, }); // --- Operator dashboard --- diff --git a/cdk/test/constructs/ec2-agent-fleet.test.ts b/cdk/test/constructs/ec2-agent-fleet.test.ts new file mode 100644 index 0000000..30d009f --- /dev/null +++ b/cdk/test/constructs/ec2-agent-fleet.test.ts @@ -0,0 +1,213 @@ +/** + * MIT No Attribution + * + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +import * as path from 'path'; +import { App, Stack } from 'aws-cdk-lib'; +import { Template, Match } from 'aws-cdk-lib/assertions'; +import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; +import * as ec2 from 'aws-cdk-lib/aws-ec2'; +import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets'; +import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager'; +import { Ec2AgentFleet } from '../../src/constructs/ec2-agent-fleet'; + +function createStack(overrides?: { memoryId?: string }): { stack: Stack; template: Template } { + const app = new App(); + const stack = new Stack(app, 'TestStack'); + + const vpc = new ec2.Vpc(stack, 'Vpc', { maxAzs: 2 }); + + const agentImageAsset = new ecr_assets.DockerImageAsset(stack, 'AgentImage', { + directory: path.join(__dirname, '..', '..', '..', 'agent'), + }); + + const taskTable = new dynamodb.Table(stack, 'TaskTable', { + partitionKey: { name: 'task_id', type: dynamodb.AttributeType.STRING }, + }); + + const taskEventsTable = new dynamodb.Table(stack, 'TaskEventsTable', { + partitionKey: { name: 'task_id', type: dynamodb.AttributeType.STRING }, + sortKey: { name: 'event_id', type: dynamodb.AttributeType.STRING }, + }); + + const userConcurrencyTable = new dynamodb.Table(stack, 'UserConcurrencyTable', { + partitionKey: { name: 'user_id', type: dynamodb.AttributeType.STRING }, + }); + + const githubTokenSecret = new secretsmanager.Secret(stack, 'GitHubTokenSecret'); + + new Ec2AgentFleet(stack, 'Ec2AgentFleet', { + vpc, + agentImageAsset, + taskTable, + taskEventsTable, + userConcurrencyTable, + githubTokenSecret, + memoryId: overrides?.memoryId, + }); + + const template = Template.fromStack(stack); + return { stack, template }; +} + +describe('Ec2AgentFleet construct', () => { + test('creates an Auto Scaling Group with launch template', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::AutoScaling::AutoScalingGroup', { + MinSize: '1', + MaxSize: '3', + DesiredCapacity: '1', + }); + }); + + test('creates a security group with TCP 443 egress only', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::EC2::SecurityGroup', { + GroupDescription: 'EC2 Agent Fleet - egress TCP 443 only', + SecurityGroupEgress: Match.arrayWith([ + Match.objectLike({ + IpProtocol: 'tcp', + FromPort: 443, + ToPort: 443, + CidrIp: '0.0.0.0/0', + }), + ]), + }); + }); + + test('creates an S3 bucket with lifecycle rule', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::S3::Bucket', { + LifecycleConfiguration: { + Rules: Match.arrayWith([ + Match.objectLike({ + ExpirationInDays: 7, + Status: 'Enabled', + }), + ]), + }, + }); + }); + + test('instance role has DynamoDB read/write permissions', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: Match.arrayWith([ + Match.objectLike({ + Action: Match.arrayWith([ + 'dynamodb:PutItem', + 'dynamodb:UpdateItem', + ]), + Effect: 'Allow', + }), + ]), + }, + }); + }); + + test('instance role has Secrets Manager read permission', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: Match.arrayWith([ + Match.objectLike({ + Action: Match.arrayWith([ + 'secretsmanager:GetSecretValue', + ]), + Effect: 'Allow', + }), + ]), + }, + }); + }); + + test('instance role has Bedrock InvokeModel permissions', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: Match.arrayWith([ + Match.objectLike({ + Action: [ + 'bedrock:InvokeModel', + 'bedrock:InvokeModelWithResponseStream', + ], + Effect: 'Allow', + Resource: '*', + }), + ]), + }, + }); + }); + + test('instance role has SSM managed policy', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::IAM::Role', { + ManagedPolicyArns: Match.arrayWith([ + Match.objectLike({ + 'Fn::Join': Match.arrayWith([ + Match.arrayWith([ + Match.stringLikeRegexp('AmazonSSMManagedInstanceCore'), + ]), + ]), + }), + ]), + }); + }); + + test('creates a CloudWatch log group with 3-month retention', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::Logs::LogGroup', { + RetentionInDays: 90, + }); + }); + + test('instance role has ECR pull permissions', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: Match.arrayWith([ + Match.objectLike({ + Action: 'ecr:GetAuthorizationToken', + Effect: 'Allow', + Resource: '*', + }), + ]), + }, + }); + }); + + test('instance role has EC2 tag management permissions conditioned on fleet tag', () => { + const { template } = createStack(); + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: Match.arrayWith([ + Match.objectLike({ + Action: ['ec2:CreateTags', 'ec2:DeleteTags'], + Effect: 'Allow', + Condition: { + StringEquals: { + 'ec2:ResourceTag/bgagent:fleet': 'Ec2AgentFleet', + }, + }, + }), + ]), + }, + }); + }); +}); diff --git a/cdk/test/handlers/shared/compute-strategy.test.ts b/cdk/test/handlers/shared/compute-strategy.test.ts index 1fac73d..ef15bea 100644 --- a/cdk/test/handlers/shared/compute-strategy.test.ts +++ b/cdk/test/handlers/shared/compute-strategy.test.ts @@ -30,8 +30,28 @@ jest.mock('@aws-sdk/client-ecs', () => ({ StopTaskCommand: jest.fn(), })); +jest.mock('@aws-sdk/client-ec2', () => ({ + EC2Client: jest.fn(() => ({ send: jest.fn() })), + DescribeInstancesCommand: jest.fn(), + CreateTagsCommand: jest.fn(), + DeleteTagsCommand: jest.fn(), +})); + +jest.mock('@aws-sdk/client-ssm', () => ({ + SSMClient: jest.fn(() => ({ send: jest.fn() })), + SendCommandCommand: jest.fn(), + GetCommandInvocationCommand: jest.fn(), + CancelCommandCommand: jest.fn(), +})); + +jest.mock('@aws-sdk/client-s3', () => ({ + S3Client: jest.fn(() => ({ send: jest.fn() })), + PutObjectCommand: jest.fn(), +})); + import { resolveComputeStrategy } from '../../../src/handlers/shared/compute-strategy'; import { AgentCoreComputeStrategy } from '../../../src/handlers/shared/strategies/agentcore-strategy'; +import { Ec2ComputeStrategy } from '../../../src/handlers/shared/strategies/ec2-strategy'; import { EcsComputeStrategy } from '../../../src/handlers/shared/strategies/ecs-strategy'; describe('resolveComputeStrategy', () => { @@ -52,4 +72,13 @@ describe('resolveComputeStrategy', () => { expect(strategy).toBeInstanceOf(EcsComputeStrategy); expect(strategy.type).toBe('ecs'); }); + + test('returns Ec2ComputeStrategy for compute_type ec2', () => { + const strategy = resolveComputeStrategy({ + compute_type: 'ec2', + runtime_arn: 'arn:test', + }); + expect(strategy).toBeInstanceOf(Ec2ComputeStrategy); + expect(strategy.type).toBe('ec2'); + }); }); diff --git a/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts new file mode 100644 index 0000000..6873722 --- /dev/null +++ b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts @@ -0,0 +1,340 @@ +/** + * MIT No Attribution + * + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + * the Software, and to permit persons to whom the Software is furnished to do so. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +const FLEET_TAG_KEY = 'bgagent:fleet'; +const FLEET_TAG_VALUE = 'test-fleet'; +const PAYLOAD_BUCKET = 'test-payload-bucket'; +const ECR_IMAGE = '123456789012.dkr.ecr.us-east-1.amazonaws.com/agent:latest'; +const INSTANCE_ID = 'i-0123456789abcdef0'; +const COMMAND_ID = 'cmd-0123456789abcdef0'; + +// Set env vars BEFORE import — Ec2ComputeStrategy reads them as module-level constants +process.env.EC2_FLEET_TAG_KEY = FLEET_TAG_KEY; +process.env.EC2_FLEET_TAG_VALUE = FLEET_TAG_VALUE; +process.env.EC2_PAYLOAD_BUCKET = PAYLOAD_BUCKET; +process.env.ECR_IMAGE_URI = ECR_IMAGE; + +const mockEc2Send = jest.fn(); +jest.mock('@aws-sdk/client-ec2', () => ({ + EC2Client: jest.fn(() => ({ send: mockEc2Send })), + DescribeInstancesCommand: jest.fn((input: unknown) => ({ _type: 'DescribeInstances', input })), + CreateTagsCommand: jest.fn((input: unknown) => ({ _type: 'CreateTags', input })), + DeleteTagsCommand: jest.fn((input: unknown) => ({ _type: 'DeleteTags', input })), +})); + +const mockSsmSend = jest.fn(); +jest.mock('@aws-sdk/client-ssm', () => ({ + SSMClient: jest.fn(() => ({ send: mockSsmSend })), + SendCommandCommand: jest.fn((input: unknown) => ({ _type: 'SendCommand', input })), + GetCommandInvocationCommand: jest.fn((input: unknown) => ({ _type: 'GetCommandInvocation', input })), + CancelCommandCommand: jest.fn((input: unknown) => ({ _type: 'CancelCommand', input })), +})); + +const mockS3Send = jest.fn(); +jest.mock('@aws-sdk/client-s3', () => ({ + S3Client: jest.fn(() => ({ send: mockS3Send })), + PutObjectCommand: jest.fn((input: unknown) => ({ _type: 'PutObject', input })), +})); + +import { Ec2ComputeStrategy } from '../../../../src/handlers/shared/strategies/ec2-strategy'; + +beforeEach(() => { + jest.clearAllMocks(); +}); + +describe('Ec2ComputeStrategy', () => { + test('type is ec2', () => { + const strategy = new Ec2ComputeStrategy(); + expect(strategy.type).toBe('ec2'); + }); + + describe('startSession', () => { + test('finds idle instance, tags as busy, uploads to S3, sends SSM command, returns handle', async () => { + // S3 upload + mockS3Send.mockResolvedValueOnce({}); + // DescribeInstances — return one idle instance + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID }] }], + }); + // CreateTags (mark busy) + mockEc2Send.mockResolvedValueOnce({}); + // SSM SendCommand + mockSsmSend.mockResolvedValueOnce({ + Command: { CommandId: COMMAND_ID }, + }); + + const strategy = new Ec2ComputeStrategy(); + const handle = await strategy.startSession({ + taskId: 'TASK001', + payload: { repo_url: 'org/repo', prompt: 'Fix the bug', issue_number: 42, max_turns: 50 }, + blueprintConfig: { compute_type: 'ec2', runtime_arn: '' }, + }); + + expect(handle.sessionId).toBe(COMMAND_ID); + expect(handle.strategyType).toBe('ec2'); + const ec2Handle = handle as Extract; + expect(ec2Handle.instanceId).toBe(INSTANCE_ID); + expect(ec2Handle.commandId).toBe(COMMAND_ID); + + // Verify S3 upload + expect(mockS3Send).toHaveBeenCalledTimes(1); + const s3Call = mockS3Send.mock.calls[0][0]; + expect(s3Call.input.Bucket).toBe(PAYLOAD_BUCKET); + expect(s3Call.input.Key).toBe('tasks/TASK001/payload.json'); + + // Verify DescribeInstances filter + expect(mockEc2Send).toHaveBeenCalledTimes(2); + const describeCall = mockEc2Send.mock.calls[0][0]; + expect(describeCall.input.Filters).toEqual(expect.arrayContaining([ + expect.objectContaining({ Name: `tag:${FLEET_TAG_KEY}`, Values: [FLEET_TAG_VALUE] }), + expect.objectContaining({ Name: 'instance-state-name', Values: ['running'] }), + expect.objectContaining({ Name: 'tag:bgagent:status', Values: ['idle'] }), + ])); + + // Verify CreateTags (busy) + const tagCall = mockEc2Send.mock.calls[1][0]; + expect(tagCall.input.Resources).toEqual([INSTANCE_ID]); + expect(tagCall.input.Tags).toEqual(expect.arrayContaining([ + { Key: 'bgagent:status', Value: 'busy' }, + { Key: 'bgagent:task-id', Value: 'TASK001' }, + ])); + + // Verify SSM SendCommand + expect(mockSsmSend).toHaveBeenCalledTimes(1); + const ssmCall = mockSsmSend.mock.calls[0][0]; + expect(ssmCall.input.DocumentName).toBe('AWS-RunShellScript'); + expect(ssmCall.input.InstanceIds).toEqual([INSTANCE_ID]); + expect(ssmCall.input.TimeoutSeconds).toBe(32400); + }); + + test('throws when no idle instances available', async () => { + // S3 upload + mockS3Send.mockResolvedValueOnce({}); + // DescribeInstances — return empty + mockEc2Send.mockResolvedValueOnce({ Reservations: [] }); + + const strategy = new Ec2ComputeStrategy(); + await expect( + strategy.startSession({ + taskId: 'TASK001', + payload: { repo_url: 'org/repo' }, + blueprintConfig: { compute_type: 'ec2', runtime_arn: '' }, + }), + ).rejects.toThrow('No idle EC2 instances available in fleet'); + }); + + test('throws when SSM SendCommand fails', async () => { + // S3 upload + mockS3Send.mockResolvedValueOnce({}); + // DescribeInstances + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID }] }], + }); + // CreateTags + mockEc2Send.mockResolvedValueOnce({}); + // SSM SendCommand — return no CommandId + mockSsmSend.mockResolvedValueOnce({ Command: {} }); + + const strategy = new Ec2ComputeStrategy(); + await expect( + strategy.startSession({ + taskId: 'TASK001', + payload: { repo_url: 'org/repo' }, + blueprintConfig: { compute_type: 'ec2', runtime_arn: '' }, + }), + ).rejects.toThrow('SSM SendCommand returned no CommandId'); + }); + }); + + describe('pollSession', () => { + const makeHandle = () => ({ + sessionId: COMMAND_ID, + strategyType: 'ec2' as const, + instanceId: INSTANCE_ID, + commandId: COMMAND_ID, + }); + + test('returns running for InProgress status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'InProgress' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'running' }); + }); + + test('returns running for Pending status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'Pending' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'running' }); + }); + + test('returns running for Delayed status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'Delayed' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'running' }); + }); + + test('returns completed for Success status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'Success' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'completed' }); + }); + + test('returns failed for Failed status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'Failed', StatusDetails: 'Script exited with code 1' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'failed', error: 'Script exited with code 1' }); + }); + + test('returns failed for Cancelled status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'Cancelled', StatusDetails: 'Cancelled by user' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'failed', error: 'Cancelled by user' }); + }); + + test('returns failed for TimedOut status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'TimedOut', StatusDetails: 'Command timed out' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'failed', error: 'Command timed out' }); + }); + + test('returns failed for Cancelling status', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'Cancelling', StatusDetails: 'Command is being cancelled' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'failed', error: 'Command is being cancelled' }); + }); + + test('returns running for unknown status (default case)', async () => { + mockSsmSend.mockResolvedValueOnce({ Status: 'SomeUnknownStatus' }); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'running' }); + }); + + test('returns failed when InvocationDoesNotExist', async () => { + const err = new Error('Invocation does not exist'); + err.name = 'InvocationDoesNotExist'; + mockSsmSend.mockRejectedValueOnce(err); + + const strategy = new Ec2ComputeStrategy(); + const result = await strategy.pollSession(makeHandle()); + expect(result).toEqual({ status: 'failed', error: 'SSM command invocation not found' }); + }); + + test('throws when handle is not ec2 type', async () => { + const strategy = new Ec2ComputeStrategy(); + await expect( + strategy.pollSession({ + sessionId: 'test', + strategyType: 'agentcore', + runtimeArn: 'arn:test', + }), + ).rejects.toThrow('pollSession called with non-ec2 handle'); + }); + }); + + describe('stopSession', () => { + test('cancels SSM command and tags instance idle', async () => { + // CancelCommand + mockSsmSend.mockResolvedValueOnce({}); + // CreateTags (idle) + mockEc2Send.mockResolvedValueOnce({}); + // DeleteTags (task-id) + mockEc2Send.mockResolvedValueOnce({}); + + const strategy = new Ec2ComputeStrategy(); + await strategy.stopSession({ + sessionId: COMMAND_ID, + strategyType: 'ec2', + instanceId: INSTANCE_ID, + commandId: COMMAND_ID, + }); + + expect(mockSsmSend).toHaveBeenCalledTimes(1); + const ssmCall = mockSsmSend.mock.calls[0][0]; + expect(ssmCall.input.CommandId).toBe(COMMAND_ID); + expect(ssmCall.input.InstanceIds).toEqual([INSTANCE_ID]); + + // Verify instance tagged back to idle + expect(mockEc2Send).toHaveBeenCalledTimes(2); + }); + + test('handles already-cancelled command gracefully', async () => { + const err = new Error('Invalid command'); + err.name = 'InvalidCommandId'; + mockSsmSend.mockRejectedValueOnce(err); + // Cleanup tags still attempted + mockEc2Send.mockResolvedValueOnce({}); + mockEc2Send.mockResolvedValueOnce({}); + + const strategy = new Ec2ComputeStrategy(); + await expect( + strategy.stopSession({ + sessionId: COMMAND_ID, + strategyType: 'ec2', + instanceId: INSTANCE_ID, + commandId: COMMAND_ID, + }), + ).resolves.toBeUndefined(); + }); + + test('throws when handle is not ec2 type', async () => { + const strategy = new Ec2ComputeStrategy(); + await expect( + strategy.stopSession({ + sessionId: 'test', + strategyType: 'agentcore', + runtimeArn: 'arn:test', + }), + ).rejects.toThrow('stopSession called with non-ec2 handle'); + }); + + test('swallows tag cleanup errors gracefully', async () => { + // CancelCommand succeeds + mockSsmSend.mockResolvedValueOnce({}); + // CreateTags fails (instance terminated) + mockEc2Send.mockRejectedValueOnce(new Error('Instance terminated')); + + const strategy = new Ec2ComputeStrategy(); + await expect( + strategy.stopSession({ + sessionId: COMMAND_ID, + strategyType: 'ec2', + instanceId: INSTANCE_ID, + commandId: COMMAND_ID, + }), + ).resolves.toBeUndefined(); + }); + }); +}); diff --git a/yarn.lock b/yarn.lock index 86a7e7e..9ede11c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -221,6 +221,27 @@ "@aws-sdk/types" "^3.222.0" tslib "^2.6.2" +"@aws-crypto/crc32c@5.2.0": + version "5.2.0" + resolved "https://registry.yarnpkg.com/@aws-crypto/crc32c/-/crc32c-5.2.0.tgz#4e34aab7f419307821509a98b9b08e84e0c1917e" + integrity sha512-+iWb8qaHLYKrNvGRbiYRHSdKRWhto5XlZUEBwDjYNf+ly5SVYG6zEoYIdxvf5R3zyeP16w4PLBn3rH1xc74Rag== + dependencies: + "@aws-crypto/util" "^5.2.0" + "@aws-sdk/types" "^3.222.0" + tslib "^2.6.2" + +"@aws-crypto/sha1-browser@5.2.0": + version "5.2.0" + resolved "https://registry.yarnpkg.com/@aws-crypto/sha1-browser/-/sha1-browser-5.2.0.tgz#b0ee2d2821d3861f017e965ef3b4cb38e3b6a0f4" + integrity sha512-OH6lveCFfcDjX4dbAvCFSYUjJZjDr/3XJ3xHtjn3Oj5b9RjojQo8npoLeA/bNwkOkrSQ0wgrHzXk4tDRxGKJeg== + dependencies: + "@aws-crypto/supports-web-crypto" "^5.2.0" + "@aws-crypto/util" "^5.2.0" + "@aws-sdk/types" "^3.222.0" + "@aws-sdk/util-locate-window" "^3.0.0" + "@smithy/util-utf8" "^2.0.0" + tslib "^2.6.2" + "@aws-crypto/sha256-browser@5.2.0": version "5.2.0" resolved "https://registry.yarnpkg.com/@aws-crypto/sha256-browser/-/sha256-browser-5.2.0.tgz#153895ef1dba6f9fce38af550e0ef58988eb649e" @@ -250,7 +271,7 @@ dependencies: tslib "^2.6.2" -"@aws-crypto/util@^5.2.0": +"@aws-crypto/util@5.2.0", "@aws-crypto/util@^5.2.0": version "5.2.0" resolved "https://registry.yarnpkg.com/@aws-crypto/util/-/util-5.2.0.tgz#71284c9cffe7927ddadac793c14f14886d3876da" integrity sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ== @@ -454,6 +475,53 @@ "@smithy/util-waiter" "^4.2.14" tslib "^2.6.2" +"@aws-sdk/client-ec2@^3.1021.0": + version "3.1030.0" + resolved "https://registry.yarnpkg.com/@aws-sdk/client-ec2/-/client-ec2-3.1030.0.tgz#1207c91571ec51d07e07f1c1454b6706d9fbc7e2" + integrity sha512-jvi++FA3GWdl0ryaT4AQo1yQaSiTzQzGZEKX+aNbHswBFTu/3sBOrIBUIsn+VrZmrbleUQ53h41ADaKrJ93NUw== + dependencies: + "@aws-crypto/sha256-browser" "5.2.0" + "@aws-crypto/sha256-js" "5.2.0" + "@aws-sdk/core" "^3.973.27" + "@aws-sdk/credential-provider-node" "^3.972.30" + "@aws-sdk/middleware-host-header" "^3.972.9" + "@aws-sdk/middleware-logger" "^3.972.9" + "@aws-sdk/middleware-recursion-detection" "^3.972.10" + "@aws-sdk/middleware-sdk-ec2" "^3.972.19" + "@aws-sdk/middleware-user-agent" "^3.972.29" + "@aws-sdk/region-config-resolver" "^3.972.11" + "@aws-sdk/types" "^3.973.7" + "@aws-sdk/util-endpoints" "^3.996.6" + "@aws-sdk/util-user-agent-browser" "^3.972.9" + "@aws-sdk/util-user-agent-node" "^3.973.15" + "@smithy/config-resolver" "^4.4.14" + "@smithy/core" "^3.23.14" + "@smithy/fetch-http-handler" "^5.3.16" + "@smithy/hash-node" "^4.2.13" + "@smithy/invalid-dependency" "^4.2.13" + "@smithy/middleware-content-length" "^4.2.13" + "@smithy/middleware-endpoint" "^4.4.29" + "@smithy/middleware-retry" "^4.5.0" + "@smithy/middleware-serde" "^4.2.17" + "@smithy/middleware-stack" "^4.2.13" + "@smithy/node-config-provider" "^4.3.13" + "@smithy/node-http-handler" "^4.5.2" + "@smithy/protocol-http" "^5.3.13" + "@smithy/smithy-client" "^4.12.9" + "@smithy/types" "^4.14.0" + "@smithy/url-parser" "^4.2.13" + "@smithy/util-base64" "^4.3.2" + "@smithy/util-body-length-browser" "^4.2.2" + "@smithy/util-body-length-node" "^4.2.3" + "@smithy/util-defaults-mode-browser" "^4.3.45" + "@smithy/util-defaults-mode-node" "^4.2.49" + "@smithy/util-endpoints" "^3.3.4" + "@smithy/util-middleware" "^4.2.13" + "@smithy/util-retry" "^4.3.0" + "@smithy/util-utf8" "^4.2.2" + "@smithy/util-waiter" "^4.2.15" + tslib "^2.6.2" + "@aws-sdk/client-ecs@^3.1021.0": version "3.1027.0" resolved "https://registry.yarnpkg.com/@aws-sdk/client-ecs/-/client-ecs-3.1027.0.tgz#fdc05b3c8a8d9457776791cb3ac4acb57da298a2" @@ -550,6 +618,67 @@ "@smithy/util-waiter" "^4.2.14" tslib "^2.6.2" +"@aws-sdk/client-s3@^3.1021.0": + version "3.1030.0" + resolved "https://registry.yarnpkg.com/@aws-sdk/client-s3/-/client-s3-3.1030.0.tgz#f5c593deb0e32fbd0a174d00feae9c69c0e7cccf" + integrity sha512-sgGb4ub0JXnHaXnok5td7A1KGwENFPwOrwgzvpkeWq9w16Sl7x2KhYtVl+Fdd/7LAvaEtm3HqrYtNmm2d0OXmQ== + dependencies: + "@aws-crypto/sha1-browser" "5.2.0" + "@aws-crypto/sha256-browser" "5.2.0" + "@aws-crypto/sha256-js" "5.2.0" + "@aws-sdk/core" "^3.973.27" + "@aws-sdk/credential-provider-node" "^3.972.30" + "@aws-sdk/middleware-bucket-endpoint" "^3.972.9" + "@aws-sdk/middleware-expect-continue" "^3.972.9" + "@aws-sdk/middleware-flexible-checksums" "^3.974.7" + "@aws-sdk/middleware-host-header" "^3.972.9" + "@aws-sdk/middleware-location-constraint" "^3.972.9" + "@aws-sdk/middleware-logger" "^3.972.9" + "@aws-sdk/middleware-recursion-detection" "^3.972.10" + "@aws-sdk/middleware-sdk-s3" "^3.972.28" + "@aws-sdk/middleware-ssec" "^3.972.9" + "@aws-sdk/middleware-user-agent" "^3.972.29" + "@aws-sdk/region-config-resolver" "^3.972.11" + "@aws-sdk/signature-v4-multi-region" "^3.996.16" + "@aws-sdk/types" "^3.973.7" + "@aws-sdk/util-endpoints" "^3.996.6" + "@aws-sdk/util-user-agent-browser" "^3.972.9" + "@aws-sdk/util-user-agent-node" "^3.973.15" + "@smithy/config-resolver" "^4.4.14" + "@smithy/core" "^3.23.14" + "@smithy/eventstream-serde-browser" "^4.2.13" + "@smithy/eventstream-serde-config-resolver" "^4.3.13" + "@smithy/eventstream-serde-node" "^4.2.13" + "@smithy/fetch-http-handler" "^5.3.16" + "@smithy/hash-blob-browser" "^4.2.14" + "@smithy/hash-node" "^4.2.13" + "@smithy/hash-stream-node" "^4.2.13" + "@smithy/invalid-dependency" "^4.2.13" + "@smithy/md5-js" "^4.2.13" + "@smithy/middleware-content-length" "^4.2.13" + "@smithy/middleware-endpoint" "^4.4.29" + "@smithy/middleware-retry" "^4.5.0" + "@smithy/middleware-serde" "^4.2.17" + "@smithy/middleware-stack" "^4.2.13" + "@smithy/node-config-provider" "^4.3.13" + "@smithy/node-http-handler" "^4.5.2" + "@smithy/protocol-http" "^5.3.13" + "@smithy/smithy-client" "^4.12.9" + "@smithy/types" "^4.14.0" + "@smithy/url-parser" "^4.2.13" + "@smithy/util-base64" "^4.3.2" + "@smithy/util-body-length-browser" "^4.2.2" + "@smithy/util-body-length-node" "^4.2.3" + "@smithy/util-defaults-mode-browser" "^4.3.45" + "@smithy/util-defaults-mode-node" "^4.2.49" + "@smithy/util-endpoints" "^3.3.4" + "@smithy/util-middleware" "^4.2.13" + "@smithy/util-retry" "^4.3.0" + "@smithy/util-stream" "^4.5.22" + "@smithy/util-utf8" "^4.2.2" + "@smithy/util-waiter" "^4.2.15" + tslib "^2.6.2" + "@aws-sdk/client-secrets-manager@^3.1021.0": version "3.1021.0" resolved "https://registry.yarnpkg.com/@aws-sdk/client-secrets-manager/-/client-secrets-manager-3.1021.0.tgz#57c6348c63146642132ffa7e885a2abba08c6ff4" @@ -595,6 +724,52 @@ "@smithy/util-utf8" "^4.2.2" tslib "^2.6.2" +"@aws-sdk/client-ssm@^3.1021.0": + version "3.1030.0" + resolved "https://registry.yarnpkg.com/@aws-sdk/client-ssm/-/client-ssm-3.1030.0.tgz#430b86d76add91913b220c2de3234a3af05b1f75" + integrity sha512-FKu4tINBafrEp6FfoJDaM+KvTqwwK5gnVTrc0ZYbAQ5L7oMuCx02MEQvRI6VLaNhuIqXMKijKo2lodyLY+00WA== + dependencies: + "@aws-crypto/sha256-browser" "5.2.0" + "@aws-crypto/sha256-js" "5.2.0" + "@aws-sdk/core" "^3.973.27" + "@aws-sdk/credential-provider-node" "^3.972.30" + "@aws-sdk/middleware-host-header" "^3.972.9" + "@aws-sdk/middleware-logger" "^3.972.9" + "@aws-sdk/middleware-recursion-detection" "^3.972.10" + "@aws-sdk/middleware-user-agent" "^3.972.29" + "@aws-sdk/region-config-resolver" "^3.972.11" + "@aws-sdk/types" "^3.973.7" + "@aws-sdk/util-endpoints" "^3.996.6" + "@aws-sdk/util-user-agent-browser" "^3.972.9" + "@aws-sdk/util-user-agent-node" "^3.973.15" + "@smithy/config-resolver" "^4.4.14" + "@smithy/core" "^3.23.14" + "@smithy/fetch-http-handler" "^5.3.16" + "@smithy/hash-node" "^4.2.13" + "@smithy/invalid-dependency" "^4.2.13" + "@smithy/middleware-content-length" "^4.2.13" + "@smithy/middleware-endpoint" "^4.4.29" + "@smithy/middleware-retry" "^4.5.0" + "@smithy/middleware-serde" "^4.2.17" + "@smithy/middleware-stack" "^4.2.13" + "@smithy/node-config-provider" "^4.3.13" + "@smithy/node-http-handler" "^4.5.2" + "@smithy/protocol-http" "^5.3.13" + "@smithy/smithy-client" "^4.12.9" + "@smithy/types" "^4.14.0" + "@smithy/url-parser" "^4.2.13" + "@smithy/util-base64" "^4.3.2" + "@smithy/util-body-length-browser" "^4.2.2" + "@smithy/util-body-length-node" "^4.2.3" + "@smithy/util-defaults-mode-browser" "^4.3.45" + "@smithy/util-defaults-mode-node" "^4.2.49" + "@smithy/util-endpoints" "^3.3.4" + "@smithy/util-middleware" "^4.2.13" + "@smithy/util-retry" "^4.3.0" + "@smithy/util-utf8" "^4.2.2" + "@smithy/util-waiter" "^4.2.15" + tslib "^2.6.2" + "@aws-sdk/core@^3.973.26": version "3.973.26" resolved "https://registry.yarnpkg.com/@aws-sdk/core/-/core-3.973.26.tgz#5989c5300f9da7ed57f34b88091c77b4fa5d7256" @@ -633,6 +808,14 @@ "@smithy/util-utf8" "^4.2.2" tslib "^2.6.2" +"@aws-sdk/crc64-nvme@^3.972.6": + version "3.972.6" + resolved "https://registry.yarnpkg.com/@aws-sdk/crc64-nvme/-/crc64-nvme-3.972.6.tgz#4e023b3e3b5f67d3129c97c5caa3e18699d3d550" + integrity sha512-NMbiqKdruhwwgI6nzBVe2jWMkXjaoQz2YOs3rFX+2F3gGyrJDkDPwMpV/RsTFeq2vAQ055wZNtOXFK4NYSkM8g== + dependencies: + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@aws-sdk/credential-provider-env@^3.972.24": version "3.972.24" resolved "https://registry.yarnpkg.com/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.24.tgz#bc33a34f15704d02552aa8b3994d17008b991f86" @@ -911,6 +1094,19 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@aws-sdk/middleware-bucket-endpoint@^3.972.9": + version "3.972.9" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-bucket-endpoint/-/middleware-bucket-endpoint-3.972.9.tgz#4dc1e7a155e612b447387c268740781c785d5810" + integrity sha512-COToYKgquDyligbcAep7ygs48RK+mwe/IYprq4+TSrVFzNOYmzWvHf6werpnKV5VYpRiwdn+Wa5ZXkPqLVwcTg== + dependencies: + "@aws-sdk/types" "^3.973.7" + "@aws-sdk/util-arn-parser" "^3.972.3" + "@smithy/node-config-provider" "^4.3.13" + "@smithy/protocol-http" "^5.3.13" + "@smithy/types" "^4.14.0" + "@smithy/util-config-provider" "^4.2.2" + tslib "^2.6.2" + "@aws-sdk/middleware-endpoint-discovery@^3.972.9": version "3.972.9" resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-endpoint-discovery/-/middleware-endpoint-discovery-3.972.9.tgz#664f9074b0017255680c200bd9b8b23a864c0ad5" @@ -933,6 +1129,36 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@aws-sdk/middleware-expect-continue@^3.972.9": + version "3.972.9" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-expect-continue/-/middleware-expect-continue-3.972.9.tgz#ad62cbc4c5f310a5d104b7fc1150eca13a3c07a4" + integrity sha512-V/FNCjFxnh4VGu+HdSiW4Yg5GELihA1MIDSAdsEPvuayXBVmr0Jaa6jdLAZLH38KYXl/vVjri9DQJWnTAujHEA== + dependencies: + "@aws-sdk/types" "^3.973.7" + "@smithy/protocol-http" "^5.3.13" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + +"@aws-sdk/middleware-flexible-checksums@^3.974.7": + version "3.974.7" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-flexible-checksums/-/middleware-flexible-checksums-3.974.7.tgz#cc2c8efc5932e7bb55d58d717fe60c45fbf21a41" + integrity sha512-uU4/ch2CLHB8Phu1oTKnnQ4e8Ujqi49zEnQYBhWYT53zfFvtJCdGsaOoypBr8Fm/pmCBssRmGoIQ4sixgdLP9w== + dependencies: + "@aws-crypto/crc32" "5.2.0" + "@aws-crypto/crc32c" "5.2.0" + "@aws-crypto/util" "5.2.0" + "@aws-sdk/core" "^3.973.27" + "@aws-sdk/crc64-nvme" "^3.972.6" + "@aws-sdk/types" "^3.973.7" + "@smithy/is-array-buffer" "^4.2.2" + "@smithy/node-config-provider" "^4.3.13" + "@smithy/protocol-http" "^5.3.13" + "@smithy/types" "^4.14.0" + "@smithy/util-middleware" "^4.2.13" + "@smithy/util-stream" "^4.5.22" + "@smithy/util-utf8" "^4.2.2" + tslib "^2.6.2" + "@aws-sdk/middleware-host-header@^3.972.8": version "3.972.8" resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-host-header/-/middleware-host-header-3.972.8.tgz#72186e96500b49b38fb5482d6b7bf95e5b985281" @@ -953,6 +1179,15 @@ "@smithy/types" "^4.14.0" tslib "^2.6.2" +"@aws-sdk/middleware-location-constraint@^3.972.9": + version "3.972.9" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-location-constraint/-/middleware-location-constraint-3.972.9.tgz#35a7a35b678d931970b146024078c509631861ad" + integrity sha512-TyfOi2XNdOZpNKeTJwRUsVAGa+14nkyMb2VVGG+eDgcWG/ed6+NUo72N3hT6QJioxym80NSinErD+LBRF0Ir1w== + dependencies: + "@aws-sdk/types" "^3.973.7" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@aws-sdk/middleware-logger@^3.972.8": version "3.972.8" resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-logger/-/middleware-logger-3.972.8.tgz#7fee4223afcb6f7828dbdf4ea745ce15027cf384" @@ -993,6 +1228,49 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@aws-sdk/middleware-sdk-ec2@^3.972.19": + version "3.972.19" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-sdk-ec2/-/middleware-sdk-ec2-3.972.19.tgz#9b66499bebae68321e50dbcf3f839faf30d28e60" + integrity sha512-eB73yVCMipYwoxiKzRAy4gt1FiAVl/EodfdMxvPomKZw+yWEWKiGhwrVhtLHhFRAM+QkMLnEslsbvsyFELHW+g== + dependencies: + "@aws-sdk/types" "^3.973.7" + "@aws-sdk/util-format-url" "^3.972.9" + "@smithy/middleware-endpoint" "^4.4.29" + "@smithy/protocol-http" "^5.3.13" + "@smithy/signature-v4" "^5.3.13" + "@smithy/smithy-client" "^4.12.9" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + +"@aws-sdk/middleware-sdk-s3@^3.972.28": + version "3.972.28" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-sdk-s3/-/middleware-sdk-s3-3.972.28.tgz#cfdcaab69da8870e039dc58499ac323cd7667242" + integrity sha512-qJHcJQH9UNPUrnPlRtCozKjtqAaypQ5IgQxTNoPsVYIQeuwNIA8Rwt3NvGij1vCDYDfCmZaPLpnJEHlZXeFqmg== + dependencies: + "@aws-sdk/core" "^3.973.27" + "@aws-sdk/types" "^3.973.7" + "@aws-sdk/util-arn-parser" "^3.972.3" + "@smithy/core" "^3.23.14" + "@smithy/node-config-provider" "^4.3.13" + "@smithy/protocol-http" "^5.3.13" + "@smithy/signature-v4" "^5.3.13" + "@smithy/smithy-client" "^4.12.9" + "@smithy/types" "^4.14.0" + "@smithy/util-config-provider" "^4.2.2" + "@smithy/util-middleware" "^4.2.13" + "@smithy/util-stream" "^4.5.22" + "@smithy/util-utf8" "^4.2.2" + tslib "^2.6.2" + +"@aws-sdk/middleware-ssec@^3.972.9": + version "3.972.9" + resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-ssec/-/middleware-ssec-3.972.9.tgz#3658fd92752682316c48b736d6c013a75cfcd7aa" + integrity sha512-wSA2BR7L0CyBNDJeSrleIIzC+DzL93YNTdfU0KPGLiocK6YsRv1nPAzPF+BFSdcs0Qa5ku5Kcf4KvQcWwKGenQ== + dependencies: + "@aws-sdk/types" "^3.973.7" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@aws-sdk/middleware-user-agent@^3.972.28": version "3.972.28" resolved "https://registry.yarnpkg.com/@aws-sdk/middleware-user-agent/-/middleware-user-agent-3.972.28.tgz#7f81d96d2fed0334ff601af62d77e14f67fb9d22" @@ -1149,6 +1427,18 @@ "@smithy/types" "^4.14.0" tslib "^2.6.2" +"@aws-sdk/signature-v4-multi-region@^3.996.16": + version "3.996.16" + resolved "https://registry.yarnpkg.com/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.16.tgz#a078e17caa4b94dad8add2e8b1be6f2362d4c83f" + integrity sha512-EMdXYB4r/k5RWq86fugjRhid5JA+Z6MpS7n4sij4u5/C+STrkvuf9aFu41rJA9MjUzxCLzv8U2XL8cH2GSRYpQ== + dependencies: + "@aws-sdk/middleware-sdk-s3" "^3.972.28" + "@aws-sdk/types" "^3.973.7" + "@smithy/protocol-http" "^5.3.13" + "@smithy/signature-v4" "^5.3.13" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@aws-sdk/token-providers@3.1021.0": version "3.1021.0" resolved "https://registry.yarnpkg.com/@aws-sdk/token-providers/-/token-providers-3.1021.0.tgz#90905a8def49f90e54a73849e25ad4bcc4dbea2a" @@ -1191,6 +1481,13 @@ "@smithy/types" "^4.14.0" tslib "^2.6.2" +"@aws-sdk/util-arn-parser@^3.972.3": + version "3.972.3" + resolved "https://registry.yarnpkg.com/@aws-sdk/util-arn-parser/-/util-arn-parser-3.972.3.tgz#ed989862bbb172ce16d9e1cd5790e5fe367219c2" + integrity sha512-HzSD8PMFrvgi2Kserxuff5VitNq2sgf3w9qxmskKDiDTThWfVteJxuCS9JXiPIPtmCrp+7N9asfIaVhBFORllA== + dependencies: + tslib "^2.6.2" + "@aws-sdk/util-dynamodb@^3.996.2": version "3.996.2" resolved "https://registry.yarnpkg.com/@aws-sdk/util-dynamodb/-/util-dynamodb-3.996.2.tgz#9521dfe84c031809f8cf2e32f03c58fd8a4bb84f" @@ -1230,6 +1527,16 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@aws-sdk/util-format-url@^3.972.9": + version "3.972.9" + resolved "https://registry.yarnpkg.com/@aws-sdk/util-format-url/-/util-format-url-3.972.9.tgz#a52e141dc7b8dcb954460e34fe4a0b9451734d7b" + integrity sha512-fNJXHrs0ZT7Wx0KGIqKv7zLxlDXt2vqjx9z6oKUQFmpE5o4xxnSryvVHfHpIifYHWKz94hFccIldJ0YSZjlCBw== + dependencies: + "@aws-sdk/types" "^3.973.7" + "@smithy/querystring-builder" "^4.2.13" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@aws-sdk/util-locate-window@^3.0.0": version "3.965.5" resolved "https://registry.yarnpkg.com/@aws-sdk/util-locate-window/-/util-locate-window-3.965.5.tgz#e30e6ff2aff6436209ed42c765dec2d2a48df7c0" @@ -2913,6 +3220,21 @@ dependencies: "@sinonjs/commons" "^3.0.1" +"@smithy/chunked-blob-reader-native@^4.2.3": + version "4.2.3" + resolved "https://registry.yarnpkg.com/@smithy/chunked-blob-reader-native/-/chunked-blob-reader-native-4.2.3.tgz#9e79a80d8d44798e7ce7a8f968cbbbaf5a40d950" + integrity sha512-jA5k5Udn7Y5717L86h4EIv06wIr3xn8GM1qHRi/Nf31annXcXHJjBKvgztnbn2TxH3xWrPBfgwHsOwZf0UmQWw== + dependencies: + "@smithy/util-base64" "^4.3.2" + tslib "^2.6.2" + +"@smithy/chunked-blob-reader@^5.2.2": + version "5.2.2" + resolved "https://registry.yarnpkg.com/@smithy/chunked-blob-reader/-/chunked-blob-reader-5.2.2.tgz#3af48e37b10e5afed478bb31d2b7bc03c81d196c" + integrity sha512-St+kVicSyayWQca+I1rGitaOEH6uKgE8IUWoYnnEX26SWdWQcL6LvMSD19Lg+vYHKdT9B2Zuu7rd3i6Wnyb/iw== + dependencies: + tslib "^2.6.2" + "@smithy/config-resolver@^4.4.13": version "4.4.13" resolved "https://registry.yarnpkg.com/@smithy/config-resolver/-/config-resolver-4.4.13.tgz#8bffd41de647ec349b4a74bf02bdd1b32452bacd" @@ -3001,6 +3323,16 @@ "@smithy/util-hex-encoding" "^4.2.2" tslib "^2.6.2" +"@smithy/eventstream-codec@^4.2.13": + version "4.2.13" + resolved "https://registry.yarnpkg.com/@smithy/eventstream-codec/-/eventstream-codec-4.2.13.tgz#7fcdf18bc1acaec395b5d387d65136973bd3e1cc" + integrity sha512-vYahwBAtRaAcFbOmE9aLr12z7RiHYDSLcnogSdxfm7kKfsNa3wH+NU5r7vTeB5rKvLsWyPjVX8iH94brP7umiQ== + dependencies: + "@aws-crypto/crc32" "5.2.0" + "@smithy/types" "^4.14.0" + "@smithy/util-hex-encoding" "^4.2.2" + tslib "^2.6.2" + "@smithy/eventstream-serde-browser@^4.2.12": version "4.2.12" resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-browser/-/eventstream-serde-browser-4.2.12.tgz#3ceb8743750edaf5d6e42cd1a2327e048f85ba4e" @@ -3010,6 +3342,15 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@smithy/eventstream-serde-browser@^4.2.13": + version "4.2.13" + resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-browser/-/eventstream-serde-browser-4.2.13.tgz#3b7f4fe380e022db489ca5eef0291b3835c369e6" + integrity sha512-wwybfcOX0tLqCcBP378TIU9IqrDuZq/tDV48LlZNydMpCnqnYr+hWBAYbRE+rFFf/p7IkDJySM3bgiMKP2ihPg== + dependencies: + "@smithy/eventstream-serde-universal" "^4.2.13" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@smithy/eventstream-serde-config-resolver@^4.3.12": version "4.3.12" resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-config-resolver/-/eventstream-serde-config-resolver-4.3.12.tgz#a29164bc5480d935ece9dbdca0f79924259e519a" @@ -3018,6 +3359,14 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@smithy/eventstream-serde-config-resolver@^4.3.13": + version "4.3.13" + resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-config-resolver/-/eventstream-serde-config-resolver-4.3.13.tgz#75477c75a5d8d4f2844319ba713b345a8b1615e0" + integrity sha512-ied1lO559PtAsMJzg2TKRlctLnEi1PfkNeMMpdwXDImk1zV9uvS/Oxoy/vcy9uv1GKZAjDAB5xT6ziE9fzm5wA== + dependencies: + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@smithy/eventstream-serde-node@^4.2.12": version "4.2.12" resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-node/-/eventstream-serde-node-4.2.12.tgz#2cc06a1ea1108f679d376aab81e95a6f69877b4a" @@ -3027,6 +3376,15 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@smithy/eventstream-serde-node@^4.2.13": + version "4.2.13" + resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-node/-/eventstream-serde-node-4.2.13.tgz#6ac8f2b06355ba15a3ccf84fc053fff9bd741e35" + integrity sha512-hFyK+ORJrxAN3RYoaD6+gsGDQjeix8HOEkosoajvXYZ4VeqonM3G4jd9IIRm/sWGXUKmudkY9KdYjzosUqdM8A== + dependencies: + "@smithy/eventstream-serde-universal" "^4.2.13" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@smithy/eventstream-serde-universal@^4.2.12": version "4.2.12" resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-universal/-/eventstream-serde-universal-4.2.12.tgz#a3640d1e7c3e348168360035661db8d21b51e078" @@ -3036,6 +3394,15 @@ "@smithy/types" "^4.13.1" tslib "^2.6.2" +"@smithy/eventstream-serde-universal@^4.2.13": + version "4.2.13" + resolved "https://registry.yarnpkg.com/@smithy/eventstream-serde-universal/-/eventstream-serde-universal-4.2.13.tgz#805c5dfea13bcffb72e3ea46a03de43ddb70843b" + integrity sha512-kRrq4EKLGeOxhC2CBEhRNcu1KSzNJzYY7RK3S7CxMPgB5dRrv55WqQOtRwQxQLC04xqORFLUgnDlc6xrNUULaA== + dependencies: + "@smithy/eventstream-codec" "^4.2.13" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@smithy/fetch-http-handler@^5.3.15": version "5.3.15" resolved "https://registry.yarnpkg.com/@smithy/fetch-http-handler/-/fetch-http-handler-5.3.15.tgz#acf69a8b3bab0396d2782fc901bad0b957c8c6a2" @@ -3058,6 +3425,16 @@ "@smithy/util-base64" "^4.3.2" tslib "^2.6.2" +"@smithy/hash-blob-browser@^4.2.14": + version "4.2.14" + resolved "https://registry.yarnpkg.com/@smithy/hash-blob-browser/-/hash-blob-browser-4.2.14.tgz#c32a6a5b70fa94e324f2ca04296e2355ddfe4c9b" + integrity sha512-rtQ5es8r/5v4rav7q5QTsfx9CtCyzrz/g7ZZZBH2xtMmd6G/KQrLOWfSHTvFOUPlVy59RQvxeBYJaLRoybMEyA== + dependencies: + "@smithy/chunked-blob-reader" "^5.2.2" + "@smithy/chunked-blob-reader-native" "^4.2.3" + "@smithy/types" "^4.14.0" + tslib "^2.6.2" + "@smithy/hash-node@^4.2.12": version "4.2.12" resolved "https://registry.yarnpkg.com/@smithy/hash-node/-/hash-node-4.2.12.tgz#0ee7f6a1d2958c313ee24b07159dcb9547792441" @@ -3078,6 +3455,15 @@ "@smithy/util-utf8" "^4.2.2" tslib "^2.6.2" +"@smithy/hash-stream-node@^4.2.13": + version "4.2.13" + resolved "https://registry.yarnpkg.com/@smithy/hash-stream-node/-/hash-stream-node-4.2.13.tgz#0e0912b12b8f11c360446812e2ada8fec3f6ddd1" + integrity sha512-WdQ7HwUjINXETeh6dqUeob1UHIYx8kAn9PSp1HhM2WWegiZBYVy2WXIs1lB07SZLan/udys9SBnQGt9MQbDpdg== + dependencies: + "@smithy/types" "^4.14.0" + "@smithy/util-utf8" "^4.2.2" + tslib "^2.6.2" + "@smithy/invalid-dependency@^4.2.12": version "4.2.12" resolved "https://registry.yarnpkg.com/@smithy/invalid-dependency/-/invalid-dependency-4.2.12.tgz#1a28c13fb33684b91848d4d6ec5104a1c1413e7f" @@ -3108,6 +3494,15 @@ dependencies: tslib "^2.6.2" +"@smithy/md5-js@^4.2.13": + version "4.2.13" + resolved "https://registry.yarnpkg.com/@smithy/md5-js/-/md5-js-4.2.13.tgz#4c96c41336d7d655758c3a7457439fabc9d4b6cd" + integrity sha512-cNm7I9NXolFxtS20ojROddOEpSAeI1Obq6pd1Kj5HtHws3s9Fkk8DdHDfQSs5KuxCewZuVK6UqrJnfJmiMzDuQ== + dependencies: + "@smithy/types" "^4.14.0" + "@smithy/util-utf8" "^4.2.2" + tslib "^2.6.2" + "@smithy/middleware-content-length@^4.2.12": version "4.2.12" resolved "https://registry.yarnpkg.com/@smithy/middleware-content-length/-/middleware-content-length-4.2.12.tgz#dec97ea1444b12e734156b764e9953b2b37c70fd" From 42566d06b7fff0fa95c5646d92946563faea09da Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 13:34:14 -0700 Subject: [PATCH 2/7] chore(pr): address Copilot review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unnecessary iam:PassRole from orchestrator (EC2 strategy never passes a role to any API) - Simplify ec2FleetConfig in task-api to empty object (instanceRoleArn was unused) - Use CDK Tags.of() for ASG fleet tag propagation instead of no-op user-data tagging — instances are now tagged at launch - Fix missing AWS_REGION in boot script by deriving from IMDS - Eliminate shell injection risk by reading all task data from S3 payload at runtime instead of interpolating into bash exports - Add cleanup trap in boot script to always retag instance as idle on exit (success, error, or signal) - Add try/catch rollback in startSession to retag instance as idle when SSM dispatch fails - Generalize ECS-specific log messages in poll loop to be compute-backend-agnostic (uses strategy type label) --- cdk/src/constructs/ec2-agent-fleet.ts | 9 +- cdk/src/constructs/task-api.ts | 4 +- cdk/src/constructs/task-orchestrator.ts | 13 +-- cdk/src/handlers/orchestrate-task.ts | 55 +++++----- cdk/src/handlers/shared/orchestrator.ts | 8 +- .../shared/strategies/ec2-strategy.ts | 102 +++++++++--------- cdk/src/stacks/agent.ts | 3 +- 7 files changed, 94 insertions(+), 100 deletions(-) diff --git a/cdk/src/constructs/ec2-agent-fleet.ts b/cdk/src/constructs/ec2-agent-fleet.ts index ded688d..5fc5b35 100644 --- a/cdk/src/constructs/ec2-agent-fleet.ts +++ b/cdk/src/constructs/ec2-agent-fleet.ts @@ -17,7 +17,7 @@ * SOFTWARE. */ -import { Duration, RemovalPolicy } from 'aws-cdk-lib'; +import { Duration, RemovalPolicy, Tags } from 'aws-cdk-lib'; import * as autoscaling from 'aws-cdk-lib/aws-autoscaling'; import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; import * as ec2 from 'aws-cdk-lib/aws-ec2'; @@ -184,10 +184,9 @@ export class Ec2AgentFleet extends Construct { healthCheck: autoscaling.HealthCheck.ec2(), }); - // Tag the ASG instances for fleet identification - // CDK auto-propagates tags from the ASG to instances - this.autoScalingGroup.node.defaultChild; - this.autoScalingGroup.addUserData(`aws ec2 create-tags --resources "$(ec2-metadata -i | cut -d' ' -f2)" --region "$(ec2-metadata --availability-zone | cut -d' ' -f2 | sed 's/.$//')" --tags Key=${this.fleetTagKey},Value=${this.fleetTagValue}`); + // Tag ASG instances for fleet identification — CDK propagates these at launch + Tags.of(this.autoScalingGroup).add(this.fleetTagKey, this.fleetTagValue); + Tags.of(this.autoScalingGroup).add('bgagent:status', 'idle'); NagSuppressions.addResourceSuppressions(this.instanceRole, [ { diff --git a/cdk/src/constructs/task-api.ts b/cdk/src/constructs/task-api.ts index 159851f..26bbd0f 100644 --- a/cdk/src/constructs/task-api.ts +++ b/cdk/src/constructs/task-api.ts @@ -111,9 +111,7 @@ export interface TaskApiProps { * EC2 fleet configuration for cancel-task to stop EC2-backed tasks. * When provided, the cancel Lambda gets `ssm:CancelCommand` permission. */ - readonly ec2FleetConfig?: { - readonly instanceRoleArn: string; - }; + readonly ec2FleetConfig?: Record; } /** diff --git a/cdk/src/constructs/task-orchestrator.ts b/cdk/src/constructs/task-orchestrator.ts index 1937021..91fb676 100644 --- a/cdk/src/constructs/task-orchestrator.ts +++ b/cdk/src/constructs/task-orchestrator.ts @@ -137,7 +137,6 @@ export interface TaskOrchestratorProps { readonly fleetTagValue: string; readonly payloadBucketName: string; readonly ecrImageUri: string; - readonly instanceRoleArn: string; }; } @@ -304,15 +303,9 @@ export class TaskOrchestrator extends Construct { resources: [`arn:${Aws.PARTITION}:s3:::${props.ec2Config.payloadBucketName}/*`], })); - this.fn.addToRolePolicy(new iam.PolicyStatement({ - actions: ['iam:PassRole'], - resources: [props.ec2Config.instanceRoleArn], - conditions: { - StringEquals: { - 'iam:PassedToService': 'ec2.amazonaws.com', - }, - }, - })); + // Note: iam:PassRole is not needed — the orchestrator does not pass the + // instance role to any EC2 API. The ASG launch template handles instance + // profile association at fleet creation time. } // Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints) diff --git a/cdk/src/handlers/orchestrate-task.ts b/cdk/src/handlers/orchestrate-task.ts index 02f372f..1fe924f 100644 --- a/cdk/src/handlers/orchestrate-task.ts +++ b/cdk/src/handlers/orchestrate-task.ts @@ -41,8 +41,8 @@ interface OrchestrateTaskEvent { const MAX_POLL_ATTEMPTS = 1020; // ~8.5h at 30s intervals const MAX_NON_RUNNING_POLLS = 10; // ~5min grace period for session to start -const MAX_CONSECUTIVE_ECS_POLL_FAILURES = 3; -const MAX_CONSECUTIVE_ECS_COMPLETED_POLLS = 5; +const MAX_CONSECUTIVE_COMPUTE_POLL_FAILURES = 3; +const MAX_CONSECUTIVE_COMPUTE_COMPLETED_POLLS = 5; const durableHandler: DurableExecutionHandler = async (event, context) => { const { task_id: taskId } = event; @@ -176,63 +176,62 @@ const durableHandler: DurableExecutionHandler = asyn 'await-agent-completion', async (state) => { const ddbState = await pollTaskStatus(taskId, state); - let consecutiveEcsPollFailures = 0; - let consecutiveEcsCompletedPolls = 0; + let consecutiveComputePollFailures = 0; + let consecutiveComputeCompletedPolls = 0; + const computeLabel = blueprintConfig.compute_type.toUpperCase(); - // ECS compute-level crash detection: if DDB is not terminal, check ECS task status + // Compute-level crash detection: if DDB is not terminal, check compute task status if ( ddbState.lastStatus && !TERMINAL_STATUSES.includes(ddbState.lastStatus) && computeStrategy ) { try { - const ecsStatus = await computeStrategy.pollSession(sessionHandle); - if (ecsStatus.status === 'failed') { - const errorMsg = 'error' in ecsStatus ? ecsStatus.error : 'ECS task failed'; - logger.warn('ECS task failed before DDB terminal write', { + const computeStatus = await computeStrategy.pollSession(sessionHandle); + if (computeStatus.status === 'failed') { + const errorMsg = 'error' in computeStatus ? computeStatus.error : `${computeLabel} task failed`; + logger.warn(`${computeLabel} task failed before DDB terminal write`, { task_id: taskId, error: errorMsg, }); - await failTask(taskId, ddbState.lastStatus, `ECS container failed: ${errorMsg}`, task.user_id, true); + await failTask(taskId, ddbState.lastStatus, `${computeLabel} compute failed: ${errorMsg}`, task.user_id, true); return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED }; } - if (ecsStatus.status === 'completed') { - consecutiveEcsCompletedPolls = (state.consecutiveEcsCompletedPolls ?? 0) + 1; - if (consecutiveEcsCompletedPolls >= MAX_CONSECUTIVE_ECS_COMPLETED_POLLS) { - // ECS task exited successfully but DDB never reached terminal — the agent - // likely crashed after container exit code 0 but before writing status. - logger.error('ECS task completed but DDB never caught up — failing task', { + if (computeStatus.status === 'completed') { + consecutiveComputeCompletedPolls = (state.consecutiveComputeCompletedPolls ?? 0) + 1; + if (consecutiveComputeCompletedPolls >= MAX_CONSECUTIVE_COMPUTE_COMPLETED_POLLS) { + logger.error(`${computeLabel} task completed but DDB never caught up — failing task`, { task_id: taskId, - consecutive_completed_polls: consecutiveEcsCompletedPolls, + consecutive_completed_polls: consecutiveComputeCompletedPolls, }); - await failTask(taskId, ddbState.lastStatus, `ECS task exited successfully but agent never wrote terminal status after ${consecutiveEcsCompletedPolls} polls`, task.user_id, true); + await failTask(taskId, ddbState.lastStatus, `${computeLabel} task exited successfully but agent never wrote terminal status after ${consecutiveComputeCompletedPolls} polls`, task.user_id, true); return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED }; } - logger.warn('ECS task completed but DDB not terminal — waiting for DDB catchup', { + logger.warn(`${computeLabel} task completed but DDB not terminal — waiting for DDB catchup`, { task_id: taskId, - consecutive_completed_polls: consecutiveEcsCompletedPolls, + consecutive_completed_polls: consecutiveComputeCompletedPolls, }); } } catch (err) { - consecutiveEcsPollFailures = (state.consecutiveEcsPollFailures ?? 0) + 1; - if (consecutiveEcsPollFailures >= MAX_CONSECUTIVE_ECS_POLL_FAILURES) { - logger.error('ECS pollSession failed repeatedly — failing task', { + consecutiveComputePollFailures = (state.consecutiveComputePollFailures ?? 0) + 1; + if (consecutiveComputePollFailures >= MAX_CONSECUTIVE_COMPUTE_POLL_FAILURES) { + logger.error(`${computeLabel} pollSession failed repeatedly — failing task`, { task_id: taskId, - consecutive_failures: consecutiveEcsPollFailures, + consecutive_failures: consecutiveComputePollFailures, error: err instanceof Error ? err.message : String(err), }); - await failTask(taskId, ddbState.lastStatus, `ECS poll failed ${consecutiveEcsPollFailures} consecutive times: ${err instanceof Error ? err.message : String(err)}`, task.user_id, true); + await failTask(taskId, ddbState.lastStatus, `${computeLabel} poll failed ${consecutiveComputePollFailures} consecutive times: ${err instanceof Error ? err.message : String(err)}`, task.user_id, true); return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED }; } - logger.warn('ECS pollSession check failed (non-fatal)', { + logger.warn(`${computeLabel} pollSession check failed (non-fatal)`, { task_id: taskId, - consecutive_failures: consecutiveEcsPollFailures, + consecutive_failures: consecutiveComputePollFailures, error: err instanceof Error ? err.message : String(err), }); } } - return { ...ddbState, consecutiveEcsPollFailures, consecutiveEcsCompletedPolls }; + return { ...ddbState, consecutiveComputePollFailures, consecutiveComputeCompletedPolls }; }, { initialState: { attempts: 0 }, diff --git a/cdk/src/handlers/shared/orchestrator.ts b/cdk/src/handlers/shared/orchestrator.ts index 4e7758f..0fbab93 100644 --- a/cdk/src/handlers/shared/orchestrator.ts +++ b/cdk/src/handlers/shared/orchestrator.ts @@ -47,10 +47,10 @@ export interface PollState { readonly lastStatus?: TaskStatusType; /** True when the agent stopped sending heartbeats while still RUNNING (likely crash/OOM). */ readonly sessionUnhealthy?: boolean; - /** Consecutive ECS poll failures — escalated to error after 3. */ - readonly consecutiveEcsPollFailures?: number; - /** Consecutive polls where ECS reports completed but DDB is not terminal — escalated after 5. */ - readonly consecutiveEcsCompletedPolls?: number; + /** Consecutive compute poll failures — escalated to error after 3. */ + readonly consecutiveComputePollFailures?: number; + /** Consecutive polls where compute reports completed but DDB is not terminal — escalated after 5. */ + readonly consecutiveComputeCompletedPolls?: number; } /** After RUNNING this long, we expect `agent_heartbeat_at` from the agent (if ever set). */ diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts index 8a6d8a9..ae2d859 100644 --- a/cdk/src/handlers/shared/strategies/ec2-strategy.ts +++ b/cdk/src/handlers/shared/strategies/ec2-strategy.ts @@ -105,69 +105,75 @@ export class Ec2ComputeStrategy implements ComputeStrategy { ], })); - // 4. Build the boot command (mirrors ECS strategy env vars and Python boot command) - const envExports = [ - `export TASK_ID='${taskId}'`, - `export REPO_URL='${String(payload.repo_url ?? '')}'`, - ...(payload.prompt ? [`export TASK_DESCRIPTION='${String(payload.prompt).replace(/'/g, "'\\''")}'`] : []), - ...(payload.issue_number ? [`export ISSUE_NUMBER='${String(payload.issue_number)}'`] : []), - `export MAX_TURNS='${String(payload.max_turns ?? 100)}'`, - ...(payload.max_budget_usd !== undefined ? [`export MAX_BUDGET_USD='${String(payload.max_budget_usd)}'`] : []), - ...(blueprintConfig.model_id ? [`export ANTHROPIC_MODEL='${blueprintConfig.model_id}'`] : []), - ...(blueprintConfig.system_prompt_overrides ? [`export SYSTEM_PROMPT_OVERRIDES='${blueprintConfig.system_prompt_overrides.replace(/'/g, "'\\''")}'`] : []), - "export CLAUDE_CODE_USE_BEDROCK='1'", - ...(payload.github_token_secret_arn ? [`export GITHUB_TOKEN_SECRET_ARN='${String(payload.github_token_secret_arn)}'`] : []), - ...(payload.memory_id ? [`export MEMORY_ID='${String(payload.memory_id)}'`] : []), - ]; - + // 4. Build the boot script + // All task data is read from the S3 payload at runtime to avoid shell + // injection — no untrusted values are interpolated into the script. + // Only infrastructure constants (bucket name, ECR URI) are embedded. const bootScript = [ '#!/bin/bash', 'set -euo pipefail', '', + '# Derive region from IMDS (SSM does not always set AWS_REGION)', + 'export AWS_REGION=$(ec2-metadata --availability-zone | cut -d" " -f2 | sed \'s/.$/\'\'/)\'', + 'export AWS_DEFAULT_REGION="$AWS_REGION"', + '', + '# Resolve instance ID for tag cleanup', + 'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)', + '', + '# Cleanup trap — always retag instance as idle on exit (success, error, or signal)', + 'cleanup() {', + ' docker system prune -f || true', + ' rm -f /tmp/payload.json', + ` aws ec2 create-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:status,Value=idle || true`, + ` aws ec2 delete-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:task-id || true`, + '}', + 'trap cleanup EXIT', + '', '# Fetch payload from S3', `aws s3 cp "s3://${EC2_PAYLOAD_BUCKET}/${payloadKey}" /tmp/payload.json`, 'export AGENT_PAYLOAD=$(cat /tmp/payload.json)', - '', - '# Set environment variables', - ...envExports, + 'export CLAUDE_CODE_USE_BEDROCK=1', '', '# ECR login and pull', - `aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $(echo '${ECR_IMAGE_URI}' | cut -d/ -f1)`, + `aws ecr get-login-password --region "$AWS_REGION" | docker login --username AWS --password-stdin $(echo '${ECR_IMAGE_URI}' | cut -d/ -f1)`, `docker pull '${ECR_IMAGE_URI}'`, '', - '# Run the agent container', - 'docker run --rm \\', - ' -e TASK_ID -e REPO_URL -e CLAUDE_CODE_USE_BEDROCK -e AGENT_PAYLOAD \\', - ' -e AWS_REGION -e AWS_DEFAULT_REGION \\', - ` ${payload.prompt ? '-e TASK_DESCRIPTION ' : ''}${payload.issue_number ? '-e ISSUE_NUMBER ' : ''}-e MAX_TURNS \\`, - ` ${payload.max_budget_usd !== undefined ? '-e MAX_BUDGET_USD ' : ''}${blueprintConfig.model_id ? '-e ANTHROPIC_MODEL ' : ''}${blueprintConfig.system_prompt_overrides ? '-e SYSTEM_PROMPT_OVERRIDES ' : ''}\\`, - ` ${payload.github_token_secret_arn ? '-e GITHUB_TOKEN_SECRET_ARN ' : ''}${payload.memory_id ? '-e MEMORY_ID ' : ''}\\`, - ` '${ECR_IMAGE_URI}' \\`, + '# Run the agent container — all config is read from AGENT_PAYLOAD inside the container', + `docker run --rm -e AGENT_PAYLOAD -e CLAUDE_CODE_USE_BEDROCK -e AWS_REGION -e AWS_DEFAULT_REGION '${ECR_IMAGE_URI}' \\`, ' python -c \'import json, os, sys; sys.path.insert(0, "/app"); from entrypoint import run_task; p = json.loads(os.environ["AGENT_PAYLOAD"]); r = run_task(repo_url=p.get("repo_url",""), task_description=p.get("prompt",""), issue_number=str(p.get("issue_number","")), github_token=p.get("github_token",""), anthropic_model=p.get("model_id",""), max_turns=int(p.get("max_turns",100)), max_budget_usd=p.get("max_budget_usd"), aws_region=os.environ.get("AWS_REGION",""), task_id=p.get("task_id",""), hydrated_context=p.get("hydrated_context"), system_prompt_overrides=p.get("system_prompt_overrides",""), prompt_version=p.get("prompt_version",""), memory_id=p.get("memory_id",""), task_type=p.get("task_type","new_task"), branch_name=p.get("branch_name",""), pr_number=str(p.get("pr_number",""))); sys.exit(0 if r.get("status")=="success" else 1)\'', - '', - '# Cleanup', - 'docker system prune -f', - 'rm -f /tmp/payload.json', - '', - '# Tag instance back to idle', - 'INSTANCE_ID=$(ec2-metadata -i | cut -d" " -f2)', - 'aws ec2 create-tags --resources "$INSTANCE_ID" --tags Key=bgagent:status,Value=idle', - 'aws ec2 delete-tags --resources "$INSTANCE_ID" --tags Key=bgagent:task-id', ].join('\n'); - // 5. Send SSM Run Command - const ssmResult = await getSsmClient().send(new SendCommandCommand({ - DocumentName: 'AWS-RunShellScript', - InstanceIds: [instanceId], - Parameters: { - commands: [bootScript], - }, - TimeoutSeconds: 32400, // 9 hours, matches orchestrator max - })); + // 5. Send SSM Run Command — rollback instance tags on failure + let commandId: string; + try { + const ssmResult = await getSsmClient().send(new SendCommandCommand({ + DocumentName: 'AWS-RunShellScript', + InstanceIds: [instanceId], + Parameters: { + commands: [bootScript], + }, + TimeoutSeconds: 32400, // 9 hours, matches orchestrator max + })); - const commandId = ssmResult.Command?.CommandId; - if (!commandId) { - throw new Error('SSM SendCommand returned no CommandId'); + if (!ssmResult.Command?.CommandId) { + throw new Error('SSM SendCommand returned no CommandId'); + } + commandId = ssmResult.Command.CommandId; + } catch (err) { + // Rollback: retag instance as idle so it's not stuck as busy + try { + await getEc2Client().send(new CreateTagsCommand({ + Resources: [instanceId], + Tags: [{ Key: 'bgagent:status', Value: 'idle' }], + })); + await getEc2Client().send(new DeleteTagsCommand({ + Resources: [instanceId], + Tags: [{ Key: 'bgagent:task-id' }], + })); + } catch { + logger.warn('Failed to rollback instance tags after dispatch failure', { instance_id: instanceId, task_id: taskId }); + } + throw err; } logger.info('EC2 SSM command dispatched', { diff --git a/cdk/src/stacks/agent.ts b/cdk/src/stacks/agent.ts index b28f6be..7fd89d1 100644 --- a/cdk/src/stacks/agent.ts +++ b/cdk/src/stacks/agent.ts @@ -339,7 +339,6 @@ export class AgentStack extends Stack { // fleetTagValue: ec2Fleet.fleetTagValue, // payloadBucketName: ec2Fleet.payloadBucket.bucketName, // ecrImageUri: agentImageAsset.imageUri, - // instanceRoleArn: ec2Fleet.instanceRole.roleArn, // }, }); @@ -366,7 +365,7 @@ export class AgentStack extends Stack { // To allow cancel-task to stop ECS-backed tasks, uncomment: // ecsClusterArn: ecsCluster.cluster.clusterArn, // To allow cancel-task to stop EC2-backed tasks, uncomment: - // ec2FleetConfig: { instanceRoleArn: ec2Fleet.instanceRole.roleArn }, + // ec2FleetConfig: {}, }); // --- Operator dashboard --- From a41fcb6d2c881083989c744548ef5d0875043eaa Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 14:05:50 -0700 Subject: [PATCH 3/7] chore(pr): address second round of review comments - Fix malformed sed quoting in AWS_REGION derivation (ec2-strategy.ts) - Remove unused blueprintConfig destructuring (ec2-strategy.ts) - Scope EC2/SSM IAM permissions: condition ec2:CreateTags on fleet tag, scope ssm:SendCommand to fleet-tagged instances and AWS-RunShellScript document, separate DescribeInstances (requires resource '*') --- cdk/src/constructs/task-orchestrator.ts | 38 ++++++++++++++++--- .../shared/strategies/ec2-strategy.ts | 4 +- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/cdk/src/constructs/task-orchestrator.ts b/cdk/src/constructs/task-orchestrator.ts index 91fb676..ed4c167 100644 --- a/cdk/src/constructs/task-orchestrator.ts +++ b/cdk/src/constructs/task-orchestrator.ts @@ -281,17 +281,45 @@ export class TaskOrchestrator extends Construct { // EC2 fleet compute strategy permissions (only when EC2 is configured) if (props.ec2Config) { + // DescribeInstances does not support resource-level permissions this.fn.addToRolePolicy(new iam.PolicyStatement({ - actions: [ - 'ec2:DescribeInstances', - 'ec2:CreateTags', - ], + actions: ['ec2:DescribeInstances'], + resources: ['*'], + })); + + // CreateTags/DeleteTags scoped to fleet instances only + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['ec2:CreateTags', 'ec2:DeleteTags'], resources: ['*'], + conditions: { + StringEquals: { + [`ec2:ResourceTag/${props.ec2Config.fleetTagKey}`]: props.ec2Config.fleetTagValue, + }, + }, + })); + + // SSM SendCommand scoped to fleet-tagged instances; Get/Cancel scoped to all commands + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['ssm:SendCommand'], + resources: [ + `arn:${Aws.PARTITION}:ec2:*:*:instance/*`, + ], + conditions: { + StringEquals: { + [`ssm:resourceTag/${props.ec2Config.fleetTagKey}`]: props.ec2Config.fleetTagValue, + }, + }, + })); + + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['ssm:SendCommand'], + resources: [ + `arn:${Aws.PARTITION}:ssm:*::document/AWS-RunShellScript`, + ], })); this.fn.addToRolePolicy(new iam.PolicyStatement({ actions: [ - 'ssm:SendCommand', 'ssm:GetCommandInvocation', 'ssm:CancelCommand', ], diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts index ae2d859..b58ba22 100644 --- a/cdk/src/handlers/shared/strategies/ec2-strategy.ts +++ b/cdk/src/handlers/shared/strategies/ec2-strategy.ts @@ -68,7 +68,7 @@ export class Ec2ComputeStrategy implements ComputeStrategy { ); } - const { taskId, payload, blueprintConfig } = input; + const { taskId, payload } = input; const payloadJson = JSON.stringify(payload); // 1. Upload payload to S3 @@ -114,7 +114,7 @@ export class Ec2ComputeStrategy implements ComputeStrategy { 'set -euo pipefail', '', '# Derive region from IMDS (SSM does not always set AWS_REGION)', - 'export AWS_REGION=$(ec2-metadata --availability-zone | cut -d" " -f2 | sed \'s/.$/\'\'/)\'', + "export AWS_REGION=$(ec2-metadata --availability-zone | cut -d' ' -f2 | sed 's/.$//')", 'export AWS_DEFAULT_REGION="$AWS_REGION"', '', '# Resolve instance ID for tag cleanup', From 6420ed9779610798c4dbb19d090ff66e3ef0a592 Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 14:22:12 -0700 Subject: [PATCH 4/7] fix(ec2): address TOCTOU race and heartbeat false-positive for EC2 tasks 1. TOCTOU race in instance selection: after tagging an instance as busy, re-describe to verify our task-id stuck. If another orchestrator won the race, try the next idle candidate instead of double-dispatching. 2. Heartbeat false-positive: EC2/ECS tasks invoke run_task() directly and may not send continuous heartbeats. Suppress sessionUnhealthy checks when compute-level crash detection (pollSession) is active, preventing premature task failure after ~6 minutes. 3. SSM Cancelling status: map to 'running' (transient) instead of 'failed' to avoid premature failure while cancel propagates. 4. Fix babel parse errors in test mocks (remove `: unknown` annotations from jest.mock factory callbacks). --- cdk/src/handlers/orchestrate-task.ts | 11 ++- .../shared/strategies/ec2-strategy.ts | 55 +++++++++++---- .../shared/strategies/ec2-strategy.test.ts | 69 +++++++++++++++---- 3 files changed, 108 insertions(+), 27 deletions(-) diff --git a/cdk/src/handlers/orchestrate-task.ts b/cdk/src/handlers/orchestrate-task.ts index 1fe924f..0d7fb82 100644 --- a/cdk/src/handlers/orchestrate-task.ts +++ b/cdk/src/handlers/orchestrate-task.ts @@ -231,7 +231,10 @@ const durableHandler: DurableExecutionHandler = asyn } } - return { ...ddbState, consecutiveComputePollFailures, consecutiveComputeCompletedPolls }; + // For ECS/EC2 tasks, suppress heartbeat-based sessionUnhealthy since those + // backends have compute-level crash detection and may not send heartbeats. + const suppressHeartbeat = computeStrategy ? { sessionUnhealthy: false } : {}; + return { ...ddbState, ...suppressHeartbeat, consecutiveComputePollFailures, consecutiveComputeCompletedPolls }; }, { initialState: { attempts: 0 }, @@ -239,7 +242,11 @@ const durableHandler: DurableExecutionHandler = asyn if (state.lastStatus && TERMINAL_STATUSES.includes(state.lastStatus)) { return { shouldContinue: false }; } - if (state.sessionUnhealthy) { + // Heartbeat-based health checks only apply to AgentCore tasks. + // ECS/EC2 tasks have compute-level crash detection (pollSession) in the + // poll callback, so stale heartbeats should not terminate polling early + // — the agent entrypoint on those backends may not send continuous heartbeats. + if (state.sessionUnhealthy && !computeStrategy) { return { shouldContinue: false }; } if (state.attempts >= MAX_POLL_ATTEMPTS) { diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts index b58ba22..6fb89a1 100644 --- a/cdk/src/handlers/shared/strategies/ec2-strategy.ts +++ b/cdk/src/handlers/shared/strategies/ec2-strategy.ts @@ -80,7 +80,10 @@ export class Ec2ComputeStrategy implements ComputeStrategy { ContentType: 'application/json', })); - // 2. Find an idle instance + // 2. Find an idle instance and claim it atomically via tag-then-verify. + // Multiple orchestrators may race for the same instance, so after tagging + // we re-describe to confirm our task-id stuck. If another invocation + // overwrote the tag, we try the next candidate. const describeResult = await getEc2Client().send(new DescribeInstancesCommand({ Filters: [ { Name: `tag:${EC2_FLEET_TAG_KEY}`, Values: [EC2_FLEET_TAG_VALUE] }, @@ -89,21 +92,47 @@ export class Ec2ComputeStrategy implements ComputeStrategy { ], })); - const instances = (describeResult.Reservations ?? []).flatMap(r => r.Instances ?? []); - if (instances.length === 0 || !instances[0]?.InstanceId) { + const candidates = (describeResult.Reservations ?? []).flatMap(r => r.Instances ?? []); + if (candidates.length === 0) { throw new Error('No idle EC2 instances available in fleet'); } - const instanceId = instances[0].InstanceId; + let instanceId: string | undefined; + for (const candidate of candidates) { + const candidateId = candidate.InstanceId; + if (!candidateId) continue; - // 3. Tag instance as busy - await getEc2Client().send(new CreateTagsCommand({ - Resources: [instanceId], - Tags: [ - { Key: 'bgagent:status', Value: 'busy' }, - { Key: 'bgagent:task-id', Value: taskId }, - ], - })); + // 3a. Tag instance as busy with our task-id + await getEc2Client().send(new CreateTagsCommand({ + Resources: [candidateId], + Tags: [ + { Key: 'bgagent:status', Value: 'busy' }, + { Key: 'bgagent:task-id', Value: taskId }, + ], + })); + + // 3b. Re-describe to verify we won the race + const verifyResult = await getEc2Client().send(new DescribeInstancesCommand({ + InstanceIds: [candidateId], + })); + const verifiedInstance = verifyResult.Reservations?.[0]?.Instances?.[0]; + const taskIdTag = verifiedInstance?.Tags?.find(t => t.Key === 'bgagent:task-id'); + + if (taskIdTag?.Value === taskId) { + instanceId = candidateId; + break; + } + + logger.warn('Lost instance claim race, trying next candidate', { + task_id: taskId, + instance_id: candidateId, + claimed_by: taskIdTag?.Value, + }); + } + + if (!instanceId) { + throw new Error('No idle EC2 instances available in fleet (all candidates claimed by other tasks)'); + } // 4. Build the boot script // All task data is read from the S3 payload at runtime to avoid shell @@ -209,13 +238,13 @@ export class Ec2ComputeStrategy implements ComputeStrategy { case 'InProgress': case 'Pending': case 'Delayed': + case 'Cancelling': // transient — command still running while cancel propagates return { status: 'running' }; case 'Success': return { status: 'completed' }; case 'Failed': case 'Cancelled': case 'TimedOut': - case 'Cancelling': return { status: 'failed', error: result.StatusDetails ?? `SSM command ${status}` }; default: // Covers any unexpected status values — treat as running to avoid diff --git a/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts index 6873722..70431eb 100644 --- a/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts +++ b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts @@ -33,23 +33,23 @@ process.env.ECR_IMAGE_URI = ECR_IMAGE; const mockEc2Send = jest.fn(); jest.mock('@aws-sdk/client-ec2', () => ({ EC2Client: jest.fn(() => ({ send: mockEc2Send })), - DescribeInstancesCommand: jest.fn((input: unknown) => ({ _type: 'DescribeInstances', input })), - CreateTagsCommand: jest.fn((input: unknown) => ({ _type: 'CreateTags', input })), - DeleteTagsCommand: jest.fn((input: unknown) => ({ _type: 'DeleteTags', input })), + DescribeInstancesCommand: jest.fn((input) => ({ _type: 'DescribeInstances', input })), + CreateTagsCommand: jest.fn((input) => ({ _type: 'CreateTags', input })), + DeleteTagsCommand: jest.fn((input) => ({ _type: 'DeleteTags', input })), })); const mockSsmSend = jest.fn(); jest.mock('@aws-sdk/client-ssm', () => ({ SSMClient: jest.fn(() => ({ send: mockSsmSend })), - SendCommandCommand: jest.fn((input: unknown) => ({ _type: 'SendCommand', input })), - GetCommandInvocationCommand: jest.fn((input: unknown) => ({ _type: 'GetCommandInvocation', input })), - CancelCommandCommand: jest.fn((input: unknown) => ({ _type: 'CancelCommand', input })), + SendCommandCommand: jest.fn((input) => ({ _type: 'SendCommand', input })), + GetCommandInvocationCommand: jest.fn((input) => ({ _type: 'GetCommandInvocation', input })), + CancelCommandCommand: jest.fn((input) => ({ _type: 'CancelCommand', input })), })); const mockS3Send = jest.fn(); jest.mock('@aws-sdk/client-s3', () => ({ S3Client: jest.fn(() => ({ send: mockS3Send })), - PutObjectCommand: jest.fn((input: unknown) => ({ _type: 'PutObject', input })), + PutObjectCommand: jest.fn((input) => ({ _type: 'PutObject', input })), })); import { Ec2ComputeStrategy } from '../../../../src/handlers/shared/strategies/ec2-strategy'; @@ -65,7 +65,7 @@ describe('Ec2ComputeStrategy', () => { }); describe('startSession', () => { - test('finds idle instance, tags as busy, uploads to S3, sends SSM command, returns handle', async () => { + test('finds idle instance, tags as busy, verifies claim, uploads to S3, sends SSM command, returns handle', async () => { // S3 upload mockS3Send.mockResolvedValueOnce({}); // DescribeInstances — return one idle instance @@ -74,6 +74,10 @@ describe('Ec2ComputeStrategy', () => { }); // CreateTags (mark busy) mockEc2Send.mockResolvedValueOnce({}); + // DescribeInstances — verify claim (tag matches our task-id) + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID, Tags: [{ Key: 'bgagent:task-id', Value: 'TASK001' }] }] }], + }); // SSM SendCommand mockSsmSend.mockResolvedValueOnce({ Command: { CommandId: COMMAND_ID }, @@ -98,8 +102,8 @@ describe('Ec2ComputeStrategy', () => { expect(s3Call.input.Bucket).toBe(PAYLOAD_BUCKET); expect(s3Call.input.Key).toBe('tasks/TASK001/payload.json'); - // Verify DescribeInstances filter - expect(mockEc2Send).toHaveBeenCalledTimes(2); + // Verify EC2 calls: DescribeInstances (find idle), CreateTags (claim), DescribeInstances (verify) + expect(mockEc2Send).toHaveBeenCalledTimes(3); const describeCall = mockEc2Send.mock.calls[0][0]; expect(describeCall.input.Filters).toEqual(expect.arrayContaining([ expect.objectContaining({ Name: `tag:${FLEET_TAG_KEY}`, Values: [FLEET_TAG_VALUE] }), @@ -123,6 +127,43 @@ describe('Ec2ComputeStrategy', () => { expect(ssmCall.input.TimeoutSeconds).toBe(32400); }); + test('tries next candidate when race is lost on first instance', async () => { + const INSTANCE_ID_2 = 'i-0987654321fedcba0'; + // S3 upload + mockS3Send.mockResolvedValueOnce({}); + // DescribeInstances — return two idle instances + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID }, { InstanceId: INSTANCE_ID_2 }] }], + }); + // CreateTags on first instance + mockEc2Send.mockResolvedValueOnce({}); + // Verify first instance — another task claimed it + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID, Tags: [{ Key: 'bgagent:task-id', Value: 'OTHER_TASK' }] }] }], + }); + // CreateTags on second instance + mockEc2Send.mockResolvedValueOnce({}); + // Verify second instance — our task-id stuck + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID_2, Tags: [{ Key: 'bgagent:task-id', Value: 'TASK001' }] }] }], + }); + // SSM SendCommand + mockSsmSend.mockResolvedValueOnce({ + Command: { CommandId: COMMAND_ID }, + }); + + const strategy = new Ec2ComputeStrategy(); + const handle = await strategy.startSession({ + taskId: 'TASK001', + payload: { repo_url: 'org/repo' }, + blueprintConfig: { compute_type: 'ec2', runtime_arn: '' }, + }); + + const ec2Handle = handle as Extract; + expect(ec2Handle.instanceId).toBe(INSTANCE_ID_2); + expect(mockEc2Send).toHaveBeenCalledTimes(5); // describe + 2*(tag + verify) + }); + test('throws when no idle instances available', async () => { // S3 upload mockS3Send.mockResolvedValueOnce({}); @@ -148,6 +189,10 @@ describe('Ec2ComputeStrategy', () => { }); // CreateTags mockEc2Send.mockResolvedValueOnce({}); + // DescribeInstances — verify claim + mockEc2Send.mockResolvedValueOnce({ + Reservations: [{ Instances: [{ InstanceId: INSTANCE_ID, Tags: [{ Key: 'bgagent:task-id', Value: 'TASK001' }] }] }], + }); // SSM SendCommand — return no CommandId mockSsmSend.mockResolvedValueOnce({ Command: {} }); @@ -226,12 +271,12 @@ describe('Ec2ComputeStrategy', () => { expect(result).toEqual({ status: 'failed', error: 'Command timed out' }); }); - test('returns failed for Cancelling status', async () => { + test('returns running for Cancelling status (transient)', async () => { mockSsmSend.mockResolvedValueOnce({ Status: 'Cancelling', StatusDetails: 'Command is being cancelled' }); const strategy = new Ec2ComputeStrategy(); const result = await strategy.pollSession(makeHandle()); - expect(result).toEqual({ status: 'failed', error: 'Command is being cancelled' }); + expect(result).toEqual({ status: 'running' }); }); test('returns running for unknown status (default case)', async () => { From b6caa02bc30c0e47d2e546e0b33244dddd24f345 Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 14:39:34 -0700 Subject: [PATCH 5/7] fix: use single quotes for non-interpolated template literals (lint) --- cdk/src/handlers/shared/strategies/ec2-strategy.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts index 6fb89a1..73227c7 100644 --- a/cdk/src/handlers/shared/strategies/ec2-strategy.ts +++ b/cdk/src/handlers/shared/strategies/ec2-strategy.ts @@ -153,8 +153,8 @@ export class Ec2ComputeStrategy implements ComputeStrategy { 'cleanup() {', ' docker system prune -f || true', ' rm -f /tmp/payload.json', - ` aws ec2 create-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:status,Value=idle || true`, - ` aws ec2 delete-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:task-id || true`, + ' aws ec2 create-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:status,Value=idle || true', + ' aws ec2 delete-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:task-id || true', '}', 'trap cleanup EXIT', '', From 2a53b4a0598f5aad857c7c8216654ff7d397b244 Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 14:53:29 -0700 Subject: [PATCH 6/7] fix(ec2): address third round of review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Add rollback on verify failure: if DescribeInstances throws during the tag-then-verify claim, roll back the busy/task-id tags so the instance isn't stuck. 2. Use docker container prune instead of docker system prune in cleanup trap to preserve cached images and avoid re-pulling on next task. 3. Add ecr:BatchCheckLayerAvailability to instance role ECR permissions — required for docker pull from ECR. 4. InvocationDoesNotExist now rethrows instead of returning failed, letting the orchestrator's consecutiveComputePollFailures counter handle transient propagation delays (fails after 3 consecutive). --- cdk/src/constructs/ec2-agent-fleet.ts | 1 + .../shared/strategies/ec2-strategy.ts | 47 ++++++++++++++----- .../shared/strategies/ec2-strategy.test.ts | 5 +- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/cdk/src/constructs/ec2-agent-fleet.ts b/cdk/src/constructs/ec2-agent-fleet.ts index 5fc5b35..1c38b3d 100644 --- a/cdk/src/constructs/ec2-agent-fleet.ts +++ b/cdk/src/constructs/ec2-agent-fleet.ts @@ -125,6 +125,7 @@ export class Ec2AgentFleet extends Construct { this.instanceRole.addToPrincipalPolicy(new iam.PolicyStatement({ actions: [ 'ecr:BatchGetImage', + 'ecr:BatchCheckLayerAvailability', 'ecr:GetDownloadUrlForLayer', ], resources: [props.agentImageAsset.repository.repositoryArn], diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts index 73227c7..ca78397 100644 --- a/cdk/src/handlers/shared/strategies/ec2-strategy.ts +++ b/cdk/src/handlers/shared/strategies/ec2-strategy.ts @@ -111,14 +111,35 @@ export class Ec2ComputeStrategy implements ComputeStrategy { ], })); - // 3b. Re-describe to verify we won the race - const verifyResult = await getEc2Client().send(new DescribeInstancesCommand({ - InstanceIds: [candidateId], - })); - const verifiedInstance = verifyResult.Reservations?.[0]?.Instances?.[0]; - const taskIdTag = verifiedInstance?.Tags?.find(t => t.Key === 'bgagent:task-id'); + // 3b. Re-describe to verify we won the race. If the verify call itself + // fails (throttle, network), roll back the tags so the instance isn't + // stuck as busy, then propagate the error. + let verified = false; + try { + const verifyResult = await getEc2Client().send(new DescribeInstancesCommand({ + InstanceIds: [candidateId], + })); + const verifiedInstance = verifyResult.Reservations?.[0]?.Instances?.[0]; + const taskIdTag = verifiedInstance?.Tags?.find(t => t.Key === 'bgagent:task-id'); + verified = taskIdTag?.Value === taskId; + } catch (verifyErr) { + // Best-effort rollback before propagating + try { + await getEc2Client().send(new CreateTagsCommand({ + Resources: [candidateId], + Tags: [{ Key: 'bgagent:status', Value: 'idle' }], + })); + await getEc2Client().send(new DeleteTagsCommand({ + Resources: [candidateId], + Tags: [{ Key: 'bgagent:task-id' }], + })); + } catch { + logger.warn('Failed to rollback instance tags after verify failure', { instance_id: candidateId, task_id: taskId }); + } + throw verifyErr; + } - if (taskIdTag?.Value === taskId) { + if (verified) { instanceId = candidateId; break; } @@ -126,7 +147,6 @@ export class Ec2ComputeStrategy implements ComputeStrategy { logger.warn('Lost instance claim race, trying next candidate', { task_id: taskId, instance_id: candidateId, - claimed_by: taskIdTag?.Value, }); } @@ -151,7 +171,7 @@ export class Ec2ComputeStrategy implements ComputeStrategy { '', '# Cleanup trap — always retag instance as idle on exit (success, error, or signal)', 'cleanup() {', - ' docker system prune -f || true', + ' docker container prune -f || true', ' rm -f /tmp/payload.json', ' aws ec2 create-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:status,Value=idle || true', ' aws ec2 delete-tags --resources "$INSTANCE_ID" --region "$AWS_REGION" --tags Key=bgagent:task-id || true', @@ -252,10 +272,11 @@ export class Ec2ComputeStrategy implements ComputeStrategy { return { status: 'running' }; } } catch (err) { - const errName = err instanceof Error ? err.name : undefined; - if (errName === 'InvocationDoesNotExist') { - return { status: 'failed', error: 'SSM command invocation not found' }; - } + // InvocationDoesNotExist can occur transiently while the command is + // still propagating to the instance. Rethrow so the orchestrator's + // consecutiveComputePollFailures counter handles it — the command + // will be retried on the next poll cycle and only failed after 3 + // consecutive errors. throw err; } } diff --git a/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts index 70431eb..370bf47 100644 --- a/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts +++ b/cdk/test/handlers/shared/strategies/ec2-strategy.test.ts @@ -287,14 +287,13 @@ describe('Ec2ComputeStrategy', () => { expect(result).toEqual({ status: 'running' }); }); - test('returns failed when InvocationDoesNotExist', async () => { + test('throws InvocationDoesNotExist so orchestrator retry counter handles it', async () => { const err = new Error('Invocation does not exist'); err.name = 'InvocationDoesNotExist'; mockSsmSend.mockRejectedValueOnce(err); const strategy = new Ec2ComputeStrategy(); - const result = await strategy.pollSession(makeHandle()); - expect(result).toEqual({ status: 'failed', error: 'SSM command invocation not found' }); + await expect(strategy.pollSession(makeHandle())).rejects.toThrow('Invocation does not exist'); }); test('throws when handle is not ec2 type', async () => { From c96db299a9f4efb7091e0e3fda2ddf9b99c8e832 Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Tue, 14 Apr 2026 17:31:40 -0700 Subject: [PATCH 7/7] fix(ec2): fix boot script module path and pass GitHub token secret to container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs prevented the EC2 compute strategy from working end-to-end: 1. Python sys.path used /app but the Docker image places modules at /app/src — fixed to sys.path.insert(0, "/app/src"). 2. GITHUB_TOKEN_SECRET_ARN was not passed to the Docker container, causing the agent to fail with "github_token is required" — now exported in the boot script and forwarded via docker run -e. Also enables the EC2 fleet construct in agent.ts with blueprints for krokoko/agent-plugins and aws-samples/sample-autonomous-cloud-coding-agents. --- .../shared/strategies/ec2-strategy.ts | 8 ++- cdk/src/stacks/agent.ts | 63 +++++++++++-------- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/cdk/src/handlers/shared/strategies/ec2-strategy.ts b/cdk/src/handlers/shared/strategies/ec2-strategy.ts index ca78397..80aff82 100644 --- a/cdk/src/handlers/shared/strategies/ec2-strategy.ts +++ b/cdk/src/handlers/shared/strategies/ec2-strategy.ts @@ -68,8 +68,9 @@ export class Ec2ComputeStrategy implements ComputeStrategy { ); } - const { taskId, payload } = input; + const { taskId, payload, blueprintConfig } = input; const payloadJson = JSON.stringify(payload); + const githubTokenSecretArn = blueprintConfig.github_token_secret_arn ?? ''; // 1. Upload payload to S3 const payloadKey = `tasks/${taskId}/payload.json`; @@ -182,14 +183,15 @@ export class Ec2ComputeStrategy implements ComputeStrategy { `aws s3 cp "s3://${EC2_PAYLOAD_BUCKET}/${payloadKey}" /tmp/payload.json`, 'export AGENT_PAYLOAD=$(cat /tmp/payload.json)', 'export CLAUDE_CODE_USE_BEDROCK=1', + `export GITHUB_TOKEN_SECRET_ARN='${githubTokenSecretArn}'`, '', '# ECR login and pull', `aws ecr get-login-password --region "$AWS_REGION" | docker login --username AWS --password-stdin $(echo '${ECR_IMAGE_URI}' | cut -d/ -f1)`, `docker pull '${ECR_IMAGE_URI}'`, '', '# Run the agent container — all config is read from AGENT_PAYLOAD inside the container', - `docker run --rm -e AGENT_PAYLOAD -e CLAUDE_CODE_USE_BEDROCK -e AWS_REGION -e AWS_DEFAULT_REGION '${ECR_IMAGE_URI}' \\`, - ' python -c \'import json, os, sys; sys.path.insert(0, "/app"); from entrypoint import run_task; p = json.loads(os.environ["AGENT_PAYLOAD"]); r = run_task(repo_url=p.get("repo_url",""), task_description=p.get("prompt",""), issue_number=str(p.get("issue_number","")), github_token=p.get("github_token",""), anthropic_model=p.get("model_id",""), max_turns=int(p.get("max_turns",100)), max_budget_usd=p.get("max_budget_usd"), aws_region=os.environ.get("AWS_REGION",""), task_id=p.get("task_id",""), hydrated_context=p.get("hydrated_context"), system_prompt_overrides=p.get("system_prompt_overrides",""), prompt_version=p.get("prompt_version",""), memory_id=p.get("memory_id",""), task_type=p.get("task_type","new_task"), branch_name=p.get("branch_name",""), pr_number=str(p.get("pr_number",""))); sys.exit(0 if r.get("status")=="success" else 1)\'', + `docker run --rm -e AGENT_PAYLOAD -e CLAUDE_CODE_USE_BEDROCK -e AWS_REGION -e AWS_DEFAULT_REGION -e GITHUB_TOKEN_SECRET_ARN '${ECR_IMAGE_URI}' \\`, + ' python -c \'import json, os, sys; sys.path.insert(0, "/app/src"); from entrypoint import run_task; p = json.loads(os.environ["AGENT_PAYLOAD"]); r = run_task(repo_url=p.get("repo_url",""), task_description=p.get("prompt",""), issue_number=str(p.get("issue_number","")), github_token=p.get("github_token",""), anthropic_model=p.get("model_id",""), max_turns=int(p.get("max_turns",100)), max_budget_usd=p.get("max_budget_usd"), aws_region=os.environ.get("AWS_REGION",""), task_id=p.get("task_id",""), hydrated_context=p.get("hydrated_context"), system_prompt_overrides=p.get("system_prompt_overrides",""), prompt_version=p.get("prompt_version",""), memory_id=p.get("memory_id",""), task_type=p.get("task_type","new_task"), branch_name=p.get("branch_name",""), pr_number=str(p.get("pr_number",""))); sys.exit(0 if r.get("status")=="success" else 1)\'', ].join('\n'); // 5. Send SSM Run Command — rollback instance tags on failure diff --git a/cdk/src/stacks/agent.ts b/cdk/src/stacks/agent.ts index 7fd89d1..eb154ae 100644 --- a/cdk/src/stacks/agent.ts +++ b/cdk/src/stacks/agent.ts @@ -23,8 +23,7 @@ import * as bedrock from '@aws-cdk/aws-bedrock-alpha'; import * as agentcoremixins from '@aws-cdk/mixins-preview/aws-bedrockagentcore'; import { Stack, StackProps, RemovalPolicy, CfnOutput, CfnResource } from 'aws-cdk-lib'; import * as ec2 from 'aws-cdk-lib/aws-ec2'; -// ecr_assets import is only needed when the ECS block below is uncommented -// import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets'; +import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets'; import * as iam from 'aws-cdk-lib/aws-iam'; import * as logs from 'aws-cdk-lib/aws-logs'; import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager'; @@ -37,7 +36,7 @@ import { Blueprint } from '../constructs/blueprint'; import { ConcurrencyReconciler } from '../constructs/concurrency-reconciler'; import { DnsFirewall } from '../constructs/dns-firewall'; // import { EcsAgentCluster } from '../constructs/ecs-agent-cluster'; -// import { Ec2AgentFleet } from '../constructs/ec2-agent-fleet'; +import { Ec2AgentFleet } from '../constructs/ec2-agent-fleet'; import { RepoTable } from '../constructs/repo-table'; import { TaskApi } from '../constructs/task-api'; import { TaskDashboard } from '../constructs/task-dashboard'; @@ -66,9 +65,20 @@ export class AgentStack extends Stack { const agentPluginsBlueprint = new Blueprint(this, 'AgentPluginsBlueprint', { repo: 'krokoko/agent-plugins', repoTable: repoTable.table, + compute: { + type: 'ec2', + }, + }); + + const ownRepoBlueprint = new Blueprint(this, 'OwnRepoBlueprint', { + repo: 'aws-samples/sample-autonomous-cloud-coding-agents', + repoTable: repoTable.table, + compute: { + type: 'ec2', + }, }); - const blueprints = [agentPluginsBlueprint]; + const blueprints = [agentPluginsBlueprint, ownRepoBlueprint]; // The AwsCustomResource singleton Lambda used by Blueprint constructs NagSuppressions.addResourceSuppressionsByPath(this, [ @@ -297,20 +307,21 @@ export class AgentStack extends Stack { // memoryId: agentMemory.memory.memoryId, // }); - // --- EC2 fleet compute backend (optional) --- - // To enable EC2 as an alternative compute backend, uncomment the block below - // and the Ec2AgentFleet import at the top of this file. Repos can then use - // compute_type: 'ec2' in their blueprint config to route tasks to the EC2 fleet. - // - // const ec2Fleet = new Ec2AgentFleet(this, 'Ec2AgentFleet', { - // vpc: agentVpc.vpc, - // agentImageAsset, - // taskTable: taskTable.table, - // taskEventsTable: taskEventsTable.table, - // userConcurrencyTable: userConcurrencyTable.table, - // githubTokenSecret, - // memoryId: agentMemory.memory.memoryId, - // }); + // --- EC2 fleet compute backend --- + const agentImageAsset = new ecr_assets.DockerImageAsset(this, 'AgentImage', { + directory: runnerPath, + platform: ecr_assets.Platform.LINUX_ARM64, + }); + + const ec2Fleet = new Ec2AgentFleet(this, 'Ec2AgentFleet', { + vpc: agentVpc.vpc, + agentImageAsset, + taskTable: taskTable.table, + taskEventsTable: taskEventsTable.table, + userConcurrencyTable: userConcurrencyTable.table, + githubTokenSecret, + memoryId: agentMemory.memory.memoryId, + }); // --- Task Orchestrator (durable Lambda function) --- const orchestrator = new TaskOrchestrator(this, 'TaskOrchestrator', { @@ -333,13 +344,12 @@ export class AgentStack extends Stack { // taskRoleArn: ecsCluster.taskRoleArn, // executionRoleArn: ecsCluster.executionRoleArn, // }, - // To wire EC2, uncomment the ec2Fleet block above and add: - // ec2Config: { - // fleetTagKey: ec2Fleet.fleetTagKey, - // fleetTagValue: ec2Fleet.fleetTagValue, - // payloadBucketName: ec2Fleet.payloadBucket.bucketName, - // ecrImageUri: agentImageAsset.imageUri, - // }, + ec2Config: { + fleetTagKey: ec2Fleet.fleetTagKey, + fleetTagValue: ec2Fleet.fleetTagValue, + payloadBucketName: ec2Fleet.payloadBucket.bucketName, + ecrImageUri: agentImageAsset.imageUri, + }, }); // Grant the orchestrator Lambda read+write access to memory @@ -364,8 +374,7 @@ export class AgentStack extends Stack { agentCoreStopSessionRuntimeArns: [runtime.agentRuntimeArn], // To allow cancel-task to stop ECS-backed tasks, uncomment: // ecsClusterArn: ecsCluster.cluster.clusterArn, - // To allow cancel-task to stop EC2-backed tasks, uncomment: - // ec2FleetConfig: {}, + ec2FleetConfig: {}, }); // --- Operator dashboard ---