Coverage for gco/stacks/regional

1"""

2Regional stack for GCO (Global Capacity Orchestrator on AWS) - EKS cluster and ALB per region.

4This is the largest stack in the project (~3200 lines) and creates all regional

5resources for a single AWS region. One instance is deployed per region defined

6in cdk.json.

8Resources Created:

9 VPC & Networking:

10 - VPC with 3 AZs, public subnets (ALB), private subnets (EKS nodes)

11 - 2 NAT Gateways for high availability

12 - VPC endpoints for ECR, S3, STS, Secrets Manager, SSM, CloudWatch

13 - VPC Flow Logs (CloudWatch Logs, 30-day retention)

15 EKS Cluster (Auto Mode):

16 - Managed control plane with full logging (API, Audit, Authenticator, Controller Manager, Scheduler)

17 - NodePools: system, general-purpose, gpu-x86, gpu-arm, inference, gpu-efa, neuron, cpu-general

18 - IRSA roles for service accounts (Secrets Manager, SQS, DynamoDB, CloudWatch, S3, EFS)

20 Load Balancing:

21 - ALB (created by Ingress via AWS Load Balancer Controller)

22 - Internal NLB for regional API Gateway VPC Link

23 - Global Accelerator endpoint registration (via ga-registration Lambda)

25 Storage:

26 - EFS with dynamic provisioning (CSI driver, access points, encryption at rest + in transit)

27 - FSx for Lustre (optional, toggled via cdk.json)

28 - Valkey Serverless cache (optional)

29 - Aurora Serverless v2 with pgvector (optional)

31 Lambda Functions:

32 - kubectl-applier: applies K8s manifests during deployment

33 - helm-installer: installs Helm charts (KEDA, Volcano, KubeRay, GPU Operator, etc.)

34 - ga-registration: registers ALB with Global Accelerator

35 - regional-api-proxy: proxies regional API Gateway to internal ALB

37 Container Images:

38 - ECR repositories + Docker image builds for health-monitor, manifest-processor,

39 inference-monitor, queue-processor

41 SQS:

42 - Regional job queue + dead letter queue (for gco jobs submit-sqs)

44Key Design Decisions:

45 - EKS Auto Mode handles node provisioning — no managed node groups or Karpenter provisioners

46 - NodePools use WhenEmpty consolidation for inference to avoid disrupting long-running pods

47 - IRSA (IAM Roles for Service Accounts) for least-privilege pod-level AWS access

48 - All optional features (FSx, Valkey, Aurora) are toggled via cdk.json context variables

49 - Template variables in K8s manifests ({{PLACEHOLDER}}) are replaced at deploy time

51Dependencies:

52 - GCOGlobalStack (for Global Accelerator endpoint group ARN, DynamoDB table names, S3 bucket)

53 - GCOApiGatewayGlobalStack (for auth secret ARN)

55Modification Guide:

56 - To add a new NodePool: add a YAML manifest in lambda/kubectl-applier-simple/manifests/ (40-49 range)

57 - To add a new service: add ECR image build here, Dockerfile in dockerfiles/, manifest in manifests/

58 - To add a new optional feature: add a cdk.json context toggle, guard with if/else in this file

59 - To change EKS version: update KUBERNETES_VERSION in constants.py

60"""

62from __future__ import annotations

64import time

65from typing import Any

67import aws_cdk.aws_eks_v2 as eks

68from aws_cdk import (

69 CfnJson,

70 CfnOutput,

71 CfnTag,

72 CustomResource,

73 Duration,

74 Fn,

75 RemovalPolicy,

76 Stack,

77)

78from aws_cdk import aws_ec2 as ec2

79from aws_cdk import aws_ecr as ecr

80from aws_cdk import aws_ecr_assets as ecr_assets

81from aws_cdk import aws_efs as efs

82from aws_cdk import aws_eks as eks_l1 # L1 constructs (CfnPodIdentityAssociation)

83from aws_cdk import aws_events as events

84from aws_cdk import aws_events_targets as events_targets

85from aws_cdk import aws_fsx as fsx

86from aws_cdk import aws_iam as iam

87from aws_cdk import aws_kms as kms

88from aws_cdk import aws_lambda as lambda_

89from aws_cdk import aws_logs as logs

90from aws_cdk import aws_sns as sns

91from aws_cdk import aws_sqs as sqs

92from aws_cdk import aws_ssm as ssm

93from aws_cdk import custom_resources as cr

94from constructs import Construct

96from gco.config.config_loader import ConfigLoader

97from gco.stacks.constants import (

98 AURORA_POSTGRES_VERSION,

99 EKS_ADDON_CLOUDWATCH_OBSERVABILITY,

100 EKS_ADDON_EFS_CSI_DRIVER,

101 EKS_ADDON_FSX_CSI_DRIVER,

102 EKS_ADDON_METRICS_SERVER,

103 EKS_ADDON_POD_IDENTITY_AGENT,

104 LAMBDA_PYTHON_RUNTIME,

105)

106

107

108class GCORegionalStack(Stack):

109 """

110 Regional resources stack for a single AWS region.

111

112 Creates EKS cluster, load balancers, and supporting infrastructure

113 for running GCO services in a specific region.

114

115 Attributes:

116 vpc: VPC with public/private subnets

117 cluster: EKS Auto Mode cluster

118 """

119

120 @staticmethod

121 def _create_irsa_role(

122 scope: GCORegionalStack,

123 id: str,

124 oidc_provider_arn: str,

125 oidc_issuer_url: str,

126 service_account_names: list[str],

127 namespaces: list[str],

128 ) -> iam.Role:

129 """Create an IAM role trusted by both IRSA (OIDC) and EKS Pod Identity.

130

131 IRSA is the primary credential mechanism — it works reliably on EKS Auto

132 Mode by projecting a service-account token that the AWS SDK exchanges for

133 temporary credentials via the OIDC provider.

134

135 Pod Identity trust is added as a secondary path so the role is ready if/when

136 Pod Identity injection starts working on Auto Mode nodes.

137

138 Uses CfnJson to defer OIDC condition key resolution to deploy time,

139 because the issuer URL is a CloudFormation token that can't be used

140 as a Python dict key at synth time.

141 """

142 # Strip https:// from issuer URL for the OIDC condition

143 issuer = Fn.select(1, Fn.split("//", oidc_issuer_url))

144

145 # Build OIDC conditions using CfnJson to defer token resolution

146 # The issuer URL is a CFN token — can't be used as a dict key at synth time

147 aud_key = Fn.join("", [issuer, ":aud"])

148 sub_key = Fn.join("", [issuer, ":sub"])

149

150 conditions_json = CfnJson(

151 scope,

152 f"{id}OidcConditions",

153 value={

154 aud_key: "sts.amazonaws.com",

155 sub_key: [

156 f"system:serviceaccount:{ns}:{sa}"

157 for ns in namespaces

158 for sa in service_account_names

159 ],

160 },

161 )

162

163 role = iam.Role(

164 scope,

165 id,

166 assumed_by=iam.FederatedPrincipal(

167 federated=oidc_provider_arn,

168 conditions={

169 "StringEquals": conditions_json,

170 },

171 assume_role_action="sts:AssumeRoleWithWebIdentity",

172 ),

173 )

174

175 # Also allow Pod Identity (secondary path for future use)

176 assert role.assume_role_policy is not None # guaranteed by assumed_by parameter above

177 role.assume_role_policy.add_statements(

178 iam.PolicyStatement(

179 effect=iam.Effect.ALLOW,

180 principals=[iam.ServicePrincipal("pods.eks.amazonaws.com")],

181 actions=["sts:AssumeRole", "sts:TagSession"],

182 )

183 )

184 return role

185

186 def __init__(

187 self,

188 scope: Construct,

189 construct_id: str,

190 config: ConfigLoader,

191 region: str,

192 auth_secret_arn: str,

193 **kwargs: Any,

194 ) -> None:

195 super().__init__(scope, construct_id, **kwargs)

196

197 self.config = config

198 self.deployment_region = region

199 self.auth_secret_arn = auth_secret_arn

200 self.alb_arn: str | None = None

201

202 # Get cluster configuration for this region

203 cluster_config = self.config.get_cluster_config(region)

204 self.cluster_config = cluster_config

205

206 # Create VPC for the EKS cluster

207 self.vpc = ec2.Vpc(

208 self,

209 "GCOVpc",

210 # vpc_name intentionally omitted - let CDK generate unique name

211 max_azs=3,

212 nat_gateways=2, # For high availability

213 subnet_configuration=[

214 ec2.SubnetConfiguration(

215 name="PublicSubnet", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24

216 ),

217 ec2.SubnetConfiguration(

218 name="PrivateSubnet",

219 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,

220 cidr_mask=24,

221 ),

222 ],

223 )

224

225 # Enable VPC Flow Logs for network traffic analysis and security monitoring

226 self._create_vpc_flow_logs()

227

228 # Create SQS queue for job ingestion

229 self._create_sqs_queue()

230

231 # Create ECR repositories and build Docker images

232 self._create_container_images()

233

234 # Pre-create the execution role shared by every ``cr.AwsCustomResource``

235 # in this stack. See ``_create_aws_custom_resource_role`` for the full

236 # rationale — in short, CDK's default behavior of auto-generating a

237 # Lambda role per ``AwsCustomResource`` (and then merging all the

238 # ``policy=`` statements onto it during deploy) triggers an IAM

239 # propagation race on cold creates. We sidestep the race by creating

240 # a single long-lived role up front and attaching policies to it as

241 # each consumer is built; every ``AwsCustomResource`` then passes

242 # ``role=self.aws_custom_resource_role`` instead of ``policy=``, so

243 # the singleton Lambda runs against a role whose inline policy has

244 # already replicated globally.

245 self._create_aws_custom_resource_role()

246

247 # Create EKS cluster

248 self._create_eks_cluster(cluster_config)

249

250 # Create EFS for shared storage

251 self._create_efs()

252

253 # Create FSx for Lustre (if enabled) for high-performance storage

254 self._create_fsx_lustre()

255

256 # Create Valkey Serverless cache (if enabled) for K/V caching

257 self._create_valkey_cache()

258

259 # Create Aurora Serverless v2 + pgvector (if enabled) for vector DB

260 self._create_aurora_pgvector()

261

262 # Create GA registration Lambda for registering Ingress-created ALB

263 self._create_ga_registration_lambda()

264

265 # Create Helm installer Lambda for KEDA and other Helm-based installations

266 self._create_helm_installer_lambda()

267

268 # Apply Kubernetes manifests (after EFS so IDs are available)

269 self._apply_kubernetes_manifests()

270

271 # Create CloudFormation drift detection (daily schedule + SNS alerts)

272 self._create_drift_detection()

273

274 # Create dedicated IAM role for MCP server

275 self._create_mcp_role()

276

277 # Export cluster information

278 self._create_outputs()

279

280 # Apply cdk-nag suppressions for this stack

281 self._apply_nag_suppressions()

282

283 def _create_vpc_flow_logs(self) -> None:

284 """Create VPC Flow Logs for network traffic monitoring.

285

286 Flow logs capture information about IP traffic going to and from

287 network interfaces in the VPC. This is required for security

288 monitoring and compliance (HIPAA, SOC2, etc.).

289 """

290 # Create CloudWatch Log Group for flow logs

291 flow_log_group = logs.LogGroup(

292 self,

293 "VpcFlowLogGroup",

294 # log_group_name intentionally omitted - let CDK generate unique name

295 retention=logs.RetentionDays.ONE_MONTH,

296 removal_policy=RemovalPolicy.DESTROY,

297 )

298

299 # Create IAM role for VPC Flow Logs

300 flow_log_role = iam.Role(

301 self,

302 "VpcFlowLogRole",

303 assumed_by=iam.ServicePrincipal("vpc-flow-logs.amazonaws.com"),

304 )

305

306 flow_log_role.add_to_policy(

307 iam.PolicyStatement(

308 actions=[

309 "logs:CreateLogStream",

310 "logs:PutLogEvents",

311 "logs:DescribeLogGroups",

312 "logs:DescribeLogStreams",

313 ],

314 resources=[flow_log_group.log_group_arn, f"{flow_log_group.log_group_arn}:*"],

315 )

316 )

317

318 # Create VPC Flow Log

319 ec2.FlowLog(

320 self,

321 "VpcFlowLog",

322 resource_type=ec2.FlowLogResourceType.from_vpc(self.vpc),

323 destination=ec2.FlowLogDestination.to_cloud_watch_logs(flow_log_group, flow_log_role),

324 traffic_type=ec2.FlowLogTrafficType.ALL,

325 )

326

327 def _apply_nag_suppressions(self) -> None:

328 """Apply cdk-nag suppressions for this stack."""

329 from gco.stacks.nag_suppressions import apply_all_suppressions

330

331 apply_all_suppressions(

332 self,

333 stack_type="regional",

334 regions=self.config.get_regions(),

335 global_region=self.config.get_global_region(),

336 )

337

338 def _create_sqs_queue(self) -> None:

339 """Create SQS queue for job ingestion.

340

341 Creates an SQS queue that serves as the default job ingestion point

342 for this region. Jobs submitted to this queue are processed by the

343 manifest processor and KEDA scales based on queue depth.

344

345 Also creates a dead-letter queue for failed messages.

346 Both queues use server-side encryption with AWS managed keys.

347 """

348 project_name = self.config.get_project_name()

349

350 # Create dead-letter queue for failed messages

351 self.job_dlq = sqs.Queue(

352 self,

353 "JobDeadLetterQueue",

354 queue_name=f"{project_name}-jobs-dlq-{self.deployment_region}",

355 retention_period=Duration.days(14),

356 removal_policy=RemovalPolicy.DESTROY,

357 enforce_ssl=True, # Require SSL for all requests

358 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption

359 )

360

361 # Create main job queue

362 self.job_queue = sqs.Queue(

363 self,

364 "JobQueue",

365 queue_name=f"{project_name}-jobs-{self.deployment_region}",

366 visibility_timeout=Duration.minutes(5), # Match Lambda timeout

367 retention_period=Duration.days(7),

368 dead_letter_queue=sqs.DeadLetterQueue(

369 max_receive_count=3, # Move to DLQ after 3 failed attempts

370 queue=self.job_dlq,

371 ),

372 removal_policy=RemovalPolicy.DESTROY,

373 enforce_ssl=True, # Require SSL for all requests

374 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption

375 )

376

377 # Output queue information

378 CfnOutput(

379 self,

380 "JobQueueUrl",

381 value=self.job_queue.queue_url,

382 description=f"SQS Job Queue URL for {self.deployment_region}",

383 export_name=f"{project_name}-job-queue-url-{self.deployment_region}",

384 )

385

386 CfnOutput(

387 self,

388 "JobQueueArn",

389 value=self.job_queue.queue_arn,

390 description=f"SQS Job Queue ARN for {self.deployment_region}",

391 export_name=f"{project_name}-job-queue-arn-{self.deployment_region}",

392 )

393

394 CfnOutput(

395 self,

396 "JobDlqUrl",

397 value=self.job_dlq.queue_url,

398 description=f"SQS Dead Letter Queue URL for {self.deployment_region}",

399 export_name=f"{project_name}-job-dlq-url-{self.deployment_region}",

400 )

401

402 def _create_aws_custom_resource_role(self) -> None:

403 """Pre-create the execution role shared by every ``AwsCustomResource``.

404

405 CDK's ``cr.AwsCustomResource`` defaults to auto-generating a per-

406 construct Lambda execution role from the ``policy=`` parameter.

407 Internally, CDK deduplicates those auto-generated roles onto a

408 single *singleton* provider Lambda (logical id prefix

409 ``AWS679f53fac002430cb0da5b7982bd22872``), and merges each custom

410 resource's policy statements onto that Lambda's role at stack

411 create time. On cold deploys, CloudFormation invokes the Lambda

412 within 2-3 seconds of attaching a new policy statement, which is

413 faster than IAM's global propagation window. The symptom is a

414 ``iam:PassRole NOT authorized`` failure on whichever addon role

415 update happens to run right after its ``iam:PassRole`` policy

416 statement was attached but before it had replicated.

417

418 The fix is to create the role up front, attach every policy

419 statement the stack will need during stack creation, and pass

420 ``role=self.aws_custom_resource_role`` to every

421 ``AwsCustomResource`` instead of ``policy=``. Because the role

422 already exists — and its inline policy has had minutes to

423 replicate by the time any ``AwsCustomResource`` actually fires —

424 the race disappears entirely.

425

426 This method creates the role with the statements we can compute

427 without a cluster reference (EKS ``UpdateAddon`` / ``DescribeAddon``

428 scoped to this cluster, and SSM ``GetParameter`` for the endpoint

429 group ARN). ``iam:PassRole`` statements for individual addon

430 roles (EFS CSI, FSx CSI, CloudWatch Observability) are appended

431 by each ``_create_*_addon`` method after the corresponding IRSA

432 role has been created, so every PassRole ``resources=`` list

433 stays precise (no wildcards) and cdk-nag stays happy.

434 """

435 project_name = self.config.get_project_name()

436 global_region = self.config.get_global_region()

437

438 self.aws_custom_resource_role = iam.Role(

439 self,

440 "AwsCustomResourceRole",

441 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),

442 description=(

443 "Shared execution role for every cr.AwsCustomResource in this "

444 "stack. Pre-created to avoid the IAM policy propagation race "

445 "that occurs when CDK auto-generates per-CR roles and the "

446 "singleton provider Lambda fires before the freshly-attached "

447 "policy has replicated globally."

448 ),

449 managed_policies=[

450 iam.ManagedPolicy.from_aws_managed_policy_name(

451 "service-role/AWSLambdaBasicExecutionRole"

452 ),

453 ],

454 )

455

456 # EKS UpdateAddon / DescribeAddon — used by the three updateAddon

457 # custom resources (EFS CSI, FSx CSI, CloudWatch Observability).

458 # Scoped to this cluster's addons by ARN.

459 self.aws_custom_resource_role.add_to_policy(

460 iam.PolicyStatement(

461 effect=iam.Effect.ALLOW,

462 actions=["eks:UpdateAddon", "eks:DescribeAddon"],

463 resources=[

464 f"arn:aws:eks:{self.deployment_region}:{self.account}"

465 f":addon/{self.cluster_config.cluster_name}/*"

466 ],

467 )

468 )

469

470 # SSM GetParameter — used by the GetEndpointGroupArn custom

471 # resource in _create_ga_registration_lambda to read the ARN of

472 # the Global Accelerator endpoint group published by the global

473 # stack during its deploy.

474 self.aws_custom_resource_role.add_to_policy(

475 iam.PolicyStatement(

476 effect=iam.Effect.ALLOW,

477 actions=["ssm:GetParameter"],

478 resources=[

479 f"arn:aws:ssm:{global_region}:{self.account}" f":parameter/{project_name}/*"

480 ],

481 )

482 )

483

484 # cdk-nag suppressions: the two wildcard-bearing ARNs above are

485 # intentional and both scoped as tightly as AWS IAM permits.

486 #

487 # - The ``eks:UpdateAddon`` / ``eks:DescribeAddon`` statement uses

488 # ``addon/<cluster>/*`` as its resource because the same shared

489 # role is consumed by three different updateAddon custom

490 # resources (EFS CSI, FSx CSI, CloudWatch Observability). Each

491 # addon has its own ARN and we'd otherwise need three separate

492 # statements that each grant access to a known addon name. The

493 # wildcard is scoped to a single cluster in a single region in

494 # a single account — it cannot be used against any addon

495 # belonging to a different cluster or a different service.

496 #

497 # - The ``ssm:GetParameter`` statement uses

498 # ``parameter/<project>/*`` because the exact parameter name

499 # (``endpoint-group-<region>-arn``) is only known at Global

500 # Accelerator registration time and the endpoint path

501 # structure is ``<project>/<parameter>``. Scoping to the

502 # project prefix restricts access to parameters owned by this

503 # project only.

504 from cdk_nag import NagSuppressions

505

506 NagSuppressions.add_resource_suppressions(

507 self.aws_custom_resource_role,

508 [

509 {

510 "id": "AwsSolutions-IAM5",

511 "reason": (

512 "Scoped to a single EKS cluster's addons "

513 "(addon/<cluster>/*) and this project's SSM "

514 "parameters (parameter/<project>/*). Both wildcards "

515 "are as tight as AWS IAM permits: addon names and "

516 "parameter names are not known at stack synthesis "

517 "time because the addons are created later in the "

518 "same stack and the GA endpoint group ARN is "

519 "published by a separate stack during deploy. The "

520 "shared role pattern itself is deliberate — see "

521 "_create_aws_custom_resource_role docstring for why "

522 "we pre-create instead of letting CDK auto-generate "

523 "per-CR roles."

524 ),

525 "appliesTo": [

526 f"Resource::arn:aws:eks:{self.deployment_region}"

527 f":<AWS::AccountId>:addon/{self.cluster_config.cluster_name}/*",

528 f"Resource::arn:aws:ssm:{global_region}"

529 f":<AWS::AccountId>:parameter/{project_name}/*",

530 ],

531 },

532 ],

533 apply_to_children=True,

534 )

535

536 def _create_container_images(self) -> None:

537 """Create ECR repositories and build Docker images for services"""

538

539 # Create ECR repository for health monitor

540 self.health_monitor_repo = ecr.Repository(

541 self,

542 "HealthMonitorRepo",

543 # repository_name intentionally omitted - let CDK generate unique name

544 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production

545 empty_on_delete=True, # Clean up images on stack deletion

546 image_scan_on_push=True, # Enable vulnerability scanning on push

547 )

548

549 # All Docker images target AMD64 (x86_64) to match EKS Auto Mode's

550 # default system nodepool.

551

552 # Build and push health monitor Docker image

553 self.health_monitor_image = ecr_assets.DockerImageAsset(

554 self,

555 "HealthMonitorImage",

556 directory=".", # Root directory

557 file="dockerfiles/health-monitor-dockerfile",

558 platform=ecr_assets.Platform.LINUX_AMD64,

559 )

560

561 # Create ECR repository for manifest processor

562 self.manifest_processor_repo = ecr.Repository(

563 self,

564 "ManifestProcessorRepo",

565 # repository_name intentionally omitted - let CDK generate unique name

566 removal_policy=RemovalPolicy.DESTROY,

567 empty_on_delete=True,

568 image_scan_on_push=True, # Enable vulnerability scanning on push

569 )

570

571 # Build and push manifest processor Docker image

572 self.manifest_processor_image = ecr_assets.DockerImageAsset(

573 self,

574 "ManifestProcessorImage",

575 directory=".",

576 file="dockerfiles/manifest-processor-dockerfile",

577 platform=ecr_assets.Platform.LINUX_AMD64,

578 )

579

580 # Output image URIs for reference

581 CfnOutput(

582 self,

583 "HealthMonitorImageUri",

584 value=self.health_monitor_image.image_uri,

585 description="Health Monitor Docker image URI",

586 )

587

588 CfnOutput(

589 self,

590 "ManifestProcessorImageUri",

591 value=self.manifest_processor_image.image_uri,

592 description="Manifest Processor Docker image URI",

593 )

594

595 # Build and push inference monitor Docker image

596 self.inference_monitor_image = ecr_assets.DockerImageAsset(

597 self,

598 "InferenceMonitorImage",

599 directory=".",

600 file="dockerfiles/inference-monitor-dockerfile",

601 platform=ecr_assets.Platform.LINUX_AMD64,

602 )

603

604 CfnOutput(

605 self,

606 "InferenceMonitorImageUri",

607 value=self.inference_monitor_image.image_uri,

608 description="Inference Monitor Docker image URI",

609 )

610

611 # Build and push queue processor Docker image (if enabled).

612 # The queue processor is a KEDA ScaledJob that consumes manifests from

613 # the regional SQS queue. It can be disabled in cdk.json if users want

614 # to implement their own consumer. When disabled, the post-helm-sqs-consumer.yaml

615 # manifest is skipped (unreplaced template variables cause it to be skipped).

616 queue_processor_config = self.node.try_get_context("queue_processor") or {}

617 self.queue_processor_enabled = queue_processor_config.get("enabled", True)

618

619 if self.queue_processor_enabled: 619 ↛ exitline 619 didn't return from function '_create_container_images' because the condition on line 619 was always true

620 self.queue_processor_image = ecr_assets.DockerImageAsset(

621 self,

622 "QueueProcessorImage",

623 directory=".",

624 file="dockerfiles/queue-processor-dockerfile",

625 platform=ecr_assets.Platform.LINUX_AMD64,

626 )

627

628 CfnOutput(

629 self,

630 "QueueProcessorImageUri",

631 value=self.queue_processor_image.image_uri,

632 description="Queue Processor Docker image URI",

633 )

634

635 def _create_eks_cluster(self, cluster_config: Any) -> None:

636 """Create the EKS cluster with auto mode and GPU node groups"""

637

638 # Create cluster admin role

639 # role_name intentionally omitted - let CDK generate unique name

640 cluster_admin_role = iam.Role(

641 self,

642 "ClusterAdminRole",

643 assumed_by=iam.ServicePrincipal("eks.amazonaws.com"),

644 managed_policies=[

645 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSClusterPolicy")

646 ],

647 )

648

649 # Create node group role

650 # role_name intentionally omitted - let CDK generate unique name

651 iam.Role(

652 self,

653 "NodeGroupRole",

654 assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),

655 managed_policies=[

656 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSWorkerNodePolicy"),

657 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKS_CNI_Policy"),

658 iam.ManagedPolicy.from_aws_managed_policy_name(

659 "AmazonEC2ContainerRegistryReadOnly"

660 ),

661 ],

662 )

663

664 # Create EKS Auto Mode cluster with built-in system and general-purpose nodepools

665 # Auto Mode automatically manages compute resources and comes with essential addons

666 # Get endpoint access configuration

667 eks_config = self.config.get_eks_cluster_config()

668 endpoint_access_mode = eks_config.get("endpoint_access", "PRIVATE")

669

670 # Map config string to EKS EndpointAccess enum

671 endpoint_access = (

672 eks.EndpointAccess.PRIVATE

673 if endpoint_access_mode == "PRIVATE"

674 else eks.EndpointAccess.PUBLIC_AND_PRIVATE

675 )

676

677 # Create KMS key for EKS secrets encryption

678 self.eks_encryption_key = kms.Key(

679 self,

680 "EksSecretsEncryptionKey",

681 description="KMS key for EKS Kubernetes secrets encryption",

682 enable_key_rotation=True,

683 removal_policy=RemovalPolicy.RETAIN,

684 )

685

686 # Get Kubernetes version - use custom version if not available in CDK enum

687 k8s_version_str = cluster_config.kubernetes_version

688 try:

689 k8s_version = getattr(eks.KubernetesVersion, f"V{k8s_version_str.replace('.', '_')}")

690 except AttributeError:

691 # Version not in CDK enum yet, use custom version

692 k8s_version = eks.KubernetesVersion.of(k8s_version_str)

693

694 self.cluster = eks.Cluster(

695 self,

696 "GCOEksCluster",

697 cluster_name=cluster_config.cluster_name,

698 version=k8s_version, # Use configured version for Auto Mode with DRA support

699 vpc=self.vpc,

700 compute=eks.ComputeConfig(

701 # Enable both built-in node pools - Auto Mode manages these automatically

702 node_pools=["system", "general-purpose"]

703 ),

704 # SECURITY: Endpoint access controlled via cdk.json eks_cluster.endpoint_access

705 # PRIVATE (default): EKS API accessible only from within VPC - most secure

706 # Job submission works via API Gateway → Lambda (in VPC) or SQS

707 # For kubectl access, use a bastion host, VPN, or AWS SSM Session Manager

708 # PUBLIC_AND_PRIVATE: EKS API accessible from internet and VPC

709 # Allows direct kubectl access but less secure

710 endpoint_access=endpoint_access,

711 role=cluster_admin_role,

712 vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)],

713 # Enable all control plane logging for security and compliance

714 cluster_logging=[

715 eks.ClusterLoggingTypes.API,

716 eks.ClusterLoggingTypes.AUDIT,

717 eks.ClusterLoggingTypes.AUTHENTICATOR,

718 eks.ClusterLoggingTypes.CONTROLLER_MANAGER,

719 eks.ClusterLoggingTypes.SCHEDULER,

720 ],

721 # SECURITY: Enable envelope encryption for Kubernetes secrets using KMS

722 secrets_encryption_key=self.eks_encryption_key,

723 )

724

725 # Auto Mode comes with essential addons pre-configured:

726 # - AWS Load Balancer Controller (for ALB/NLB integration)

727 # - CoreDNS, kube-proxy, VPC CNI (standard Kubernetes components)

728

729 # OIDC provider for IRSA — the primary credential injection mechanism.

730 # IRSA uses projected service-account tokens exchanged via the OIDC provider

731 # for temporary AWS credentials. This works reliably on EKS Auto Mode.

732 self.oidc_provider = eks.OidcProviderNative(

733 self,

734 "OidcProvider",

735 url=self.cluster.cluster_open_id_connect_issuer_url,

736 )

737

738 # Pod Identity Agent add-on — registers the admission webhook that injects

739 # Pod Identity credentials. On Auto Mode the DaemonSet schedules 0 pods

740 # (the agent is built into the node), but the add-on registration is still

741 # needed for the control-plane webhook. Kept as a secondary credential path.

742 self._create_pod_identity_agent_addon()

743

744 # Add Metrics Server add-on for HPA and resource monitoring

745 self._create_metrics_server_addon()

746

747 # Add EFS CSI Driver add-on for shared storage

748 self._create_efs_csi_driver_addon()

749

750 # Add CloudWatch Observability add-on for Container Insights metrics

751 self._create_cloudwatch_observability_addon()

752

753 # NOTE: GPU compute is configured via Karpenter NodePools (not managed node groups)

754 # NodePool manifests are located in lambda/kubectl-applier-simple/manifests/:

755 # - 40-nodepool-gpu-x86.yaml: x86_64 GPU instances (g4dn, g5, g6, g6e, p3)

756 # - 41-nodepool-gpu-arm.yaml: ARM64 GPU instances (g5g)

757 # - 42-nodepool-inference.yaml: inference-optimized GPU instances

758 # - 43-nodepool-efa.yaml: EFA-enabled instances (p4d, p5, p6)

759 # - 44-nodepool-neuron.yaml: Trainium/Inferentia instances

760 # These will be applied by the kubectl Lambda custom resource (created below)

761

762 # Create IRSA role for service account to access secrets

763 self._create_service_account_role()

764

765 # Create kubectl Lambda for applying Kubernetes manifests

766 self._create_kubectl_lambda()

767

768 # ── Shared toleration config for EKS add-ons ──────────────────────────

769 # All GCO nodepools apply taints (nvidia.com/gpu, aws.amazon.com/neuron,

770 # vpc.amazonaws.com/efa) that prevent DaemonSet pods from scheduling.

771 # Every add-on that runs a DaemonSet (or may schedule on tainted nodes)

772 # must tolerate these taints so that storage drivers, metrics agents, and

773 # other infrastructure components work on every node type.

774 _ADDON_NODE_TOLERATIONS = [

775 {"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"},

776 {"key": "aws.amazon.com/neuron", "operator": "Exists", "effect": "NoSchedule"},

777 {"key": "vpc.amazonaws.com/efa", "operator": "Exists", "effect": "NoSchedule"},

778 ]

779

780 def _create_pod_identity_agent_addon(self) -> None:

781 """Create EKS Pod Identity Agent add-on.

782

783 On Auto Mode the DaemonSet schedules 0 pods (the agent is built into

784 the node runtime), but the add-on registration is still required for

785 the control-plane admission webhook that injects Pod Identity tokens.

786 """

787 eks.Addon(

788 self,

789 "PodIdentityAgentAddon",

790 cluster=self.cluster, # type: ignore[arg-type]

791 addon_name="eks-pod-identity-agent",

792 addon_version=EKS_ADDON_POD_IDENTITY_AGENT,

793 preserve_on_delete=False,

794 configuration_values={

795 "tolerations": self._ADDON_NODE_TOLERATIONS,

796 },

797 )

798

799 def _create_metrics_server_addon(self) -> None:

800 """Create Metrics Server add-on for resource metrics.

801

802 The Metrics Server collects resource metrics from kubelets and exposes

803 them via the Kubernetes API server. This is required for:

804 - Horizontal Pod Autoscaler (HPA)

805 - Vertical Pod Autoscaler (VPA)

806 - kubectl top commands

807 - Resource monitoring dashboards

808

809 Note: Metrics Server doesn't require an IRSA role as it only needs

810 in-cluster permissions which are handled by its service account.

811 """

812 eks.Addon(

813 self,

814 "MetricsServerAddon",

815 cluster=self.cluster, # type: ignore[arg-type]

816 addon_name="metrics-server",

817 addon_version=EKS_ADDON_METRICS_SERVER,

818 preserve_on_delete=False,

819 configuration_values={

820 "tolerations": self._ADDON_NODE_TOLERATIONS,

821 },

822 )

823

824 def _create_efs_csi_driver_addon(self) -> None:

825 """Create EFS CSI Driver add-on for shared storage support.

826

827 The EFS CSI driver enables Kubernetes pods to mount EFS file systems

828 as persistent volumes. This is required for the shared storage feature.

829

830 We create a Pod Identity role for the EFS CSI driver and update the add-on

831 to use it via a custom resource after the add-on is created.

832 """

833 # Create IAM role for EFS CSI Driver using IRSA + Pod Identity

834 self.efs_csi_role = GCORegionalStack._create_irsa_role(

835 self,

836 "EfsCsiDriverRole",

837 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,

838 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,

839 service_account_names=["efs-csi-controller-sa"],

840 namespaces=["kube-system"],

841 )

842

843 # Add EFS CSI driver permissions

844 self.efs_csi_role.add_managed_policy(

845 iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEFSCSIDriverPolicy")

846 )

847

848 # Create EFS CSI Driver add-on

849 efs_addon = eks.Addon(

850 self,

851 "EfsCsiDriverAddon",

852 cluster=self.cluster, # type: ignore[arg-type]

853 addon_name="aws-efs-csi-driver",

854 addon_version=EKS_ADDON_EFS_CSI_DRIVER,

855 preserve_on_delete=False,

856 configuration_values={

857 "node": {

858 "tolerations": self._ADDON_NODE_TOLERATIONS,

859 },

860 "controller": {

861 "tolerations": self._ADDON_NODE_TOLERATIONS,

862 },

863 },

864 )

865

866 # Append the PassRole statement for the EFS CSI role to the shared

867 # AwsCustomResource execution role. See the role's creation in

868 # _create_aws_custom_resource_role for the full rationale on why

869 # we pre-create + attach up-front instead of letting CDK

870 # auto-generate per-CR roles.

871 self.aws_custom_resource_role.add_to_policy(

872 iam.PolicyStatement(

873 effect=iam.Effect.ALLOW,

874 actions=["iam:PassRole"],

875 resources=[self.efs_csi_role.role_arn],

876 )

877 )

878

879 # Update the add-on to use the IRSA role via custom resource

880 # This is needed because the eks v2 alpha Addon doesn't support service_account_role directly

881 update_addon = cr.AwsCustomResource(

882 self,

883 "UpdateEfsCsiAddonRole",

884 on_create=cr.AwsSdkCall(

885 service="EKS",

886 action="updateAddon",

887 parameters={

888 "clusterName": self.cluster.cluster_name,

889 "addonName": "aws-efs-csi-driver",

890 "serviceAccountRoleArn": self.efs_csi_role.role_arn,

891 },

892 physical_resource_id=cr.PhysicalResourceId.of(

893 f"{self.cluster.cluster_name}-efs-csi-role-update"

894 ),

895 ),

896 on_update=cr.AwsSdkCall(

897 service="EKS",

898 action="updateAddon",

899 parameters={

900 "clusterName": self.cluster.cluster_name,

901 "addonName": "aws-efs-csi-driver",

902 "serviceAccountRoleArn": self.efs_csi_role.role_arn,

903 },

904 ),

905 role=self.aws_custom_resource_role,

906 )

907

908 # Ensure the update happens after the add-on is created. We also

909 # depend on the shared execution role so CloudFormation has fully

910 # attached + replicated its inline policy before the Lambda fires.

911 update_addon.node.add_dependency(efs_addon)

912 update_addon.node.add_dependency(self.efs_csi_role)

913 update_addon.node.add_dependency(self.aws_custom_resource_role)

914

915 # Expose the update-addon resource so _apply_kubernetes_manifests can

916 # make the kubectl Lambda wait for the IRSA annotation patch to land

917 # before it tries to rollout-restart the efs-csi-controller. Without

918 # this ordering, the restart could fire before EKS has re-attached

919 # the role ARN, leaving the new pods just as credential-less as the

920 # old ones and causing every EFS CreateAccessPoint to fail with a

921 # 401 from IMDS.

922 self._efs_csi_addon_role_update = update_addon

923

924 def _create_cloudwatch_observability_addon(self) -> None:

925 """Create CloudWatch Observability add-on for Container Insights.

926

927 The CloudWatch Observability add-on enables Container Insights metrics

928 for the EKS cluster, providing visibility into:

929 - Cluster CPU and memory utilization

930 - Node-level metrics

931 - Pod and container metrics

932 - Application logs (optional)

933

934 These metrics are used by the monitoring dashboard to display

935 cluster health and resource utilization.

936 """

937

938 # Create IAM role for CloudWatch agent using IRSA + Pod Identity

939 self.cloudwatch_role = GCORegionalStack._create_irsa_role(

940 self,

941 "CloudWatchObservabilityRole",

942 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,

943 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,

944 service_account_names=["cloudwatch-agent"],

945 namespaces=["amazon-cloudwatch"],

946 )

947

948 # Add CloudWatch agent permissions

949 self.cloudwatch_role.add_managed_policy(

950 iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchAgentServerPolicy")

951 )

952 self.cloudwatch_role.add_managed_policy(

953 iam.ManagedPolicy.from_aws_managed_policy_name("AWSXrayWriteOnlyAccess")

954 )

955

956 # Create CloudWatch Observability add-on

957 cw_addon = eks.Addon(

958 self,

959 "CloudWatchObservabilityAddon",

960 cluster=self.cluster, # type: ignore[arg-type]

961 addon_name="amazon-cloudwatch-observability",

962 addon_version=EKS_ADDON_CLOUDWATCH_OBSERVABILITY,

963 preserve_on_delete=False,

964 configuration_values={

965 "tolerations": self._ADDON_NODE_TOLERATIONS,

966 # Enable Container Insights with application log collection

967 # Logs are sent to /aws/containerinsights/{cluster}/application

968 "containerLogs": {

969 "enabled": True,

970 },

971 },

972 )

973

974 # Append the PassRole statement for the CloudWatch Observability

975 # role to the shared AwsCustomResource execution role. See

976 # _create_aws_custom_resource_role for the full rationale.

977 self.aws_custom_resource_role.add_to_policy(

978 iam.PolicyStatement(

979 effect=iam.Effect.ALLOW,

980 actions=["iam:PassRole"],

981 resources=[self.cloudwatch_role.role_arn],

982 )

983 )

984

985 # Update the add-on to use the IRSA role via custom resource

986 update_cw_addon = cr.AwsCustomResource(

987 self,

988 "UpdateCloudWatchAddonRole",

989 on_create=cr.AwsSdkCall(

990 service="EKS",

991 action="updateAddon",

992 parameters={

993 "clusterName": self.cluster.cluster_name,

994 "addonName": "amazon-cloudwatch-observability",

995 "serviceAccountRoleArn": self.cloudwatch_role.role_arn,

996 },

997 physical_resource_id=cr.PhysicalResourceId.of(

998 f"{self.cluster.cluster_name}-cw-obs-role-update"

999 ),

1000 ),

1001 on_update=cr.AwsSdkCall(

1002 service="EKS",

1003 action="updateAddon",

1004 parameters={

1005 "clusterName": self.cluster.cluster_name,

1006 "addonName": "amazon-cloudwatch-observability",

1007 "serviceAccountRoleArn": self.cloudwatch_role.role_arn,

1008 },

1009 ),

1010 role=self.aws_custom_resource_role,

1011 )

1012

1013 # Ensure the update happens after the add-on is created. Depend on

1014 # the shared execution role so CFN has fully attached + replicated

1015 # its inline policy before the Lambda fires. No CR→CR dependency

1016 # chain needed anymore — the race it was serializing against is

1017 # eliminated by pre-creating the role.

1018 update_cw_addon.node.add_dependency(cw_addon)

1019 update_cw_addon.node.add_dependency(self.cloudwatch_role)

1020 update_cw_addon.node.add_dependency(self.aws_custom_resource_role)

1021

1022 # Expose the update-addon resource so _apply_kubernetes_manifests can

1023 # make the kubectl Lambda wait for the IRSA annotation patch to land

1024 # before it rollout-restarts the cloudwatch-agent DaemonSet. See the

1025 # EFS CSI equivalent for the full rationale — same race, same fix.

1026 self._cloudwatch_addon_role_update = update_cw_addon

1027

1028 def _create_service_account_role(self) -> None:

1029 """Create IAM role for Kubernetes service account using EKS Pod Identity.

1030

1031 Pod Identity is the recommended mechanism for EKS Auto Mode. It's simpler

1032 and more reliable than IRSA — no OIDC provider, no webhook injection, no

1033 projected tokens. EKS manages the credential injection automatically.

1034

1035 This role can be assumed by the gco-service-account in:

1036 - gco-system namespace (for system services like health-monitor, manifest-processor)

1037 - gco-jobs namespace (for user jobs that need SQS access for KEDA scaling)

1038 - gco-inference namespace (for inference endpoints)

1039 """

1040 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust

1041 #

1042 # The trust policy's `sub` condition must list every ServiceAccount

1043 # that needs to assume this role. Keep in sync with:

1044 # - lambda/kubectl-applier-simple/manifests/01-serviceaccounts.yaml

1045 # (gco-service-account)

1046 # - lambda/kubectl-applier-simple/manifests/02-rbac.yaml

1047 # (gco-health-monitor-sa, gco-manifest-processor-sa,

1048 # gco-inference-monitor-sa)

1049 # - lambda/kubectl-applier-simple/manifests/04a-jobs-serviceaccount.yaml

1050 # (gco-service-account in gco-jobs)

1051 self.service_account_role = GCORegionalStack._create_irsa_role(

1052 self,

1053 "ServiceAccountRole",

1054 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,

1055 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,

1056 service_account_names=[

1057 "gco-service-account",

1058 "gco-health-monitor-sa",

1059 "gco-manifest-processor-sa",

1060 "gco-inference-monitor-sa",

1061 ],

1062 namespaces=["gco-system", "gco-jobs", "gco-inference"],

1063 )

1064

1065 # Grant permission to read the auth secret

1066 # Note: We use an explicit IAM policy statement with a wildcard (*) because:

1067 # 1. The secret is in a different region (API Gateway region)

1068 # 2. CDK's grant_read() generates a policy with ?????? suffix which requires

1069 # exactly 6 characters, but the SDK can call GetSecretValue with either

1070 # the full ARN (with suffix) or partial ARN (without suffix)

1071 # 3. Using * ensures both forms work correctly

1072 self.service_account_role.add_to_policy(

1073 iam.PolicyStatement(

1074 effect=iam.Effect.ALLOW,

1075 actions=[

1076 "secretsmanager:GetSecretValue",

1077 "secretsmanager:DescribeSecret",

1078 ],

1079 resources=[f"{self.auth_secret_arn}*"], # Wildcard to match with or without suffix

1080 )

1081 )

1082

1083 # cdk-nag suppression: the trailing ``*`` on the auth secret

1084 # ARN above is intentional and is NOT a broad wildcard. Secrets

1085 # Manager appends a random 6-character suffix to every secret

1086 # ARN at creation time (``arn:...:secret:my-secret-AbC123``).

1087 # The secret lives in a separate stack (api_gateway_global_stack)

1088 # and is referenced here via a cross-stack token, so the actual

1089 # suffix is unknown at synth time. The wildcard matches the

1090 # suffix only — every finding under this rule is still scoped

1091 # to this single secret.

1092 from cdk_nag import NagSuppressions

1093

1094 NagSuppressions.add_resource_suppressions(

1095 self.service_account_role,

1096 [

1097 {

1098 "id": "AwsSolutions-IAM5",

1099 "reason": (

1100 "The trailing ``*`` matches the 6-character "

1101 "random suffix Secrets Manager appends to secret "

1102 "ARNs. The secret is created in a different stack "

1103 "(api_gateway_global_stack) and referenced here "

1104 "via a cross-stack token, so the actual suffix "

1105 "isn't known at synth time. The wildcard is "

1106 "bounded to a single secret — it does not grant "

1107 "access to any other secret in the account."

1108 ),

1109 "appliesTo": [

1110 {"regex": "/^Resource::<GCOAuthSecret.*>\\*$/"},

1111 ],

1112 },

1113 ],

1114 apply_to_children=True,

1115 )

1116

1117 # cdk-nag suppression: the ServiceAccountRole grants ec2:Describe*

1118 # and elasticloadbalancing:Describe* for the AWS Load Balancer

1119 # Controller. These AWS APIs do not support resource-level IAM

1120 # scoping — Resource: * is the only valid form.

1121 NagSuppressions.add_resource_suppressions(

1122 self.service_account_role,

1123 [

1124 {

1125 "id": "AwsSolutions-IAM5",

1126 "reason": (

1127 "The ServiceAccountRole grants ec2:Describe* and "

1128 "elasticloadbalancing:Describe* for the AWS Load Balancer "

1129 "Controller. These AWS APIs do not support resource-level "

1130 "IAM scoping — Resource: * is the only valid form. See "

1131 "https://docs.aws.amazon.com/service-authorization/latest/"

1132 "reference/list_amazonec2.html"

1133 ),

1134 "appliesTo": ["Resource::*"],

1135 },

1136 ],

1137 apply_to_children=True,

1138 )

1139

1140 # Add permissions for AWS Load Balancer Controller

1141 self.service_account_role.add_to_policy(

1142 iam.PolicyStatement(

1143 effect=iam.Effect.ALLOW,

1144 actions=[

1145 "ec2:DescribeAccountAttributes",

1146 "ec2:DescribeAddresses",

1147 "ec2:DescribeAvailabilityZones",

1148 "ec2:DescribeInternetGateways",

1149 "ec2:DescribeVpcs",

1150 "ec2:DescribeVpcPeeringConnections",

1151 "ec2:DescribeSubnets",

1152 "ec2:DescribeSecurityGroups",

1153 "ec2:DescribeInstances",

1154 "ec2:DescribeNetworkInterfaces",

1155 "ec2:DescribeTags",

1156 "ec2:GetCoipPoolUsage",

1157 "ec2:DescribeCoipPools",

1158 "elasticloadbalancing:DescribeLoadBalancers",

1159 "elasticloadbalancing:DescribeLoadBalancerAttributes",

1160 "elasticloadbalancing:DescribeListeners",

1161 "elasticloadbalancing:DescribeListenerCertificates",

1162 "elasticloadbalancing:DescribeSSLPolicies",

1163 "elasticloadbalancing:DescribeRules",

1164 "elasticloadbalancing:DescribeTargetGroups",

1165 "elasticloadbalancing:DescribeTargetGroupAttributes",

1166 "elasticloadbalancing:DescribeTargetHealth",

1167 "elasticloadbalancing:DescribeTags",

1168 ],

1169 resources=["*"],

1170 )

1171 )

1172

1173 self.service_account_role.add_to_policy(

1174 iam.PolicyStatement(

1175 effect=iam.Effect.ALLOW,

1176 actions=[

1177 "elasticloadbalancing:CreateLoadBalancer",

1178 "elasticloadbalancing:CreateTargetGroup",

1179 "elasticloadbalancing:CreateListener",

1180 "elasticloadbalancing:DeleteLoadBalancer",

1181 "elasticloadbalancing:DeleteTargetGroup",

1182 "elasticloadbalancing:DeleteListener",

1183 "elasticloadbalancing:ModifyLoadBalancerAttributes",

1184 "elasticloadbalancing:ModifyTargetGroup",

1185 "elasticloadbalancing:ModifyTargetGroupAttributes",

1186 "elasticloadbalancing:ModifyListener",

1187 "elasticloadbalancing:RegisterTargets",

1188 "elasticloadbalancing:DeregisterTargets",

1189 "elasticloadbalancing:SetWebAcl",

1190 "elasticloadbalancing:SetSecurityGroups",

1191 "elasticloadbalancing:SetSubnets",

1192 "elasticloadbalancing:AddTags",

1193 "elasticloadbalancing:RemoveTags",

1194 ],

1195 resources=["*"],

1196 )

1197 )

1198

1199 self.service_account_role.add_to_policy(

1200 iam.PolicyStatement(

1201 effect=iam.Effect.ALLOW,

1202 actions=[

1203 "ec2:CreateSecurityGroup",

1204 "ec2:CreateTags",

1205 "ec2:DeleteTags",

1206 "ec2:AuthorizeSecurityGroupIngress",

1207 "ec2:RevokeSecurityGroupIngress",

1208 "ec2:DeleteSecurityGroup",

1209 ],

1210 resources=["*"],

1211 )

1212 )

1213

1214 self.service_account_role.add_to_policy(

1215 iam.PolicyStatement(

1216 effect=iam.Effect.ALLOW,

1217 actions=["iam:CreateServiceLinkedRole"],

1218 resources=["*"],

1219 conditions={

1220 "StringEquals": {"iam:AWSServiceName": "elasticloadbalancing.amazonaws.com"}

1221 },

1222 )

1223 )

1224

1225 self.service_account_role.add_to_policy(

1226 iam.PolicyStatement(

1227 effect=iam.Effect.ALLOW,

1228 actions=[

1229 "wafv2:GetWebACL",

1230 "wafv2:GetWebACLForResource",

1231 "wafv2:AssociateWebACL",

1232 "wafv2:DisassociateWebACL",

1233 ],

1234 resources=["*"],

1235 )

1236 )

1237

1238 self.service_account_role.add_to_policy(

1239 iam.PolicyStatement(

1240 effect=iam.Effect.ALLOW,

1241 actions=[

1242 "shield:GetSubscriptionState",

1243 "shield:DescribeProtection",

1244 "shield:CreateProtection",

1245 "shield:DeleteProtection",

1246 ],

1247 resources=["*"],

1248 )

1249 )

1250

1251 self.service_account_role.add_to_policy(

1252 iam.PolicyStatement(

1253 effect=iam.Effect.ALLOW,

1254 actions=["acm:ListCertificates", "acm:DescribeCertificate"],

1255 resources=["*"],

1256 )

1257 )

1258

1259 self.service_account_role.add_to_policy(

1260 iam.PolicyStatement(

1261 effect=iam.Effect.ALLOW,

1262 actions=["cognito-idp:DescribeUserPoolClient"],

1263 resources=["*"],

1264 )

1265 )

1266

1267 # Add SQS permissions for KEDA to scale based on queue depth

1268 self.service_account_role.add_to_policy(

1269 iam.PolicyStatement(

1270 effect=iam.Effect.ALLOW,

1271 actions=[

1272 "sqs:GetQueueAttributes",

1273 "sqs:GetQueueUrl",

1274 "sqs:ReceiveMessage",

1275 "sqs:DeleteMessage",

1276 "sqs:SendMessage",

1277 ],

1278 resources=[

1279 self.job_queue.queue_arn,

1280 self.job_dlq.queue_arn,

1281 ],

1282 )

1283 )

1284

1285 # Add CloudWatch permissions for publishing custom metrics

1286 # Used by health-monitor and manifest-processor to publish metrics

1287 self.service_account_role.add_to_policy(

1288 iam.PolicyStatement(

1289 effect=iam.Effect.ALLOW,

1290 actions=["cloudwatch:PutMetricData"],

1291 resources=["*"],

1292 conditions={

1293 "StringEquals": {

1294 "cloudwatch:namespace": [

1295 "GCO/HealthMonitor",

1296 "GCO/ManifestProcessor",

1297 ]

1298 }

1299 },

1300 )

1301 )

1302

1303 # Add DynamoDB permissions for templates, webhooks, and job queue

1304 # Tables are created in the global stack and accessed from all regions

1305 project_name = self.config.get_project_name()

1306 global_region = self.config.get_global_region()

1307

1308 self.service_account_role.add_to_policy(

1309 iam.PolicyStatement(

1310 effect=iam.Effect.ALLOW,

1311 actions=[

1312 "dynamodb:GetItem",

1313 "dynamodb:PutItem",

1314 "dynamodb:UpdateItem",

1315 "dynamodb:DeleteItem",

1316 "dynamodb:Query",

1317 "dynamodb:Scan",

1318 ],

1319 resources=[

1320 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates",

1321 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates/index/*",

1322 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks",

1323 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks/index/*",

1324 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs",

1325 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs/index/*",

1326 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints",

1327 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints/index/*",

1328 ],

1329 )

1330 )

1331

1332 # Add S3 permissions for model weights bucket (used by inference init containers)

1333 self.service_account_role.add_to_policy(

1334 iam.PolicyStatement(

1335 effect=iam.Effect.ALLOW,

1336 actions=[

1337 "s3:GetObject",

1338 "s3:ListBucket",

1339 ],

1340 resources=[

1341 f"arn:aws:s3:::{project_name}-*",

1342 f"arn:aws:s3:::{project_name}-*/*",

1343 ],

1344 )

1345 )

1346

1347 # KMS decrypt for model weights bucket (S3-scoped)

1348 self.service_account_role.add_to_policy(

1349 iam.PolicyStatement(

1350 effect=iam.Effect.ALLOW,

1351 actions=["kms:Decrypt", "kms:GenerateDataKey"],

1352 resources=[f"arn:aws:kms:*:{self.account}:key/*"],

1353 conditions={

1354 "StringLike": {

1355 "kms:ViaService": "s3.*.amazonaws.com",

1356 }

1357 },

1358 )

1359 )

1360

1361 # Create KEDA operator IAM role for SQS access

1362 self._create_keda_operator_role()

1363

1364 # Create Pod Identity Associations for all service accounts

1365 self._create_pod_identity_associations()

1366

1367 def _create_keda_operator_role(self) -> None:

1368 """Create IAM role for KEDA operator service account using EKS Pod Identity.

1369

1370 This role allows the KEDA operator to access SQS queues for scaling

1371 based on queue depth. The role is assumed by the keda-operator service

1372 account in the keda namespace.

1373 """

1374 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust

1375 self.keda_operator_role = GCORegionalStack._create_irsa_role(

1376 self,

1377 "KedaOperatorRole",

1378 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,

1379 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,

1380 service_account_names=["keda-operator"],

1381 namespaces=["keda"],

1382 )

1383

1384 # Add SQS permissions for KEDA to read queue metrics

1385 self.keda_operator_role.add_to_policy(

1386 iam.PolicyStatement(

1387 effect=iam.Effect.ALLOW,

1388 actions=[

1389 "sqs:GetQueueAttributes",

1390 "sqs:GetQueueUrl",

1391 ],

1392 resources=[

1393 self.job_queue.queue_arn,

1394 self.job_dlq.queue_arn,

1395 ],

1396 )

1397 )

1398

1399 def _create_pod_identity_associations(self) -> None:

1400 """Create EKS Pod Identity Associations for all service accounts.

1401

1402 Pod Identity is the recommended mechanism for EKS Auto Mode. Each

1403 association links an IAM role to a Kubernetes service account in a

1404 specific namespace. EKS manages credential injection automatically.

1405

1406 Stores associations in self._pod_identity_associations so the

1407 kubectl-applier custom resource can declare an explicit dependency,

1408 ensuring credentials are available before workloads start.

1409 """

1410 self._pod_identity_associations: list[Any] = []

1411

1412 # GCO service account — used by health-monitor, manifest-processor, inference-monitor

1413 for namespace in ["gco-system", "gco-jobs", "gco-inference"]:

1414 assoc = eks_l1.CfnPodIdentityAssociation(

1415 self,

1416 f"PodIdentity-gco-sa-{namespace}",

1417 cluster_name=self.cluster.cluster_name,

1418 namespace=namespace,

1419 service_account="gco-service-account",

1420 role_arn=self.service_account_role.role_arn,

1421 )

1422 self._pod_identity_associations.append(assoc)

1423

1424 # KEDA operator — needs SQS access for queue-based scaling

1425 keda_assoc = eks_l1.CfnPodIdentityAssociation(

1426 self,

1427 "PodIdentity-keda-operator",

1428 cluster_name=self.cluster.cluster_name,

1429 namespace="keda",

1430 service_account="keda-operator",

1431 role_arn=self.keda_operator_role.role_arn,

1432 )

1433 self._pod_identity_associations.append(keda_assoc)

1434

1435 # EFS CSI driver — needs EFS access for shared storage

1436 efs_assoc = eks_l1.CfnPodIdentityAssociation(

1437 self,

1438 "PodIdentity-efs-csi",

1439 cluster_name=self.cluster.cluster_name,

1440 namespace="kube-system",

1441 service_account="efs-csi-controller-sa",

1442 role_arn=self.efs_csi_role.role_arn,

1443 )

1444 self._pod_identity_associations.append(efs_assoc)

1445

1446 # CloudWatch agent — needs CloudWatch access for observability

1447 cw_assoc = eks_l1.CfnPodIdentityAssociation(

1448 self,

1449 "PodIdentity-cloudwatch",

1450 cluster_name=self.cluster.cluster_name,

1451 namespace="amazon-cloudwatch",

1452 service_account="cloudwatch-agent",

1453 role_arn=self.cloudwatch_role.role_arn,

1454 )

1455 self._pod_identity_associations.append(cw_assoc)

1456

1457 # FSx CSI driver — only when FSx is enabled (created later in _create_fsx_lustre)

1458 # The FSx Pod Identity association is added in _create_fsx_lustre instead

1459

1460 def _create_kubectl_lambda(self) -> None:

1461 """Create Lambda function to apply Kubernetes manifests using Python client.

1462

1463 Note: This creates the Lambda and provider but does NOT create the custom resource.

1464 The custom resource is created in _apply_kubernetes_manifests() after ALB is created,

1465 so that target group ARNs can be passed to the manifests.

1466 """

1467 project_name = self.config.get_project_name()

1468

1469 # Create IAM role for kubectl Lambda

1470 kubectl_lambda_role = iam.Role(

1471 self,

1472 "KubectlLambdaRole",

1473 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),

1474 managed_policies=[

1475 iam.ManagedPolicy.from_aws_managed_policy_name(

1476 "service-role/AWSLambdaVPCAccessExecutionRole"

1477 ),

1478 iam.ManagedPolicy.from_aws_managed_policy_name(

1479 "service-role/AWSLambdaBasicExecutionRole"

1480 ),

1481 ],

1482 )

1483

1484 # Add EKS permissions

1485 kubectl_lambda_role.add_to_policy(

1486 iam.PolicyStatement(

1487 actions=[

1488 "eks:DescribeCluster",

1489 "eks:ListClusters",

1490 ],

1491 resources=[self.cluster.cluster_arn],

1492 )

1493 )

1494

1495 # Add permissions to assume cluster admin role

1496 kubectl_lambda_role.add_to_policy(

1497 iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"])

1498 )

1499

1500 # Create security group for kubectl Lambda

1501 kubectl_lambda_sg = ec2.SecurityGroup(

1502 self,

1503 "KubectlLambdaSG",

1504 vpc=self.vpc,

1505 description="Security group for kubectl Lambda to access EKS cluster",

1506 security_group_name=f"{self.config.get_project_name()}-kubectl-lambda-sg-{self.deployment_region}",

1507 allow_all_outbound=True, # Lambda needs outbound access to EKS API

1508 )

1509

1510 # Allow Lambda security group to access EKS cluster security group on port 443

1511 # The EKS cluster security group is automatically created by EKS

1512 self.cluster.cluster_security_group.add_ingress_rule(

1513 peer=kubectl_lambda_sg,

1514 connection=ec2.Port.tcp(443),

1515 description="Allow kubectl Lambda to access EKS API",

1516 )

1517

1518 # Create Lambda function (Python-only, no Docker!)

1519 # Store function name as string attribute for cross-stack references

1520 # This avoids CDK cross-environment resolution issues when account is unresolved

1521 self.kubectl_lambda_function_name = f"{project_name}-kubectl-{self.deployment_region}"

1522 self.kubectl_lambda = lambda_.Function(

1523 self,

1524 "KubectlApplierFunction",

1525 function_name=self.kubectl_lambda_function_name,

1526 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),

1527 handler="handler.lambda_handler",

1528 code=lambda_.Code.from_asset("lambda/kubectl-applier-simple-build"),

1529 timeout=Duration.minutes(15), # Max Lambda timeout

1530 memory_size=512,

1531 role=kubectl_lambda_role,

1532 vpc=self.vpc,

1533 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),

1534 security_groups=[kubectl_lambda_sg], # Use the security group we created

1535 environment={

1536 "CLUSTER_NAME": self.cluster.cluster_name,

1537 "REGION": self.deployment_region,

1538 },

1539 tracing=lambda_.Tracing.ACTIVE,

1540 )

1541

1542 # Add EKS access entry for the Lambda role to authenticate with the cluster

1543 # This grants the Lambda role cluster admin permissions

1544 eks.AccessEntry(

1545 self,

1546 "KubectlLambdaAccessEntry",

1547 cluster=self.cluster, # type: ignore[arg-type]

1548 principal=kubectl_lambda_role.role_arn,

1549 access_policies=[

1550 eks.AccessPolicy.from_access_policy_name(

1551 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER

1552 )

1553 ],

1554 )

1555

1556 # Create log group for kubectl provider

1557 kubectl_provider_log_group = logs.LogGroup(

1558 self,

1559 "KubectlProviderLogGroup",

1560 retention=logs.RetentionDays.ONE_WEEK,

1561 removal_policy=RemovalPolicy.DESTROY,

1562 )

1563

1564 # Create custom resource provider (stored for use in _apply_kubernetes_manifests)

1565 self.kubectl_provider = cr.Provider(

1566 self,

1567 "KubectlProvider",

1568 on_event_handler=self.kubectl_lambda,

1569 log_group=kubectl_provider_log_group,

1570 )

1571

1572 # cdk-nag suppression: the kubectl-applier Lambda requires broad

1573 # EKS and Kubernetes API access to apply arbitrary manifests.

1574 from cdk_nag import NagSuppressions

1575

1576 NagSuppressions.add_resource_suppressions(

1577 kubectl_lambda_role,

1578 [

1579 {

1580 "id": "AwsSolutions-IAM5",

1581 "reason": (

1582 "The kubectl-applier Lambda requires broad EKS and Kubernetes API "

1583 "access to apply arbitrary manifests (RBAC, ServiceAccounts, "

1584 "Deployments, Jobs, NetworkPolicies) across multiple namespaces. "

1585 "Resource: * is required because the set of Kubernetes resources "

1586 "is dynamic and not known at synth time."

1587 ),

1588 "appliesTo": ["Resource::*"],

1589 },

1590 ],

1591 apply_to_children=True,

1592 )

1593

1594 def _apply_kubernetes_manifests(self) -> None:

1595 """Apply Kubernetes manifests using the kubectl Lambda custom resource.

1596

1597 This is called after ALB security group and EFS are created.

1598 The Ingress will use the security group ID to create the ALB.

1599 """

1600

1601 # Get public subnet IDs for Ingress annotation (currently unused but kept for future use)

1602 # public_subnet_ids = ",".join([subnet.subnet_id for subnet in self.vpc.public_subnets])

1603

1604 # Apply manifests using custom resource

1605 # Build image replacements dict

1606 # Include a deployment timestamp to force pod rollouts when code changes

1607 from datetime import UTC, datetime

1608

1609 deployment_timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")

1610

1611 # Get resource thresholds from config

1612 thresholds = self.config.get_resource_thresholds()

1613

1614 # Get manifest processor resource quotas.

1615 # Resource quotas and the security/image policy now live under the

1616 # shared job_validation_policy section because both the REST

1617 # manifest_processor and the SQS queue_processor read them. Service-

1618 # specific knobs (replicas, validation_enabled, max_request_body_bytes,

1619 # etc.) stay under manifest_processor.

1620 mp_config = self.node.try_get_context("manifest_processor") or {}

1621 job_policy = self.node.try_get_context("job_validation_policy") or {}

1622 job_quotas = job_policy.get("resource_quotas", {})

1623

1624 image_replacements = {

1625 "{{HEALTH_MONITOR_IMAGE}}": self.health_monitor_image.image_uri,

1626 "{{MANIFEST_PROCESSOR_IMAGE}}": self.manifest_processor_image.image_uri,

1627 "{{INFERENCE_MONITOR_IMAGE}}": self.inference_monitor_image.image_uri,

1628 "{{CLUSTER_NAME}}": self.cluster.cluster_name,

1629 "{{REGION}}": self.deployment_region,

1630 "{{AUTH_SECRET_ARN}}": self.auth_secret_arn,

1631 "{{SERVICE_ACCOUNT_ROLE_ARN}}": self.service_account_role.role_arn,

1632 "{{EFS_FILE_SYSTEM_ID}}": self.efs_file_system.file_system_id,

1633 "{{EFS_ACCESS_POINT_ID}}": self.efs_access_point.access_point_id,

1634 "{{JOB_QUEUE_URL}}": self.job_queue.queue_url,

1635 "{{JOB_QUEUE_ARN}}": self.job_queue.queue_arn,

1636 "{{DEPLOYMENT_TIMESTAMP}}": deployment_timestamp,

1637 # Resource thresholds

1638 "{{CPU_THRESHOLD}}": str(thresholds.cpu_threshold),

1639 "{{MEMORY_THRESHOLD}}": str(thresholds.memory_threshold),

1640 "{{GPU_THRESHOLD}}": str(thresholds.gpu_threshold),

1641 "{{PENDING_PODS_THRESHOLD}}": str(thresholds.pending_pods_threshold),

1642 "{{PENDING_REQUESTED_CPU_VCPUS}}": str(thresholds.pending_requested_cpu_vcpus),

1643 "{{PENDING_REQUESTED_MEMORY_GB}}": str(thresholds.pending_requested_memory_gb),

1644 "{{PENDING_REQUESTED_GPUS}}": str(thresholds.pending_requested_gpus),

1645 # DynamoDB table names (from global stack)

1646 "{{TEMPLATES_TABLE_NAME}}": f"{self.config.get_project_name()}-job-templates",

1647 "{{WEBHOOKS_TABLE_NAME}}": f"{self.config.get_project_name()}-webhooks",

1648 "{{JOBS_TABLE_NAME}}": f"{self.config.get_project_name()}-jobs",

1649 # DynamoDB region (global stack region, may differ from cluster region)

1650 "{{DYNAMODB_REGION}}": self.config.get_global_region(),

1651 # Manifest processor resource quotas (sourced from shared policy).

1652 "{{MP_MAX_CPU_PER_MANIFEST}}": str(job_quotas.get("max_cpu_per_manifest", "10")),

1653 "{{MP_MAX_MEMORY_PER_MANIFEST}}": str(

1654 job_quotas.get("max_memory_per_manifest", "32Gi")

1655 ),

1656 "{{MP_MAX_GPU_PER_MANIFEST}}": str(job_quotas.get("max_gpu_per_manifest", 4)),

1657 # Manifest processor namespace allowlist (sourced from shared policy).

1658 # Both the REST manifest processor and the SQS queue processor

1659 # read from job_validation_policy.allowed_namespaces so a single

1660 # edit takes effect on both submission paths at the next deploy.

1661 "{{MP_ALLOWED_NAMESPACES}}": ",".join(

1662 job_policy.get("allowed_namespaces", ["default", "gco-jobs"])

1663 ),

1664 # Manifest processor request body size cap (HTTP 413 middleware).

1665 # Lives at cdk.json::manifest_processor.max_request_body_bytes.

1666 "{{MP_MAX_REQUEST_BODY_BYTES}}": str(

1667 mp_config.get("max_request_body_bytes", 1_048_576)

1668 ),

1669 }

1670

1671 # Add queue processor replacements if enabled

1672 qp_config = self.node.try_get_context("queue_processor") or {}

1673

1674 # Add VPC endpoint CIDR replacements for network policy restrictions

1675 # Generates a YAML block of ipBlock entries from the vpc_endpoint_cidrs array.

1676 # The placeholder {{VPC_ENDPOINT_CIDR_BLOCKS}} sits at 8-space indentation in

1677 # the manifest, so the first entry needs no leading indent (the manifest provides

1678 # it) and subsequent entries are indented to align.

1679 vpc_endpoint_cidrs = self.node.try_get_context("vpc_endpoint_cidrs") or ["10.0.0.0/16"]

1680 cidr_lines = []

1681 for i, cidr in enumerate(vpc_endpoint_cidrs):

1682 prefix = "" if i == 0 else " "

1683 cidr_lines.append(f'{prefix}- ipBlock:\n cidr: "{cidr}"')

1684 image_replacements["{{VPC_ENDPOINT_CIDR_BLOCKS}}"] = "\n".join(cidr_lines)

1685

1686 # Resource governance for gco-jobs namespace: ResourceQuota caps aggregate

1687 # resource consumption across the namespace, LimitRange caps per-container

1688 # maxima. Values come from cdk.json `resource_quota` context with defaults

1689 # sized for a modest multi-tenant dev cluster.

1690 resource_quota = self.node.try_get_context("resource_quota") or {}

1691 image_replacements["{{QUOTA_MAX_CPU}}"] = str(resource_quota.get("max_cpu", "100"))

1692 image_replacements["{{QUOTA_MAX_MEMORY}}"] = str(resource_quota.get("max_memory", "512Gi"))

1693 image_replacements["{{QUOTA_MAX_GPU}}"] = str(resource_quota.get("max_gpu", "32"))

1694 image_replacements["{{QUOTA_MAX_PODS}}"] = str(resource_quota.get("max_pods", "50"))

1695 image_replacements["{{LIMIT_MAX_CPU}}"] = str(resource_quota.get("container_max_cpu", "10"))

1696 image_replacements["{{LIMIT_MAX_MEMORY}}"] = str(

1697 resource_quota.get("container_max_memory", "64Gi")

1698 )

1699 image_replacements["{{LIMIT_MAX_GPU}}"] = str(resource_quota.get("container_max_gpu", "4"))

1700

1701 if self.queue_processor_enabled: 1701 ↛ 1775line 1701 didn't jump to line 1775 because the condition on line 1701 was always true

1702 image_replacements["{{QUEUE_PROCESSOR_IMAGE}}"] = self.queue_processor_image.image_uri

1703 image_replacements["{{QP_POLLING_INTERVAL}}"] = str(

1704 qp_config.get("polling_interval", 10)

1705 )

1706 image_replacements["{{QP_MAX_CONCURRENT_JOBS}}"] = str(

1707 qp_config.get("max_concurrent_jobs", 10)

1708 )

1709 image_replacements["{{QP_MESSAGES_PER_JOB}}"] = str(

1710 qp_config.get("messages_per_job", 1)

1711 )

1712 image_replacements["{{QP_SUCCESSFUL_JOBS_HISTORY}}"] = str(

1713 qp_config.get("successful_jobs_history", 20)

1714 )

1715 image_replacements["{{QP_FAILED_JOBS_HISTORY}}"] = str(

1716 qp_config.get("failed_jobs_history", 10)

1717 )

1718 image_replacements["{{QP_ALLOWED_NAMESPACES}}"] = ",".join(

1719 job_policy.get("allowed_namespaces", ["default", "gco-jobs"])

1720 )

1721 # Resource caps, image allowlist, and security policy are shared

1722 # with the REST manifest processor. Source them from the

1723 # job_validation_policy section so a single change in cdk.json

1724 # takes effect on both submission paths at the next deploy.

1725 image_replacements["{{QP_MAX_GPU_PER_MANIFEST}}"] = str(

1726 job_quotas.get("max_gpu_per_manifest", 4)

1727 )

1728 image_replacements["{{QP_MAX_CPU_PER_MANIFEST}}"] = str(

1729 job_quotas.get("max_cpu_per_manifest", "10")

1730 )

1731 image_replacements["{{QP_MAX_MEMORY_PER_MANIFEST}}"] = str(

1732 job_quotas.get("max_memory_per_manifest", "32Gi")

1733 )

1734 image_replacements["{{QP_TRUSTED_REGISTRIES}}"] = ",".join(

1735 job_policy.get("trusted_registries", [])

1736 )

1737 image_replacements["{{QP_TRUSTED_DOCKERHUB_ORGS}}"] = ",".join(

1738 job_policy.get("trusted_dockerhub_orgs", [])

1739 )

1740

1741 # Security policy toggles — shared with the REST manifest_processor.

1742 # Both services read the same cdk.json section so a single policy

1743 # flip (e.g. block_run_as_root: true) takes effect on both paths.

1744 security_policy = job_policy.get("manifest_security_policy", {})

1745

1746 def _policy_str(v: object) -> str:

1747 return "true" if v else "false"

1748

1749 image_replacements["{{QP_BLOCK_PRIVILEGED}}"] = _policy_str(

1750 security_policy.get("block_privileged", True)

1751 )

1752 image_replacements["{{QP_BLOCK_PRIVILEGE_ESCALATION}}"] = _policy_str(

1753 security_policy.get("block_privilege_escalation", True)

1754 )

1755 image_replacements["{{QP_BLOCK_HOST_NETWORK}}"] = _policy_str(

1756 security_policy.get("block_host_network", True)

1757 )

1758 image_replacements["{{QP_BLOCK_HOST_PID}}"] = _policy_str(

1759 security_policy.get("block_host_pid", True)

1760 )

1761 image_replacements["{{QP_BLOCK_HOST_IPC}}"] = _policy_str(

1762 security_policy.get("block_host_ipc", True)

1763 )

1764 image_replacements["{{QP_BLOCK_HOST_PATH}}"] = _policy_str(

1765 security_policy.get("block_host_path", True)

1766 )

1767 image_replacements["{{QP_BLOCK_ADDED_CAPABILITIES}}"] = _policy_str(

1768 security_policy.get("block_added_capabilities", True)

1769 )

1770 image_replacements["{{QP_BLOCK_RUN_AS_ROOT}}"] = _policy_str(

1771 security_policy.get("block_run_as_root", False)

1772 )

1773

1774 # Add Valkey endpoint if enabled

1775 if hasattr(self, "valkey_cache") and self.valkey_cache: 1775 ↛ 1776line 1775 didn't jump to line 1776 because the condition on line 1775 was never true

1776 image_replacements["{{VALKEY_ENDPOINT}}"] = self.valkey_cache.attr_endpoint_address

1777 image_replacements["{{VALKEY_PORT}}"] = self.valkey_cache.attr_endpoint_port

1778

1779 # Add Aurora pgvector endpoint if enabled

1780 if hasattr(self, "aurora_cluster") and self.aurora_cluster:

1781 image_replacements["{{AURORA_PGVECTOR_ENDPOINT}}"] = (

1782 self.aurora_cluster.cluster_endpoint.hostname

1783 )

1784 image_replacements["{{AURORA_PGVECTOR_READER_ENDPOINT}}"] = (

1785 self.aurora_cluster.cluster_read_endpoint.hostname

1786 )

1787 image_replacements["{{AURORA_PGVECTOR_PORT}}"] = str(

1788 self.aurora_cluster.cluster_endpoint.port

1789 )

1790 if self.aurora_cluster.secret: 1790 ↛ 1796line 1790 didn't jump to line 1796 because the condition on line 1790 was always true

1791 image_replacements["{{AURORA_PGVECTOR_SECRET_ARN}}"] = (

1792 self.aurora_cluster.secret.secret_arn

1793 )

1794

1795 # Add FSx replacements if enabled

1796 if self.fsx_file_system:

1797 image_replacements["{{FSX_FILE_SYSTEM_ID}}"] = self.fsx_file_system.ref

1798 image_replacements["{{FSX_DNS_NAME}}"] = self.fsx_file_system.attr_dns_name

1799 image_replacements["{{FSX_MOUNT_NAME}}"] = self.fsx_file_system.attr_lustre_mount_name

1800 image_replacements["{{PRIVATE_SUBNET_ID}}"] = self.vpc.private_subnets[0].subnet_id

1801 image_replacements["{{FSX_SECURITY_GROUP_ID}}"] = (

1802 self.fsx_security_group.security_group_id

1803 )

1804

1805 kubectl_apply = CustomResource(

1806 self,

1807 "KubectlApplyManifests",

1808 service_token=self.kubectl_provider.service_token,

1809 properties={

1810 "ClusterName": self.cluster.cluster_name,

1811 "Region": self.deployment_region,

1812 "SkipDeletionOnStackDelete": "true", # Don't delete resources on stack deletion

1813 "ImageReplacements": image_replacements,

1814 # Include FSx file system ID directly to force update when FSx changes

1815 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none",

1816 # Force update on each deployment to trigger pod rollouts

1817 "DeploymentTimestamp": deployment_timestamp,

1818 },

1819 )

1820

1821 # Ensure manifests are applied after cluster, EFS, and FSx are ready

1822 # Note: ALB is created by EKS Auto Mode when Ingress is applied

1823 kubectl_apply.node.add_dependency(self.cluster)

1824 kubectl_apply.node.add_dependency(self.efs_file_system)

1825 if self.fsx_file_system:

1826 kubectl_apply.node.add_dependency(self.fsx_file_system)

1827

1828 # Wait for EKS to have patched the IRSA role ARN onto each managed

1829 # addon's service account before the kubectl Lambda rollout-restarts

1830 # the controllers at the end of this invocation. Otherwise the

1831 # restart sees the old (annotation-less) SA, the mutating webhook

1832 # can't inject AWS_ROLE_ARN, and the new pods are just as

1833 # credential-less as the ones they replaced. The symptom is

1834 # controller pods silently failing with "no EC2 IMDS role found" —

1835 # for EFS/FSx that manifests as PVCs stuck Pending forever, for

1836 # CloudWatch as missing Container Insights metrics. See the

1837 # UpdateEfsCsiAddonRole custom resource in _create_efs_csi_driver_addon

1838 # for the full rationale.

1839 for attr in (

1840 "_efs_csi_addon_role_update",

1841 "_fsx_csi_addon_role_update",

1842 "_cloudwatch_addon_role_update",

1843 ):

1844 update_cr = getattr(self, attr, None)

1845 if update_cr is not None:

1846 kubectl_apply.node.add_dependency(update_cr)

1847

1848 # Ensure Pod Identity associations exist before workloads start,

1849 # so pods get IAM credentials on first launch

1850 for assoc in self._pod_identity_associations:

1851 kubectl_apply.node.add_dependency(assoc)

1852

1853 # Install Helm charts (KEDA, etc.) after base manifests are applied

1854 # This ensures namespaces and RBAC are in place before Helm installations

1855 helm_install = CustomResource(

1856 self,

1857 "HelmInstallCharts",

1858 service_token=self.helm_installer_provider.service_token,

1859 properties={

1860 "ClusterName": self.cluster.cluster_name,

1861 "Region": self.deployment_region,

1862 # Enable core AI/ML infrastructure charts by default

1863 # NVIDIA Network Operator toggled via cdk.json nvidia_network_operator.enabled

1864 "EnabledCharts": self._get_enabled_helm_charts(),

1865 # Override chart values if needed

1866 "Charts": {},

1867 # Pass IAM role ARNs for service account annotations

1868 "KedaOperatorRoleArn": self.keda_operator_role.role_arn,

1869 # Force re-invocation on every deployment to pick up charts.yaml changes

1870 "DeploymentTimestamp": deployment_timestamp,

1871 },

1872 )

1873

1874 # Helm charts depend on kubectl manifests being applied first

1875 helm_install.node.add_dependency(kubectl_apply)

1876

1877 # Apply CRD-dependent manifests after Helm installs the CRDs.

1878 # KEDA ScaledJob/ScaledObject require the KEDA CRDs to exist first.

1879 # This second kubectl pass runs after Helm and applies only those resources.

1880 kubectl_apply_post_helm = CustomResource(

1881 self,

1882 "KubectlApplyPostHelmManifests",

1883 service_token=self.kubectl_provider.service_token,

1884 properties={

1885 "ClusterName": self.cluster.cluster_name,

1886 "Region": self.deployment_region,

1887 "SkipDeletionOnStackDelete": "true",

1888 "ImageReplacements": image_replacements,

1889 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none",

1890 "DeploymentTimestamp": deployment_timestamp,

1891 # PostHelm: "true" tells the handler to apply only post-helm-* manifests

1892 "PostHelm": "true",

1893 },

1894 )

1895

1896 # Must run after Helm has installed the CRDs

1897 kubectl_apply_post_helm.node.add_dependency(helm_install)

1898

1899 # Create GA registration custom resource AFTER manifests are applied

1900 # This waits for the Ingress to create the ALB and registers it with GA

1901 #

1902 # IMPORTANT: We include a deployment timestamp to force CloudFormation to

1903 # re-invoke the Lambda on every deployment. This ensures the ALB is always

1904 # registered with the Global Accelerator, even if other properties haven't changed.

1905 # Without this, CloudFormation may skip the custom resource if it thinks

1906 # nothing has changed, leaving the ALB unregistered after GA recreation.

1907 deployment_timestamp = str(int(time.time()))

1908

1909 ga_registration = CustomResource(

1910 self,

1911 "GaRegistration",

1912 service_token=self.ga_registration_provider.service_token,

1913 properties={

1914 "ClusterName": self.cluster.cluster_name,

1915 "Region": self.deployment_region,

1916 "EndpointGroupArn": self.endpoint_group_arn,

1917 "IngressName": "gco-ingress",

1918 "Namespace": "gco-system",

1919 # Pass global region and project name for SSM storage

1920 "GlobalRegion": self.config.get_global_region(),

1921 "ProjectName": self.config.get_project_name(),

1922 # Force re-invocation on every deployment

1923 "DeploymentTimestamp": deployment_timestamp,

1924 },

1925 )

1926

1927 # GA registration must happen after manifests are applied

1928 ga_registration.node.add_dependency(kubectl_apply)

1929

1930 def _create_ga_registration_lambda(self) -> None:

1931 """Create Lambda function to register Ingress-created ALB with Global Accelerator.

1932

1933 This Lambda:

1934 1. Waits for the Ingress to get an ALB address

1935 2. Gets the ALB ARN from the address

1936 3. Registers that ALB with Global Accelerator

1937

1938 This is necessary because the ALB is created by the AWS Load Balancer Controller

1939 (not CDK), so we can't directly reference its ARN.

1940 """

1941 project_name = self.config.get_project_name()

1942

1943 # Create Lambda function for GA registration using external handler

1944 ga_registration_lambda = lambda_.Function(

1945 self,

1946 "GaRegistrationFunction",

1947 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),

1948 handler="handler.lambda_handler",

1949 code=lambda_.Code.from_asset("lambda/ga-registration"),

1950 timeout=Duration.minutes(15), # Max Lambda timeout; handler uses 14 min budget

1951 memory_size=256,

1952 vpc=self.vpc,

1953 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),

1954 environment={

1955 "CLUSTER_NAME": self.cluster.cluster_name,

1956 "REGION": self.deployment_region,

1957 },

1958 tracing=lambda_.Tracing.ACTIVE,

1959 )

1960

1961 # Grant permissions

1962 ga_registration_lambda.add_to_role_policy(

1963 iam.PolicyStatement(

1964 effect=iam.Effect.ALLOW,

1965 actions=["eks:DescribeCluster"],

1966 resources=[self.cluster.cluster_arn],

1967 )

1968 )

1969 ga_registration_lambda.add_to_role_policy(

1970 iam.PolicyStatement(

1971 effect=iam.Effect.ALLOW,

1972 actions=[

1973 "elasticloadbalancing:DescribeLoadBalancers",

1974 "elasticloadbalancing:DescribeTags", # Required for tag-based ALB detection

1975 ],

1976 resources=["*"],

1977 )

1978 )

1979 ga_registration_lambda.add_to_role_policy(

1980 iam.PolicyStatement(

1981 effect=iam.Effect.ALLOW,

1982 actions=[

1983 "globalaccelerator:AddEndpoints",

1984 "globalaccelerator:RemoveEndpoints",

1985 "globalaccelerator:UpdateEndpointGroup",

1986 "globalaccelerator:DescribeEndpointGroup",

1987 ],

1988 resources=["*"],

1989 )

1990 )

1991 ga_registration_lambda.add_to_role_policy(

1992 iam.PolicyStatement(

1993 effect=iam.Effect.ALLOW,

1994 actions=["ssm:GetParameter", "ssm:PutParameter", "ssm:DeleteParameter"],

1995 resources=[

1996 f"arn:aws:ssm:{self.config.get_global_region()}:{self.account}:parameter/{project_name}/*"

1997 ],

1998 )

1999 )

2000

2001 # Add EKS access entry for the Lambda role

2002 if ga_registration_lambda.role is not None: 2002 ↛ 2016line 2002 didn't jump to line 2016 because the condition on line 2002 was always true

2003 eks.AccessEntry(

2004 self,

2005 "GaRegistrationLambdaAccessEntry",

2006 cluster=self.cluster, # type: ignore[arg-type]

2007 principal=ga_registration_lambda.role.role_arn,

2008 access_policies=[

2009 eks.AccessPolicy.from_access_policy_name(

2010 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER

2011 )

2012 ],

2013 )

2014

2015 # Allow Lambda to access EKS API

2016 self.cluster.cluster_security_group.add_ingress_rule(

2017 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block),

2018 connection=ec2.Port.tcp(443),

2019 description="Allow GA registration Lambda to access EKS API",

2020 )

2021

2022 # Get endpoint group ARN from SSM (stored in global region).

2023 # Uses the shared AwsCustomResource execution role (pre-created in

2024 # _create_aws_custom_resource_role) — the SSM GetParameter

2025 # statement was attached there up-front so the Lambda never hits

2026 # an IAM propagation race on cold deploys.

2027 global_region = self.config.get_global_region()

2028 get_endpoint_group_arn = cr.AwsCustomResource(

2029 self,

2030 "GetEndpointGroupArn",

2031 on_create=cr.AwsSdkCall(

2032 service="SSM",

2033 action="getParameter",

2034 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"},

2035 region=global_region,

2036 physical_resource_id=cr.PhysicalResourceId.of(

2037 f"{project_name}-get-endpoint-group-arn-{self.deployment_region}"

2038 ),

2039 ),

2040 on_update=cr.AwsSdkCall(

2041 service="SSM",

2042 action="getParameter",

2043 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"},

2044 region=global_region,

2045 ),

2046 role=self.aws_custom_resource_role,

2047 )

2048 get_endpoint_group_arn.node.add_dependency(self.aws_custom_resource_role)

2049

2050 endpoint_group_arn = get_endpoint_group_arn.get_response_field("Parameter.Value")

2051

2052 # Create log group for GA registration provider

2053 ga_provider_log_group = logs.LogGroup(

2054 self,

2055 "GaRegistrationProviderLogGroup",

2056 retention=logs.RetentionDays.ONE_WEEK,

2057 removal_policy=RemovalPolicy.DESTROY,

2058 )

2059

2060 # Create provider and custom resource

2061 ga_provider = cr.Provider(

2062 self,

2063 "GaRegistrationProvider",

2064 on_event_handler=ga_registration_lambda,

2065 log_group=ga_provider_log_group,

2066 )

2067

2068 # Store for use after kubectl apply

2069 self.ga_registration_provider = ga_provider

2070 self.endpoint_group_arn = endpoint_group_arn

2071

2072 # cdk-nag suppression: the GA registration Lambda needs broad

2073 # Global Accelerator and ELB Describe access with Resource: *.

2074 from cdk_nag import NagSuppressions

2075

2076 NagSuppressions.add_resource_suppressions(

2077 ga_registration_lambda,

2078 [

2079 {

2080 "id": "AwsSolutions-IAM5",

2081 "reason": (

2082 "The GA registration Lambda needs elasticloadbalancing:Describe* "

2083 "and globalaccelerator:* to discover the Ingress-created ALB and "

2084 "register it with Global Accelerator. These APIs do not support "

2085 "resource-level IAM scoping — Resource: * is the only valid form."

2086 ),

2087 "appliesTo": ["Resource::*"],

2088 },

2089 ],

2090 apply_to_children=True,

2091 )

2092

2093 def _get_enabled_helm_charts(self) -> list[str]:

2094 """Return the list of Helm charts to install based on cdk.json helm config.

2095

2096 Reads the 'helm' section from cdk.json context. Each key maps to one or

2097 more Helm chart names. Charts are returned in dependency order with Kueue

2098 last (its webhook intercepts all Job/Deployment mutations).

2099 """

2100 helm_config = self.node.try_get_context("helm") or {}

2101

2102 # Mapping from cdk.json helm key → Helm chart name(s) in charts.yaml

2103 # Order matters: dependencies first, Kueue last

2104 chart_map: list[tuple[str, list[str]]] = [

2105 ("keda", ["keda"]),

2106 ("nvidia_gpu_operator", ["nvidia-gpu-operator"]),

2107 ("nvidia_dra_driver", ["nvidia-dra-driver"]),

2108 ("nvidia_network_operator", ["nvidia-network-operator"]),

2109 ("aws_efa_device_plugin", ["aws-efa-device-plugin"]),

2110 ("aws_neuron_device_plugin", ["aws-neuron-device-plugin"]),

2111 ("volcano", ["volcano"]),

2112 ("kuberay", ["kuberay-operator"]),

2113 ("cert_manager", ["cert-manager"]),

2114 ("slurm", ["slinky-slurm-operator", "slinky-slurm"]),

2115 ("yunikorn", ["yunikorn"]),

2116 ("kueue", ["kueue"]), # Must be last

2117 ]

2118

2119 enabled_charts = []

2120 for config_key, chart_names in chart_map:

2121 chart_config = helm_config.get(config_key, {})

2122 if chart_config.get("enabled", True): 2122 ↛ 2120line 2122 didn't jump to line 2120 because the condition on line 2122 was always true

2123 enabled_charts.extend(chart_names)

2124

2125 return enabled_charts

2126

2127 def _create_helm_installer_lambda(self) -> None:

2128 """Create Lambda function to install Helm charts (KEDA, NVIDIA DRA, etc.).

2129

2130 This Lambda uses Helm to install charts that require complex setup

2131 (TLS certificates, CRDs, etc.) that are difficult to manage via raw manifests.

2132

2133 Charts installed:

2134 - KEDA: Kubernetes Event-Driven Autoscaling (enabled by default)

2135 - NVIDIA DRA Driver: Dynamic Resource Allocation for GPUs (disabled by default)

2136 """

2137 project_name = self.config.get_project_name()

2138

2139 # Create IAM role for Helm installer Lambda

2140 helm_lambda_role = iam.Role(

2141 self,

2142 "HelmInstallerLambdaRole",

2143 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),

2144 managed_policies=[

2145 iam.ManagedPolicy.from_aws_managed_policy_name(

2146 "service-role/AWSLambdaVPCAccessExecutionRole"

2147 ),

2148 iam.ManagedPolicy.from_aws_managed_policy_name(

2149 "service-role/AWSLambdaBasicExecutionRole"

2150 ),

2151 ],

2152 )

2153

2154 # Add EKS permissions

2155 helm_lambda_role.add_to_policy(

2156 iam.PolicyStatement(

2157 actions=["eks:DescribeCluster", "eks:ListClusters"],

2158 resources=[self.cluster.cluster_arn],

2159 )

2160 )

2161

2162 # Create security group for Helm installer Lambda

2163 helm_lambda_sg = ec2.SecurityGroup(

2164 self,

2165 "HelmInstallerLambdaSG",

2166 vpc=self.vpc,

2167 description="Security group for Helm installer Lambda to access EKS cluster",

2168 security_group_name=f"{project_name}-helm-lambda-sg-{self.deployment_region}",

2169 allow_all_outbound=True,

2170 )

2171

2172 # Allow Lambda to access EKS cluster API

2173 self.cluster.cluster_security_group.add_ingress_rule(

2174 peer=helm_lambda_sg,

2175 connection=ec2.Port.tcp(443),

2176 description="Allow Helm installer Lambda to access EKS API",

2177 )

2178

2179 # Build Docker image for Helm installer Lambda

2180 # Points at helm-installer-build/ which is rebuilt fresh every deploy

2181 # by _build_helm_installer_lambda() in cli/stacks.py

2182 ecr_assets.DockerImageAsset(

2183 self,

2184 "HelmInstallerImage",

2185 directory="lambda/helm-installer-build",

2186 platform=ecr_assets.Platform.LINUX_AMD64,

2187 )

2188

2189 # Create Lambda function using Docker image

2190 # Store function name as string attribute for cross-stack references

2191 # This avoids CDK cross-environment resolution issues when account is unresolved

2192 self.helm_installer_lambda_function_name = f"{project_name}-helm-{self.deployment_region}"

2193 self.helm_installer_lambda = lambda_.DockerImageFunction(

2194 self,

2195 "HelmInstallerFunction",

2196 function_name=self.helm_installer_lambda_function_name,

2197 code=lambda_.DockerImageCode.from_image_asset(

2198 directory="lambda/helm-installer-build",

2199 platform=ecr_assets.Platform.LINUX_AMD64,

2200 ),

2201 timeout=Duration.minutes(15),

2202 memory_size=1024,

2203 architecture=lambda_.Architecture.X86_64,

2204 role=helm_lambda_role,

2205 vpc=self.vpc,

2206 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),

2207 security_groups=[helm_lambda_sg],

2208 environment={

2209 "CLUSTER_NAME": self.cluster.cluster_name,

2210 "REGION": self.deployment_region,

2211 },

2212 tracing=lambda_.Tracing.ACTIVE,

2213 )

2214

2215 # Add EKS access entry for the Lambda role

2216 eks.AccessEntry(

2217 self,

2218 "HelmInstallerLambdaAccessEntry",

2219 cluster=self.cluster, # type: ignore[arg-type]

2220 principal=helm_lambda_role.role_arn,

2221 access_policies=[

2222 eks.AccessPolicy.from_access_policy_name(

2223 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER

2224 )

2225 ],

2226 )

2227

2228 # Create log group for Helm installer provider

2229 helm_provider_log_group = logs.LogGroup(

2230 self,

2231 "HelmInstallerProviderLogGroup",

2232 retention=logs.RetentionDays.ONE_WEEK,

2233 removal_policy=RemovalPolicy.DESTROY,

2234 )

2235

2236 # Create custom resource provider

2237 self.helm_installer_provider = cr.Provider(

2238 self,

2239 "HelmInstallerProvider",

2240 on_event_handler=self.helm_installer_lambda,

2241 log_group=helm_provider_log_group,

2242 )

2243

2244 # cdk-nag suppression: the Helm installer Lambda requires broad

2245 # EKS and Kubernetes API access to install Helm charts.

2246 from cdk_nag import NagSuppressions

2247

2248 NagSuppressions.add_resource_suppressions(

2249 helm_lambda_role,

2250 [

2251 {

2252 "id": "AwsSolutions-IAM5",

2253 "reason": (

2254 "The Helm installer Lambda requires broad EKS and Kubernetes API "

2255 "access to install Helm charts (KEDA, NVIDIA DRA, etc.) that create "

2256 "CRDs, RBAC rules, and workloads across multiple namespaces. "

2257 "Resource: * is required because the set of Kubernetes resources "

2258 "is dynamic and not known at synth time."

2259 ),

2260 "appliesTo": ["Resource::*"],

2261 },

2262 ],

2263 apply_to_children=True,

2264 )

2265

2266 def _create_efs(self) -> None:

2267 """Create EFS file system for shared storage across jobs.

2268

2269 Creates an EFS file system with mount targets in each private subnet,

2270 allowing pods to share data and persist outputs. The EFS is configured

2271 with:

2272 - Encryption at rest

2273 - Automatic backups

2274 - General Purpose performance mode (suitable for most workloads)

2275 - Bursting throughput mode

2276

2277 Kubernetes resources (StorageClass, PV, PVC) are created via manifests.

2278 """

2279 project_name = self.config.get_project_name()

2280

2281 # Create security group for EFS

2282 self.efs_security_group = ec2.SecurityGroup(

2283 self,

2284 "EfsSecurityGroup",

2285 vpc=self.vpc,

2286 description=f"Security group for {project_name} EFS in {self.deployment_region}",

2287 security_group_name=f"{project_name}-efs-sg-{self.deployment_region}",

2288 allow_all_outbound=False, # EFS doesn't need outbound

2289 )

2290

2291 # Allow NFS traffic from EKS cluster security group

2292 self.efs_security_group.add_ingress_rule(

2293 peer=self.cluster.cluster_security_group,

2294 connection=ec2.Port.tcp(2049),

2295 description="Allow NFS from EKS cluster",

2296 )

2297

2298 # Create EFS file system

2299 self.efs_file_system = efs.FileSystem(

2300 self,

2301 "GCOEfs",

2302 vpc=self.vpc,

2303 file_system_name=f"{project_name}-efs-{self.deployment_region}",

2304 security_group=self.efs_security_group,

2305 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),

2306 encrypted=True,

2307 performance_mode=efs.PerformanceMode.GENERAL_PURPOSE,

2308 throughput_mode=efs.ThroughputMode.BURSTING,

2309 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production

2310 enable_automatic_backups=True,

2311 )

2312

2313 # Add file system policy to allow mounting without IAM authorization

2314 # This allows any client that can reach the mount target to mount the file system

2315 self.efs_file_system.add_to_resource_policy(

2316 iam.PolicyStatement(

2317 effect=iam.Effect.ALLOW,

2318 principals=[iam.AnyPrincipal()],

2319 actions=[

2320 "elasticfilesystem:ClientMount",

2321 "elasticfilesystem:ClientWrite",

2322 "elasticfilesystem:ClientRootAccess",

2323 ],

2324 conditions={"Bool": {"elasticfilesystem:AccessedViaMountTarget": "true"}},

2325 )

2326 )

2327

2328 # Create access point for the gco-jobs directory

2329 self.efs_access_point = self.efs_file_system.add_access_point(

2330 "JobsAccessPoint",

2331 path="/gco-jobs",

2332 create_acl=efs.Acl(owner_uid="1000", owner_gid="1000", permissions="755"),

2333 posix_user=efs.PosixUser(uid="1000", gid="1000"),

2334 )

2335

2336 # Output EFS information

2337 CfnOutput(

2338 self,

2339 "EfsFileSystemId",

2340 value=self.efs_file_system.file_system_id,

2341 description="EFS File System ID for shared job storage",

2342 )

2343

2344 CfnOutput(

2345 self,

2346 "EfsAccessPointId",

2347 value=self.efs_access_point.access_point_id,

2348 description="EFS Access Point ID for job outputs",

2349 )

2350

2351 def _create_fsx_lustre(self) -> None:

2352 """Create FSx for Lustre file system for high-performance storage.

2353

2354 FSx for Lustre provides high-performance parallel file system storage

2355 ideal for ML training workloads that require high throughput and low latency.

2356

2357 This is optional and controlled by the fsx_lustre.enabled config setting.

2358

2359 Supported deployment types:

2360 - SCRATCH_1: Temporary storage, no data replication

2361 - SCRATCH_2: Temporary storage with better burst performance

2362 - PERSISTENT_1: Persistent storage with data replication

2363 - PERSISTENT_2: Latest persistent storage with higher throughput

2364 """

2365 fsx_config = self.config.get_fsx_lustre_config(self.deployment_region)

2366

2367 if not fsx_config.get("enabled", False):

2368 self.fsx_file_system = None

2369 return

2370

2371 project_name = self.config.get_project_name()

2372

2373 # Create security group for FSx

2374 self.fsx_security_group = ec2.SecurityGroup(

2375 self,

2376 "FsxSecurityGroup",

2377 vpc=self.vpc,

2378 description=f"Security group for {project_name} FSx Lustre in {self.deployment_region}",

2379 security_group_name=f"{project_name}-fsx-sg-{self.deployment_region}",

2380 allow_all_outbound=False,

2381 )

2382

2383 # Allow Lustre traffic from EKS cluster security group

2384 # Lustre uses ports 988 (control) and 1021-1023 (data)

2385 self.fsx_security_group.add_ingress_rule(

2386 peer=self.cluster.cluster_security_group,

2387 connection=ec2.Port.tcp(988),

2388 description="Allow Lustre control traffic from EKS cluster",

2389 )

2390 self.fsx_security_group.add_ingress_rule(

2391 peer=self.cluster.cluster_security_group,

2392 connection=ec2.Port.tcp_range(1021, 1023),

2393 description="Allow Lustre data traffic from EKS cluster",

2394 )

2395

2396 # Allow self-referencing traffic for FSx Lustre internal communication

2397 # FSx Lustre nodes need to communicate with each other on port 988

2398 self.fsx_security_group.add_ingress_rule(

2399 peer=self.fsx_security_group,

2400 connection=ec2.Port.tcp(988),

2401 description="Allow Lustre internal traffic on port 988",

2402 )

2403 self.fsx_security_group.add_ingress_rule(

2404 peer=self.fsx_security_group,

2405 connection=ec2.Port.tcp_range(1021, 1023),

2406 description="Allow Lustre internal traffic on ports 1021-1023",

2407 )

2408

2409 # Get deployment type

2410 deployment_type = fsx_config.get("deployment_type", "SCRATCH_2")

2411 storage_capacity = fsx_config.get("storage_capacity_gib", 1200)

2412

2413 # Build Lustre configuration based on deployment type

2414 lustre_config = {

2415 "deploymentType": deployment_type,

2416 "dataCompressionType": fsx_config.get("data_compression_type", "LZ4"),

2417 }

2418

2419 # Add throughput for PERSISTENT types

2420 if deployment_type.startswith("PERSISTENT"):

2421 lustre_config["perUnitStorageThroughput"] = fsx_config.get(

2422 "per_unit_storage_throughput", 200

2423 )

2424

2425 # Add S3 import/export if configured

2426 import_path = fsx_config.get("import_path")

2427 export_path = fsx_config.get("export_path")

2428

2429 if import_path:

2430 lustre_config["importPath"] = import_path

2431 lustre_config["autoImportPolicy"] = fsx_config.get(

2432 "auto_import_policy", "NEW_CHANGED_DELETED"

2433 )

2434

2435 if export_path:

2436 lustre_config["exportPath"] = export_path

2437

2438 # Get file system type version (default to 2.15 for kernel 6.x compatibility)

2439 # IMPORTANT: Lustre 2.10 is NOT compatible with kernel 6.x (AL2023, Bottlerocket 1.19+)

2440 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/lustre-client-matrix.html

2441 file_system_type_version = fsx_config.get("file_system_type_version", "2.15")

2442

2443 # Create FSx for Lustre file system

2444 self.fsx_file_system = fsx.CfnFileSystem(

2445 self,

2446 "GCOFsxLustre",

2447 file_system_type="LUSTRE",

2448 file_system_type_version=file_system_type_version,

2449 storage_capacity=storage_capacity,

2450 subnet_ids=[self.vpc.private_subnets[0].subnet_id],

2451 security_group_ids=[self.fsx_security_group.security_group_id],

2452 lustre_configuration=lustre_config,

2453 tags=[

2454 {"key": "Name", "value": f"{project_name}-fsx-{self.deployment_region}"},

2455 {"key": "Project", "value": project_name},

2456 ],

2457 )

2458

2459 # Ensure FSx file system waits for security group ingress rules to be created

2460 # This prevents "security group does not permit Lustre LNET traffic" errors

2461 self.fsx_file_system.node.add_dependency(self.fsx_security_group)

2462

2463 # Create FSx CSI Driver add-on for Kubernetes integration

2464 self._create_fsx_csi_driver_addon()

2465

2466 # Output FSx information

2467 CfnOutput(

2468 self,

2469 "FsxFileSystemId",

2470 value=self.fsx_file_system.ref,

2471 description="FSx for Lustre File System ID",

2472 )

2473

2474 CfnOutput(

2475 self,

2476 "FsxDnsName",

2477 value=self.fsx_file_system.attr_dns_name,

2478 description="FSx for Lustre DNS Name",

2479 )

2480

2481 CfnOutput(

2482 self,

2483 "FsxMountName",

2484 value=self.fsx_file_system.attr_lustre_mount_name,

2485 description="FSx for Lustre Mount Name",

2486 )

2487

2488 def _create_valkey_cache(self) -> None:

2489 """Create an ElastiCache Serverless Valkey cache for K/V caching.

2490

2491 Provides a low-latency key-value store that inference endpoints and

2492 jobs can use for prompt caching, session state, feature stores, or

2493 any shared state across pods. Valkey Serverless auto-scales and

2494 requires no node management.

2495

2496 The cache is placed in the VPC private subnets and accessible from

2497 any pod via the cluster security group.

2498 """

2499 valkey_config = self.config.get_valkey_config()

2500 if not valkey_config.get("enabled", False): 2500 ↛ 2503line 2500 didn't jump to line 2503 because the condition on line 2500 was always true

2501 return

2502

2503 from aws_cdk import aws_elasticache as elasticache

2504

2505 # Security group for Valkey (allow access from EKS cluster)

2506 valkey_sg = ec2.SecurityGroup(

2507 self,

2508 "ValkeySG",

2509 vpc=self.vpc,

2510 description="Security group for Valkey Serverless cache",

2511 allow_all_outbound=False,

2512 )

2513 valkey_sg.add_ingress_rule(

2514 ec2.Peer.ipv4(self.vpc.vpc_cidr_block),

2515 ec2.Port.tcp(6379),

2516 "Allow Valkey access from VPC",

2517 )

2518

2519 private_subnet_ids = [s.subnet_id for s in self.vpc.private_subnets]

2520

2521 self.valkey_cache = elasticache.CfnServerlessCache(

2522 self,

2523 "ValkeyCache",

2524 engine="valkey",

2525 serverless_cache_name=f"gco-{self.deployment_region}",

2526 description=f"GCO K/V cache for {self.deployment_region}",

2527 major_engine_version="8",

2528 security_group_ids=[valkey_sg.security_group_id],

2529 subnet_ids=private_subnet_ids,

2530 cache_usage_limits=elasticache.CfnServerlessCache.CacheUsageLimitsProperty(

2531 data_storage=elasticache.CfnServerlessCache.DataStorageProperty(

2532 maximum=valkey_config.get("max_data_storage_gb", 5),

2533 minimum=1,

2534 unit="GB",

2535 ),

2536 ecpu_per_second=elasticache.CfnServerlessCache.ECPUPerSecondProperty(

2537 maximum=valkey_config.get("max_ecpu_per_second", 5000),

2538 minimum=1000,

2539 ),

2540 ),

2541 snapshot_retention_limit=valkey_config.get("snapshot_retention_limit", 1),

2542 tags=[

2543 CfnTag(key="Project", value="gco"),

2544 CfnTag(key="Region", value=self.deployment_region),

2545 ],

2546 )

2547

2548 CfnOutput(

2549 self,

2550 "ValkeyEndpoint",

2551 value=self.valkey_cache.attr_endpoint_address,

2552 description="Valkey Serverless cache endpoint",

2553 )

2554 CfnOutput(

2555 self,

2556 "ValkeyPort",

2557 value=self.valkey_cache.attr_endpoint_port,

2558 description="Valkey Serverless cache port",

2559 )

2560

2561 # Store endpoint in SSM for discovery by pods

2562 ssm.StringParameter(

2563 self,

2564 "ValkeyEndpointParam",

2565 parameter_name=f"/{self.config.get_project_name()}/valkey-endpoint-{self.deployment_region}",

2566 string_value=self.valkey_cache.attr_endpoint_address,

2567 description=f"Valkey endpoint for {self.deployment_region}",

2568 )

2569

2570 def _create_aurora_pgvector(self) -> None:

2571 """Create an Aurora Serverless v2 PostgreSQL cluster with pgvector.

2572

2573 Provides a fully managed vector database that inference endpoints and

2574 jobs can use for RAG (retrieval-augmented generation), semantic search,

2575 embedding storage, and similarity queries. Aurora Serverless v2

2576 auto-scales capacity and requires no instance management.

2577

2578 The cluster is placed in the VPC private subnets and accessible from

2579 any pod via the cluster security group. Credentials are stored in

2580 Secrets Manager and the endpoint is published to SSM + a K8s ConfigMap

2581 for automatic discovery.

2582

2583 See: https://aws.amazon.com/blogs/database/accelerate-generative-ai-workloads-on-amazon-aurora-with-optimized-reads-and-pgvector/

2584 """

2585 aurora_config = self.config.get_aurora_pgvector_config()

2586 if not aurora_config.get("enabled", False):

2587 return

2588

2589 from aws_cdk import aws_rds as rds

2590

2591 project_name = self.config.get_project_name()

2592

2593 # Security group for Aurora (allow PostgreSQL access from EKS cluster only)

2594 aurora_sg = ec2.SecurityGroup(

2595 self,

2596 "AuroraPgvectorSG",

2597 vpc=self.vpc,

2598 description="Security group for Aurora Serverless v2 pgvector",

2599 allow_all_outbound=False,

2600 )

2601 aurora_sg.add_ingress_rule(

2602 self.cluster.cluster_security_group,

2603 ec2.Port.tcp(5432),

2604 "Allow PostgreSQL access from EKS cluster",

2605 )

2606

2607 # Subnet group for Aurora (private subnets only)

2608 subnet_group = rds.SubnetGroup(

2609 self,

2610 "AuroraPgvectorSubnetGroup",

2611 description=f"Subnet group for GCO Aurora pgvector in {self.deployment_region}",

2612 vpc=self.vpc,

2613 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),

2614 )

2615

2616 # Aurora Serverless v2 cluster with PostgreSQL 16 + pgvector

2617 self.aurora_cluster = rds.DatabaseCluster(

2618 self,

2619 "AuroraPgvectorCluster",

2620 engine=rds.DatabaseClusterEngine.aurora_postgres(

2621 version=getattr(rds.AuroraPostgresEngineVersion, AURORA_POSTGRES_VERSION),

2622 ),

2623 serverless_v2_min_capacity=aurora_config.get("min_acu", 0),

2624 serverless_v2_max_capacity=aurora_config.get("max_acu", 16),

2625 writer=rds.ClusterInstance.serverless_v2(

2626 "Writer",

2627 auto_minor_version_upgrade=True,

2628 ),

2629 readers=[

2630 rds.ClusterInstance.serverless_v2(

2631 "Reader",

2632 auto_minor_version_upgrade=True,

2633 scale_with_writer=True,

2634 ),

2635 ],

2636 vpc=self.vpc,

2637 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),

2638 subnet_group=subnet_group,

2639 security_groups=[aurora_sg],

2640 default_database_name="gco_vectors",

2641 backup=rds.BackupProps(

2642 retention=Duration.days(aurora_config.get("backup_retention_days", 7)),

2643 ),

2644 deletion_protection=aurora_config.get("deletion_protection", False),

2645 removal_policy=RemovalPolicy.DESTROY,

2646 storage_encrypted=True,

2647 iam_authentication=True,

2648 cloudwatch_logs_exports=["postgresql"],

2649 monitoring_interval=Duration.seconds(60),

2650 cluster_identifier=f"{project_name}-pgvector-{self.deployment_region}",

2651 )

2652

2653 # Construct-level cdk-nag suppressions for Aurora pgvector

2654 from cdk_nag import NagPackSuppression, NagSuppressions

2655

2656 NagSuppressions.add_resource_suppressions(

2657 self.aurora_cluster,

2658 [

2659 NagPackSuppression(

2660 id="AwsSolutions-RDS10",

2661 reason=(

2662 "Deletion protection is intentionally disabled for dev/test deployments. "

2663 "Production deployments should set aurora_pgvector.deletion_protection=true "

2664 "in cdk.json."

2665 ),

2666 ),

2667 NagPackSuppression(

2668 id="AwsSolutions-SMG4",

2669 reason=(

2670 "Aurora manages credential rotation via the RDS integration with Secrets "

2671 "Manager. Manual Secrets Manager rotation is not required. "

2672 "See: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/rds-secrets-manager.html"

2673 ),

2674 ),

2675 NagPackSuppression(

2676 id="HIPAA.Security-RDSInstanceDeletionProtectionEnabled",

2677 reason=(

2678 "Deletion protection is intentionally disabled for dev/test deployments. "

2679 "Production deployments should set aurora_pgvector.deletion_protection=true "

2680 "in cdk.json."

2681 ),

2682 ),

2683 NagPackSuppression(

2684 id="NIST.800.53.R5-RDSInstanceDeletionProtectionEnabled",

2685 reason=(

2686 "Deletion protection is intentionally disabled for dev/test deployments. "

2687 "Production deployments should set aurora_pgvector.deletion_protection=true "

2688 "in cdk.json."

2689 ),

2690 ),

2691 NagPackSuppression(

2692 id="PCI.DSS.321-SecretsManagerUsingKMSKey",

2693 reason=(

2694 "Aurora Serverless v2 credentials in Secrets Manager are encrypted with "

2695 "AWS-managed keys by default. Customer-managed KMS can be enabled if "

2696 "required for PCI compliance."

2697 ),

2698 ),

2699 ],

2700 apply_to_children=True,

2701 )

2702

2703 # Outputs

2704 CfnOutput(

2705 self,

2706 "AuroraPgvectorEndpoint",

2707 value=self.aurora_cluster.cluster_endpoint.hostname,

2708 description="Aurora pgvector cluster writer endpoint",

2709 )

2710 CfnOutput(

2711 self,

2712 "AuroraPgvectorReaderEndpoint",

2713 value=self.aurora_cluster.cluster_read_endpoint.hostname,

2714 description="Aurora pgvector cluster reader endpoint",

2715 )

2716 CfnOutput(

2717 self,

2718 "AuroraPgvectorPort",

2719 value=str(self.aurora_cluster.cluster_endpoint.port),

2720 description="Aurora pgvector cluster port",

2721 )

2722 CfnOutput(

2723 self,

2724 "AuroraPgvectorSecretArn",

2725 value=self.aurora_cluster.secret.secret_arn if self.aurora_cluster.secret else "",

2726 description="Aurora pgvector credentials secret ARN",

2727 )

2728

2729 # Store endpoint in SSM for discovery by pods and external tools

2730 ssm.StringParameter(

2731 self,

2732 "AuroraPgvectorEndpointParam",

2733 parameter_name=f"/{project_name}/aurora-pgvector-endpoint-{self.deployment_region}",

2734 string_value=self.aurora_cluster.cluster_endpoint.hostname,

2735 description=f"Aurora pgvector endpoint for {self.deployment_region}",

2736 )

2737

2738 # Grant the ServiceAccountRole read access to the Aurora secret

2739 # so pods can retrieve credentials via the ConfigMap + Secrets Manager.

2740 if self.aurora_cluster.secret: 2740 ↛ exitline 2740 didn't return from function '_create_aurora_pgvector' because the condition on line 2740 was always true

2741 self.aurora_cluster.secret.grant_read(self.service_account_role)

2742

2743 def _create_fsx_csi_driver_addon(self) -> None:

2744 """Create FSx CSI Driver add-on for Kubernetes integration.

2745

2746 The FSx CSI driver enables Kubernetes pods to mount FSx for Lustre

2747 file systems as persistent volumes.

2748 """

2749 # Create IAM role for FSx CSI Driver using IRSA + Pod Identity

2750 self.fsx_csi_role = GCORegionalStack._create_irsa_role(

2751 self,

2752 "FsxCsiDriverRole",

2753 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,

2754 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,

2755 service_account_names=["fsx-csi-controller-sa"],

2756 namespaces=["kube-system"],

2757 )

2758

2759 # Add FSx CSI driver permissions

2760 self.fsx_csi_role.add_to_policy(

2761 iam.PolicyStatement(

2762 effect=iam.Effect.ALLOW,

2763 actions=[

2764 "fsx:DescribeFileSystems",

2765 "fsx:DescribeVolumes",

2766 "fsx:CreateVolume",

2767 "fsx:DeleteVolume",

2768 "fsx:TagResource",

2769 ],

2770 resources=["*"],

2771 )

2772 )

2773

2774 self.fsx_csi_role.add_to_policy(

2775 iam.PolicyStatement(

2776 effect=iam.Effect.ALLOW,

2777 actions=[

2778 "ec2:DescribeInstances",

2779 "ec2:DescribeVolumes",

2780 "ec2:DescribeVpcs",

2781 "ec2:DescribeSubnets",

2782 "ec2:DescribeSecurityGroups",

2783 ],

2784 resources=["*"],

2785 )

2786 )

2787

2788 # cdk-nag suppression: the FSx CSI driver role grants

2789 # ec2:Describe* APIs that don't support resource-level scoping.

2790 from cdk_nag import NagSuppressions

2791

2792 NagSuppressions.add_resource_suppressions(

2793 self.fsx_csi_role,

2794 [

2795 {

2796 "id": "AwsSolutions-IAM5",

2797 "reason": (

2798 "The FSx CSI driver role grants ec2:Describe* for volume "

2799 "and network discovery. These AWS APIs do not support "

2800 "resource-level IAM scoping — Resource: * is the only "

2801 "valid form."

2802 ),

2803 "appliesTo": ["Resource::*"],

2804 },

2805 ],

2806 apply_to_children=True,

2807 )

2808

2809 # Create FSx CSI Driver add-on

2810 fsx_addon = eks.Addon(

2811 self,

2812 "FsxCsiDriverAddon",

2813 cluster=self.cluster, # type: ignore[arg-type]

2814 addon_name="aws-fsx-csi-driver",

2815 addon_version=EKS_ADDON_FSX_CSI_DRIVER,

2816 preserve_on_delete=False,

2817 configuration_values={

2818 "node": {

2819 "tolerations": self._ADDON_NODE_TOLERATIONS,

2820 },

2821 "controller": {

2822 "tolerations": self._ADDON_NODE_TOLERATIONS,

2823 },

2824 },

2825 )

2826

2827 # Append the PassRole statement for the FSx CSI role to the shared

2828 # AwsCustomResource execution role. See

2829 # _create_aws_custom_resource_role for the full rationale.

2830 self.aws_custom_resource_role.add_to_policy(

2831 iam.PolicyStatement(

2832 effect=iam.Effect.ALLOW,

2833 actions=["iam:PassRole"],

2834 resources=[self.fsx_csi_role.role_arn],

2835 )

2836 )

2837

2838 # Update the add-on to use the IRSA role

2839 update_fsx_addon = cr.AwsCustomResource(

2840 self,

2841 "UpdateFsxCsiAddonRole",

2842 on_create=cr.AwsSdkCall(

2843 service="EKS",

2844 action="updateAddon",

2845 parameters={

2846 "clusterName": self.cluster.cluster_name,

2847 "addonName": "aws-fsx-csi-driver",

2848 "serviceAccountRoleArn": self.fsx_csi_role.role_arn,

2849 },

2850 physical_resource_id=cr.PhysicalResourceId.of(

2851 f"{self.cluster.cluster_name}-fsx-csi-role-update"

2852 ),

2853 ),

2854 on_update=cr.AwsSdkCall(

2855 service="EKS",

2856 action="updateAddon",

2857 parameters={

2858 "clusterName": self.cluster.cluster_name,

2859 "addonName": "aws-fsx-csi-driver",

2860 "serviceAccountRoleArn": self.fsx_csi_role.role_arn,

2861 },

2862 ),

2863 role=self.aws_custom_resource_role,

2864 )

2865

2866 update_fsx_addon.node.add_dependency(fsx_addon)

2867 update_fsx_addon.node.add_dependency(self.fsx_csi_role)

2868 update_fsx_addon.node.add_dependency(self.aws_custom_resource_role)

2869

2870 # Expose the update-addon resource so _apply_kubernetes_manifests can

2871 # make the kubectl Lambda wait for the IRSA annotation patch to land

2872 # before it rollout-restarts the fsx-csi-controller. See the EFS CSI

2873 # equivalent for the full rationale — same race, same fix, same

2874 # symptom (PVCs stuck Pending with "no EC2 IMDS role found").

2875 self._fsx_csi_addon_role_update = update_fsx_addon

2876

2877 # Create Pod Identity Association for FSx CSI driver

2878 eks_l1.CfnPodIdentityAssociation(

2879 self,

2880 "PodIdentity-fsx-csi",

2881 cluster_name=self.cluster.cluster_name,

2882 namespace="kube-system",

2883 service_account="fsx-csi-controller-sa",

2884 role_arn=self.fsx_csi_role.role_arn,

2885 )

2886

2887 def _create_drift_detection(self) -> None:

2888 """Create CloudFormation drift detection on a daily schedule.

2889

2890 Creates:

2891 - SNS topic (KMS-encrypted) for drift alerts

2892 - Lambda function that initiates drift detection on this stack, polls

2893 until detection completes, and publishes to SNS if drift is found

2894 - EventBridge rule on a daily schedule (configurable via cdk.json

2895 ``drift_detection.schedule_hours``) that invokes the Lambda

2896

2897 Operators can disable drift detection entirely by setting

2898 ``drift_detection.enabled`` to ``false`` in cdk.json. When disabled,

2899 no resources are created.

2900 """

2901 drift_config = self.node.try_get_context("drift_detection") or {}

2902 if not drift_config.get("enabled", True):

2903 return

2904

2905 schedule_hours = int(drift_config.get("schedule_hours", 24))

2906

2907 # KMS key for SNS topic encryption. SNS with AWS-managed keys doesn't

2908 # allow CloudFormation/Lambda to publish, so we use a customer-managed

2909 # key we can grant publish access on.

2910 drift_topic_key = kms.Key(

2911 self,

2912 "DriftDetectionTopicKey",

2913 description="KMS key for GCO drift detection SNS topic",

2914 enable_key_rotation=True,

2915 removal_policy=RemovalPolicy.DESTROY,

2916 )

2917

2918 self.drift_detection_topic = sns.Topic(

2919 self,

2920 "DriftDetectionTopic",

2921 display_name="GCO CloudFormation Drift Alerts",

2922 master_key=drift_topic_key,

2923 )

2924

2925 # IAM role for the drift detection Lambda

2926 drift_lambda_role = iam.Role(

2927 self,

2928 "DriftDetectionLambdaRole",

2929 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),

2930 managed_policies=[

2931 iam.ManagedPolicy.from_aws_managed_policy_name(

2932 "service-role/AWSLambdaBasicExecutionRole"

2933 ),

2934 ],

2935 )

2936

2937 # CloudFormation drift APIs operate at the stack level; the API does

2938 # not support resource-level ARN scoping for these actions, so we scope

2939 # to this stack's ARN where supported and accept "*" where not.

2940 drift_lambda_role.add_to_policy(

2941 iam.PolicyStatement(

2942 effect=iam.Effect.ALLOW,

2943 actions=[

2944 "cloudformation:DetectStackDrift",

2945 "cloudformation:DescribeStackDriftDetectionStatus",

2946 "cloudformation:DescribeStackResourceDrifts",

2947 "cloudformation:DescribeStackResource",

2948 "cloudformation:DescribeStackResources",

2949 ],

2950 resources=["*"],

2951 )

2952 )

2953

2954 self.drift_detection_topic.grant_publish(drift_lambda_role)

2955

2956 # Lambda function — one per stack; stack name is baked into env vars

2957 drift_lambda = lambda_.Function(

2958 self,

2959 "DriftDetectionFunction",

2960 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),

2961 handler="handler.lambda_handler",

2962 code=lambda_.Code.from_asset("lambda/drift-detection"),

2963 timeout=Duration.minutes(14), # Leave headroom under Lambda 15-min cap

2964 memory_size=256,

2965 role=drift_lambda_role,

2966 environment={

2967 "STACK_NAME": self.stack_name,

2968 "SNS_TOPIC_ARN": self.drift_detection_topic.topic_arn,

2969 "REGION": self.deployment_region,

2970 },

2971 tracing=lambda_.Tracing.ACTIVE,

2972 )

2973

2974 # Dead-letter queue for EventBridge → Lambda target failures.

2975 # Captures events that fail to reach the Lambda (e.g. due to

2976 # throttling or permission issues) so operators can retry or

2977 # investigate. Required by Serverless-EventBusDLQ cdk-nag rule.

2978 drift_rule_dlq = sqs.Queue(

2979 self,

2980 "DriftDetectionRuleDlq",

2981 retention_period=Duration.days(14),

2982 enforce_ssl=True,

2983 encryption=sqs.QueueEncryption.SQS_MANAGED,

2984 removal_policy=RemovalPolicy.DESTROY,

2985 )

2986

2987 # DLQs themselves are terminal — they don't need their own DLQ.

2988 # Suppress the circular AwsSolutions-SQS3 nag finding.

2989 from cdk_nag import NagSuppressions as _DlqNagSuppressions

2990

2991 _DlqNagSuppressions.add_resource_suppressions(

2992 drift_rule_dlq,

2993 [

2994 {

2995 "id": "AwsSolutions-SQS3",

2996 "reason": (

2997 "This queue IS the dead-letter queue for the "

2998 "DriftDetectionSchedule EventBridge rule. A DLQ for a "

2999 "DLQ is circular; if events fail to reach this queue "

3000 "they are captured by EventBridge's own retry metrics "

3001 "(CloudWatch FailedInvocations)."

3002 ),

3003 },

3004 ],

3005 )

3006

3007 # EventBridge rule — daily schedule by default

3008 events.Rule(

3009 self,

3010 "DriftDetectionSchedule",

3011 description=(f"Daily CloudFormation drift detection for {self.stack_name}"),

3012 schedule=events.Schedule.rate(Duration.hours(schedule_hours)),

3013 targets=[

3014 events_targets.LambdaFunction(

3015 drift_lambda,

3016 dead_letter_queue=drift_rule_dlq,

3017 retry_attempts=2,

3018 )

3019 ],

3020 )

3021

3022 # Outputs for operators to subscribe to the topic

3023 CfnOutput(

3024 self,

3025 "DriftDetectionTopicArn",

3026 value=self.drift_detection_topic.topic_arn,

3027 description=(

3028 f"SNS topic ARN for CloudFormation drift alerts in "

3029 f"{self.deployment_region}. Subscribe an endpoint (email, "

3030 f"Slack, PagerDuty) to receive drift notifications."

3031 ),

3032 )

3033

3034 # cdk-nag suppressions for this component

3035 from cdk_nag import NagSuppressions

3036

3037 NagSuppressions.add_resource_suppressions(

3038 drift_lambda_role,

3039 [

3040 {

3041 "id": "AwsSolutions-IAM4",

3042 "reason": (

3043 "AWSLambdaBasicExecutionRole provides standard "

3044 "CloudWatch Logs permissions required for Lambda "

3045 "logging. This is the AWS-recommended managed policy."

3046 ),

3047 },

3048 {

3049 "id": "AwsSolutions-IAM5",

3050 "reason": (

3051 "CloudFormation drift detection APIs (DetectStackDrift, "

3052 "DescribeStackDriftDetectionStatus, "

3053 "DescribeStackResourceDrifts) cannot be scoped to a "

3054 "specific stack resource via IAM; the action-level "

3055 "scoping requires wildcard resources. The Lambda's "

3056 "environment pins it to a single stack name, so the "

3057 "effective blast radius is limited."

3058 ),

3059 },

3060 ],

3061 apply_to_children=True,

3062 )

3063

3064 def _create_mcp_role(self) -> None:

3065 """Create dedicated IAM role for the MCP server.

3066

3067 The MCP server exposes GCO CLI tools to LLM agents. Without a dedicated

3068 role, the server would inherit the full ambient credentials of the user

3069 who launches it (often an administrator). This method creates a

3070 least-privilege role that the MCP server can assume at startup via

3071 ``GCO_MCP_ROLE_ARN``.

3072

3073 Permissions are scoped to the minimum needed by the tools exposed:

3074

3075 - ``eks:DescribeCluster`` on this regional EKS cluster ARN only.

3076 - ``s3:GetObject`` on model weights buckets. The model bucket lives in

3077 the global stack, so we scope to the same name pattern used by the

3078 service account role (``{project_name}-*``). This is a deliberate

3079 compromise: a precise cross-stack ARN export would force a tight

3080 dependency on the global stack, and cdk-nag will flag it anyway

3081 because the bucket name is auto-generated.

3082 - ``cloudwatch:GetMetricData`` / ``cloudwatch:ListMetrics``. These APIs

3083 do not support resource-level IAM, so wildcard is required. Read-only.

3084 - ``sqs:SendMessage`` scoped to this region's job queue ARN only.

3085

3086 The trust policy uses ``AccountRootPrincipal`` so any IAM user/role in

3087 the account can assume it (gated by an explicit sts:AssumeRole

3088 permission on the caller — standard AWS behavior). Operators who want

3089 to restrict assumption further should add an external-id or principal

3090 condition to the trust policy after deployment.

3091

3092 Operators can disable this component entirely by setting

3093 ``mcp_server.enabled`` to ``false`` in cdk.json.

3094 """

3095 mcp_config = self.node.try_get_context("mcp_server") or {}

3096 if not mcp_config.get("enabled", True):

3097 return

3098

3099 project_name = self.config.get_project_name()

3100

3101 self.mcp_server_role = iam.Role(

3102 self,

3103 "McpServerRole",

3104 assumed_by=iam.AccountRootPrincipal(),

3105 description=(

3106 "Least-privilege role assumed by the GCO MCP server at startup. "

3107 "Grants only the permissions needed by MCP tools: eks:DescribeCluster, "

3108 "s3:GetObject on model buckets, cloudwatch read-only metrics, and "

3109 "sqs:SendMessage to the regional job queue."

3110 ),

3111 max_session_duration=Duration.hours(12),

3112 )

3113

3114 # eks:DescribeCluster on this region's cluster only

3115 self.mcp_server_role.add_to_policy(

3116 iam.PolicyStatement(

3117 effect=iam.Effect.ALLOW,

3118 actions=["eks:DescribeCluster"],

3119 resources=[self.cluster.cluster_arn],

3120 )

3121 )

3122

3123 # s3:GetObject on model weights buckets. Bucket name is auto-generated

3124 # in the global stack, so we match the same prefix pattern used by the

3125 # service account role.

3126 self.mcp_server_role.add_to_policy(

3127 iam.PolicyStatement(

3128 effect=iam.Effect.ALLOW,

3129 actions=["s3:GetObject", "s3:ListBucket"],

3130 resources=[

3131 f"arn:aws:s3:::{project_name}-*",

3132 f"arn:aws:s3:::{project_name}-*/*",

3133 ],

3134 )

3135 )

3136

3137 # CloudWatch read-only metrics APIs. These APIs do not support

3138 # resource-level IAM so wildcard is required.

3139 self.mcp_server_role.add_to_policy(

3140 iam.PolicyStatement(

3141 effect=iam.Effect.ALLOW,

3142 actions=[

3143 "cloudwatch:GetMetricData",

3144 "cloudwatch:GetMetricStatistics",

3145 "cloudwatch:ListMetrics",

3146 ],

3147 resources=["*"],

3148 )

3149 )

3150

3151 # sqs:SendMessage scoped to the regional job queue only

3152 self.mcp_server_role.add_to_policy(

3153 iam.PolicyStatement(

3154 effect=iam.Effect.ALLOW,

3155 actions=["sqs:SendMessage", "sqs:GetQueueUrl", "sqs:GetQueueAttributes"],

3156 resources=[self.job_queue.queue_arn],

3157 )

3158 )

3159

3160 # Export the role ARN so operators can set GCO_MCP_ROLE_ARN in their

3161 # MCP server environment.

3162 CfnOutput(

3163 self,

3164 "McpServerRoleArn",

3165 value=self.mcp_server_role.role_arn,

3166 description=(

3167 "IAM role ARN for the GCO MCP server. Set GCO_MCP_ROLE_ARN to "

3168 "this value when launching the MCP server so it assumes a "

3169 "least-privilege role instead of ambient credentials."

3170 ),

3171 export_name=f"{project_name}-mcp-server-role-arn-{self.deployment_region}",

3172 )

3173

3174 # cdk-nag suppressions: CloudWatch metrics APIs cannot be scoped.

3175 from cdk_nag import NagSuppressions

3176

3177 NagSuppressions.add_resource_suppressions(

3178 self.mcp_server_role,

3179 [

3180 {

3181 "id": "AwsSolutions-IAM5",

3182 "reason": (

3183 "The CloudWatch metrics APIs (GetMetricData, "

3184 "GetMetricStatistics, ListMetrics) do not support "

3185 "resource-level IAM; wildcard resource is required. "

3186 "The S3 permissions use the {project_name}-* prefix "

3187 "pattern because the model weights bucket name is "

3188 "auto-generated by CDK in the global stack and a "

3189 "cross-stack ARN export would create tight stack "

3190 "coupling. All actions are read-only or scoped "

3191 "send-only (SQS)."

3192 ),

3193 },

3194 ],

3195 apply_to_children=True,

3196 )

3197

3198 def _create_outputs(self) -> None:

3199 """Create CloudFormation outputs for cluster information"""

3200 project_name = self.config.get_project_name()

3201

3202 # Export cluster information

3203 CfnOutput(

3204 self,

3205 "ClusterName",

3206 value=self.cluster.cluster_name,

3207 description=f"EKS cluster name for {self.deployment_region}",

3208 export_name=f"{project_name}-cluster-name-{self.deployment_region}",

3209 )

3210

3211 CfnOutput(

3212 self,

3213 "ClusterArn",

3214 value=self.cluster.cluster_arn,

3215 description=f"EKS cluster ARN for {self.deployment_region}",

3216 export_name=f"{project_name}-cluster-arn-{self.deployment_region}",

3217 )

3218

3219 CfnOutput(

3220 self,

3221 "ClusterEndpoint",

3222 value=self.cluster.cluster_endpoint,

3223 description=f"EKS cluster endpoint for {self.deployment_region}",

3224 export_name=f"{project_name}-cluster-endpoint-{self.deployment_region}",

3225 )

3226

3227 CfnOutput(

3228 self,

3229 "ClusterSecurityGroupId",

3230 value=self.cluster.cluster_security_group_id,

3231 description=f"EKS cluster security group ID for {self.deployment_region}",

3232 export_name=f"{project_name}-cluster-sg-{self.deployment_region}",

3233 )

3234

3235 CfnOutput(

3236 self,

3237 "VpcId",

3238 value=self.vpc.vpc_id,

3239 description=f"VPC ID for {self.deployment_region}",

3240 export_name=f"{project_name}-vpc-id-{self.deployment_region}",

3241 )

3242

3243 # Export public subnet IDs for ALB

3244 public_subnet_ids = [subnet.subnet_id for subnet in self.vpc.public_subnets]

3245 CfnOutput(

3246 self,

3247 "PublicSubnetIds",

3248 value=Fn.join(",", public_subnet_ids),

3249 description=f"Public subnet IDs for ALB in {self.deployment_region}",

3250 export_name=f"{project_name}-public-subnets-{self.deployment_region}",

3251 )

3252

3253 # Note: ALB is created by AWS Load Balancer Controller via Ingress

3254 # The ALB ARN is registered with Global Accelerator by the GA registration Lambda

3255

3256 def get_cluster(self) -> eks.Cluster:

3257 """Get the EKS cluster"""

3258 return self.cluster

3259

3260 def get_vpc(self) -> ec2.Vpc:

3261 """Get the VPC"""

3262 return self.vpc

Coverage for gco / stacks / regional_stack.py: 93%

445 statements