Coverage for gco / stacks / regional_stack.py: 93%

445 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 21:47 +0000

1""" 

2Regional stack for GCO (Global Capacity Orchestrator on AWS) - EKS cluster and ALB per region. 

3 

4This is the largest stack in the project (~3200 lines) and creates all regional 

5resources for a single AWS region. One instance is deployed per region defined 

6in cdk.json. 

7 

8Resources Created: 

9 VPC & Networking: 

10 - VPC with 3 AZs, public subnets (ALB), private subnets (EKS nodes) 

11 - 2 NAT Gateways for high availability 

12 - VPC endpoints for ECR, S3, STS, Secrets Manager, SSM, CloudWatch 

13 - VPC Flow Logs (CloudWatch Logs, 30-day retention) 

14 

15 EKS Cluster (Auto Mode): 

16 - Managed control plane with full logging (API, Audit, Authenticator, Controller Manager, Scheduler) 

17 - NodePools: system, general-purpose, gpu-x86, gpu-arm, inference, gpu-efa, neuron, cpu-general 

18 - IRSA roles for service accounts (Secrets Manager, SQS, DynamoDB, CloudWatch, S3, EFS) 

19 

20 Load Balancing: 

21 - ALB (created by Ingress via AWS Load Balancer Controller) 

22 - Internal NLB for regional API Gateway VPC Link 

23 - Global Accelerator endpoint registration (via ga-registration Lambda) 

24 

25 Storage: 

26 - EFS with dynamic provisioning (CSI driver, access points, encryption at rest + in transit) 

27 - FSx for Lustre (optional, toggled via cdk.json) 

28 - Valkey Serverless cache (optional) 

29 - Aurora Serverless v2 with pgvector (optional) 

30 

31 Lambda Functions: 

32 - kubectl-applier: applies K8s manifests during deployment 

33 - helm-installer: installs Helm charts (KEDA, Volcano, KubeRay, GPU Operator, etc.) 

34 - ga-registration: registers ALB with Global Accelerator 

35 - regional-api-proxy: proxies regional API Gateway to internal ALB 

36 

37 Container Images: 

38 - ECR repositories + Docker image builds for health-monitor, manifest-processor, 

39 inference-monitor, queue-processor 

40 

41 SQS: 

42 - Regional job queue + dead letter queue (for gco jobs submit-sqs) 

43 

44Key Design Decisions: 

45 - EKS Auto Mode handles node provisioning — no managed node groups or Karpenter provisioners 

46 - NodePools use WhenEmpty consolidation for inference to avoid disrupting long-running pods 

47 - IRSA (IAM Roles for Service Accounts) for least-privilege pod-level AWS access 

48 - All optional features (FSx, Valkey, Aurora) are toggled via cdk.json context variables 

49 - Template variables in K8s manifests ({{PLACEHOLDER}}) are replaced at deploy time 

50 

51Dependencies: 

52 - GCOGlobalStack (for Global Accelerator endpoint group ARN, DynamoDB table names, S3 bucket) 

53 - GCOApiGatewayGlobalStack (for auth secret ARN) 

54 

55Modification Guide: 

56 - To add a new NodePool: add a YAML manifest in lambda/kubectl-applier-simple/manifests/ (40-49 range) 

57 - To add a new service: add ECR image build here, Dockerfile in dockerfiles/, manifest in manifests/ 

58 - To add a new optional feature: add a cdk.json context toggle, guard with if/else in this file 

59 - To change EKS version: update KUBERNETES_VERSION in constants.py 

60""" 

61 

62from __future__ import annotations 

63 

64import time 

65from typing import Any 

66 

67import aws_cdk.aws_eks_v2 as eks 

68from aws_cdk import ( 

69 CfnJson, 

70 CfnOutput, 

71 CfnTag, 

72 CustomResource, 

73 Duration, 

74 Fn, 

75 RemovalPolicy, 

76 Stack, 

77) 

78from aws_cdk import aws_ec2 as ec2 

79from aws_cdk import aws_ecr as ecr 

80from aws_cdk import aws_ecr_assets as ecr_assets 

81from aws_cdk import aws_efs as efs 

82from aws_cdk import aws_eks as eks_l1 # L1 constructs (CfnPodIdentityAssociation) 

83from aws_cdk import aws_events as events 

84from aws_cdk import aws_events_targets as events_targets 

85from aws_cdk import aws_fsx as fsx 

86from aws_cdk import aws_iam as iam 

87from aws_cdk import aws_kms as kms 

88from aws_cdk import aws_lambda as lambda_ 

89from aws_cdk import aws_logs as logs 

90from aws_cdk import aws_sns as sns 

91from aws_cdk import aws_sqs as sqs 

92from aws_cdk import aws_ssm as ssm 

93from aws_cdk import custom_resources as cr 

94from constructs import Construct 

95 

96from gco.config.config_loader import ConfigLoader 

97from gco.stacks.constants import ( 

98 AURORA_POSTGRES_VERSION, 

99 EKS_ADDON_CLOUDWATCH_OBSERVABILITY, 

100 EKS_ADDON_EFS_CSI_DRIVER, 

101 EKS_ADDON_FSX_CSI_DRIVER, 

102 EKS_ADDON_METRICS_SERVER, 

103 EKS_ADDON_POD_IDENTITY_AGENT, 

104 LAMBDA_PYTHON_RUNTIME, 

105) 

106 

107 

108class GCORegionalStack(Stack): 

109 """ 

110 Regional resources stack for a single AWS region. 

111 

112 Creates EKS cluster, load balancers, and supporting infrastructure 

113 for running GCO services in a specific region. 

114 

115 Attributes: 

116 vpc: VPC with public/private subnets 

117 cluster: EKS Auto Mode cluster 

118 """ 

119 

120 @staticmethod 

121 def _create_irsa_role( 

122 scope: GCORegionalStack, 

123 id: str, 

124 oidc_provider_arn: str, 

125 oidc_issuer_url: str, 

126 service_account_names: list[str], 

127 namespaces: list[str], 

128 ) -> iam.Role: 

129 """Create an IAM role trusted by both IRSA (OIDC) and EKS Pod Identity. 

130 

131 IRSA is the primary credential mechanism — it works reliably on EKS Auto 

132 Mode by projecting a service-account token that the AWS SDK exchanges for 

133 temporary credentials via the OIDC provider. 

134 

135 Pod Identity trust is added as a secondary path so the role is ready if/when 

136 Pod Identity injection starts working on Auto Mode nodes. 

137 

138 Uses CfnJson to defer OIDC condition key resolution to deploy time, 

139 because the issuer URL is a CloudFormation token that can't be used 

140 as a Python dict key at synth time. 

141 """ 

142 # Strip https:// from issuer URL for the OIDC condition 

143 issuer = Fn.select(1, Fn.split("//", oidc_issuer_url)) 

144 

145 # Build OIDC conditions using CfnJson to defer token resolution 

146 # The issuer URL is a CFN token — can't be used as a dict key at synth time 

147 aud_key = Fn.join("", [issuer, ":aud"]) 

148 sub_key = Fn.join("", [issuer, ":sub"]) 

149 

150 conditions_json = CfnJson( 

151 scope, 

152 f"{id}OidcConditions", 

153 value={ 

154 aud_key: "sts.amazonaws.com", 

155 sub_key: [ 

156 f"system:serviceaccount:{ns}:{sa}" 

157 for ns in namespaces 

158 for sa in service_account_names 

159 ], 

160 }, 

161 ) 

162 

163 role = iam.Role( 

164 scope, 

165 id, 

166 assumed_by=iam.FederatedPrincipal( 

167 federated=oidc_provider_arn, 

168 conditions={ 

169 "StringEquals": conditions_json, 

170 }, 

171 assume_role_action="sts:AssumeRoleWithWebIdentity", 

172 ), 

173 ) 

174 

175 # Also allow Pod Identity (secondary path for future use) 

176 assert role.assume_role_policy is not None # guaranteed by assumed_by parameter above 

177 role.assume_role_policy.add_statements( 

178 iam.PolicyStatement( 

179 effect=iam.Effect.ALLOW, 

180 principals=[iam.ServicePrincipal("pods.eks.amazonaws.com")], 

181 actions=["sts:AssumeRole", "sts:TagSession"], 

182 ) 

183 ) 

184 return role 

185 

186 def __init__( 

187 self, 

188 scope: Construct, 

189 construct_id: str, 

190 config: ConfigLoader, 

191 region: str, 

192 auth_secret_arn: str, 

193 **kwargs: Any, 

194 ) -> None: 

195 super().__init__(scope, construct_id, **kwargs) 

196 

197 self.config = config 

198 self.deployment_region = region 

199 self.auth_secret_arn = auth_secret_arn 

200 self.alb_arn: str | None = None 

201 

202 # Get cluster configuration for this region 

203 cluster_config = self.config.get_cluster_config(region) 

204 self.cluster_config = cluster_config 

205 

206 # Create VPC for the EKS cluster 

207 self.vpc = ec2.Vpc( 

208 self, 

209 "GCOVpc", 

210 # vpc_name intentionally omitted - let CDK generate unique name 

211 max_azs=3, 

212 nat_gateways=2, # For high availability 

213 subnet_configuration=[ 

214 ec2.SubnetConfiguration( 

215 name="PublicSubnet", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24 

216 ), 

217 ec2.SubnetConfiguration( 

218 name="PrivateSubnet", 

219 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, 

220 cidr_mask=24, 

221 ), 

222 ], 

223 ) 

224 

225 # Enable VPC Flow Logs for network traffic analysis and security monitoring 

226 self._create_vpc_flow_logs() 

227 

228 # Create SQS queue for job ingestion 

229 self._create_sqs_queue() 

230 

231 # Create ECR repositories and build Docker images 

232 self._create_container_images() 

233 

234 # Pre-create the execution role shared by every ``cr.AwsCustomResource`` 

235 # in this stack. See ``_create_aws_custom_resource_role`` for the full 

236 # rationale — in short, CDK's default behavior of auto-generating a 

237 # Lambda role per ``AwsCustomResource`` (and then merging all the 

238 # ``policy=`` statements onto it during deploy) triggers an IAM 

239 # propagation race on cold creates. We sidestep the race by creating 

240 # a single long-lived role up front and attaching policies to it as 

241 # each consumer is built; every ``AwsCustomResource`` then passes 

242 # ``role=self.aws_custom_resource_role`` instead of ``policy=``, so 

243 # the singleton Lambda runs against a role whose inline policy has 

244 # already replicated globally. 

245 self._create_aws_custom_resource_role() 

246 

247 # Create EKS cluster 

248 self._create_eks_cluster(cluster_config) 

249 

250 # Create EFS for shared storage 

251 self._create_efs() 

252 

253 # Create FSx for Lustre (if enabled) for high-performance storage 

254 self._create_fsx_lustre() 

255 

256 # Create Valkey Serverless cache (if enabled) for K/V caching 

257 self._create_valkey_cache() 

258 

259 # Create Aurora Serverless v2 + pgvector (if enabled) for vector DB 

260 self._create_aurora_pgvector() 

261 

262 # Create GA registration Lambda for registering Ingress-created ALB 

263 self._create_ga_registration_lambda() 

264 

265 # Create Helm installer Lambda for KEDA and other Helm-based installations 

266 self._create_helm_installer_lambda() 

267 

268 # Apply Kubernetes manifests (after EFS so IDs are available) 

269 self._apply_kubernetes_manifests() 

270 

271 # Create CloudFormation drift detection (daily schedule + SNS alerts) 

272 self._create_drift_detection() 

273 

274 # Create dedicated IAM role for MCP server 

275 self._create_mcp_role() 

276 

277 # Export cluster information 

278 self._create_outputs() 

279 

280 # Apply cdk-nag suppressions for this stack 

281 self._apply_nag_suppressions() 

282 

283 def _create_vpc_flow_logs(self) -> None: 

284 """Create VPC Flow Logs for network traffic monitoring. 

285 

286 Flow logs capture information about IP traffic going to and from 

287 network interfaces in the VPC. This is required for security 

288 monitoring and compliance (HIPAA, SOC2, etc.). 

289 """ 

290 # Create CloudWatch Log Group for flow logs 

291 flow_log_group = logs.LogGroup( 

292 self, 

293 "VpcFlowLogGroup", 

294 # log_group_name intentionally omitted - let CDK generate unique name 

295 retention=logs.RetentionDays.ONE_MONTH, 

296 removal_policy=RemovalPolicy.DESTROY, 

297 ) 

298 

299 # Create IAM role for VPC Flow Logs 

300 flow_log_role = iam.Role( 

301 self, 

302 "VpcFlowLogRole", 

303 assumed_by=iam.ServicePrincipal("vpc-flow-logs.amazonaws.com"), 

304 ) 

305 

306 flow_log_role.add_to_policy( 

307 iam.PolicyStatement( 

308 actions=[ 

309 "logs:CreateLogStream", 

310 "logs:PutLogEvents", 

311 "logs:DescribeLogGroups", 

312 "logs:DescribeLogStreams", 

313 ], 

314 resources=[flow_log_group.log_group_arn, f"{flow_log_group.log_group_arn}:*"], 

315 ) 

316 ) 

317 

318 # Create VPC Flow Log 

319 ec2.FlowLog( 

320 self, 

321 "VpcFlowLog", 

322 resource_type=ec2.FlowLogResourceType.from_vpc(self.vpc), 

323 destination=ec2.FlowLogDestination.to_cloud_watch_logs(flow_log_group, flow_log_role), 

324 traffic_type=ec2.FlowLogTrafficType.ALL, 

325 ) 

326 

327 def _apply_nag_suppressions(self) -> None: 

328 """Apply cdk-nag suppressions for this stack.""" 

329 from gco.stacks.nag_suppressions import apply_all_suppressions 

330 

331 apply_all_suppressions( 

332 self, 

333 stack_type="regional", 

334 regions=self.config.get_regions(), 

335 global_region=self.config.get_global_region(), 

336 ) 

337 

338 def _create_sqs_queue(self) -> None: 

339 """Create SQS queue for job ingestion. 

340 

341 Creates an SQS queue that serves as the default job ingestion point 

342 for this region. Jobs submitted to this queue are processed by the 

343 manifest processor and KEDA scales based on queue depth. 

344 

345 Also creates a dead-letter queue for failed messages. 

346 Both queues use server-side encryption with AWS managed keys. 

347 """ 

348 project_name = self.config.get_project_name() 

349 

350 # Create dead-letter queue for failed messages 

351 self.job_dlq = sqs.Queue( 

352 self, 

353 "JobDeadLetterQueue", 

354 queue_name=f"{project_name}-jobs-dlq-{self.deployment_region}", 

355 retention_period=Duration.days(14), 

356 removal_policy=RemovalPolicy.DESTROY, 

357 enforce_ssl=True, # Require SSL for all requests 

358 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption 

359 ) 

360 

361 # Create main job queue 

362 self.job_queue = sqs.Queue( 

363 self, 

364 "JobQueue", 

365 queue_name=f"{project_name}-jobs-{self.deployment_region}", 

366 visibility_timeout=Duration.minutes(5), # Match Lambda timeout 

367 retention_period=Duration.days(7), 

368 dead_letter_queue=sqs.DeadLetterQueue( 

369 max_receive_count=3, # Move to DLQ after 3 failed attempts 

370 queue=self.job_dlq, 

371 ), 

372 removal_policy=RemovalPolicy.DESTROY, 

373 enforce_ssl=True, # Require SSL for all requests 

374 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption 

375 ) 

376 

377 # Output queue information 

378 CfnOutput( 

379 self, 

380 "JobQueueUrl", 

381 value=self.job_queue.queue_url, 

382 description=f"SQS Job Queue URL for {self.deployment_region}", 

383 export_name=f"{project_name}-job-queue-url-{self.deployment_region}", 

384 ) 

385 

386 CfnOutput( 

387 self, 

388 "JobQueueArn", 

389 value=self.job_queue.queue_arn, 

390 description=f"SQS Job Queue ARN for {self.deployment_region}", 

391 export_name=f"{project_name}-job-queue-arn-{self.deployment_region}", 

392 ) 

393 

394 CfnOutput( 

395 self, 

396 "JobDlqUrl", 

397 value=self.job_dlq.queue_url, 

398 description=f"SQS Dead Letter Queue URL for {self.deployment_region}", 

399 export_name=f"{project_name}-job-dlq-url-{self.deployment_region}", 

400 ) 

401 

402 def _create_aws_custom_resource_role(self) -> None: 

403 """Pre-create the execution role shared by every ``AwsCustomResource``. 

404 

405 CDK's ``cr.AwsCustomResource`` defaults to auto-generating a per- 

406 construct Lambda execution role from the ``policy=`` parameter. 

407 Internally, CDK deduplicates those auto-generated roles onto a 

408 single *singleton* provider Lambda (logical id prefix 

409 ``AWS679f53fac002430cb0da5b7982bd22872``), and merges each custom 

410 resource's policy statements onto that Lambda's role at stack 

411 create time. On cold deploys, CloudFormation invokes the Lambda 

412 within 2-3 seconds of attaching a new policy statement, which is 

413 faster than IAM's global propagation window. The symptom is a 

414 ``iam:PassRole NOT authorized`` failure on whichever addon role 

415 update happens to run right after its ``iam:PassRole`` policy 

416 statement was attached but before it had replicated. 

417 

418 The fix is to create the role up front, attach every policy 

419 statement the stack will need during stack creation, and pass 

420 ``role=self.aws_custom_resource_role`` to every 

421 ``AwsCustomResource`` instead of ``policy=``. Because the role 

422 already exists — and its inline policy has had minutes to 

423 replicate by the time any ``AwsCustomResource`` actually fires — 

424 the race disappears entirely. 

425 

426 This method creates the role with the statements we can compute 

427 without a cluster reference (EKS ``UpdateAddon`` / ``DescribeAddon`` 

428 scoped to this cluster, and SSM ``GetParameter`` for the endpoint 

429 group ARN). ``iam:PassRole`` statements for individual addon 

430 roles (EFS CSI, FSx CSI, CloudWatch Observability) are appended 

431 by each ``_create_*_addon`` method after the corresponding IRSA 

432 role has been created, so every PassRole ``resources=`` list 

433 stays precise (no wildcards) and cdk-nag stays happy. 

434 """ 

435 project_name = self.config.get_project_name() 

436 global_region = self.config.get_global_region() 

437 

438 self.aws_custom_resource_role = iam.Role( 

439 self, 

440 "AwsCustomResourceRole", 

441 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

442 description=( 

443 "Shared execution role for every cr.AwsCustomResource in this " 

444 "stack. Pre-created to avoid the IAM policy propagation race " 

445 "that occurs when CDK auto-generates per-CR roles and the " 

446 "singleton provider Lambda fires before the freshly-attached " 

447 "policy has replicated globally." 

448 ), 

449 managed_policies=[ 

450 iam.ManagedPolicy.from_aws_managed_policy_name( 

451 "service-role/AWSLambdaBasicExecutionRole" 

452 ), 

453 ], 

454 ) 

455 

456 # EKS UpdateAddon / DescribeAddon — used by the three updateAddon 

457 # custom resources (EFS CSI, FSx CSI, CloudWatch Observability). 

458 # Scoped to this cluster's addons by ARN. 

459 self.aws_custom_resource_role.add_to_policy( 

460 iam.PolicyStatement( 

461 effect=iam.Effect.ALLOW, 

462 actions=["eks:UpdateAddon", "eks:DescribeAddon"], 

463 resources=[ 

464 f"arn:aws:eks:{self.deployment_region}:{self.account}" 

465 f":addon/{self.cluster_config.cluster_name}/*" 

466 ], 

467 ) 

468 ) 

469 

470 # SSM GetParameter — used by the GetEndpointGroupArn custom 

471 # resource in _create_ga_registration_lambda to read the ARN of 

472 # the Global Accelerator endpoint group published by the global 

473 # stack during its deploy. 

474 self.aws_custom_resource_role.add_to_policy( 

475 iam.PolicyStatement( 

476 effect=iam.Effect.ALLOW, 

477 actions=["ssm:GetParameter"], 

478 resources=[ 

479 f"arn:aws:ssm:{global_region}:{self.account}" f":parameter/{project_name}/*" 

480 ], 

481 ) 

482 ) 

483 

484 # cdk-nag suppressions: the two wildcard-bearing ARNs above are 

485 # intentional and both scoped as tightly as AWS IAM permits. 

486 # 

487 # - The ``eks:UpdateAddon`` / ``eks:DescribeAddon`` statement uses 

488 # ``addon/<cluster>/*`` as its resource because the same shared 

489 # role is consumed by three different updateAddon custom 

490 # resources (EFS CSI, FSx CSI, CloudWatch Observability). Each 

491 # addon has its own ARN and we'd otherwise need three separate 

492 # statements that each grant access to a known addon name. The 

493 # wildcard is scoped to a single cluster in a single region in 

494 # a single account — it cannot be used against any addon 

495 # belonging to a different cluster or a different service. 

496 # 

497 # - The ``ssm:GetParameter`` statement uses 

498 # ``parameter/<project>/*`` because the exact parameter name 

499 # (``endpoint-group-<region>-arn``) is only known at Global 

500 # Accelerator registration time and the endpoint path 

501 # structure is ``<project>/<parameter>``. Scoping to the 

502 # project prefix restricts access to parameters owned by this 

503 # project only. 

504 from cdk_nag import NagSuppressions 

505 

506 NagSuppressions.add_resource_suppressions( 

507 self.aws_custom_resource_role, 

508 [ 

509 { 

510 "id": "AwsSolutions-IAM5", 

511 "reason": ( 

512 "Scoped to a single EKS cluster's addons " 

513 "(addon/<cluster>/*) and this project's SSM " 

514 "parameters (parameter/<project>/*). Both wildcards " 

515 "are as tight as AWS IAM permits: addon names and " 

516 "parameter names are not known at stack synthesis " 

517 "time because the addons are created later in the " 

518 "same stack and the GA endpoint group ARN is " 

519 "published by a separate stack during deploy. The " 

520 "shared role pattern itself is deliberate — see " 

521 "_create_aws_custom_resource_role docstring for why " 

522 "we pre-create instead of letting CDK auto-generate " 

523 "per-CR roles." 

524 ), 

525 "appliesTo": [ 

526 f"Resource::arn:aws:eks:{self.deployment_region}" 

527 f":<AWS::AccountId>:addon/{self.cluster_config.cluster_name}/*", 

528 f"Resource::arn:aws:ssm:{global_region}" 

529 f":<AWS::AccountId>:parameter/{project_name}/*", 

530 ], 

531 }, 

532 ], 

533 apply_to_children=True, 

534 ) 

535 

536 def _create_container_images(self) -> None: 

537 """Create ECR repositories and build Docker images for services""" 

538 

539 # Create ECR repository for health monitor 

540 self.health_monitor_repo = ecr.Repository( 

541 self, 

542 "HealthMonitorRepo", 

543 # repository_name intentionally omitted - let CDK generate unique name 

544 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production 

545 empty_on_delete=True, # Clean up images on stack deletion 

546 image_scan_on_push=True, # Enable vulnerability scanning on push 

547 ) 

548 

549 # All Docker images target AMD64 (x86_64) to match EKS Auto Mode's 

550 # default system nodepool. 

551 

552 # Build and push health monitor Docker image 

553 self.health_monitor_image = ecr_assets.DockerImageAsset( 

554 self, 

555 "HealthMonitorImage", 

556 directory=".", # Root directory 

557 file="dockerfiles/health-monitor-dockerfile", 

558 platform=ecr_assets.Platform.LINUX_AMD64, 

559 ) 

560 

561 # Create ECR repository for manifest processor 

562 self.manifest_processor_repo = ecr.Repository( 

563 self, 

564 "ManifestProcessorRepo", 

565 # repository_name intentionally omitted - let CDK generate unique name 

566 removal_policy=RemovalPolicy.DESTROY, 

567 empty_on_delete=True, 

568 image_scan_on_push=True, # Enable vulnerability scanning on push 

569 ) 

570 

571 # Build and push manifest processor Docker image 

572 self.manifest_processor_image = ecr_assets.DockerImageAsset( 

573 self, 

574 "ManifestProcessorImage", 

575 directory=".", 

576 file="dockerfiles/manifest-processor-dockerfile", 

577 platform=ecr_assets.Platform.LINUX_AMD64, 

578 ) 

579 

580 # Output image URIs for reference 

581 CfnOutput( 

582 self, 

583 "HealthMonitorImageUri", 

584 value=self.health_monitor_image.image_uri, 

585 description="Health Monitor Docker image URI", 

586 ) 

587 

588 CfnOutput( 

589 self, 

590 "ManifestProcessorImageUri", 

591 value=self.manifest_processor_image.image_uri, 

592 description="Manifest Processor Docker image URI", 

593 ) 

594 

595 # Build and push inference monitor Docker image 

596 self.inference_monitor_image = ecr_assets.DockerImageAsset( 

597 self, 

598 "InferenceMonitorImage", 

599 directory=".", 

600 file="dockerfiles/inference-monitor-dockerfile", 

601 platform=ecr_assets.Platform.LINUX_AMD64, 

602 ) 

603 

604 CfnOutput( 

605 self, 

606 "InferenceMonitorImageUri", 

607 value=self.inference_monitor_image.image_uri, 

608 description="Inference Monitor Docker image URI", 

609 ) 

610 

611 # Build and push queue processor Docker image (if enabled). 

612 # The queue processor is a KEDA ScaledJob that consumes manifests from 

613 # the regional SQS queue. It can be disabled in cdk.json if users want 

614 # to implement their own consumer. When disabled, the post-helm-sqs-consumer.yaml 

615 # manifest is skipped (unreplaced template variables cause it to be skipped). 

616 queue_processor_config = self.node.try_get_context("queue_processor") or {} 

617 self.queue_processor_enabled = queue_processor_config.get("enabled", True) 

618 

619 if self.queue_processor_enabled: 619 ↛ exitline 619 didn't return from function '_create_container_images' because the condition on line 619 was always true

620 self.queue_processor_image = ecr_assets.DockerImageAsset( 

621 self, 

622 "QueueProcessorImage", 

623 directory=".", 

624 file="dockerfiles/queue-processor-dockerfile", 

625 platform=ecr_assets.Platform.LINUX_AMD64, 

626 ) 

627 

628 CfnOutput( 

629 self, 

630 "QueueProcessorImageUri", 

631 value=self.queue_processor_image.image_uri, 

632 description="Queue Processor Docker image URI", 

633 ) 

634 

635 def _create_eks_cluster(self, cluster_config: Any) -> None: 

636 """Create the EKS cluster with auto mode and GPU node groups""" 

637 

638 # Create cluster admin role 

639 # role_name intentionally omitted - let CDK generate unique name 

640 cluster_admin_role = iam.Role( 

641 self, 

642 "ClusterAdminRole", 

643 assumed_by=iam.ServicePrincipal("eks.amazonaws.com"), 

644 managed_policies=[ 

645 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSClusterPolicy") 

646 ], 

647 ) 

648 

649 # Create node group role 

650 # role_name intentionally omitted - let CDK generate unique name 

651 iam.Role( 

652 self, 

653 "NodeGroupRole", 

654 assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"), 

655 managed_policies=[ 

656 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSWorkerNodePolicy"), 

657 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKS_CNI_Policy"), 

658 iam.ManagedPolicy.from_aws_managed_policy_name( 

659 "AmazonEC2ContainerRegistryReadOnly" 

660 ), 

661 ], 

662 ) 

663 

664 # Create EKS Auto Mode cluster with built-in system and general-purpose nodepools 

665 # Auto Mode automatically manages compute resources and comes with essential addons 

666 # Get endpoint access configuration 

667 eks_config = self.config.get_eks_cluster_config() 

668 endpoint_access_mode = eks_config.get("endpoint_access", "PRIVATE") 

669 

670 # Map config string to EKS EndpointAccess enum 

671 endpoint_access = ( 

672 eks.EndpointAccess.PRIVATE 

673 if endpoint_access_mode == "PRIVATE" 

674 else eks.EndpointAccess.PUBLIC_AND_PRIVATE 

675 ) 

676 

677 # Create KMS key for EKS secrets encryption 

678 self.eks_encryption_key = kms.Key( 

679 self, 

680 "EksSecretsEncryptionKey", 

681 description="KMS key for EKS Kubernetes secrets encryption", 

682 enable_key_rotation=True, 

683 removal_policy=RemovalPolicy.RETAIN, 

684 ) 

685 

686 # Get Kubernetes version - use custom version if not available in CDK enum 

687 k8s_version_str = cluster_config.kubernetes_version 

688 try: 

689 k8s_version = getattr(eks.KubernetesVersion, f"V{k8s_version_str.replace('.', '_')}") 

690 except AttributeError: 

691 # Version not in CDK enum yet, use custom version 

692 k8s_version = eks.KubernetesVersion.of(k8s_version_str) 

693 

694 self.cluster = eks.Cluster( 

695 self, 

696 "GCOEksCluster", 

697 cluster_name=cluster_config.cluster_name, 

698 version=k8s_version, # Use configured version for Auto Mode with DRA support 

699 vpc=self.vpc, 

700 compute=eks.ComputeConfig( 

701 # Enable both built-in node pools - Auto Mode manages these automatically 

702 node_pools=["system", "general-purpose"] 

703 ), 

704 # SECURITY: Endpoint access controlled via cdk.json eks_cluster.endpoint_access 

705 # PRIVATE (default): EKS API accessible only from within VPC - most secure 

706 # Job submission works via API Gateway → Lambda (in VPC) or SQS 

707 # For kubectl access, use a bastion host, VPN, or AWS SSM Session Manager 

708 # PUBLIC_AND_PRIVATE: EKS API accessible from internet and VPC 

709 # Allows direct kubectl access but less secure 

710 endpoint_access=endpoint_access, 

711 role=cluster_admin_role, 

712 vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)], 

713 # Enable all control plane logging for security and compliance 

714 cluster_logging=[ 

715 eks.ClusterLoggingTypes.API, 

716 eks.ClusterLoggingTypes.AUDIT, 

717 eks.ClusterLoggingTypes.AUTHENTICATOR, 

718 eks.ClusterLoggingTypes.CONTROLLER_MANAGER, 

719 eks.ClusterLoggingTypes.SCHEDULER, 

720 ], 

721 # SECURITY: Enable envelope encryption for Kubernetes secrets using KMS 

722 secrets_encryption_key=self.eks_encryption_key, 

723 ) 

724 

725 # Auto Mode comes with essential addons pre-configured: 

726 # - AWS Load Balancer Controller (for ALB/NLB integration) 

727 # - CoreDNS, kube-proxy, VPC CNI (standard Kubernetes components) 

728 

729 # OIDC provider for IRSA — the primary credential injection mechanism. 

730 # IRSA uses projected service-account tokens exchanged via the OIDC provider 

731 # for temporary AWS credentials. This works reliably on EKS Auto Mode. 

732 self.oidc_provider = eks.OidcProviderNative( 

733 self, 

734 "OidcProvider", 

735 url=self.cluster.cluster_open_id_connect_issuer_url, 

736 ) 

737 

738 # Pod Identity Agent add-on — registers the admission webhook that injects 

739 # Pod Identity credentials. On Auto Mode the DaemonSet schedules 0 pods 

740 # (the agent is built into the node), but the add-on registration is still 

741 # needed for the control-plane webhook. Kept as a secondary credential path. 

742 self._create_pod_identity_agent_addon() 

743 

744 # Add Metrics Server add-on for HPA and resource monitoring 

745 self._create_metrics_server_addon() 

746 

747 # Add EFS CSI Driver add-on for shared storage 

748 self._create_efs_csi_driver_addon() 

749 

750 # Add CloudWatch Observability add-on for Container Insights metrics 

751 self._create_cloudwatch_observability_addon() 

752 

753 # NOTE: GPU compute is configured via Karpenter NodePools (not managed node groups) 

754 # NodePool manifests are located in lambda/kubectl-applier-simple/manifests/: 

755 # - 40-nodepool-gpu-x86.yaml: x86_64 GPU instances (g4dn, g5, g6, g6e, p3) 

756 # - 41-nodepool-gpu-arm.yaml: ARM64 GPU instances (g5g) 

757 # - 42-nodepool-inference.yaml: inference-optimized GPU instances 

758 # - 43-nodepool-efa.yaml: EFA-enabled instances (p4d, p5, p6) 

759 # - 44-nodepool-neuron.yaml: Trainium/Inferentia instances 

760 # These will be applied by the kubectl Lambda custom resource (created below) 

761 

762 # Create IRSA role for service account to access secrets 

763 self._create_service_account_role() 

764 

765 # Create kubectl Lambda for applying Kubernetes manifests 

766 self._create_kubectl_lambda() 

767 

768 # ── Shared toleration config for EKS add-ons ────────────────────────── 

769 # All GCO nodepools apply taints (nvidia.com/gpu, aws.amazon.com/neuron, 

770 # vpc.amazonaws.com/efa) that prevent DaemonSet pods from scheduling. 

771 # Every add-on that runs a DaemonSet (or may schedule on tainted nodes) 

772 # must tolerate these taints so that storage drivers, metrics agents, and 

773 # other infrastructure components work on every node type. 

774 _ADDON_NODE_TOLERATIONS = [ 

775 {"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}, 

776 {"key": "aws.amazon.com/neuron", "operator": "Exists", "effect": "NoSchedule"}, 

777 {"key": "vpc.amazonaws.com/efa", "operator": "Exists", "effect": "NoSchedule"}, 

778 ] 

779 

780 def _create_pod_identity_agent_addon(self) -> None: 

781 """Create EKS Pod Identity Agent add-on. 

782 

783 On Auto Mode the DaemonSet schedules 0 pods (the agent is built into 

784 the node runtime), but the add-on registration is still required for 

785 the control-plane admission webhook that injects Pod Identity tokens. 

786 """ 

787 eks.Addon( 

788 self, 

789 "PodIdentityAgentAddon", 

790 cluster=self.cluster, # type: ignore[arg-type] 

791 addon_name="eks-pod-identity-agent", 

792 addon_version=EKS_ADDON_POD_IDENTITY_AGENT, 

793 preserve_on_delete=False, 

794 configuration_values={ 

795 "tolerations": self._ADDON_NODE_TOLERATIONS, 

796 }, 

797 ) 

798 

799 def _create_metrics_server_addon(self) -> None: 

800 """Create Metrics Server add-on for resource metrics. 

801 

802 The Metrics Server collects resource metrics from kubelets and exposes 

803 them via the Kubernetes API server. This is required for: 

804 - Horizontal Pod Autoscaler (HPA) 

805 - Vertical Pod Autoscaler (VPA) 

806 - kubectl top commands 

807 - Resource monitoring dashboards 

808 

809 Note: Metrics Server doesn't require an IRSA role as it only needs 

810 in-cluster permissions which are handled by its service account. 

811 """ 

812 eks.Addon( 

813 self, 

814 "MetricsServerAddon", 

815 cluster=self.cluster, # type: ignore[arg-type] 

816 addon_name="metrics-server", 

817 addon_version=EKS_ADDON_METRICS_SERVER, 

818 preserve_on_delete=False, 

819 configuration_values={ 

820 "tolerations": self._ADDON_NODE_TOLERATIONS, 

821 }, 

822 ) 

823 

824 def _create_efs_csi_driver_addon(self) -> None: 

825 """Create EFS CSI Driver add-on for shared storage support. 

826 

827 The EFS CSI driver enables Kubernetes pods to mount EFS file systems 

828 as persistent volumes. This is required for the shared storage feature. 

829 

830 We create a Pod Identity role for the EFS CSI driver and update the add-on 

831 to use it via a custom resource after the add-on is created. 

832 """ 

833 # Create IAM role for EFS CSI Driver using IRSA + Pod Identity 

834 self.efs_csi_role = GCORegionalStack._create_irsa_role( 

835 self, 

836 "EfsCsiDriverRole", 

837 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

838 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

839 service_account_names=["efs-csi-controller-sa"], 

840 namespaces=["kube-system"], 

841 ) 

842 

843 # Add EFS CSI driver permissions 

844 self.efs_csi_role.add_managed_policy( 

845 iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEFSCSIDriverPolicy") 

846 ) 

847 

848 # Create EFS CSI Driver add-on 

849 efs_addon = eks.Addon( 

850 self, 

851 "EfsCsiDriverAddon", 

852 cluster=self.cluster, # type: ignore[arg-type] 

853 addon_name="aws-efs-csi-driver", 

854 addon_version=EKS_ADDON_EFS_CSI_DRIVER, 

855 preserve_on_delete=False, 

856 configuration_values={ 

857 "node": { 

858 "tolerations": self._ADDON_NODE_TOLERATIONS, 

859 }, 

860 "controller": { 

861 "tolerations": self._ADDON_NODE_TOLERATIONS, 

862 }, 

863 }, 

864 ) 

865 

866 # Append the PassRole statement for the EFS CSI role to the shared 

867 # AwsCustomResource execution role. See the role's creation in 

868 # _create_aws_custom_resource_role for the full rationale on why 

869 # we pre-create + attach up-front instead of letting CDK 

870 # auto-generate per-CR roles. 

871 self.aws_custom_resource_role.add_to_policy( 

872 iam.PolicyStatement( 

873 effect=iam.Effect.ALLOW, 

874 actions=["iam:PassRole"], 

875 resources=[self.efs_csi_role.role_arn], 

876 ) 

877 ) 

878 

879 # Update the add-on to use the IRSA role via custom resource 

880 # This is needed because the eks v2 alpha Addon doesn't support service_account_role directly 

881 update_addon = cr.AwsCustomResource( 

882 self, 

883 "UpdateEfsCsiAddonRole", 

884 on_create=cr.AwsSdkCall( 

885 service="EKS", 

886 action="updateAddon", 

887 parameters={ 

888 "clusterName": self.cluster.cluster_name, 

889 "addonName": "aws-efs-csi-driver", 

890 "serviceAccountRoleArn": self.efs_csi_role.role_arn, 

891 }, 

892 physical_resource_id=cr.PhysicalResourceId.of( 

893 f"{self.cluster.cluster_name}-efs-csi-role-update" 

894 ), 

895 ), 

896 on_update=cr.AwsSdkCall( 

897 service="EKS", 

898 action="updateAddon", 

899 parameters={ 

900 "clusterName": self.cluster.cluster_name, 

901 "addonName": "aws-efs-csi-driver", 

902 "serviceAccountRoleArn": self.efs_csi_role.role_arn, 

903 }, 

904 ), 

905 role=self.aws_custom_resource_role, 

906 ) 

907 

908 # Ensure the update happens after the add-on is created. We also 

909 # depend on the shared execution role so CloudFormation has fully 

910 # attached + replicated its inline policy before the Lambda fires. 

911 update_addon.node.add_dependency(efs_addon) 

912 update_addon.node.add_dependency(self.efs_csi_role) 

913 update_addon.node.add_dependency(self.aws_custom_resource_role) 

914 

915 # Expose the update-addon resource so _apply_kubernetes_manifests can 

916 # make the kubectl Lambda wait for the IRSA annotation patch to land 

917 # before it tries to rollout-restart the efs-csi-controller. Without 

918 # this ordering, the restart could fire before EKS has re-attached 

919 # the role ARN, leaving the new pods just as credential-less as the 

920 # old ones and causing every EFS CreateAccessPoint to fail with a 

921 # 401 from IMDS. 

922 self._efs_csi_addon_role_update = update_addon 

923 

924 def _create_cloudwatch_observability_addon(self) -> None: 

925 """Create CloudWatch Observability add-on for Container Insights. 

926 

927 The CloudWatch Observability add-on enables Container Insights metrics 

928 for the EKS cluster, providing visibility into: 

929 - Cluster CPU and memory utilization 

930 - Node-level metrics 

931 - Pod and container metrics 

932 - Application logs (optional) 

933 

934 These metrics are used by the monitoring dashboard to display 

935 cluster health and resource utilization. 

936 """ 

937 

938 # Create IAM role for CloudWatch agent using IRSA + Pod Identity 

939 self.cloudwatch_role = GCORegionalStack._create_irsa_role( 

940 self, 

941 "CloudWatchObservabilityRole", 

942 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

943 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

944 service_account_names=["cloudwatch-agent"], 

945 namespaces=["amazon-cloudwatch"], 

946 ) 

947 

948 # Add CloudWatch agent permissions 

949 self.cloudwatch_role.add_managed_policy( 

950 iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchAgentServerPolicy") 

951 ) 

952 self.cloudwatch_role.add_managed_policy( 

953 iam.ManagedPolicy.from_aws_managed_policy_name("AWSXrayWriteOnlyAccess") 

954 ) 

955 

956 # Create CloudWatch Observability add-on 

957 cw_addon = eks.Addon( 

958 self, 

959 "CloudWatchObservabilityAddon", 

960 cluster=self.cluster, # type: ignore[arg-type] 

961 addon_name="amazon-cloudwatch-observability", 

962 addon_version=EKS_ADDON_CLOUDWATCH_OBSERVABILITY, 

963 preserve_on_delete=False, 

964 configuration_values={ 

965 "tolerations": self._ADDON_NODE_TOLERATIONS, 

966 # Enable Container Insights with application log collection 

967 # Logs are sent to /aws/containerinsights/{cluster}/application 

968 "containerLogs": { 

969 "enabled": True, 

970 }, 

971 }, 

972 ) 

973 

974 # Append the PassRole statement for the CloudWatch Observability 

975 # role to the shared AwsCustomResource execution role. See 

976 # _create_aws_custom_resource_role for the full rationale. 

977 self.aws_custom_resource_role.add_to_policy( 

978 iam.PolicyStatement( 

979 effect=iam.Effect.ALLOW, 

980 actions=["iam:PassRole"], 

981 resources=[self.cloudwatch_role.role_arn], 

982 ) 

983 ) 

984 

985 # Update the add-on to use the IRSA role via custom resource 

986 update_cw_addon = cr.AwsCustomResource( 

987 self, 

988 "UpdateCloudWatchAddonRole", 

989 on_create=cr.AwsSdkCall( 

990 service="EKS", 

991 action="updateAddon", 

992 parameters={ 

993 "clusterName": self.cluster.cluster_name, 

994 "addonName": "amazon-cloudwatch-observability", 

995 "serviceAccountRoleArn": self.cloudwatch_role.role_arn, 

996 }, 

997 physical_resource_id=cr.PhysicalResourceId.of( 

998 f"{self.cluster.cluster_name}-cw-obs-role-update" 

999 ), 

1000 ), 

1001 on_update=cr.AwsSdkCall( 

1002 service="EKS", 

1003 action="updateAddon", 

1004 parameters={ 

1005 "clusterName": self.cluster.cluster_name, 

1006 "addonName": "amazon-cloudwatch-observability", 

1007 "serviceAccountRoleArn": self.cloudwatch_role.role_arn, 

1008 }, 

1009 ), 

1010 role=self.aws_custom_resource_role, 

1011 ) 

1012 

1013 # Ensure the update happens after the add-on is created. Depend on 

1014 # the shared execution role so CFN has fully attached + replicated 

1015 # its inline policy before the Lambda fires. No CR→CR dependency 

1016 # chain needed anymore — the race it was serializing against is 

1017 # eliminated by pre-creating the role. 

1018 update_cw_addon.node.add_dependency(cw_addon) 

1019 update_cw_addon.node.add_dependency(self.cloudwatch_role) 

1020 update_cw_addon.node.add_dependency(self.aws_custom_resource_role) 

1021 

1022 # Expose the update-addon resource so _apply_kubernetes_manifests can 

1023 # make the kubectl Lambda wait for the IRSA annotation patch to land 

1024 # before it rollout-restarts the cloudwatch-agent DaemonSet. See the 

1025 # EFS CSI equivalent for the full rationale — same race, same fix. 

1026 self._cloudwatch_addon_role_update = update_cw_addon 

1027 

1028 def _create_service_account_role(self) -> None: 

1029 """Create IAM role for Kubernetes service account using EKS Pod Identity. 

1030 

1031 Pod Identity is the recommended mechanism for EKS Auto Mode. It's simpler 

1032 and more reliable than IRSA — no OIDC provider, no webhook injection, no 

1033 projected tokens. EKS manages the credential injection automatically. 

1034 

1035 This role can be assumed by the gco-service-account in: 

1036 - gco-system namespace (for system services like health-monitor, manifest-processor) 

1037 - gco-jobs namespace (for user jobs that need SQS access for KEDA scaling) 

1038 - gco-inference namespace (for inference endpoints) 

1039 """ 

1040 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust 

1041 # 

1042 # The trust policy's `sub` condition must list every ServiceAccount 

1043 # that needs to assume this role. Keep in sync with: 

1044 # - lambda/kubectl-applier-simple/manifests/01-serviceaccounts.yaml 

1045 # (gco-service-account) 

1046 # - lambda/kubectl-applier-simple/manifests/02-rbac.yaml 

1047 # (gco-health-monitor-sa, gco-manifest-processor-sa, 

1048 # gco-inference-monitor-sa) 

1049 # - lambda/kubectl-applier-simple/manifests/04a-jobs-serviceaccount.yaml 

1050 # (gco-service-account in gco-jobs) 

1051 self.service_account_role = GCORegionalStack._create_irsa_role( 

1052 self, 

1053 "ServiceAccountRole", 

1054 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

1055 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

1056 service_account_names=[ 

1057 "gco-service-account", 

1058 "gco-health-monitor-sa", 

1059 "gco-manifest-processor-sa", 

1060 "gco-inference-monitor-sa", 

1061 ], 

1062 namespaces=["gco-system", "gco-jobs", "gco-inference"], 

1063 ) 

1064 

1065 # Grant permission to read the auth secret 

1066 # Note: We use an explicit IAM policy statement with a wildcard (*) because: 

1067 # 1. The secret is in a different region (API Gateway region) 

1068 # 2. CDK's grant_read() generates a policy with ?????? suffix which requires 

1069 # exactly 6 characters, but the SDK can call GetSecretValue with either 

1070 # the full ARN (with suffix) or partial ARN (without suffix) 

1071 # 3. Using * ensures both forms work correctly 

1072 self.service_account_role.add_to_policy( 

1073 iam.PolicyStatement( 

1074 effect=iam.Effect.ALLOW, 

1075 actions=[ 

1076 "secretsmanager:GetSecretValue", 

1077 "secretsmanager:DescribeSecret", 

1078 ], 

1079 resources=[f"{self.auth_secret_arn}*"], # Wildcard to match with or without suffix 

1080 ) 

1081 ) 

1082 

1083 # cdk-nag suppression: the trailing ``*`` on the auth secret 

1084 # ARN above is intentional and is NOT a broad wildcard. Secrets 

1085 # Manager appends a random 6-character suffix to every secret 

1086 # ARN at creation time (``arn:...:secret:my-secret-AbC123``). 

1087 # The secret lives in a separate stack (api_gateway_global_stack) 

1088 # and is referenced here via a cross-stack token, so the actual 

1089 # suffix is unknown at synth time. The wildcard matches the 

1090 # suffix only — every finding under this rule is still scoped 

1091 # to this single secret. 

1092 from cdk_nag import NagSuppressions 

1093 

1094 NagSuppressions.add_resource_suppressions( 

1095 self.service_account_role, 

1096 [ 

1097 { 

1098 "id": "AwsSolutions-IAM5", 

1099 "reason": ( 

1100 "The trailing ``*`` matches the 6-character " 

1101 "random suffix Secrets Manager appends to secret " 

1102 "ARNs. The secret is created in a different stack " 

1103 "(api_gateway_global_stack) and referenced here " 

1104 "via a cross-stack token, so the actual suffix " 

1105 "isn't known at synth time. The wildcard is " 

1106 "bounded to a single secret — it does not grant " 

1107 "access to any other secret in the account." 

1108 ), 

1109 "appliesTo": [ 

1110 {"regex": "/^Resource::<GCOAuthSecret.*>\\*$/"}, 

1111 ], 

1112 }, 

1113 ], 

1114 apply_to_children=True, 

1115 ) 

1116 

1117 # cdk-nag suppression: the ServiceAccountRole grants ec2:Describe* 

1118 # and elasticloadbalancing:Describe* for the AWS Load Balancer 

1119 # Controller. These AWS APIs do not support resource-level IAM 

1120 # scoping — Resource: * is the only valid form. 

1121 NagSuppressions.add_resource_suppressions( 

1122 self.service_account_role, 

1123 [ 

1124 { 

1125 "id": "AwsSolutions-IAM5", 

1126 "reason": ( 

1127 "The ServiceAccountRole grants ec2:Describe* and " 

1128 "elasticloadbalancing:Describe* for the AWS Load Balancer " 

1129 "Controller. These AWS APIs do not support resource-level " 

1130 "IAM scoping — Resource: * is the only valid form. See " 

1131 "https://docs.aws.amazon.com/service-authorization/latest/" 

1132 "reference/list_amazonec2.html" 

1133 ), 

1134 "appliesTo": ["Resource::*"], 

1135 }, 

1136 ], 

1137 apply_to_children=True, 

1138 ) 

1139 

1140 # Add permissions for AWS Load Balancer Controller 

1141 self.service_account_role.add_to_policy( 

1142 iam.PolicyStatement( 

1143 effect=iam.Effect.ALLOW, 

1144 actions=[ 

1145 "ec2:DescribeAccountAttributes", 

1146 "ec2:DescribeAddresses", 

1147 "ec2:DescribeAvailabilityZones", 

1148 "ec2:DescribeInternetGateways", 

1149 "ec2:DescribeVpcs", 

1150 "ec2:DescribeVpcPeeringConnections", 

1151 "ec2:DescribeSubnets", 

1152 "ec2:DescribeSecurityGroups", 

1153 "ec2:DescribeInstances", 

1154 "ec2:DescribeNetworkInterfaces", 

1155 "ec2:DescribeTags", 

1156 "ec2:GetCoipPoolUsage", 

1157 "ec2:DescribeCoipPools", 

1158 "elasticloadbalancing:DescribeLoadBalancers", 

1159 "elasticloadbalancing:DescribeLoadBalancerAttributes", 

1160 "elasticloadbalancing:DescribeListeners", 

1161 "elasticloadbalancing:DescribeListenerCertificates", 

1162 "elasticloadbalancing:DescribeSSLPolicies", 

1163 "elasticloadbalancing:DescribeRules", 

1164 "elasticloadbalancing:DescribeTargetGroups", 

1165 "elasticloadbalancing:DescribeTargetGroupAttributes", 

1166 "elasticloadbalancing:DescribeTargetHealth", 

1167 "elasticloadbalancing:DescribeTags", 

1168 ], 

1169 resources=["*"], 

1170 ) 

1171 ) 

1172 

1173 self.service_account_role.add_to_policy( 

1174 iam.PolicyStatement( 

1175 effect=iam.Effect.ALLOW, 

1176 actions=[ 

1177 "elasticloadbalancing:CreateLoadBalancer", 

1178 "elasticloadbalancing:CreateTargetGroup", 

1179 "elasticloadbalancing:CreateListener", 

1180 "elasticloadbalancing:DeleteLoadBalancer", 

1181 "elasticloadbalancing:DeleteTargetGroup", 

1182 "elasticloadbalancing:DeleteListener", 

1183 "elasticloadbalancing:ModifyLoadBalancerAttributes", 

1184 "elasticloadbalancing:ModifyTargetGroup", 

1185 "elasticloadbalancing:ModifyTargetGroupAttributes", 

1186 "elasticloadbalancing:ModifyListener", 

1187 "elasticloadbalancing:RegisterTargets", 

1188 "elasticloadbalancing:DeregisterTargets", 

1189 "elasticloadbalancing:SetWebAcl", 

1190 "elasticloadbalancing:SetSecurityGroups", 

1191 "elasticloadbalancing:SetSubnets", 

1192 "elasticloadbalancing:AddTags", 

1193 "elasticloadbalancing:RemoveTags", 

1194 ], 

1195 resources=["*"], 

1196 ) 

1197 ) 

1198 

1199 self.service_account_role.add_to_policy( 

1200 iam.PolicyStatement( 

1201 effect=iam.Effect.ALLOW, 

1202 actions=[ 

1203 "ec2:CreateSecurityGroup", 

1204 "ec2:CreateTags", 

1205 "ec2:DeleteTags", 

1206 "ec2:AuthorizeSecurityGroupIngress", 

1207 "ec2:RevokeSecurityGroupIngress", 

1208 "ec2:DeleteSecurityGroup", 

1209 ], 

1210 resources=["*"], 

1211 ) 

1212 ) 

1213 

1214 self.service_account_role.add_to_policy( 

1215 iam.PolicyStatement( 

1216 effect=iam.Effect.ALLOW, 

1217 actions=["iam:CreateServiceLinkedRole"], 

1218 resources=["*"], 

1219 conditions={ 

1220 "StringEquals": {"iam:AWSServiceName": "elasticloadbalancing.amazonaws.com"} 

1221 }, 

1222 ) 

1223 ) 

1224 

1225 self.service_account_role.add_to_policy( 

1226 iam.PolicyStatement( 

1227 effect=iam.Effect.ALLOW, 

1228 actions=[ 

1229 "wafv2:GetWebACL", 

1230 "wafv2:GetWebACLForResource", 

1231 "wafv2:AssociateWebACL", 

1232 "wafv2:DisassociateWebACL", 

1233 ], 

1234 resources=["*"], 

1235 ) 

1236 ) 

1237 

1238 self.service_account_role.add_to_policy( 

1239 iam.PolicyStatement( 

1240 effect=iam.Effect.ALLOW, 

1241 actions=[ 

1242 "shield:GetSubscriptionState", 

1243 "shield:DescribeProtection", 

1244 "shield:CreateProtection", 

1245 "shield:DeleteProtection", 

1246 ], 

1247 resources=["*"], 

1248 ) 

1249 ) 

1250 

1251 self.service_account_role.add_to_policy( 

1252 iam.PolicyStatement( 

1253 effect=iam.Effect.ALLOW, 

1254 actions=["acm:ListCertificates", "acm:DescribeCertificate"], 

1255 resources=["*"], 

1256 ) 

1257 ) 

1258 

1259 self.service_account_role.add_to_policy( 

1260 iam.PolicyStatement( 

1261 effect=iam.Effect.ALLOW, 

1262 actions=["cognito-idp:DescribeUserPoolClient"], 

1263 resources=["*"], 

1264 ) 

1265 ) 

1266 

1267 # Add SQS permissions for KEDA to scale based on queue depth 

1268 self.service_account_role.add_to_policy( 

1269 iam.PolicyStatement( 

1270 effect=iam.Effect.ALLOW, 

1271 actions=[ 

1272 "sqs:GetQueueAttributes", 

1273 "sqs:GetQueueUrl", 

1274 "sqs:ReceiveMessage", 

1275 "sqs:DeleteMessage", 

1276 "sqs:SendMessage", 

1277 ], 

1278 resources=[ 

1279 self.job_queue.queue_arn, 

1280 self.job_dlq.queue_arn, 

1281 ], 

1282 ) 

1283 ) 

1284 

1285 # Add CloudWatch permissions for publishing custom metrics 

1286 # Used by health-monitor and manifest-processor to publish metrics 

1287 self.service_account_role.add_to_policy( 

1288 iam.PolicyStatement( 

1289 effect=iam.Effect.ALLOW, 

1290 actions=["cloudwatch:PutMetricData"], 

1291 resources=["*"], 

1292 conditions={ 

1293 "StringEquals": { 

1294 "cloudwatch:namespace": [ 

1295 "GCO/HealthMonitor", 

1296 "GCO/ManifestProcessor", 

1297 ] 

1298 } 

1299 }, 

1300 ) 

1301 ) 

1302 

1303 # Add DynamoDB permissions for templates, webhooks, and job queue 

1304 # Tables are created in the global stack and accessed from all regions 

1305 project_name = self.config.get_project_name() 

1306 global_region = self.config.get_global_region() 

1307 

1308 self.service_account_role.add_to_policy( 

1309 iam.PolicyStatement( 

1310 effect=iam.Effect.ALLOW, 

1311 actions=[ 

1312 "dynamodb:GetItem", 

1313 "dynamodb:PutItem", 

1314 "dynamodb:UpdateItem", 

1315 "dynamodb:DeleteItem", 

1316 "dynamodb:Query", 

1317 "dynamodb:Scan", 

1318 ], 

1319 resources=[ 

1320 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates", 

1321 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates/index/*", 

1322 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks", 

1323 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks/index/*", 

1324 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs", 

1325 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs/index/*", 

1326 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints", 

1327 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints/index/*", 

1328 ], 

1329 ) 

1330 ) 

1331 

1332 # Add S3 permissions for model weights bucket (used by inference init containers) 

1333 self.service_account_role.add_to_policy( 

1334 iam.PolicyStatement( 

1335 effect=iam.Effect.ALLOW, 

1336 actions=[ 

1337 "s3:GetObject", 

1338 "s3:ListBucket", 

1339 ], 

1340 resources=[ 

1341 f"arn:aws:s3:::{project_name}-*", 

1342 f"arn:aws:s3:::{project_name}-*/*", 

1343 ], 

1344 ) 

1345 ) 

1346 

1347 # KMS decrypt for model weights bucket (S3-scoped) 

1348 self.service_account_role.add_to_policy( 

1349 iam.PolicyStatement( 

1350 effect=iam.Effect.ALLOW, 

1351 actions=["kms:Decrypt", "kms:GenerateDataKey"], 

1352 resources=[f"arn:aws:kms:*:{self.account}:key/*"], 

1353 conditions={ 

1354 "StringLike": { 

1355 "kms:ViaService": "s3.*.amazonaws.com", 

1356 } 

1357 }, 

1358 ) 

1359 ) 

1360 

1361 # Create KEDA operator IAM role for SQS access 

1362 self._create_keda_operator_role() 

1363 

1364 # Create Pod Identity Associations for all service accounts 

1365 self._create_pod_identity_associations() 

1366 

1367 def _create_keda_operator_role(self) -> None: 

1368 """Create IAM role for KEDA operator service account using EKS Pod Identity. 

1369 

1370 This role allows the KEDA operator to access SQS queues for scaling 

1371 based on queue depth. The role is assumed by the keda-operator service 

1372 account in the keda namespace. 

1373 """ 

1374 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust 

1375 self.keda_operator_role = GCORegionalStack._create_irsa_role( 

1376 self, 

1377 "KedaOperatorRole", 

1378 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

1379 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

1380 service_account_names=["keda-operator"], 

1381 namespaces=["keda"], 

1382 ) 

1383 

1384 # Add SQS permissions for KEDA to read queue metrics 

1385 self.keda_operator_role.add_to_policy( 

1386 iam.PolicyStatement( 

1387 effect=iam.Effect.ALLOW, 

1388 actions=[ 

1389 "sqs:GetQueueAttributes", 

1390 "sqs:GetQueueUrl", 

1391 ], 

1392 resources=[ 

1393 self.job_queue.queue_arn, 

1394 self.job_dlq.queue_arn, 

1395 ], 

1396 ) 

1397 ) 

1398 

1399 def _create_pod_identity_associations(self) -> None: 

1400 """Create EKS Pod Identity Associations for all service accounts. 

1401 

1402 Pod Identity is the recommended mechanism for EKS Auto Mode. Each 

1403 association links an IAM role to a Kubernetes service account in a 

1404 specific namespace. EKS manages credential injection automatically. 

1405 

1406 Stores associations in self._pod_identity_associations so the 

1407 kubectl-applier custom resource can declare an explicit dependency, 

1408 ensuring credentials are available before workloads start. 

1409 """ 

1410 self._pod_identity_associations: list[Any] = [] 

1411 

1412 # GCO service account — used by health-monitor, manifest-processor, inference-monitor 

1413 for namespace in ["gco-system", "gco-jobs", "gco-inference"]: 

1414 assoc = eks_l1.CfnPodIdentityAssociation( 

1415 self, 

1416 f"PodIdentity-gco-sa-{namespace}", 

1417 cluster_name=self.cluster.cluster_name, 

1418 namespace=namespace, 

1419 service_account="gco-service-account", 

1420 role_arn=self.service_account_role.role_arn, 

1421 ) 

1422 self._pod_identity_associations.append(assoc) 

1423 

1424 # KEDA operator — needs SQS access for queue-based scaling 

1425 keda_assoc = eks_l1.CfnPodIdentityAssociation( 

1426 self, 

1427 "PodIdentity-keda-operator", 

1428 cluster_name=self.cluster.cluster_name, 

1429 namespace="keda", 

1430 service_account="keda-operator", 

1431 role_arn=self.keda_operator_role.role_arn, 

1432 ) 

1433 self._pod_identity_associations.append(keda_assoc) 

1434 

1435 # EFS CSI driver — needs EFS access for shared storage 

1436 efs_assoc = eks_l1.CfnPodIdentityAssociation( 

1437 self, 

1438 "PodIdentity-efs-csi", 

1439 cluster_name=self.cluster.cluster_name, 

1440 namespace="kube-system", 

1441 service_account="efs-csi-controller-sa", 

1442 role_arn=self.efs_csi_role.role_arn, 

1443 ) 

1444 self._pod_identity_associations.append(efs_assoc) 

1445 

1446 # CloudWatch agent — needs CloudWatch access for observability 

1447 cw_assoc = eks_l1.CfnPodIdentityAssociation( 

1448 self, 

1449 "PodIdentity-cloudwatch", 

1450 cluster_name=self.cluster.cluster_name, 

1451 namespace="amazon-cloudwatch", 

1452 service_account="cloudwatch-agent", 

1453 role_arn=self.cloudwatch_role.role_arn, 

1454 ) 

1455 self._pod_identity_associations.append(cw_assoc) 

1456 

1457 # FSx CSI driver — only when FSx is enabled (created later in _create_fsx_lustre) 

1458 # The FSx Pod Identity association is added in _create_fsx_lustre instead 

1459 

1460 def _create_kubectl_lambda(self) -> None: 

1461 """Create Lambda function to apply Kubernetes manifests using Python client. 

1462 

1463 Note: This creates the Lambda and provider but does NOT create the custom resource. 

1464 The custom resource is created in _apply_kubernetes_manifests() after ALB is created, 

1465 so that target group ARNs can be passed to the manifests. 

1466 """ 

1467 project_name = self.config.get_project_name() 

1468 

1469 # Create IAM role for kubectl Lambda 

1470 kubectl_lambda_role = iam.Role( 

1471 self, 

1472 "KubectlLambdaRole", 

1473 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

1474 managed_policies=[ 

1475 iam.ManagedPolicy.from_aws_managed_policy_name( 

1476 "service-role/AWSLambdaVPCAccessExecutionRole" 

1477 ), 

1478 iam.ManagedPolicy.from_aws_managed_policy_name( 

1479 "service-role/AWSLambdaBasicExecutionRole" 

1480 ), 

1481 ], 

1482 ) 

1483 

1484 # Add EKS permissions 

1485 kubectl_lambda_role.add_to_policy( 

1486 iam.PolicyStatement( 

1487 actions=[ 

1488 "eks:DescribeCluster", 

1489 "eks:ListClusters", 

1490 ], 

1491 resources=[self.cluster.cluster_arn], 

1492 ) 

1493 ) 

1494 

1495 # Add permissions to assume cluster admin role 

1496 kubectl_lambda_role.add_to_policy( 

1497 iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"]) 

1498 ) 

1499 

1500 # Create security group for kubectl Lambda 

1501 kubectl_lambda_sg = ec2.SecurityGroup( 

1502 self, 

1503 "KubectlLambdaSG", 

1504 vpc=self.vpc, 

1505 description="Security group for kubectl Lambda to access EKS cluster", 

1506 security_group_name=f"{self.config.get_project_name()}-kubectl-lambda-sg-{self.deployment_region}", 

1507 allow_all_outbound=True, # Lambda needs outbound access to EKS API 

1508 ) 

1509 

1510 # Allow Lambda security group to access EKS cluster security group on port 443 

1511 # The EKS cluster security group is automatically created by EKS 

1512 self.cluster.cluster_security_group.add_ingress_rule( 

1513 peer=kubectl_lambda_sg, 

1514 connection=ec2.Port.tcp(443), 

1515 description="Allow kubectl Lambda to access EKS API", 

1516 ) 

1517 

1518 # Create Lambda function (Python-only, no Docker!) 

1519 # Store function name as string attribute for cross-stack references 

1520 # This avoids CDK cross-environment resolution issues when account is unresolved 

1521 self.kubectl_lambda_function_name = f"{project_name}-kubectl-{self.deployment_region}" 

1522 self.kubectl_lambda = lambda_.Function( 

1523 self, 

1524 "KubectlApplierFunction", 

1525 function_name=self.kubectl_lambda_function_name, 

1526 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

1527 handler="handler.lambda_handler", 

1528 code=lambda_.Code.from_asset("lambda/kubectl-applier-simple-build"), 

1529 timeout=Duration.minutes(15), # Max Lambda timeout 

1530 memory_size=512, 

1531 role=kubectl_lambda_role, 

1532 vpc=self.vpc, 

1533 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

1534 security_groups=[kubectl_lambda_sg], # Use the security group we created 

1535 environment={ 

1536 "CLUSTER_NAME": self.cluster.cluster_name, 

1537 "REGION": self.deployment_region, 

1538 }, 

1539 tracing=lambda_.Tracing.ACTIVE, 

1540 ) 

1541 

1542 # Add EKS access entry for the Lambda role to authenticate with the cluster 

1543 # This grants the Lambda role cluster admin permissions 

1544 eks.AccessEntry( 

1545 self, 

1546 "KubectlLambdaAccessEntry", 

1547 cluster=self.cluster, # type: ignore[arg-type] 

1548 principal=kubectl_lambda_role.role_arn, 

1549 access_policies=[ 

1550 eks.AccessPolicy.from_access_policy_name( 

1551 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER 

1552 ) 

1553 ], 

1554 ) 

1555 

1556 # Create log group for kubectl provider 

1557 kubectl_provider_log_group = logs.LogGroup( 

1558 self, 

1559 "KubectlProviderLogGroup", 

1560 retention=logs.RetentionDays.ONE_WEEK, 

1561 removal_policy=RemovalPolicy.DESTROY, 

1562 ) 

1563 

1564 # Create custom resource provider (stored for use in _apply_kubernetes_manifests) 

1565 self.kubectl_provider = cr.Provider( 

1566 self, 

1567 "KubectlProvider", 

1568 on_event_handler=self.kubectl_lambda, 

1569 log_group=kubectl_provider_log_group, 

1570 ) 

1571 

1572 # cdk-nag suppression: the kubectl-applier Lambda requires broad 

1573 # EKS and Kubernetes API access to apply arbitrary manifests. 

1574 from cdk_nag import NagSuppressions 

1575 

1576 NagSuppressions.add_resource_suppressions( 

1577 kubectl_lambda_role, 

1578 [ 

1579 { 

1580 "id": "AwsSolutions-IAM5", 

1581 "reason": ( 

1582 "The kubectl-applier Lambda requires broad EKS and Kubernetes API " 

1583 "access to apply arbitrary manifests (RBAC, ServiceAccounts, " 

1584 "Deployments, Jobs, NetworkPolicies) across multiple namespaces. " 

1585 "Resource: * is required because the set of Kubernetes resources " 

1586 "is dynamic and not known at synth time." 

1587 ), 

1588 "appliesTo": ["Resource::*"], 

1589 }, 

1590 ], 

1591 apply_to_children=True, 

1592 ) 

1593 

1594 def _apply_kubernetes_manifests(self) -> None: 

1595 """Apply Kubernetes manifests using the kubectl Lambda custom resource. 

1596 

1597 This is called after ALB security group and EFS are created. 

1598 The Ingress will use the security group ID to create the ALB. 

1599 """ 

1600 

1601 # Get public subnet IDs for Ingress annotation (currently unused but kept for future use) 

1602 # public_subnet_ids = ",".join([subnet.subnet_id for subnet in self.vpc.public_subnets]) 

1603 

1604 # Apply manifests using custom resource 

1605 # Build image replacements dict 

1606 # Include a deployment timestamp to force pod rollouts when code changes 

1607 from datetime import UTC, datetime 

1608 

1609 deployment_timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") 

1610 

1611 # Get resource thresholds from config 

1612 thresholds = self.config.get_resource_thresholds() 

1613 

1614 # Get manifest processor resource quotas. 

1615 # Resource quotas and the security/image policy now live under the 

1616 # shared job_validation_policy section because both the REST 

1617 # manifest_processor and the SQS queue_processor read them. Service- 

1618 # specific knobs (replicas, validation_enabled, max_request_body_bytes, 

1619 # etc.) stay under manifest_processor. 

1620 mp_config = self.node.try_get_context("manifest_processor") or {} 

1621 job_policy = self.node.try_get_context("job_validation_policy") or {} 

1622 job_quotas = job_policy.get("resource_quotas", {}) 

1623 

1624 image_replacements = { 

1625 "{{HEALTH_MONITOR_IMAGE}}": self.health_monitor_image.image_uri, 

1626 "{{MANIFEST_PROCESSOR_IMAGE}}": self.manifest_processor_image.image_uri, 

1627 "{{INFERENCE_MONITOR_IMAGE}}": self.inference_monitor_image.image_uri, 

1628 "{{CLUSTER_NAME}}": self.cluster.cluster_name, 

1629 "{{REGION}}": self.deployment_region, 

1630 "{{AUTH_SECRET_ARN}}": self.auth_secret_arn, 

1631 "{{SERVICE_ACCOUNT_ROLE_ARN}}": self.service_account_role.role_arn, 

1632 "{{EFS_FILE_SYSTEM_ID}}": self.efs_file_system.file_system_id, 

1633 "{{EFS_ACCESS_POINT_ID}}": self.efs_access_point.access_point_id, 

1634 "{{JOB_QUEUE_URL}}": self.job_queue.queue_url, 

1635 "{{JOB_QUEUE_ARN}}": self.job_queue.queue_arn, 

1636 "{{DEPLOYMENT_TIMESTAMP}}": deployment_timestamp, 

1637 # Resource thresholds 

1638 "{{CPU_THRESHOLD}}": str(thresholds.cpu_threshold), 

1639 "{{MEMORY_THRESHOLD}}": str(thresholds.memory_threshold), 

1640 "{{GPU_THRESHOLD}}": str(thresholds.gpu_threshold), 

1641 "{{PENDING_PODS_THRESHOLD}}": str(thresholds.pending_pods_threshold), 

1642 "{{PENDING_REQUESTED_CPU_VCPUS}}": str(thresholds.pending_requested_cpu_vcpus), 

1643 "{{PENDING_REQUESTED_MEMORY_GB}}": str(thresholds.pending_requested_memory_gb), 

1644 "{{PENDING_REQUESTED_GPUS}}": str(thresholds.pending_requested_gpus), 

1645 # DynamoDB table names (from global stack) 

1646 "{{TEMPLATES_TABLE_NAME}}": f"{self.config.get_project_name()}-job-templates", 

1647 "{{WEBHOOKS_TABLE_NAME}}": f"{self.config.get_project_name()}-webhooks", 

1648 "{{JOBS_TABLE_NAME}}": f"{self.config.get_project_name()}-jobs", 

1649 # DynamoDB region (global stack region, may differ from cluster region) 

1650 "{{DYNAMODB_REGION}}": self.config.get_global_region(), 

1651 # Manifest processor resource quotas (sourced from shared policy). 

1652 "{{MP_MAX_CPU_PER_MANIFEST}}": str(job_quotas.get("max_cpu_per_manifest", "10")), 

1653 "{{MP_MAX_MEMORY_PER_MANIFEST}}": str( 

1654 job_quotas.get("max_memory_per_manifest", "32Gi") 

1655 ), 

1656 "{{MP_MAX_GPU_PER_MANIFEST}}": str(job_quotas.get("max_gpu_per_manifest", 4)), 

1657 # Manifest processor namespace allowlist (sourced from shared policy). 

1658 # Both the REST manifest processor and the SQS queue processor 

1659 # read from job_validation_policy.allowed_namespaces so a single 

1660 # edit takes effect on both submission paths at the next deploy. 

1661 "{{MP_ALLOWED_NAMESPACES}}": ",".join( 

1662 job_policy.get("allowed_namespaces", ["default", "gco-jobs"]) 

1663 ), 

1664 # Manifest processor request body size cap (HTTP 413 middleware). 

1665 # Lives at cdk.json::manifest_processor.max_request_body_bytes. 

1666 "{{MP_MAX_REQUEST_BODY_BYTES}}": str( 

1667 mp_config.get("max_request_body_bytes", 1_048_576) 

1668 ), 

1669 } 

1670 

1671 # Add queue processor replacements if enabled 

1672 qp_config = self.node.try_get_context("queue_processor") or {} 

1673 

1674 # Add VPC endpoint CIDR replacements for network policy restrictions 

1675 # Generates a YAML block of ipBlock entries from the vpc_endpoint_cidrs array. 

1676 # The placeholder {{VPC_ENDPOINT_CIDR_BLOCKS}} sits at 8-space indentation in 

1677 # the manifest, so the first entry needs no leading indent (the manifest provides 

1678 # it) and subsequent entries are indented to align. 

1679 vpc_endpoint_cidrs = self.node.try_get_context("vpc_endpoint_cidrs") or ["10.0.0.0/16"] 

1680 cidr_lines = [] 

1681 for i, cidr in enumerate(vpc_endpoint_cidrs): 

1682 prefix = "" if i == 0 else " " 

1683 cidr_lines.append(f'{prefix}- ipBlock:\n cidr: "{cidr}"') 

1684 image_replacements["{{VPC_ENDPOINT_CIDR_BLOCKS}}"] = "\n".join(cidr_lines) 

1685 

1686 # Resource governance for gco-jobs namespace: ResourceQuota caps aggregate 

1687 # resource consumption across the namespace, LimitRange caps per-container 

1688 # maxima. Values come from cdk.json `resource_quota` context with defaults 

1689 # sized for a modest multi-tenant dev cluster. 

1690 resource_quota = self.node.try_get_context("resource_quota") or {} 

1691 image_replacements["{{QUOTA_MAX_CPU}}"] = str(resource_quota.get("max_cpu", "100")) 

1692 image_replacements["{{QUOTA_MAX_MEMORY}}"] = str(resource_quota.get("max_memory", "512Gi")) 

1693 image_replacements["{{QUOTA_MAX_GPU}}"] = str(resource_quota.get("max_gpu", "32")) 

1694 image_replacements["{{QUOTA_MAX_PODS}}"] = str(resource_quota.get("max_pods", "50")) 

1695 image_replacements["{{LIMIT_MAX_CPU}}"] = str(resource_quota.get("container_max_cpu", "10")) 

1696 image_replacements["{{LIMIT_MAX_MEMORY}}"] = str( 

1697 resource_quota.get("container_max_memory", "64Gi") 

1698 ) 

1699 image_replacements["{{LIMIT_MAX_GPU}}"] = str(resource_quota.get("container_max_gpu", "4")) 

1700 

1701 if self.queue_processor_enabled: 1701 ↛ 1775line 1701 didn't jump to line 1775 because the condition on line 1701 was always true

1702 image_replacements["{{QUEUE_PROCESSOR_IMAGE}}"] = self.queue_processor_image.image_uri 

1703 image_replacements["{{QP_POLLING_INTERVAL}}"] = str( 

1704 qp_config.get("polling_interval", 10) 

1705 ) 

1706 image_replacements["{{QP_MAX_CONCURRENT_JOBS}}"] = str( 

1707 qp_config.get("max_concurrent_jobs", 10) 

1708 ) 

1709 image_replacements["{{QP_MESSAGES_PER_JOB}}"] = str( 

1710 qp_config.get("messages_per_job", 1) 

1711 ) 

1712 image_replacements["{{QP_SUCCESSFUL_JOBS_HISTORY}}"] = str( 

1713 qp_config.get("successful_jobs_history", 20) 

1714 ) 

1715 image_replacements["{{QP_FAILED_JOBS_HISTORY}}"] = str( 

1716 qp_config.get("failed_jobs_history", 10) 

1717 ) 

1718 image_replacements["{{QP_ALLOWED_NAMESPACES}}"] = ",".join( 

1719 job_policy.get("allowed_namespaces", ["default", "gco-jobs"]) 

1720 ) 

1721 # Resource caps, image allowlist, and security policy are shared 

1722 # with the REST manifest processor. Source them from the 

1723 # job_validation_policy section so a single change in cdk.json 

1724 # takes effect on both submission paths at the next deploy. 

1725 image_replacements["{{QP_MAX_GPU_PER_MANIFEST}}"] = str( 

1726 job_quotas.get("max_gpu_per_manifest", 4) 

1727 ) 

1728 image_replacements["{{QP_MAX_CPU_PER_MANIFEST}}"] = str( 

1729 job_quotas.get("max_cpu_per_manifest", "10") 

1730 ) 

1731 image_replacements["{{QP_MAX_MEMORY_PER_MANIFEST}}"] = str( 

1732 job_quotas.get("max_memory_per_manifest", "32Gi") 

1733 ) 

1734 image_replacements["{{QP_TRUSTED_REGISTRIES}}"] = ",".join( 

1735 job_policy.get("trusted_registries", []) 

1736 ) 

1737 image_replacements["{{QP_TRUSTED_DOCKERHUB_ORGS}}"] = ",".join( 

1738 job_policy.get("trusted_dockerhub_orgs", []) 

1739 ) 

1740 

1741 # Security policy toggles — shared with the REST manifest_processor. 

1742 # Both services read the same cdk.json section so a single policy 

1743 # flip (e.g. block_run_as_root: true) takes effect on both paths. 

1744 security_policy = job_policy.get("manifest_security_policy", {}) 

1745 

1746 def _policy_str(v: object) -> str: 

1747 return "true" if v else "false" 

1748 

1749 image_replacements["{{QP_BLOCK_PRIVILEGED}}"] = _policy_str( 

1750 security_policy.get("block_privileged", True) 

1751 ) 

1752 image_replacements["{{QP_BLOCK_PRIVILEGE_ESCALATION}}"] = _policy_str( 

1753 security_policy.get("block_privilege_escalation", True) 

1754 ) 

1755 image_replacements["{{QP_BLOCK_HOST_NETWORK}}"] = _policy_str( 

1756 security_policy.get("block_host_network", True) 

1757 ) 

1758 image_replacements["{{QP_BLOCK_HOST_PID}}"] = _policy_str( 

1759 security_policy.get("block_host_pid", True) 

1760 ) 

1761 image_replacements["{{QP_BLOCK_HOST_IPC}}"] = _policy_str( 

1762 security_policy.get("block_host_ipc", True) 

1763 ) 

1764 image_replacements["{{QP_BLOCK_HOST_PATH}}"] = _policy_str( 

1765 security_policy.get("block_host_path", True) 

1766 ) 

1767 image_replacements["{{QP_BLOCK_ADDED_CAPABILITIES}}"] = _policy_str( 

1768 security_policy.get("block_added_capabilities", True) 

1769 ) 

1770 image_replacements["{{QP_BLOCK_RUN_AS_ROOT}}"] = _policy_str( 

1771 security_policy.get("block_run_as_root", False) 

1772 ) 

1773 

1774 # Add Valkey endpoint if enabled 

1775 if hasattr(self, "valkey_cache") and self.valkey_cache: 1775 ↛ 1776line 1775 didn't jump to line 1776 because the condition on line 1775 was never true

1776 image_replacements["{{VALKEY_ENDPOINT}}"] = self.valkey_cache.attr_endpoint_address 

1777 image_replacements["{{VALKEY_PORT}}"] = self.valkey_cache.attr_endpoint_port 

1778 

1779 # Add Aurora pgvector endpoint if enabled 

1780 if hasattr(self, "aurora_cluster") and self.aurora_cluster: 

1781 image_replacements["{{AURORA_PGVECTOR_ENDPOINT}}"] = ( 

1782 self.aurora_cluster.cluster_endpoint.hostname 

1783 ) 

1784 image_replacements["{{AURORA_PGVECTOR_READER_ENDPOINT}}"] = ( 

1785 self.aurora_cluster.cluster_read_endpoint.hostname 

1786 ) 

1787 image_replacements["{{AURORA_PGVECTOR_PORT}}"] = str( 

1788 self.aurora_cluster.cluster_endpoint.port 

1789 ) 

1790 if self.aurora_cluster.secret: 1790 ↛ 1796line 1790 didn't jump to line 1796 because the condition on line 1790 was always true

1791 image_replacements["{{AURORA_PGVECTOR_SECRET_ARN}}"] = ( 

1792 self.aurora_cluster.secret.secret_arn 

1793 ) 

1794 

1795 # Add FSx replacements if enabled 

1796 if self.fsx_file_system: 

1797 image_replacements["{{FSX_FILE_SYSTEM_ID}}"] = self.fsx_file_system.ref 

1798 image_replacements["{{FSX_DNS_NAME}}"] = self.fsx_file_system.attr_dns_name 

1799 image_replacements["{{FSX_MOUNT_NAME}}"] = self.fsx_file_system.attr_lustre_mount_name 

1800 image_replacements["{{PRIVATE_SUBNET_ID}}"] = self.vpc.private_subnets[0].subnet_id 

1801 image_replacements["{{FSX_SECURITY_GROUP_ID}}"] = ( 

1802 self.fsx_security_group.security_group_id 

1803 ) 

1804 

1805 kubectl_apply = CustomResource( 

1806 self, 

1807 "KubectlApplyManifests", 

1808 service_token=self.kubectl_provider.service_token, 

1809 properties={ 

1810 "ClusterName": self.cluster.cluster_name, 

1811 "Region": self.deployment_region, 

1812 "SkipDeletionOnStackDelete": "true", # Don't delete resources on stack deletion 

1813 "ImageReplacements": image_replacements, 

1814 # Include FSx file system ID directly to force update when FSx changes 

1815 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none", 

1816 # Force update on each deployment to trigger pod rollouts 

1817 "DeploymentTimestamp": deployment_timestamp, 

1818 }, 

1819 ) 

1820 

1821 # Ensure manifests are applied after cluster, EFS, and FSx are ready 

1822 # Note: ALB is created by EKS Auto Mode when Ingress is applied 

1823 kubectl_apply.node.add_dependency(self.cluster) 

1824 kubectl_apply.node.add_dependency(self.efs_file_system) 

1825 if self.fsx_file_system: 

1826 kubectl_apply.node.add_dependency(self.fsx_file_system) 

1827 

1828 # Wait for EKS to have patched the IRSA role ARN onto each managed 

1829 # addon's service account before the kubectl Lambda rollout-restarts 

1830 # the controllers at the end of this invocation. Otherwise the 

1831 # restart sees the old (annotation-less) SA, the mutating webhook 

1832 # can't inject AWS_ROLE_ARN, and the new pods are just as 

1833 # credential-less as the ones they replaced. The symptom is 

1834 # controller pods silently failing with "no EC2 IMDS role found" — 

1835 # for EFS/FSx that manifests as PVCs stuck Pending forever, for 

1836 # CloudWatch as missing Container Insights metrics. See the 

1837 # UpdateEfsCsiAddonRole custom resource in _create_efs_csi_driver_addon 

1838 # for the full rationale. 

1839 for attr in ( 

1840 "_efs_csi_addon_role_update", 

1841 "_fsx_csi_addon_role_update", 

1842 "_cloudwatch_addon_role_update", 

1843 ): 

1844 update_cr = getattr(self, attr, None) 

1845 if update_cr is not None: 

1846 kubectl_apply.node.add_dependency(update_cr) 

1847 

1848 # Ensure Pod Identity associations exist before workloads start, 

1849 # so pods get IAM credentials on first launch 

1850 for assoc in self._pod_identity_associations: 

1851 kubectl_apply.node.add_dependency(assoc) 

1852 

1853 # Install Helm charts (KEDA, etc.) after base manifests are applied 

1854 # This ensures namespaces and RBAC are in place before Helm installations 

1855 helm_install = CustomResource( 

1856 self, 

1857 "HelmInstallCharts", 

1858 service_token=self.helm_installer_provider.service_token, 

1859 properties={ 

1860 "ClusterName": self.cluster.cluster_name, 

1861 "Region": self.deployment_region, 

1862 # Enable core AI/ML infrastructure charts by default 

1863 # NVIDIA Network Operator toggled via cdk.json nvidia_network_operator.enabled 

1864 "EnabledCharts": self._get_enabled_helm_charts(), 

1865 # Override chart values if needed 

1866 "Charts": {}, 

1867 # Pass IAM role ARNs for service account annotations 

1868 "KedaOperatorRoleArn": self.keda_operator_role.role_arn, 

1869 # Force re-invocation on every deployment to pick up charts.yaml changes 

1870 "DeploymentTimestamp": deployment_timestamp, 

1871 }, 

1872 ) 

1873 

1874 # Helm charts depend on kubectl manifests being applied first 

1875 helm_install.node.add_dependency(kubectl_apply) 

1876 

1877 # Apply CRD-dependent manifests after Helm installs the CRDs. 

1878 # KEDA ScaledJob/ScaledObject require the KEDA CRDs to exist first. 

1879 # This second kubectl pass runs after Helm and applies only those resources. 

1880 kubectl_apply_post_helm = CustomResource( 

1881 self, 

1882 "KubectlApplyPostHelmManifests", 

1883 service_token=self.kubectl_provider.service_token, 

1884 properties={ 

1885 "ClusterName": self.cluster.cluster_name, 

1886 "Region": self.deployment_region, 

1887 "SkipDeletionOnStackDelete": "true", 

1888 "ImageReplacements": image_replacements, 

1889 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none", 

1890 "DeploymentTimestamp": deployment_timestamp, 

1891 # PostHelm: "true" tells the handler to apply only post-helm-* manifests 

1892 "PostHelm": "true", 

1893 }, 

1894 ) 

1895 

1896 # Must run after Helm has installed the CRDs 

1897 kubectl_apply_post_helm.node.add_dependency(helm_install) 

1898 

1899 # Create GA registration custom resource AFTER manifests are applied 

1900 # This waits for the Ingress to create the ALB and registers it with GA 

1901 # 

1902 # IMPORTANT: We include a deployment timestamp to force CloudFormation to 

1903 # re-invoke the Lambda on every deployment. This ensures the ALB is always 

1904 # registered with the Global Accelerator, even if other properties haven't changed. 

1905 # Without this, CloudFormation may skip the custom resource if it thinks 

1906 # nothing has changed, leaving the ALB unregistered after GA recreation. 

1907 deployment_timestamp = str(int(time.time())) 

1908 

1909 ga_registration = CustomResource( 

1910 self, 

1911 "GaRegistration", 

1912 service_token=self.ga_registration_provider.service_token, 

1913 properties={ 

1914 "ClusterName": self.cluster.cluster_name, 

1915 "Region": self.deployment_region, 

1916 "EndpointGroupArn": self.endpoint_group_arn, 

1917 "IngressName": "gco-ingress", 

1918 "Namespace": "gco-system", 

1919 # Pass global region and project name for SSM storage 

1920 "GlobalRegion": self.config.get_global_region(), 

1921 "ProjectName": self.config.get_project_name(), 

1922 # Force re-invocation on every deployment 

1923 "DeploymentTimestamp": deployment_timestamp, 

1924 }, 

1925 ) 

1926 

1927 # GA registration must happen after manifests are applied 

1928 ga_registration.node.add_dependency(kubectl_apply) 

1929 

1930 def _create_ga_registration_lambda(self) -> None: 

1931 """Create Lambda function to register Ingress-created ALB with Global Accelerator. 

1932 

1933 This Lambda: 

1934 1. Waits for the Ingress to get an ALB address 

1935 2. Gets the ALB ARN from the address 

1936 3. Registers that ALB with Global Accelerator 

1937 

1938 This is necessary because the ALB is created by the AWS Load Balancer Controller 

1939 (not CDK), so we can't directly reference its ARN. 

1940 """ 

1941 project_name = self.config.get_project_name() 

1942 

1943 # Create Lambda function for GA registration using external handler 

1944 ga_registration_lambda = lambda_.Function( 

1945 self, 

1946 "GaRegistrationFunction", 

1947 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

1948 handler="handler.lambda_handler", 

1949 code=lambda_.Code.from_asset("lambda/ga-registration"), 

1950 timeout=Duration.minutes(15), # Max Lambda timeout; handler uses 14 min budget 

1951 memory_size=256, 

1952 vpc=self.vpc, 

1953 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

1954 environment={ 

1955 "CLUSTER_NAME": self.cluster.cluster_name, 

1956 "REGION": self.deployment_region, 

1957 }, 

1958 tracing=lambda_.Tracing.ACTIVE, 

1959 ) 

1960 

1961 # Grant permissions 

1962 ga_registration_lambda.add_to_role_policy( 

1963 iam.PolicyStatement( 

1964 effect=iam.Effect.ALLOW, 

1965 actions=["eks:DescribeCluster"], 

1966 resources=[self.cluster.cluster_arn], 

1967 ) 

1968 ) 

1969 ga_registration_lambda.add_to_role_policy( 

1970 iam.PolicyStatement( 

1971 effect=iam.Effect.ALLOW, 

1972 actions=[ 

1973 "elasticloadbalancing:DescribeLoadBalancers", 

1974 "elasticloadbalancing:DescribeTags", # Required for tag-based ALB detection 

1975 ], 

1976 resources=["*"], 

1977 ) 

1978 ) 

1979 ga_registration_lambda.add_to_role_policy( 

1980 iam.PolicyStatement( 

1981 effect=iam.Effect.ALLOW, 

1982 actions=[ 

1983 "globalaccelerator:AddEndpoints", 

1984 "globalaccelerator:RemoveEndpoints", 

1985 "globalaccelerator:UpdateEndpointGroup", 

1986 "globalaccelerator:DescribeEndpointGroup", 

1987 ], 

1988 resources=["*"], 

1989 ) 

1990 ) 

1991 ga_registration_lambda.add_to_role_policy( 

1992 iam.PolicyStatement( 

1993 effect=iam.Effect.ALLOW, 

1994 actions=["ssm:GetParameter", "ssm:PutParameter", "ssm:DeleteParameter"], 

1995 resources=[ 

1996 f"arn:aws:ssm:{self.config.get_global_region()}:{self.account}:parameter/{project_name}/*" 

1997 ], 

1998 ) 

1999 ) 

2000 

2001 # Add EKS access entry for the Lambda role 

2002 if ga_registration_lambda.role is not None: 2002 ↛ 2016line 2002 didn't jump to line 2016 because the condition on line 2002 was always true

2003 eks.AccessEntry( 

2004 self, 

2005 "GaRegistrationLambdaAccessEntry", 

2006 cluster=self.cluster, # type: ignore[arg-type] 

2007 principal=ga_registration_lambda.role.role_arn, 

2008 access_policies=[ 

2009 eks.AccessPolicy.from_access_policy_name( 

2010 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER 

2011 ) 

2012 ], 

2013 ) 

2014 

2015 # Allow Lambda to access EKS API 

2016 self.cluster.cluster_security_group.add_ingress_rule( 

2017 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block), 

2018 connection=ec2.Port.tcp(443), 

2019 description="Allow GA registration Lambda to access EKS API", 

2020 ) 

2021 

2022 # Get endpoint group ARN from SSM (stored in global region). 

2023 # Uses the shared AwsCustomResource execution role (pre-created in 

2024 # _create_aws_custom_resource_role) — the SSM GetParameter 

2025 # statement was attached there up-front so the Lambda never hits 

2026 # an IAM propagation race on cold deploys. 

2027 global_region = self.config.get_global_region() 

2028 get_endpoint_group_arn = cr.AwsCustomResource( 

2029 self, 

2030 "GetEndpointGroupArn", 

2031 on_create=cr.AwsSdkCall( 

2032 service="SSM", 

2033 action="getParameter", 

2034 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"}, 

2035 region=global_region, 

2036 physical_resource_id=cr.PhysicalResourceId.of( 

2037 f"{project_name}-get-endpoint-group-arn-{self.deployment_region}" 

2038 ), 

2039 ), 

2040 on_update=cr.AwsSdkCall( 

2041 service="SSM", 

2042 action="getParameter", 

2043 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"}, 

2044 region=global_region, 

2045 ), 

2046 role=self.aws_custom_resource_role, 

2047 ) 

2048 get_endpoint_group_arn.node.add_dependency(self.aws_custom_resource_role) 

2049 

2050 endpoint_group_arn = get_endpoint_group_arn.get_response_field("Parameter.Value") 

2051 

2052 # Create log group for GA registration provider 

2053 ga_provider_log_group = logs.LogGroup( 

2054 self, 

2055 "GaRegistrationProviderLogGroup", 

2056 retention=logs.RetentionDays.ONE_WEEK, 

2057 removal_policy=RemovalPolicy.DESTROY, 

2058 ) 

2059 

2060 # Create provider and custom resource 

2061 ga_provider = cr.Provider( 

2062 self, 

2063 "GaRegistrationProvider", 

2064 on_event_handler=ga_registration_lambda, 

2065 log_group=ga_provider_log_group, 

2066 ) 

2067 

2068 # Store for use after kubectl apply 

2069 self.ga_registration_provider = ga_provider 

2070 self.endpoint_group_arn = endpoint_group_arn 

2071 

2072 # cdk-nag suppression: the GA registration Lambda needs broad 

2073 # Global Accelerator and ELB Describe access with Resource: *. 

2074 from cdk_nag import NagSuppressions 

2075 

2076 NagSuppressions.add_resource_suppressions( 

2077 ga_registration_lambda, 

2078 [ 

2079 { 

2080 "id": "AwsSolutions-IAM5", 

2081 "reason": ( 

2082 "The GA registration Lambda needs elasticloadbalancing:Describe* " 

2083 "and globalaccelerator:* to discover the Ingress-created ALB and " 

2084 "register it with Global Accelerator. These APIs do not support " 

2085 "resource-level IAM scoping — Resource: * is the only valid form." 

2086 ), 

2087 "appliesTo": ["Resource::*"], 

2088 }, 

2089 ], 

2090 apply_to_children=True, 

2091 ) 

2092 

2093 def _get_enabled_helm_charts(self) -> list[str]: 

2094 """Return the list of Helm charts to install based on cdk.json helm config. 

2095 

2096 Reads the 'helm' section from cdk.json context. Each key maps to one or 

2097 more Helm chart names. Charts are returned in dependency order with Kueue 

2098 last (its webhook intercepts all Job/Deployment mutations). 

2099 """ 

2100 helm_config = self.node.try_get_context("helm") or {} 

2101 

2102 # Mapping from cdk.json helm key → Helm chart name(s) in charts.yaml 

2103 # Order matters: dependencies first, Kueue last 

2104 chart_map: list[tuple[str, list[str]]] = [ 

2105 ("keda", ["keda"]), 

2106 ("nvidia_gpu_operator", ["nvidia-gpu-operator"]), 

2107 ("nvidia_dra_driver", ["nvidia-dra-driver"]), 

2108 ("nvidia_network_operator", ["nvidia-network-operator"]), 

2109 ("aws_efa_device_plugin", ["aws-efa-device-plugin"]), 

2110 ("aws_neuron_device_plugin", ["aws-neuron-device-plugin"]), 

2111 ("volcano", ["volcano"]), 

2112 ("kuberay", ["kuberay-operator"]), 

2113 ("cert_manager", ["cert-manager"]), 

2114 ("slurm", ["slinky-slurm-operator", "slinky-slurm"]), 

2115 ("yunikorn", ["yunikorn"]), 

2116 ("kueue", ["kueue"]), # Must be last 

2117 ] 

2118 

2119 enabled_charts = [] 

2120 for config_key, chart_names in chart_map: 

2121 chart_config = helm_config.get(config_key, {}) 

2122 if chart_config.get("enabled", True): 2122 ↛ 2120line 2122 didn't jump to line 2120 because the condition on line 2122 was always true

2123 enabled_charts.extend(chart_names) 

2124 

2125 return enabled_charts 

2126 

2127 def _create_helm_installer_lambda(self) -> None: 

2128 """Create Lambda function to install Helm charts (KEDA, NVIDIA DRA, etc.). 

2129 

2130 This Lambda uses Helm to install charts that require complex setup 

2131 (TLS certificates, CRDs, etc.) that are difficult to manage via raw manifests. 

2132 

2133 Charts installed: 

2134 - KEDA: Kubernetes Event-Driven Autoscaling (enabled by default) 

2135 - NVIDIA DRA Driver: Dynamic Resource Allocation for GPUs (disabled by default) 

2136 """ 

2137 project_name = self.config.get_project_name() 

2138 

2139 # Create IAM role for Helm installer Lambda 

2140 helm_lambda_role = iam.Role( 

2141 self, 

2142 "HelmInstallerLambdaRole", 

2143 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

2144 managed_policies=[ 

2145 iam.ManagedPolicy.from_aws_managed_policy_name( 

2146 "service-role/AWSLambdaVPCAccessExecutionRole" 

2147 ), 

2148 iam.ManagedPolicy.from_aws_managed_policy_name( 

2149 "service-role/AWSLambdaBasicExecutionRole" 

2150 ), 

2151 ], 

2152 ) 

2153 

2154 # Add EKS permissions 

2155 helm_lambda_role.add_to_policy( 

2156 iam.PolicyStatement( 

2157 actions=["eks:DescribeCluster", "eks:ListClusters"], 

2158 resources=[self.cluster.cluster_arn], 

2159 ) 

2160 ) 

2161 

2162 # Create security group for Helm installer Lambda 

2163 helm_lambda_sg = ec2.SecurityGroup( 

2164 self, 

2165 "HelmInstallerLambdaSG", 

2166 vpc=self.vpc, 

2167 description="Security group for Helm installer Lambda to access EKS cluster", 

2168 security_group_name=f"{project_name}-helm-lambda-sg-{self.deployment_region}", 

2169 allow_all_outbound=True, 

2170 ) 

2171 

2172 # Allow Lambda to access EKS cluster API 

2173 self.cluster.cluster_security_group.add_ingress_rule( 

2174 peer=helm_lambda_sg, 

2175 connection=ec2.Port.tcp(443), 

2176 description="Allow Helm installer Lambda to access EKS API", 

2177 ) 

2178 

2179 # Build Docker image for Helm installer Lambda 

2180 # Points at helm-installer-build/ which is rebuilt fresh every deploy 

2181 # by _build_helm_installer_lambda() in cli/stacks.py 

2182 ecr_assets.DockerImageAsset( 

2183 self, 

2184 "HelmInstallerImage", 

2185 directory="lambda/helm-installer-build", 

2186 platform=ecr_assets.Platform.LINUX_AMD64, 

2187 ) 

2188 

2189 # Create Lambda function using Docker image 

2190 # Store function name as string attribute for cross-stack references 

2191 # This avoids CDK cross-environment resolution issues when account is unresolved 

2192 self.helm_installer_lambda_function_name = f"{project_name}-helm-{self.deployment_region}" 

2193 self.helm_installer_lambda = lambda_.DockerImageFunction( 

2194 self, 

2195 "HelmInstallerFunction", 

2196 function_name=self.helm_installer_lambda_function_name, 

2197 code=lambda_.DockerImageCode.from_image_asset( 

2198 directory="lambda/helm-installer-build", 

2199 platform=ecr_assets.Platform.LINUX_AMD64, 

2200 ), 

2201 timeout=Duration.minutes(15), 

2202 memory_size=1024, 

2203 architecture=lambda_.Architecture.X86_64, 

2204 role=helm_lambda_role, 

2205 vpc=self.vpc, 

2206 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2207 security_groups=[helm_lambda_sg], 

2208 environment={ 

2209 "CLUSTER_NAME": self.cluster.cluster_name, 

2210 "REGION": self.deployment_region, 

2211 }, 

2212 tracing=lambda_.Tracing.ACTIVE, 

2213 ) 

2214 

2215 # Add EKS access entry for the Lambda role 

2216 eks.AccessEntry( 

2217 self, 

2218 "HelmInstallerLambdaAccessEntry", 

2219 cluster=self.cluster, # type: ignore[arg-type] 

2220 principal=helm_lambda_role.role_arn, 

2221 access_policies=[ 

2222 eks.AccessPolicy.from_access_policy_name( 

2223 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER 

2224 ) 

2225 ], 

2226 ) 

2227 

2228 # Create log group for Helm installer provider 

2229 helm_provider_log_group = logs.LogGroup( 

2230 self, 

2231 "HelmInstallerProviderLogGroup", 

2232 retention=logs.RetentionDays.ONE_WEEK, 

2233 removal_policy=RemovalPolicy.DESTROY, 

2234 ) 

2235 

2236 # Create custom resource provider 

2237 self.helm_installer_provider = cr.Provider( 

2238 self, 

2239 "HelmInstallerProvider", 

2240 on_event_handler=self.helm_installer_lambda, 

2241 log_group=helm_provider_log_group, 

2242 ) 

2243 

2244 # cdk-nag suppression: the Helm installer Lambda requires broad 

2245 # EKS and Kubernetes API access to install Helm charts. 

2246 from cdk_nag import NagSuppressions 

2247 

2248 NagSuppressions.add_resource_suppressions( 

2249 helm_lambda_role, 

2250 [ 

2251 { 

2252 "id": "AwsSolutions-IAM5", 

2253 "reason": ( 

2254 "The Helm installer Lambda requires broad EKS and Kubernetes API " 

2255 "access to install Helm charts (KEDA, NVIDIA DRA, etc.) that create " 

2256 "CRDs, RBAC rules, and workloads across multiple namespaces. " 

2257 "Resource: * is required because the set of Kubernetes resources " 

2258 "is dynamic and not known at synth time." 

2259 ), 

2260 "appliesTo": ["Resource::*"], 

2261 }, 

2262 ], 

2263 apply_to_children=True, 

2264 ) 

2265 

2266 def _create_efs(self) -> None: 

2267 """Create EFS file system for shared storage across jobs. 

2268 

2269 Creates an EFS file system with mount targets in each private subnet, 

2270 allowing pods to share data and persist outputs. The EFS is configured 

2271 with: 

2272 - Encryption at rest 

2273 - Automatic backups 

2274 - General Purpose performance mode (suitable for most workloads) 

2275 - Bursting throughput mode 

2276 

2277 Kubernetes resources (StorageClass, PV, PVC) are created via manifests. 

2278 """ 

2279 project_name = self.config.get_project_name() 

2280 

2281 # Create security group for EFS 

2282 self.efs_security_group = ec2.SecurityGroup( 

2283 self, 

2284 "EfsSecurityGroup", 

2285 vpc=self.vpc, 

2286 description=f"Security group for {project_name} EFS in {self.deployment_region}", 

2287 security_group_name=f"{project_name}-efs-sg-{self.deployment_region}", 

2288 allow_all_outbound=False, # EFS doesn't need outbound 

2289 ) 

2290 

2291 # Allow NFS traffic from EKS cluster security group 

2292 self.efs_security_group.add_ingress_rule( 

2293 peer=self.cluster.cluster_security_group, 

2294 connection=ec2.Port.tcp(2049), 

2295 description="Allow NFS from EKS cluster", 

2296 ) 

2297 

2298 # Create EFS file system 

2299 self.efs_file_system = efs.FileSystem( 

2300 self, 

2301 "GCOEfs", 

2302 vpc=self.vpc, 

2303 file_system_name=f"{project_name}-efs-{self.deployment_region}", 

2304 security_group=self.efs_security_group, 

2305 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2306 encrypted=True, 

2307 performance_mode=efs.PerformanceMode.GENERAL_PURPOSE, 

2308 throughput_mode=efs.ThroughputMode.BURSTING, 

2309 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production 

2310 enable_automatic_backups=True, 

2311 ) 

2312 

2313 # Add file system policy to allow mounting without IAM authorization 

2314 # This allows any client that can reach the mount target to mount the file system 

2315 self.efs_file_system.add_to_resource_policy( 

2316 iam.PolicyStatement( 

2317 effect=iam.Effect.ALLOW, 

2318 principals=[iam.AnyPrincipal()], 

2319 actions=[ 

2320 "elasticfilesystem:ClientMount", 

2321 "elasticfilesystem:ClientWrite", 

2322 "elasticfilesystem:ClientRootAccess", 

2323 ], 

2324 conditions={"Bool": {"elasticfilesystem:AccessedViaMountTarget": "true"}}, 

2325 ) 

2326 ) 

2327 

2328 # Create access point for the gco-jobs directory 

2329 self.efs_access_point = self.efs_file_system.add_access_point( 

2330 "JobsAccessPoint", 

2331 path="/gco-jobs", 

2332 create_acl=efs.Acl(owner_uid="1000", owner_gid="1000", permissions="755"), 

2333 posix_user=efs.PosixUser(uid="1000", gid="1000"), 

2334 ) 

2335 

2336 # Output EFS information 

2337 CfnOutput( 

2338 self, 

2339 "EfsFileSystemId", 

2340 value=self.efs_file_system.file_system_id, 

2341 description="EFS File System ID for shared job storage", 

2342 ) 

2343 

2344 CfnOutput( 

2345 self, 

2346 "EfsAccessPointId", 

2347 value=self.efs_access_point.access_point_id, 

2348 description="EFS Access Point ID for job outputs", 

2349 ) 

2350 

2351 def _create_fsx_lustre(self) -> None: 

2352 """Create FSx for Lustre file system for high-performance storage. 

2353 

2354 FSx for Lustre provides high-performance parallel file system storage 

2355 ideal for ML training workloads that require high throughput and low latency. 

2356 

2357 This is optional and controlled by the fsx_lustre.enabled config setting. 

2358 

2359 Supported deployment types: 

2360 - SCRATCH_1: Temporary storage, no data replication 

2361 - SCRATCH_2: Temporary storage with better burst performance 

2362 - PERSISTENT_1: Persistent storage with data replication 

2363 - PERSISTENT_2: Latest persistent storage with higher throughput 

2364 """ 

2365 fsx_config = self.config.get_fsx_lustre_config(self.deployment_region) 

2366 

2367 if not fsx_config.get("enabled", False): 

2368 self.fsx_file_system = None 

2369 return 

2370 

2371 project_name = self.config.get_project_name() 

2372 

2373 # Create security group for FSx 

2374 self.fsx_security_group = ec2.SecurityGroup( 

2375 self, 

2376 "FsxSecurityGroup", 

2377 vpc=self.vpc, 

2378 description=f"Security group for {project_name} FSx Lustre in {self.deployment_region}", 

2379 security_group_name=f"{project_name}-fsx-sg-{self.deployment_region}", 

2380 allow_all_outbound=False, 

2381 ) 

2382 

2383 # Allow Lustre traffic from EKS cluster security group 

2384 # Lustre uses ports 988 (control) and 1021-1023 (data) 

2385 self.fsx_security_group.add_ingress_rule( 

2386 peer=self.cluster.cluster_security_group, 

2387 connection=ec2.Port.tcp(988), 

2388 description="Allow Lustre control traffic from EKS cluster", 

2389 ) 

2390 self.fsx_security_group.add_ingress_rule( 

2391 peer=self.cluster.cluster_security_group, 

2392 connection=ec2.Port.tcp_range(1021, 1023), 

2393 description="Allow Lustre data traffic from EKS cluster", 

2394 ) 

2395 

2396 # Allow self-referencing traffic for FSx Lustre internal communication 

2397 # FSx Lustre nodes need to communicate with each other on port 988 

2398 self.fsx_security_group.add_ingress_rule( 

2399 peer=self.fsx_security_group, 

2400 connection=ec2.Port.tcp(988), 

2401 description="Allow Lustre internal traffic on port 988", 

2402 ) 

2403 self.fsx_security_group.add_ingress_rule( 

2404 peer=self.fsx_security_group, 

2405 connection=ec2.Port.tcp_range(1021, 1023), 

2406 description="Allow Lustre internal traffic on ports 1021-1023", 

2407 ) 

2408 

2409 # Get deployment type 

2410 deployment_type = fsx_config.get("deployment_type", "SCRATCH_2") 

2411 storage_capacity = fsx_config.get("storage_capacity_gib", 1200) 

2412 

2413 # Build Lustre configuration based on deployment type 

2414 lustre_config = { 

2415 "deploymentType": deployment_type, 

2416 "dataCompressionType": fsx_config.get("data_compression_type", "LZ4"), 

2417 } 

2418 

2419 # Add throughput for PERSISTENT types 

2420 if deployment_type.startswith("PERSISTENT"): 

2421 lustre_config["perUnitStorageThroughput"] = fsx_config.get( 

2422 "per_unit_storage_throughput", 200 

2423 ) 

2424 

2425 # Add S3 import/export if configured 

2426 import_path = fsx_config.get("import_path") 

2427 export_path = fsx_config.get("export_path") 

2428 

2429 if import_path: 

2430 lustre_config["importPath"] = import_path 

2431 lustre_config["autoImportPolicy"] = fsx_config.get( 

2432 "auto_import_policy", "NEW_CHANGED_DELETED" 

2433 ) 

2434 

2435 if export_path: 

2436 lustre_config["exportPath"] = export_path 

2437 

2438 # Get file system type version (default to 2.15 for kernel 6.x compatibility) 

2439 # IMPORTANT: Lustre 2.10 is NOT compatible with kernel 6.x (AL2023, Bottlerocket 1.19+) 

2440 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/lustre-client-matrix.html 

2441 file_system_type_version = fsx_config.get("file_system_type_version", "2.15") 

2442 

2443 # Create FSx for Lustre file system 

2444 self.fsx_file_system = fsx.CfnFileSystem( 

2445 self, 

2446 "GCOFsxLustre", 

2447 file_system_type="LUSTRE", 

2448 file_system_type_version=file_system_type_version, 

2449 storage_capacity=storage_capacity, 

2450 subnet_ids=[self.vpc.private_subnets[0].subnet_id], 

2451 security_group_ids=[self.fsx_security_group.security_group_id], 

2452 lustre_configuration=lustre_config, 

2453 tags=[ 

2454 {"key": "Name", "value": f"{project_name}-fsx-{self.deployment_region}"}, 

2455 {"key": "Project", "value": project_name}, 

2456 ], 

2457 ) 

2458 

2459 # Ensure FSx file system waits for security group ingress rules to be created 

2460 # This prevents "security group does not permit Lustre LNET traffic" errors 

2461 self.fsx_file_system.node.add_dependency(self.fsx_security_group) 

2462 

2463 # Create FSx CSI Driver add-on for Kubernetes integration 

2464 self._create_fsx_csi_driver_addon() 

2465 

2466 # Output FSx information 

2467 CfnOutput( 

2468 self, 

2469 "FsxFileSystemId", 

2470 value=self.fsx_file_system.ref, 

2471 description="FSx for Lustre File System ID", 

2472 ) 

2473 

2474 CfnOutput( 

2475 self, 

2476 "FsxDnsName", 

2477 value=self.fsx_file_system.attr_dns_name, 

2478 description="FSx for Lustre DNS Name", 

2479 ) 

2480 

2481 CfnOutput( 

2482 self, 

2483 "FsxMountName", 

2484 value=self.fsx_file_system.attr_lustre_mount_name, 

2485 description="FSx for Lustre Mount Name", 

2486 ) 

2487 

2488 def _create_valkey_cache(self) -> None: 

2489 """Create an ElastiCache Serverless Valkey cache for K/V caching. 

2490 

2491 Provides a low-latency key-value store that inference endpoints and 

2492 jobs can use for prompt caching, session state, feature stores, or 

2493 any shared state across pods. Valkey Serverless auto-scales and 

2494 requires no node management. 

2495 

2496 The cache is placed in the VPC private subnets and accessible from 

2497 any pod via the cluster security group. 

2498 """ 

2499 valkey_config = self.config.get_valkey_config() 

2500 if not valkey_config.get("enabled", False): 2500 ↛ 2503line 2500 didn't jump to line 2503 because the condition on line 2500 was always true

2501 return 

2502 

2503 from aws_cdk import aws_elasticache as elasticache 

2504 

2505 # Security group for Valkey (allow access from EKS cluster) 

2506 valkey_sg = ec2.SecurityGroup( 

2507 self, 

2508 "ValkeySG", 

2509 vpc=self.vpc, 

2510 description="Security group for Valkey Serverless cache", 

2511 allow_all_outbound=False, 

2512 ) 

2513 valkey_sg.add_ingress_rule( 

2514 ec2.Peer.ipv4(self.vpc.vpc_cidr_block), 

2515 ec2.Port.tcp(6379), 

2516 "Allow Valkey access from VPC", 

2517 ) 

2518 

2519 private_subnet_ids = [s.subnet_id for s in self.vpc.private_subnets] 

2520 

2521 self.valkey_cache = elasticache.CfnServerlessCache( 

2522 self, 

2523 "ValkeyCache", 

2524 engine="valkey", 

2525 serverless_cache_name=f"gco-{self.deployment_region}", 

2526 description=f"GCO K/V cache for {self.deployment_region}", 

2527 major_engine_version="8", 

2528 security_group_ids=[valkey_sg.security_group_id], 

2529 subnet_ids=private_subnet_ids, 

2530 cache_usage_limits=elasticache.CfnServerlessCache.CacheUsageLimitsProperty( 

2531 data_storage=elasticache.CfnServerlessCache.DataStorageProperty( 

2532 maximum=valkey_config.get("max_data_storage_gb", 5), 

2533 minimum=1, 

2534 unit="GB", 

2535 ), 

2536 ecpu_per_second=elasticache.CfnServerlessCache.ECPUPerSecondProperty( 

2537 maximum=valkey_config.get("max_ecpu_per_second", 5000), 

2538 minimum=1000, 

2539 ), 

2540 ), 

2541 snapshot_retention_limit=valkey_config.get("snapshot_retention_limit", 1), 

2542 tags=[ 

2543 CfnTag(key="Project", value="gco"), 

2544 CfnTag(key="Region", value=self.deployment_region), 

2545 ], 

2546 ) 

2547 

2548 CfnOutput( 

2549 self, 

2550 "ValkeyEndpoint", 

2551 value=self.valkey_cache.attr_endpoint_address, 

2552 description="Valkey Serverless cache endpoint", 

2553 ) 

2554 CfnOutput( 

2555 self, 

2556 "ValkeyPort", 

2557 value=self.valkey_cache.attr_endpoint_port, 

2558 description="Valkey Serverless cache port", 

2559 ) 

2560 

2561 # Store endpoint in SSM for discovery by pods 

2562 ssm.StringParameter( 

2563 self, 

2564 "ValkeyEndpointParam", 

2565 parameter_name=f"/{self.config.get_project_name()}/valkey-endpoint-{self.deployment_region}", 

2566 string_value=self.valkey_cache.attr_endpoint_address, 

2567 description=f"Valkey endpoint for {self.deployment_region}", 

2568 ) 

2569 

2570 def _create_aurora_pgvector(self) -> None: 

2571 """Create an Aurora Serverless v2 PostgreSQL cluster with pgvector. 

2572 

2573 Provides a fully managed vector database that inference endpoints and 

2574 jobs can use for RAG (retrieval-augmented generation), semantic search, 

2575 embedding storage, and similarity queries. Aurora Serverless v2 

2576 auto-scales capacity and requires no instance management. 

2577 

2578 The cluster is placed in the VPC private subnets and accessible from 

2579 any pod via the cluster security group. Credentials are stored in 

2580 Secrets Manager and the endpoint is published to SSM + a K8s ConfigMap 

2581 for automatic discovery. 

2582 

2583 See: https://aws.amazon.com/blogs/database/accelerate-generative-ai-workloads-on-amazon-aurora-with-optimized-reads-and-pgvector/ 

2584 """ 

2585 aurora_config = self.config.get_aurora_pgvector_config() 

2586 if not aurora_config.get("enabled", False): 

2587 return 

2588 

2589 from aws_cdk import aws_rds as rds 

2590 

2591 project_name = self.config.get_project_name() 

2592 

2593 # Security group for Aurora (allow PostgreSQL access from EKS cluster only) 

2594 aurora_sg = ec2.SecurityGroup( 

2595 self, 

2596 "AuroraPgvectorSG", 

2597 vpc=self.vpc, 

2598 description="Security group for Aurora Serverless v2 pgvector", 

2599 allow_all_outbound=False, 

2600 ) 

2601 aurora_sg.add_ingress_rule( 

2602 self.cluster.cluster_security_group, 

2603 ec2.Port.tcp(5432), 

2604 "Allow PostgreSQL access from EKS cluster", 

2605 ) 

2606 

2607 # Subnet group for Aurora (private subnets only) 

2608 subnet_group = rds.SubnetGroup( 

2609 self, 

2610 "AuroraPgvectorSubnetGroup", 

2611 description=f"Subnet group for GCO Aurora pgvector in {self.deployment_region}", 

2612 vpc=self.vpc, 

2613 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2614 ) 

2615 

2616 # Aurora Serverless v2 cluster with PostgreSQL 16 + pgvector 

2617 self.aurora_cluster = rds.DatabaseCluster( 

2618 self, 

2619 "AuroraPgvectorCluster", 

2620 engine=rds.DatabaseClusterEngine.aurora_postgres( 

2621 version=getattr(rds.AuroraPostgresEngineVersion, AURORA_POSTGRES_VERSION), 

2622 ), 

2623 serverless_v2_min_capacity=aurora_config.get("min_acu", 0), 

2624 serverless_v2_max_capacity=aurora_config.get("max_acu", 16), 

2625 writer=rds.ClusterInstance.serverless_v2( 

2626 "Writer", 

2627 auto_minor_version_upgrade=True, 

2628 ), 

2629 readers=[ 

2630 rds.ClusterInstance.serverless_v2( 

2631 "Reader", 

2632 auto_minor_version_upgrade=True, 

2633 scale_with_writer=True, 

2634 ), 

2635 ], 

2636 vpc=self.vpc, 

2637 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2638 subnet_group=subnet_group, 

2639 security_groups=[aurora_sg], 

2640 default_database_name="gco_vectors", 

2641 backup=rds.BackupProps( 

2642 retention=Duration.days(aurora_config.get("backup_retention_days", 7)), 

2643 ), 

2644 deletion_protection=aurora_config.get("deletion_protection", False), 

2645 removal_policy=RemovalPolicy.DESTROY, 

2646 storage_encrypted=True, 

2647 iam_authentication=True, 

2648 cloudwatch_logs_exports=["postgresql"], 

2649 monitoring_interval=Duration.seconds(60), 

2650 cluster_identifier=f"{project_name}-pgvector-{self.deployment_region}", 

2651 ) 

2652 

2653 # Construct-level cdk-nag suppressions for Aurora pgvector 

2654 from cdk_nag import NagPackSuppression, NagSuppressions 

2655 

2656 NagSuppressions.add_resource_suppressions( 

2657 self.aurora_cluster, 

2658 [ 

2659 NagPackSuppression( 

2660 id="AwsSolutions-RDS10", 

2661 reason=( 

2662 "Deletion protection is intentionally disabled for dev/test deployments. " 

2663 "Production deployments should set aurora_pgvector.deletion_protection=true " 

2664 "in cdk.json." 

2665 ), 

2666 ), 

2667 NagPackSuppression( 

2668 id="AwsSolutions-SMG4", 

2669 reason=( 

2670 "Aurora manages credential rotation via the RDS integration with Secrets " 

2671 "Manager. Manual Secrets Manager rotation is not required. " 

2672 "See: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/rds-secrets-manager.html" 

2673 ), 

2674 ), 

2675 NagPackSuppression( 

2676 id="HIPAA.Security-RDSInstanceDeletionProtectionEnabled", 

2677 reason=( 

2678 "Deletion protection is intentionally disabled for dev/test deployments. " 

2679 "Production deployments should set aurora_pgvector.deletion_protection=true " 

2680 "in cdk.json." 

2681 ), 

2682 ), 

2683 NagPackSuppression( 

2684 id="NIST.800.53.R5-RDSInstanceDeletionProtectionEnabled", 

2685 reason=( 

2686 "Deletion protection is intentionally disabled for dev/test deployments. " 

2687 "Production deployments should set aurora_pgvector.deletion_protection=true " 

2688 "in cdk.json." 

2689 ), 

2690 ), 

2691 NagPackSuppression( 

2692 id="PCI.DSS.321-SecretsManagerUsingKMSKey", 

2693 reason=( 

2694 "Aurora Serverless v2 credentials in Secrets Manager are encrypted with " 

2695 "AWS-managed keys by default. Customer-managed KMS can be enabled if " 

2696 "required for PCI compliance." 

2697 ), 

2698 ), 

2699 ], 

2700 apply_to_children=True, 

2701 ) 

2702 

2703 # Outputs 

2704 CfnOutput( 

2705 self, 

2706 "AuroraPgvectorEndpoint", 

2707 value=self.aurora_cluster.cluster_endpoint.hostname, 

2708 description="Aurora pgvector cluster writer endpoint", 

2709 ) 

2710 CfnOutput( 

2711 self, 

2712 "AuroraPgvectorReaderEndpoint", 

2713 value=self.aurora_cluster.cluster_read_endpoint.hostname, 

2714 description="Aurora pgvector cluster reader endpoint", 

2715 ) 

2716 CfnOutput( 

2717 self, 

2718 "AuroraPgvectorPort", 

2719 value=str(self.aurora_cluster.cluster_endpoint.port), 

2720 description="Aurora pgvector cluster port", 

2721 ) 

2722 CfnOutput( 

2723 self, 

2724 "AuroraPgvectorSecretArn", 

2725 value=self.aurora_cluster.secret.secret_arn if self.aurora_cluster.secret else "", 

2726 description="Aurora pgvector credentials secret ARN", 

2727 ) 

2728 

2729 # Store endpoint in SSM for discovery by pods and external tools 

2730 ssm.StringParameter( 

2731 self, 

2732 "AuroraPgvectorEndpointParam", 

2733 parameter_name=f"/{project_name}/aurora-pgvector-endpoint-{self.deployment_region}", 

2734 string_value=self.aurora_cluster.cluster_endpoint.hostname, 

2735 description=f"Aurora pgvector endpoint for {self.deployment_region}", 

2736 ) 

2737 

2738 # Grant the ServiceAccountRole read access to the Aurora secret 

2739 # so pods can retrieve credentials via the ConfigMap + Secrets Manager. 

2740 if self.aurora_cluster.secret: 2740 ↛ exitline 2740 didn't return from function '_create_aurora_pgvector' because the condition on line 2740 was always true

2741 self.aurora_cluster.secret.grant_read(self.service_account_role) 

2742 

2743 def _create_fsx_csi_driver_addon(self) -> None: 

2744 """Create FSx CSI Driver add-on for Kubernetes integration. 

2745 

2746 The FSx CSI driver enables Kubernetes pods to mount FSx for Lustre 

2747 file systems as persistent volumes. 

2748 """ 

2749 # Create IAM role for FSx CSI Driver using IRSA + Pod Identity 

2750 self.fsx_csi_role = GCORegionalStack._create_irsa_role( 

2751 self, 

2752 "FsxCsiDriverRole", 

2753 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

2754 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

2755 service_account_names=["fsx-csi-controller-sa"], 

2756 namespaces=["kube-system"], 

2757 ) 

2758 

2759 # Add FSx CSI driver permissions 

2760 self.fsx_csi_role.add_to_policy( 

2761 iam.PolicyStatement( 

2762 effect=iam.Effect.ALLOW, 

2763 actions=[ 

2764 "fsx:DescribeFileSystems", 

2765 "fsx:DescribeVolumes", 

2766 "fsx:CreateVolume", 

2767 "fsx:DeleteVolume", 

2768 "fsx:TagResource", 

2769 ], 

2770 resources=["*"], 

2771 ) 

2772 ) 

2773 

2774 self.fsx_csi_role.add_to_policy( 

2775 iam.PolicyStatement( 

2776 effect=iam.Effect.ALLOW, 

2777 actions=[ 

2778 "ec2:DescribeInstances", 

2779 "ec2:DescribeVolumes", 

2780 "ec2:DescribeVpcs", 

2781 "ec2:DescribeSubnets", 

2782 "ec2:DescribeSecurityGroups", 

2783 ], 

2784 resources=["*"], 

2785 ) 

2786 ) 

2787 

2788 # cdk-nag suppression: the FSx CSI driver role grants 

2789 # ec2:Describe* APIs that don't support resource-level scoping. 

2790 from cdk_nag import NagSuppressions 

2791 

2792 NagSuppressions.add_resource_suppressions( 

2793 self.fsx_csi_role, 

2794 [ 

2795 { 

2796 "id": "AwsSolutions-IAM5", 

2797 "reason": ( 

2798 "The FSx CSI driver role grants ec2:Describe* for volume " 

2799 "and network discovery. These AWS APIs do not support " 

2800 "resource-level IAM scoping — Resource: * is the only " 

2801 "valid form." 

2802 ), 

2803 "appliesTo": ["Resource::*"], 

2804 }, 

2805 ], 

2806 apply_to_children=True, 

2807 ) 

2808 

2809 # Create FSx CSI Driver add-on 

2810 fsx_addon = eks.Addon( 

2811 self, 

2812 "FsxCsiDriverAddon", 

2813 cluster=self.cluster, # type: ignore[arg-type] 

2814 addon_name="aws-fsx-csi-driver", 

2815 addon_version=EKS_ADDON_FSX_CSI_DRIVER, 

2816 preserve_on_delete=False, 

2817 configuration_values={ 

2818 "node": { 

2819 "tolerations": self._ADDON_NODE_TOLERATIONS, 

2820 }, 

2821 "controller": { 

2822 "tolerations": self._ADDON_NODE_TOLERATIONS, 

2823 }, 

2824 }, 

2825 ) 

2826 

2827 # Append the PassRole statement for the FSx CSI role to the shared 

2828 # AwsCustomResource execution role. See 

2829 # _create_aws_custom_resource_role for the full rationale. 

2830 self.aws_custom_resource_role.add_to_policy( 

2831 iam.PolicyStatement( 

2832 effect=iam.Effect.ALLOW, 

2833 actions=["iam:PassRole"], 

2834 resources=[self.fsx_csi_role.role_arn], 

2835 ) 

2836 ) 

2837 

2838 # Update the add-on to use the IRSA role 

2839 update_fsx_addon = cr.AwsCustomResource( 

2840 self, 

2841 "UpdateFsxCsiAddonRole", 

2842 on_create=cr.AwsSdkCall( 

2843 service="EKS", 

2844 action="updateAddon", 

2845 parameters={ 

2846 "clusterName": self.cluster.cluster_name, 

2847 "addonName": "aws-fsx-csi-driver", 

2848 "serviceAccountRoleArn": self.fsx_csi_role.role_arn, 

2849 }, 

2850 physical_resource_id=cr.PhysicalResourceId.of( 

2851 f"{self.cluster.cluster_name}-fsx-csi-role-update" 

2852 ), 

2853 ), 

2854 on_update=cr.AwsSdkCall( 

2855 service="EKS", 

2856 action="updateAddon", 

2857 parameters={ 

2858 "clusterName": self.cluster.cluster_name, 

2859 "addonName": "aws-fsx-csi-driver", 

2860 "serviceAccountRoleArn": self.fsx_csi_role.role_arn, 

2861 }, 

2862 ), 

2863 role=self.aws_custom_resource_role, 

2864 ) 

2865 

2866 update_fsx_addon.node.add_dependency(fsx_addon) 

2867 update_fsx_addon.node.add_dependency(self.fsx_csi_role) 

2868 update_fsx_addon.node.add_dependency(self.aws_custom_resource_role) 

2869 

2870 # Expose the update-addon resource so _apply_kubernetes_manifests can 

2871 # make the kubectl Lambda wait for the IRSA annotation patch to land 

2872 # before it rollout-restarts the fsx-csi-controller. See the EFS CSI 

2873 # equivalent for the full rationale — same race, same fix, same 

2874 # symptom (PVCs stuck Pending with "no EC2 IMDS role found"). 

2875 self._fsx_csi_addon_role_update = update_fsx_addon 

2876 

2877 # Create Pod Identity Association for FSx CSI driver 

2878 eks_l1.CfnPodIdentityAssociation( 

2879 self, 

2880 "PodIdentity-fsx-csi", 

2881 cluster_name=self.cluster.cluster_name, 

2882 namespace="kube-system", 

2883 service_account="fsx-csi-controller-sa", 

2884 role_arn=self.fsx_csi_role.role_arn, 

2885 ) 

2886 

2887 def _create_drift_detection(self) -> None: 

2888 """Create CloudFormation drift detection on a daily schedule. 

2889 

2890 Creates: 

2891 - SNS topic (KMS-encrypted) for drift alerts 

2892 - Lambda function that initiates drift detection on this stack, polls 

2893 until detection completes, and publishes to SNS if drift is found 

2894 - EventBridge rule on a daily schedule (configurable via cdk.json 

2895 ``drift_detection.schedule_hours``) that invokes the Lambda 

2896 

2897 Operators can disable drift detection entirely by setting 

2898 ``drift_detection.enabled`` to ``false`` in cdk.json. When disabled, 

2899 no resources are created. 

2900 """ 

2901 drift_config = self.node.try_get_context("drift_detection") or {} 

2902 if not drift_config.get("enabled", True): 

2903 return 

2904 

2905 schedule_hours = int(drift_config.get("schedule_hours", 24)) 

2906 

2907 # KMS key for SNS topic encryption. SNS with AWS-managed keys doesn't 

2908 # allow CloudFormation/Lambda to publish, so we use a customer-managed 

2909 # key we can grant publish access on. 

2910 drift_topic_key = kms.Key( 

2911 self, 

2912 "DriftDetectionTopicKey", 

2913 description="KMS key for GCO drift detection SNS topic", 

2914 enable_key_rotation=True, 

2915 removal_policy=RemovalPolicy.DESTROY, 

2916 ) 

2917 

2918 self.drift_detection_topic = sns.Topic( 

2919 self, 

2920 "DriftDetectionTopic", 

2921 display_name="GCO CloudFormation Drift Alerts", 

2922 master_key=drift_topic_key, 

2923 ) 

2924 

2925 # IAM role for the drift detection Lambda 

2926 drift_lambda_role = iam.Role( 

2927 self, 

2928 "DriftDetectionLambdaRole", 

2929 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

2930 managed_policies=[ 

2931 iam.ManagedPolicy.from_aws_managed_policy_name( 

2932 "service-role/AWSLambdaBasicExecutionRole" 

2933 ), 

2934 ], 

2935 ) 

2936 

2937 # CloudFormation drift APIs operate at the stack level; the API does 

2938 # not support resource-level ARN scoping for these actions, so we scope 

2939 # to this stack's ARN where supported and accept "*" where not. 

2940 drift_lambda_role.add_to_policy( 

2941 iam.PolicyStatement( 

2942 effect=iam.Effect.ALLOW, 

2943 actions=[ 

2944 "cloudformation:DetectStackDrift", 

2945 "cloudformation:DescribeStackDriftDetectionStatus", 

2946 "cloudformation:DescribeStackResourceDrifts", 

2947 "cloudformation:DescribeStackResource", 

2948 "cloudformation:DescribeStackResources", 

2949 ], 

2950 resources=["*"], 

2951 ) 

2952 ) 

2953 

2954 self.drift_detection_topic.grant_publish(drift_lambda_role) 

2955 

2956 # Lambda function — one per stack; stack name is baked into env vars 

2957 drift_lambda = lambda_.Function( 

2958 self, 

2959 "DriftDetectionFunction", 

2960 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

2961 handler="handler.lambda_handler", 

2962 code=lambda_.Code.from_asset("lambda/drift-detection"), 

2963 timeout=Duration.minutes(14), # Leave headroom under Lambda 15-min cap 

2964 memory_size=256, 

2965 role=drift_lambda_role, 

2966 environment={ 

2967 "STACK_NAME": self.stack_name, 

2968 "SNS_TOPIC_ARN": self.drift_detection_topic.topic_arn, 

2969 "REGION": self.deployment_region, 

2970 }, 

2971 tracing=lambda_.Tracing.ACTIVE, 

2972 ) 

2973 

2974 # Dead-letter queue for EventBridge → Lambda target failures. 

2975 # Captures events that fail to reach the Lambda (e.g. due to 

2976 # throttling or permission issues) so operators can retry or 

2977 # investigate. Required by Serverless-EventBusDLQ cdk-nag rule. 

2978 drift_rule_dlq = sqs.Queue( 

2979 self, 

2980 "DriftDetectionRuleDlq", 

2981 retention_period=Duration.days(14), 

2982 enforce_ssl=True, 

2983 encryption=sqs.QueueEncryption.SQS_MANAGED, 

2984 removal_policy=RemovalPolicy.DESTROY, 

2985 ) 

2986 

2987 # DLQs themselves are terminal — they don't need their own DLQ. 

2988 # Suppress the circular AwsSolutions-SQS3 nag finding. 

2989 from cdk_nag import NagSuppressions as _DlqNagSuppressions 

2990 

2991 _DlqNagSuppressions.add_resource_suppressions( 

2992 drift_rule_dlq, 

2993 [ 

2994 { 

2995 "id": "AwsSolutions-SQS3", 

2996 "reason": ( 

2997 "This queue IS the dead-letter queue for the " 

2998 "DriftDetectionSchedule EventBridge rule. A DLQ for a " 

2999 "DLQ is circular; if events fail to reach this queue " 

3000 "they are captured by EventBridge's own retry metrics " 

3001 "(CloudWatch FailedInvocations)." 

3002 ), 

3003 }, 

3004 ], 

3005 ) 

3006 

3007 # EventBridge rule — daily schedule by default 

3008 events.Rule( 

3009 self, 

3010 "DriftDetectionSchedule", 

3011 description=(f"Daily CloudFormation drift detection for {self.stack_name}"), 

3012 schedule=events.Schedule.rate(Duration.hours(schedule_hours)), 

3013 targets=[ 

3014 events_targets.LambdaFunction( 

3015 drift_lambda, 

3016 dead_letter_queue=drift_rule_dlq, 

3017 retry_attempts=2, 

3018 ) 

3019 ], 

3020 ) 

3021 

3022 # Outputs for operators to subscribe to the topic 

3023 CfnOutput( 

3024 self, 

3025 "DriftDetectionTopicArn", 

3026 value=self.drift_detection_topic.topic_arn, 

3027 description=( 

3028 f"SNS topic ARN for CloudFormation drift alerts in " 

3029 f"{self.deployment_region}. Subscribe an endpoint (email, " 

3030 f"Slack, PagerDuty) to receive drift notifications." 

3031 ), 

3032 ) 

3033 

3034 # cdk-nag suppressions for this component 

3035 from cdk_nag import NagSuppressions 

3036 

3037 NagSuppressions.add_resource_suppressions( 

3038 drift_lambda_role, 

3039 [ 

3040 { 

3041 "id": "AwsSolutions-IAM4", 

3042 "reason": ( 

3043 "AWSLambdaBasicExecutionRole provides standard " 

3044 "CloudWatch Logs permissions required for Lambda " 

3045 "logging. This is the AWS-recommended managed policy." 

3046 ), 

3047 }, 

3048 { 

3049 "id": "AwsSolutions-IAM5", 

3050 "reason": ( 

3051 "CloudFormation drift detection APIs (DetectStackDrift, " 

3052 "DescribeStackDriftDetectionStatus, " 

3053 "DescribeStackResourceDrifts) cannot be scoped to a " 

3054 "specific stack resource via IAM; the action-level " 

3055 "scoping requires wildcard resources. The Lambda's " 

3056 "environment pins it to a single stack name, so the " 

3057 "effective blast radius is limited." 

3058 ), 

3059 }, 

3060 ], 

3061 apply_to_children=True, 

3062 ) 

3063 

3064 def _create_mcp_role(self) -> None: 

3065 """Create dedicated IAM role for the MCP server. 

3066 

3067 The MCP server exposes GCO CLI tools to LLM agents. Without a dedicated 

3068 role, the server would inherit the full ambient credentials of the user 

3069 who launches it (often an administrator). This method creates a 

3070 least-privilege role that the MCP server can assume at startup via 

3071 ``GCO_MCP_ROLE_ARN``. 

3072 

3073 Permissions are scoped to the minimum needed by the tools exposed: 

3074 

3075 - ``eks:DescribeCluster`` on this regional EKS cluster ARN only. 

3076 - ``s3:GetObject`` on model weights buckets. The model bucket lives in 

3077 the global stack, so we scope to the same name pattern used by the 

3078 service account role (``{project_name}-*``). This is a deliberate 

3079 compromise: a precise cross-stack ARN export would force a tight 

3080 dependency on the global stack, and cdk-nag will flag it anyway 

3081 because the bucket name is auto-generated. 

3082 - ``cloudwatch:GetMetricData`` / ``cloudwatch:ListMetrics``. These APIs 

3083 do not support resource-level IAM, so wildcard is required. Read-only. 

3084 - ``sqs:SendMessage`` scoped to this region's job queue ARN only. 

3085 

3086 The trust policy uses ``AccountRootPrincipal`` so any IAM user/role in 

3087 the account can assume it (gated by an explicit sts:AssumeRole 

3088 permission on the caller — standard AWS behavior). Operators who want 

3089 to restrict assumption further should add an external-id or principal 

3090 condition to the trust policy after deployment. 

3091 

3092 Operators can disable this component entirely by setting 

3093 ``mcp_server.enabled`` to ``false`` in cdk.json. 

3094 """ 

3095 mcp_config = self.node.try_get_context("mcp_server") or {} 

3096 if not mcp_config.get("enabled", True): 

3097 return 

3098 

3099 project_name = self.config.get_project_name() 

3100 

3101 self.mcp_server_role = iam.Role( 

3102 self, 

3103 "McpServerRole", 

3104 assumed_by=iam.AccountRootPrincipal(), 

3105 description=( 

3106 "Least-privilege role assumed by the GCO MCP server at startup. " 

3107 "Grants only the permissions needed by MCP tools: eks:DescribeCluster, " 

3108 "s3:GetObject on model buckets, cloudwatch read-only metrics, and " 

3109 "sqs:SendMessage to the regional job queue." 

3110 ), 

3111 max_session_duration=Duration.hours(12), 

3112 ) 

3113 

3114 # eks:DescribeCluster on this region's cluster only 

3115 self.mcp_server_role.add_to_policy( 

3116 iam.PolicyStatement( 

3117 effect=iam.Effect.ALLOW, 

3118 actions=["eks:DescribeCluster"], 

3119 resources=[self.cluster.cluster_arn], 

3120 ) 

3121 ) 

3122 

3123 # s3:GetObject on model weights buckets. Bucket name is auto-generated 

3124 # in the global stack, so we match the same prefix pattern used by the 

3125 # service account role. 

3126 self.mcp_server_role.add_to_policy( 

3127 iam.PolicyStatement( 

3128 effect=iam.Effect.ALLOW, 

3129 actions=["s3:GetObject", "s3:ListBucket"], 

3130 resources=[ 

3131 f"arn:aws:s3:::{project_name}-*", 

3132 f"arn:aws:s3:::{project_name}-*/*", 

3133 ], 

3134 ) 

3135 ) 

3136 

3137 # CloudWatch read-only metrics APIs. These APIs do not support 

3138 # resource-level IAM so wildcard is required. 

3139 self.mcp_server_role.add_to_policy( 

3140 iam.PolicyStatement( 

3141 effect=iam.Effect.ALLOW, 

3142 actions=[ 

3143 "cloudwatch:GetMetricData", 

3144 "cloudwatch:GetMetricStatistics", 

3145 "cloudwatch:ListMetrics", 

3146 ], 

3147 resources=["*"], 

3148 ) 

3149 ) 

3150 

3151 # sqs:SendMessage scoped to the regional job queue only 

3152 self.mcp_server_role.add_to_policy( 

3153 iam.PolicyStatement( 

3154 effect=iam.Effect.ALLOW, 

3155 actions=["sqs:SendMessage", "sqs:GetQueueUrl", "sqs:GetQueueAttributes"], 

3156 resources=[self.job_queue.queue_arn], 

3157 ) 

3158 ) 

3159 

3160 # Export the role ARN so operators can set GCO_MCP_ROLE_ARN in their 

3161 # MCP server environment. 

3162 CfnOutput( 

3163 self, 

3164 "McpServerRoleArn", 

3165 value=self.mcp_server_role.role_arn, 

3166 description=( 

3167 "IAM role ARN for the GCO MCP server. Set GCO_MCP_ROLE_ARN to " 

3168 "this value when launching the MCP server so it assumes a " 

3169 "least-privilege role instead of ambient credentials." 

3170 ), 

3171 export_name=f"{project_name}-mcp-server-role-arn-{self.deployment_region}", 

3172 ) 

3173 

3174 # cdk-nag suppressions: CloudWatch metrics APIs cannot be scoped. 

3175 from cdk_nag import NagSuppressions 

3176 

3177 NagSuppressions.add_resource_suppressions( 

3178 self.mcp_server_role, 

3179 [ 

3180 { 

3181 "id": "AwsSolutions-IAM5", 

3182 "reason": ( 

3183 "The CloudWatch metrics APIs (GetMetricData, " 

3184 "GetMetricStatistics, ListMetrics) do not support " 

3185 "resource-level IAM; wildcard resource is required. " 

3186 "The S3 permissions use the {project_name}-* prefix " 

3187 "pattern because the model weights bucket name is " 

3188 "auto-generated by CDK in the global stack and a " 

3189 "cross-stack ARN export would create tight stack " 

3190 "coupling. All actions are read-only or scoped " 

3191 "send-only (SQS)." 

3192 ), 

3193 }, 

3194 ], 

3195 apply_to_children=True, 

3196 ) 

3197 

3198 def _create_outputs(self) -> None: 

3199 """Create CloudFormation outputs for cluster information""" 

3200 project_name = self.config.get_project_name() 

3201 

3202 # Export cluster information 

3203 CfnOutput( 

3204 self, 

3205 "ClusterName", 

3206 value=self.cluster.cluster_name, 

3207 description=f"EKS cluster name for {self.deployment_region}", 

3208 export_name=f"{project_name}-cluster-name-{self.deployment_region}", 

3209 ) 

3210 

3211 CfnOutput( 

3212 self, 

3213 "ClusterArn", 

3214 value=self.cluster.cluster_arn, 

3215 description=f"EKS cluster ARN for {self.deployment_region}", 

3216 export_name=f"{project_name}-cluster-arn-{self.deployment_region}", 

3217 ) 

3218 

3219 CfnOutput( 

3220 self, 

3221 "ClusterEndpoint", 

3222 value=self.cluster.cluster_endpoint, 

3223 description=f"EKS cluster endpoint for {self.deployment_region}", 

3224 export_name=f"{project_name}-cluster-endpoint-{self.deployment_region}", 

3225 ) 

3226 

3227 CfnOutput( 

3228 self, 

3229 "ClusterSecurityGroupId", 

3230 value=self.cluster.cluster_security_group_id, 

3231 description=f"EKS cluster security group ID for {self.deployment_region}", 

3232 export_name=f"{project_name}-cluster-sg-{self.deployment_region}", 

3233 ) 

3234 

3235 CfnOutput( 

3236 self, 

3237 "VpcId", 

3238 value=self.vpc.vpc_id, 

3239 description=f"VPC ID for {self.deployment_region}", 

3240 export_name=f"{project_name}-vpc-id-{self.deployment_region}", 

3241 ) 

3242 

3243 # Export public subnet IDs for ALB 

3244 public_subnet_ids = [subnet.subnet_id for subnet in self.vpc.public_subnets] 

3245 CfnOutput( 

3246 self, 

3247 "PublicSubnetIds", 

3248 value=Fn.join(",", public_subnet_ids), 

3249 description=f"Public subnet IDs for ALB in {self.deployment_region}", 

3250 export_name=f"{project_name}-public-subnets-{self.deployment_region}", 

3251 ) 

3252 

3253 # Note: ALB is created by AWS Load Balancer Controller via Ingress 

3254 # The ALB ARN is registered with Global Accelerator by the GA registration Lambda 

3255 

3256 def get_cluster(self) -> eks.Cluster: 

3257 """Get the EKS cluster""" 

3258 return self.cluster 

3259 

3260 def get_vpc(self) -> ec2.Vpc: 

3261 """Get the VPC""" 

3262 return self.vpc