Coverage for gco/stacks/regional_stack.py: 94%

482 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-15 15:07 +0000

1""" 

2Regional stack for GCO (Global Capacity Orchestrator on AWS) - EKS cluster and ALB per region. 

3 

4This is the largest stack in the project (~3200 lines) and creates all regional 

5resources for a single AWS region. One instance is deployed per region defined 

6in cdk.json. 

7 

8Resources Created: 

9 VPC & Networking: 

10 - VPC with 3 AZs, public subnets (ALB), private subnets (EKS nodes) 

11 - 2 NAT Gateways for high availability 

12 - VPC endpoints for ECR, S3, STS, Secrets Manager, SSM, CloudWatch 

13 - VPC Flow Logs (CloudWatch Logs, 30-day retention) 

14 

15 EKS Cluster (Auto Mode): 

16 - Managed control plane with full logging (API, Audit, Authenticator, Controller Manager, Scheduler) 

17 - NodePools: system, general-purpose, gpu-x86, gpu-arm, inference, gpu-efa, neuron, cpu-general 

18 - IRSA roles for service accounts (Secrets Manager, SQS, DynamoDB, CloudWatch, S3, EFS) 

19 

20 Load Balancing: 

21 - ALB (created by Ingress via AWS Load Balancer Controller) 

22 - Internal NLB for regional API Gateway VPC Link 

23 - Global Accelerator endpoint registration (via ga-registration Lambda) 

24 

25 Storage: 

26 - EFS with dynamic provisioning (CSI driver, access points, encryption at rest + in transit) 

27 - FSx for Lustre (optional, toggled via cdk.json) 

28 - Valkey Serverless cache (optional) 

29 - Aurora Serverless v2 with pgvector (optional) 

30 

31 Lambda Functions: 

32 - kubectl-applier: applies K8s manifests during deployment 

33 - helm-installer: installs Helm charts (KEDA, Volcano, KubeRay, GPU Operator, etc.) 

34 - ga-registration: registers ALB with Global Accelerator 

35 - regional-api-proxy: proxies regional API Gateway to internal ALB 

36 

37 Container Images: 

38 - ECR repositories + Docker image builds for health-monitor, manifest-processor, 

39 inference-monitor, queue-processor 

40 

41 SQS: 

42 - Regional job queue + dead letter queue (for gco jobs submit-sqs) 

43 

44Key Design Decisions: 

45 - EKS Auto Mode handles node provisioning — no managed node groups or Karpenter provisioners 

46 - NodePools use WhenEmpty consolidation for inference to avoid disrupting long-running pods 

47 - IRSA (IAM Roles for Service Accounts) for least-privilege pod-level AWS access 

48 - All optional features (FSx, Valkey, Aurora) are toggled via cdk.json context variables 

49 - Template variables in K8s manifests ({{PLACEHOLDER}}) are replaced at deploy time 

50 

51Dependencies: 

52 - GCOGlobalStack (for Global Accelerator endpoint group ARN, DynamoDB table names, S3 bucket) 

53 - GCOApiGatewayGlobalStack (for auth secret ARN) 

54 

55Modification Guide: 

56 - To add a new NodePool: add a YAML manifest in lambda/kubectl-applier-simple/manifests/ (40-49 range) 

57 - To add a new service: add ECR image build here, Dockerfile in dockerfiles/, manifest in manifests/ 

58 - To add a new optional feature: add a cdk.json context toggle, guard with if/else in this file 

59 - To change EKS version: update KUBERNETES_VERSION in constants.py 

60""" 

61 

62from __future__ import annotations 

63 

64import time 

65from dataclasses import dataclass 

66from typing import Any 

67 

68import aws_cdk.aws_eks_v2 as eks 

69from aws_cdk import ( 

70 CfnJson, 

71 CfnOutput, 

72 CfnTag, 

73 CustomResource, 

74 Duration, 

75 Fn, 

76 RemovalPolicy, 

77 Stack, 

78) 

79from aws_cdk import aws_ec2 as ec2 

80from aws_cdk import aws_ecr as ecr 

81from aws_cdk import aws_ecr_assets as ecr_assets 

82from aws_cdk import aws_efs as efs 

83from aws_cdk import aws_eks as eks_l1 # L1 constructs (CfnPodIdentityAssociation) 

84from aws_cdk import aws_events as events 

85from aws_cdk import aws_events_targets as events_targets 

86from aws_cdk import aws_fsx as fsx 

87from aws_cdk import aws_iam as iam 

88from aws_cdk import aws_kms as kms 

89from aws_cdk import aws_lambda as lambda_ 

90from aws_cdk import aws_logs as logs 

91from aws_cdk import aws_sns as sns 

92from aws_cdk import aws_sqs as sqs 

93from aws_cdk import aws_ssm as ssm 

94from aws_cdk import custom_resources as cr 

95from constructs import Construct 

96 

97from gco.config.config_loader import ConfigLoader 

98from gco.stacks.constants import ( 

99 AURORA_POSTGRES_VERSION, 

100 CLUSTER_SHARED_SSM_PARAMETER_PREFIX, 

101 EKS_ADDON_CLOUDWATCH_OBSERVABILITY, 

102 EKS_ADDON_EFS_CSI_DRIVER, 

103 EKS_ADDON_FSX_CSI_DRIVER, 

104 EKS_ADDON_METRICS_SERVER, 

105 EKS_ADDON_POD_IDENTITY_AGENT, 

106 LAMBDA_PYTHON_RUNTIME, 

107) 

108 

109# <pyflowchart-code-diagram> BEGIN - auto-inserted, do not edit 

110# Flowchart(s) generated from this file: 

111# * ``GCORegionalStack.__init__`` -> ``diagrams/code_diagrams/gco/stacks/regional_stack.GCORegionalStack___init__.html`` 

112# (PNG: ``diagrams/code_diagrams/gco/stacks/regional_stack.GCORegionalStack___init__.png``) 

113# Regenerate with ``python diagrams/code_diagrams/generate.py``. 

114# <pyflowchart-code-diagram> END 

115 

116 

117@dataclass(frozen=True) 

118class SharedBucketIdentity: 

119 """Identity of the always-on ``Cluster_Shared_Bucket`` owned by ``GCOGlobalStack``. 

120 

121 Every regional stack resolves this identity from the three SSM parameters 

122 ``/gco/cluster-shared-bucket/{name,arn,region}`` published by 

123 ``GCOGlobalStack`` in the global region. The three values are used to 

124 grant IAM permissions on the bucket to the regional job-pod role and to 

125 populate the ``gco-cluster-shared-bucket`` ConfigMap applied to every 

126 regional EKS cluster. Frozen so it can be safely shared across helper 

127 methods without accidental mutation. 

128 """ 

129 

130 name: str 

131 arn: str 

132 region: str 

133 

134 

135def _compute_kubectl_cluster_shared_replacements( 

136 shared: SharedBucketIdentity, 

137) -> dict[str, str]: 

138 """Build the ``{{CLUSTER_SHARED_BUCKET*}}`` kubectl-applier replacements. 

139 

140 Pure helper kept at module scope so property and presence tests can 

141 inspect the output without synthesizing a full regional stack. The 

142 three keys are always populated — there is no feature toggle — because 

143 the ``gco-cluster-shared-bucket`` ConfigMap is applied unconditionally 

144 on every regional cluster. 

145 """ 

146 return { 

147 "{{CLUSTER_SHARED_BUCKET}}": shared.name, 

148 "{{CLUSTER_SHARED_BUCKET_ARN}}": shared.arn, 

149 "{{CLUSTER_SHARED_BUCKET_REGION}}": shared.region, 

150 } 

151 

152 

153def _augment_trusted_registries_with_project_ecr( 

154 base: list[str], 

155 *, 

156 account: str, 

157 regions: list[str], 

158 global_region: str, 

159) -> list[str]: 

160 """Return the configured trusted registries plus the project's own ECR. 

161 

162 The new ``gco images build`` flow pushes images to a per-account ECR 

163 registry under ``<account>.dkr.ecr.<region>.amazonaws.com/gco/<name>``. 

164 Without this augmentation the queue/manifest validators would treat 

165 those URIs as untrusted and reject every job that uses one — which 

166 defeats the whole point of the image registry feature. 

167 

168 Returns the unique union of the operator-configured ``base`` list 

169 plus the per-region project ECR hostnames (one per deployed region, 

170 plus the global region where ``gco-global`` provisions the source 

171 repo). Order is stable so the rendered ConfigMap doesn't churn 

172 between deploys. 

173 """ 

174 augmented: list[str] = list(base) 

175 seen = set(augmented) 

176 targets = list(dict.fromkeys([global_region, *regions])) 

177 if account: 

178 for region in targets: 

179 host = f"{account}.dkr.ecr.{region}.amazonaws.com" 

180 if host not in seen: 

181 augmented.append(host) 

182 seen.add(host) 

183 return augmented 

184 

185 

186class GCORegionalStack(Stack): 

187 """ 

188 Regional resources stack for a single AWS region. 

189 

190 Creates EKS cluster, load balancers, and supporting infrastructure 

191 for running GCO services in a specific region. 

192 

193 Attributes: 

194 vpc: VPC with public/private subnets 

195 cluster: EKS Auto Mode cluster 

196 """ 

197 

198 @staticmethod 

199 def _create_irsa_role( 

200 scope: GCORegionalStack, 

201 id: str, 

202 oidc_provider_arn: str, 

203 oidc_issuer_url: str, 

204 service_account_names: list[str], 

205 namespaces: list[str], 

206 ) -> iam.Role: 

207 """Create an IAM role trusted by both IRSA (OIDC) and EKS Pod Identity. 

208 

209 IRSA is the primary credential mechanism — it works reliably on EKS Auto 

210 Mode by projecting a service-account token that the AWS SDK exchanges for 

211 temporary credentials via the OIDC provider. 

212 

213 Pod Identity trust is added as a secondary path so the role is ready if/when 

214 Pod Identity injection starts working on Auto Mode nodes. 

215 

216 Uses CfnJson to defer OIDC condition key resolution to deploy time, 

217 because the issuer URL is a CloudFormation token that can't be used 

218 as a Python dict key at synth time. 

219 """ 

220 # Strip https:// from issuer URL for the OIDC condition 

221 issuer = Fn.select(1, Fn.split("//", oidc_issuer_url)) 

222 

223 # Build OIDC conditions using CfnJson to defer token resolution 

224 # The issuer URL is a CFN token — can't be used as a dict key at synth time 

225 aud_key = Fn.join("", [issuer, ":aud"]) 

226 sub_key = Fn.join("", [issuer, ":sub"]) 

227 

228 conditions_json = CfnJson( 

229 scope, 

230 f"{id}OidcConditions", 

231 value={ 

232 aud_key: "sts.amazonaws.com", 

233 sub_key: [ 

234 f"system:serviceaccount:{ns}:{sa}" 

235 for ns in namespaces 

236 for sa in service_account_names 

237 ], 

238 }, 

239 ) 

240 

241 role = iam.Role( 

242 scope, 

243 id, 

244 assumed_by=iam.FederatedPrincipal( 

245 federated=oidc_provider_arn, 

246 conditions={ 

247 "StringEquals": conditions_json, 

248 }, 

249 assume_role_action="sts:AssumeRoleWithWebIdentity", 

250 ), 

251 ) 

252 

253 # Also allow Pod Identity (secondary path for future use) 

254 assert role.assume_role_policy is not None # guaranteed by assumed_by parameter above 

255 role.assume_role_policy.add_statements( 

256 iam.PolicyStatement( 

257 effect=iam.Effect.ALLOW, 

258 principals=[iam.ServicePrincipal("pods.eks.amazonaws.com")], 

259 actions=["sts:AssumeRole", "sts:TagSession"], 

260 ) 

261 ) 

262 return role 

263 

264 def __init__( 

265 self, 

266 scope: Construct, 

267 construct_id: str, 

268 config: ConfigLoader, 

269 region: str, 

270 auth_secret_arn: str, 

271 **kwargs: Any, 

272 ) -> None: 

273 super().__init__(scope, construct_id, **kwargs) 

274 

275 self.config = config 

276 self.deployment_region = region 

277 self.auth_secret_arn = auth_secret_arn 

278 self.alb_arn: str | None = None 

279 

280 # Get cluster configuration for this region 

281 cluster_config = self.config.get_cluster_config(region) 

282 self.cluster_config = cluster_config 

283 

284 # Create VPC for the EKS cluster 

285 self.vpc = ec2.Vpc( 

286 self, 

287 "GCOVpc", 

288 # vpc_name intentionally omitted - let CDK generate unique name 

289 max_azs=3, 

290 nat_gateways=2, # For high availability 

291 subnet_configuration=[ 

292 ec2.SubnetConfiguration( 

293 name="PublicSubnet", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24 

294 ), 

295 ec2.SubnetConfiguration( 

296 name="PrivateSubnet", 

297 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, 

298 cidr_mask=24, 

299 ), 

300 ], 

301 ) 

302 

303 # Enable VPC Flow Logs for network traffic analysis and security monitoring 

304 self._create_vpc_flow_logs() 

305 

306 # Create SQS queue for job ingestion 

307 self._create_sqs_queue() 

308 

309 # Create ECR repositories and build Docker images 

310 self._create_container_images() 

311 

312 # Pre-create the execution role shared by every ``cr.AwsCustomResource`` 

313 # in this stack. See ``_create_aws_custom_resource_role`` for the full 

314 # rationale — in short, CDK's default behavior of auto-generating a 

315 # Lambda role per ``AwsCustomResource`` (and then merging all the 

316 # ``policy=`` statements onto it during deploy) triggers an IAM 

317 # propagation race on cold creates. We sidestep the race by creating 

318 # a single long-lived role up front and attaching policies to it as 

319 # each consumer is built; every ``AwsCustomResource`` then passes 

320 # ``role=self.aws_custom_resource_role`` instead of ``policy=``, so 

321 # the singleton Lambda runs against a role whose inline policy has 

322 # already replicated globally. 

323 self._create_aws_custom_resource_role() 

324 

325 # Create EKS cluster 

326 self._create_eks_cluster(cluster_config) 

327 

328 # Resolve the always-on Cluster_Shared_Bucket identity from SSM 

329 # (owned by GCOGlobalStack) and attach RW + KMS grants to the 

330 # job-pod role. Runs unconditionally — the ConfigMap and IAM 

331 # statements are always present on every regional cluster. Must 

332 # run after 

333 # _create_pod_identity_associations (which created service_account_role) 

334 # and before _apply_kubernetes_manifests (which consumes the 

335 # replacements in the KubectlApplyManifests CustomResource). 

336 self.cluster_shared_identity = self._resolve_cluster_shared_bucket_from_ssm() 

337 self._grant_cluster_shared_bucket_to_job_role(self.cluster_shared_identity) 

338 

339 # Create EFS for shared storage 

340 self._create_efs() 

341 

342 # Create FSx for Lustre (if enabled) for high-performance storage 

343 self._create_fsx_lustre() 

344 

345 # Create Valkey Serverless cache (if enabled) for K/V caching 

346 self._create_valkey_cache() 

347 

348 # Create Aurora Serverless v2 + pgvector (if enabled) for vector DB 

349 self._create_aurora_pgvector() 

350 

351 # Create GA registration Lambda for registering Ingress-created ALB 

352 self._create_ga_registration_lambda() 

353 

354 # Create Helm installer Lambda for KEDA and other Helm-based installations 

355 self._create_helm_installer_lambda() 

356 

357 # Apply Kubernetes manifests (after EFS so IDs are available) 

358 self._apply_kubernetes_manifests() 

359 

360 # Create CloudFormation drift detection (daily schedule + SNS alerts) 

361 self._create_drift_detection() 

362 

363 # Create dedicated IAM role for MCP server 

364 self._create_mcp_role() 

365 

366 # Export cluster information 

367 self._create_outputs() 

368 

369 # Apply cdk-nag suppressions for this stack 

370 self._apply_nag_suppressions() 

371 

372 def _create_vpc_flow_logs(self) -> None: 

373 """Create VPC Flow Logs for network traffic monitoring. 

374 

375 Flow logs capture information about IP traffic going to and from 

376 network interfaces in the VPC. This is required for security 

377 monitoring and compliance (HIPAA, SOC2, etc.). 

378 """ 

379 # Create CloudWatch Log Group for flow logs 

380 flow_log_group = logs.LogGroup( 

381 self, 

382 "VpcFlowLogGroup", 

383 # log_group_name intentionally omitted - let CDK generate unique name 

384 retention=logs.RetentionDays.ONE_MONTH, 

385 removal_policy=RemovalPolicy.DESTROY, 

386 ) 

387 

388 # Create IAM role for VPC Flow Logs 

389 flow_log_role = iam.Role( 

390 self, 

391 "VpcFlowLogRole", 

392 assumed_by=iam.ServicePrincipal("vpc-flow-logs.amazonaws.com"), 

393 ) 

394 

395 flow_log_role.add_to_policy( 

396 iam.PolicyStatement( 

397 actions=[ 

398 "logs:CreateLogStream", 

399 "logs:PutLogEvents", 

400 "logs:DescribeLogGroups", 

401 "logs:DescribeLogStreams", 

402 ], 

403 resources=[flow_log_group.log_group_arn, f"{flow_log_group.log_group_arn}:*"], 

404 ) 

405 ) 

406 

407 # Create VPC Flow Log 

408 ec2.FlowLog( 

409 self, 

410 "VpcFlowLog", 

411 resource_type=ec2.FlowLogResourceType.from_vpc(self.vpc), 

412 destination=ec2.FlowLogDestination.to_cloud_watch_logs(flow_log_group, flow_log_role), 

413 traffic_type=ec2.FlowLogTrafficType.ALL, 

414 ) 

415 

416 def _apply_nag_suppressions(self) -> None: 

417 """Apply cdk-nag suppressions for this stack.""" 

418 from gco.stacks.nag_suppressions import apply_all_suppressions 

419 

420 apply_all_suppressions( 

421 self, 

422 stack_type="regional", 

423 regions=self.config.get_regions(), 

424 global_region=self.config.get_global_region(), 

425 ) 

426 

427 def _create_sqs_queue(self) -> None: 

428 """Create SQS queue for job ingestion. 

429 

430 Creates an SQS queue that serves as the default job ingestion point 

431 for this region. Jobs submitted to this queue are processed by the 

432 manifest processor and KEDA scales based on queue depth. 

433 

434 Also creates a dead-letter queue for failed messages. 

435 Both queues use server-side encryption with AWS managed keys. 

436 """ 

437 project_name = self.config.get_project_name() 

438 

439 # Create dead-letter queue for failed messages 

440 self.job_dlq = sqs.Queue( 

441 self, 

442 "JobDeadLetterQueue", 

443 queue_name=f"{project_name}-jobs-dlq-{self.deployment_region}", 

444 retention_period=Duration.days(14), 

445 removal_policy=RemovalPolicy.DESTROY, 

446 enforce_ssl=True, # Require SSL for all requests 

447 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption 

448 ) 

449 

450 # Create main job queue 

451 self.job_queue = sqs.Queue( 

452 self, 

453 "JobQueue", 

454 queue_name=f"{project_name}-jobs-{self.deployment_region}", 

455 visibility_timeout=Duration.minutes(5), # Match Lambda timeout 

456 retention_period=Duration.days(7), 

457 dead_letter_queue=sqs.DeadLetterQueue( 

458 max_receive_count=3, # Move to DLQ after 3 failed attempts 

459 queue=self.job_dlq, 

460 ), 

461 removal_policy=RemovalPolicy.DESTROY, 

462 enforce_ssl=True, # Require SSL for all requests 

463 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption 

464 ) 

465 

466 # Output queue information 

467 CfnOutput( 

468 self, 

469 "JobQueueUrl", 

470 value=self.job_queue.queue_url, 

471 description=f"SQS Job Queue URL for {self.deployment_region}", 

472 export_name=f"{project_name}-job-queue-url-{self.deployment_region}", 

473 ) 

474 

475 CfnOutput( 

476 self, 

477 "JobQueueArn", 

478 value=self.job_queue.queue_arn, 

479 description=f"SQS Job Queue ARN for {self.deployment_region}", 

480 export_name=f"{project_name}-job-queue-arn-{self.deployment_region}", 

481 ) 

482 

483 CfnOutput( 

484 self, 

485 "JobDlqUrl", 

486 value=self.job_dlq.queue_url, 

487 description=f"SQS Dead Letter Queue URL for {self.deployment_region}", 

488 export_name=f"{project_name}-job-dlq-url-{self.deployment_region}", 

489 ) 

490 

491 def _create_aws_custom_resource_role(self) -> None: 

492 """Pre-create the execution role shared by every ``AwsCustomResource``. 

493 

494 CDK's ``cr.AwsCustomResource`` defaults to auto-generating a per- 

495 construct Lambda execution role from the ``policy=`` parameter. 

496 Internally, CDK deduplicates those auto-generated roles onto a 

497 single *singleton* provider Lambda (logical id prefix 

498 ``AWS679f53fac002430cb0da5b7982bd22872``), and merges each custom 

499 resource's policy statements onto that Lambda's role at stack 

500 create time. On cold deploys, CloudFormation invokes the Lambda 

501 within 2-3 seconds of attaching a new policy statement, which is 

502 faster than IAM's global propagation window. The symptom is a 

503 ``iam:PassRole NOT authorized`` failure on whichever addon role 

504 update happens to run right after its ``iam:PassRole`` policy 

505 statement was attached but before it had replicated. 

506 

507 The fix is to create the role up front, attach every policy 

508 statement the stack will need during stack creation, and pass 

509 ``role=self.aws_custom_resource_role`` to every 

510 ``AwsCustomResource`` instead of ``policy=``. Because the role 

511 already exists — and its inline policy has had minutes to 

512 replicate by the time any ``AwsCustomResource`` actually fires — 

513 the race disappears entirely. 

514 

515 This method creates the role with the statements we can compute 

516 without a cluster reference (EKS ``UpdateAddon`` / ``DescribeAddon`` 

517 scoped to this cluster, and SSM ``GetParameter`` for the endpoint 

518 group ARN). ``iam:PassRole`` statements for individual addon 

519 roles (EFS CSI, FSx CSI, CloudWatch Observability) are appended 

520 by each ``_create_*_addon`` method after the corresponding IRSA 

521 role has been created, so every PassRole ``resources=`` list 

522 stays precise (no wildcards) and cdk-nag stays happy. 

523 """ 

524 project_name = self.config.get_project_name() 

525 global_region = self.config.get_global_region() 

526 

527 self.aws_custom_resource_role = iam.Role( 

528 self, 

529 "AwsCustomResourceRole", 

530 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

531 description=( 

532 "Shared execution role for every cr.AwsCustomResource in this " 

533 "stack. Pre-created to avoid the IAM policy propagation race " 

534 "that occurs when CDK auto-generates per-CR roles and the " 

535 "singleton provider Lambda fires before the freshly-attached " 

536 "policy has replicated globally." 

537 ), 

538 managed_policies=[ 

539 iam.ManagedPolicy.from_aws_managed_policy_name( 

540 "service-role/AWSLambdaBasicExecutionRole" 

541 ), 

542 ], 

543 ) 

544 

545 # EKS UpdateAddon / DescribeAddon — used by the three updateAddon 

546 # custom resources (EFS CSI, FSx CSI, CloudWatch Observability). 

547 # Scoped to this cluster's addons by ARN. 

548 self.aws_custom_resource_role.add_to_policy( 

549 iam.PolicyStatement( 

550 effect=iam.Effect.ALLOW, 

551 actions=["eks:UpdateAddon", "eks:DescribeAddon"], 

552 resources=[ 

553 f"arn:aws:eks:{self.deployment_region}:{self.account}" 

554 f":addon/{self.cluster_config.cluster_name}/*" 

555 ], 

556 ) 

557 ) 

558 

559 # SSM GetParameter — used by the GetEndpointGroupArn custom 

560 # resource in _create_ga_registration_lambda to read the ARN of 

561 # the Global Accelerator endpoint group published by the global 

562 # stack during its deploy. 

563 self.aws_custom_resource_role.add_to_policy( 

564 iam.PolicyStatement( 

565 effect=iam.Effect.ALLOW, 

566 actions=["ssm:GetParameter"], 

567 resources=[ 

568 f"arn:aws:ssm:{global_region}:{self.account}:parameter/{project_name}/*" 

569 ], 

570 ) 

571 ) 

572 

573 # cdk-nag suppressions: the two wildcard-bearing ARNs above are 

574 # intentional and both scoped as tightly as AWS IAM permits. 

575 # 

576 # - The ``eks:UpdateAddon`` / ``eks:DescribeAddon`` statement uses 

577 # ``addon/<cluster>/*`` as its resource because the same shared 

578 # role is consumed by three different updateAddon custom 

579 # resources (EFS CSI, FSx CSI, CloudWatch Observability). Each 

580 # addon has its own ARN and we'd otherwise need three separate 

581 # statements that each grant access to a known addon name. The 

582 # wildcard is scoped to a single cluster in a single region in 

583 # a single account — it cannot be used against any addon 

584 # belonging to a different cluster or a different service. 

585 # 

586 # - The ``ssm:GetParameter`` statement uses 

587 # ``parameter/<project>/*`` because the exact parameter name 

588 # (``endpoint-group-<region>-arn``) is only known at Global 

589 # Accelerator registration time and the endpoint path 

590 # structure is ``<project>/<parameter>``. Scoping to the 

591 # project prefix restricts access to parameters owned by this 

592 # project only. 

593 from cdk_nag import NagSuppressions 

594 

595 NagSuppressions.add_resource_suppressions( 

596 self.aws_custom_resource_role, 

597 [ 

598 { 

599 "id": "AwsSolutions-IAM5", 

600 "reason": ( 

601 "Scoped to a single EKS cluster's addons " 

602 "(addon/<cluster>/*) and this project's SSM " 

603 "parameters (parameter/<project>/*). Both wildcards " 

604 "are as tight as AWS IAM permits: addon names and " 

605 "parameter names are not known at stack synthesis " 

606 "time because the addons are created later in the " 

607 "same stack and the GA endpoint group ARN is " 

608 "published by a separate stack during deploy. The " 

609 "shared role pattern itself is deliberate — see " 

610 "_create_aws_custom_resource_role docstring for why " 

611 "we pre-create instead of letting CDK auto-generate " 

612 "per-CR roles." 

613 ), 

614 "appliesTo": [ 

615 f"Resource::arn:aws:eks:{self.deployment_region}" 

616 f":<AWS::AccountId>:addon/{self.cluster_config.cluster_name}/*", 

617 f"Resource::arn:aws:ssm:{global_region}" 

618 f":<AWS::AccountId>:parameter/{project_name}/*", 

619 ], 

620 }, 

621 ], 

622 apply_to_children=True, 

623 ) 

624 

625 def _create_container_images(self) -> None: 

626 """Create ECR repositories and build Docker images for services""" 

627 

628 # Create ECR repository for health monitor 

629 self.health_monitor_repo = ecr.Repository( 

630 self, 

631 "HealthMonitorRepo", 

632 # repository_name intentionally omitted - let CDK generate unique name 

633 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production 

634 empty_on_delete=True, # Clean up images on stack deletion 

635 image_scan_on_push=True, # Enable vulnerability scanning on push 

636 ) 

637 

638 # All Docker images target AMD64 (x86_64) to match EKS Auto Mode's 

639 # default system nodepool. 

640 

641 # Build and push health monitor Docker image 

642 self.health_monitor_image = ecr_assets.DockerImageAsset( 

643 self, 

644 "HealthMonitorImage", 

645 directory=".", # Root directory 

646 file="dockerfiles/health-monitor-dockerfile", 

647 platform=ecr_assets.Platform.LINUX_AMD64, 

648 ) 

649 

650 # Create ECR repository for manifest processor 

651 self.manifest_processor_repo = ecr.Repository( 

652 self, 

653 "ManifestProcessorRepo", 

654 # repository_name intentionally omitted - let CDK generate unique name 

655 removal_policy=RemovalPolicy.DESTROY, 

656 empty_on_delete=True, 

657 image_scan_on_push=True, # Enable vulnerability scanning on push 

658 ) 

659 

660 # Build and push manifest processor Docker image 

661 self.manifest_processor_image = ecr_assets.DockerImageAsset( 

662 self, 

663 "ManifestProcessorImage", 

664 directory=".", 

665 file="dockerfiles/manifest-processor-dockerfile", 

666 platform=ecr_assets.Platform.LINUX_AMD64, 

667 ) 

668 

669 # Output image URIs for reference 

670 CfnOutput( 

671 self, 

672 "HealthMonitorImageUri", 

673 value=self.health_monitor_image.image_uri, 

674 description="Health Monitor Docker image URI", 

675 ) 

676 

677 CfnOutput( 

678 self, 

679 "ManifestProcessorImageUri", 

680 value=self.manifest_processor_image.image_uri, 

681 description="Manifest Processor Docker image URI", 

682 ) 

683 

684 # Build and push inference monitor Docker image 

685 self.inference_monitor_image = ecr_assets.DockerImageAsset( 

686 self, 

687 "InferenceMonitorImage", 

688 directory=".", 

689 file="dockerfiles/inference-monitor-dockerfile", 

690 platform=ecr_assets.Platform.LINUX_AMD64, 

691 ) 

692 

693 CfnOutput( 

694 self, 

695 "InferenceMonitorImageUri", 

696 value=self.inference_monitor_image.image_uri, 

697 description="Inference Monitor Docker image URI", 

698 ) 

699 

700 # Build and push queue processor Docker image (if enabled). 

701 # The queue processor is a KEDA ScaledJob that consumes manifests from 

702 # the regional SQS queue. It can be disabled in cdk.json if users want 

703 # to implement their own consumer. When disabled, the post-helm-sqs-consumer.yaml 

704 # manifest is skipped (unreplaced template variables cause it to be skipped). 

705 queue_processor_config = self.node.try_get_context("queue_processor") or {} 

706 self.queue_processor_enabled = queue_processor_config.get("enabled", True) 

707 

708 if self.queue_processor_enabled: 708 ↛ exitline 708 didn't return from function '_create_container_images' because the condition on line 708 was always true

709 self.queue_processor_image = ecr_assets.DockerImageAsset( 

710 self, 

711 "QueueProcessorImage", 

712 directory=".", 

713 file="dockerfiles/queue-processor-dockerfile", 

714 platform=ecr_assets.Platform.LINUX_AMD64, 

715 ) 

716 

717 CfnOutput( 

718 self, 

719 "QueueProcessorImageUri", 

720 value=self.queue_processor_image.image_uri, 

721 description="Queue Processor Docker image URI", 

722 ) 

723 

724 def _create_eks_cluster(self, cluster_config: Any) -> None: 

725 """Create the EKS cluster with auto mode and GPU node groups""" 

726 

727 # Create cluster admin role 

728 # role_name intentionally omitted - let CDK generate unique name 

729 cluster_admin_role = iam.Role( 

730 self, 

731 "ClusterAdminRole", 

732 assumed_by=iam.ServicePrincipal("eks.amazonaws.com"), 

733 managed_policies=[ 

734 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSClusterPolicy") 

735 ], 

736 ) 

737 

738 # Create node group role 

739 # role_name intentionally omitted - let CDK generate unique name 

740 iam.Role( 

741 self, 

742 "NodeGroupRole", 

743 assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"), 

744 managed_policies=[ 

745 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSWorkerNodePolicy"), 

746 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKS_CNI_Policy"), 

747 iam.ManagedPolicy.from_aws_managed_policy_name( 

748 "AmazonEC2ContainerRegistryReadOnly" 

749 ), 

750 ], 

751 ) 

752 

753 # Create EKS Auto Mode cluster with built-in system and general-purpose nodepools 

754 # Auto Mode automatically manages compute resources and comes with essential addons 

755 # Get endpoint access configuration 

756 eks_config = self.config.get_eks_cluster_config() 

757 endpoint_access_mode = eks_config.get("endpoint_access", "PRIVATE") 

758 

759 # Map config string to EKS EndpointAccess enum 

760 endpoint_access = ( 

761 eks.EndpointAccess.PRIVATE 

762 if endpoint_access_mode == "PRIVATE" 

763 else eks.EndpointAccess.PUBLIC_AND_PRIVATE 

764 ) 

765 

766 # Create KMS key for EKS secrets encryption 

767 self.eks_encryption_key = kms.Key( 

768 self, 

769 "EksSecretsEncryptionKey", 

770 description="KMS key for EKS Kubernetes secrets encryption", 

771 enable_key_rotation=True, 

772 removal_policy=RemovalPolicy.RETAIN, 

773 ) 

774 

775 # Get Kubernetes version - use custom version if not available in CDK enum 

776 k8s_version_str = cluster_config.kubernetes_version 

777 try: 

778 k8s_version = getattr(eks.KubernetesVersion, f"V{k8s_version_str.replace('.', '_')}") 

779 except AttributeError: 

780 # Version not in CDK enum yet, use custom version 

781 k8s_version = eks.KubernetesVersion.of(k8s_version_str) 

782 

783 self.cluster = eks.Cluster( 

784 self, 

785 "GCOEksCluster", 

786 cluster_name=cluster_config.cluster_name, 

787 version=k8s_version, # Use configured version for Auto Mode with DRA support 

788 vpc=self.vpc, 

789 compute=eks.ComputeConfig( 

790 # Enable both built-in node pools - Auto Mode manages these automatically 

791 node_pools=["system", "general-purpose"] 

792 ), 

793 # SECURITY: Endpoint access controlled via cdk.json eks_cluster.endpoint_access 

794 # PRIVATE (default): EKS API accessible only from within VPC - most secure 

795 # Job submission works via API Gateway → Lambda (in VPC) or SQS 

796 # For kubectl access, use a bastion host, VPN, or AWS SSM Session Manager 

797 # PUBLIC_AND_PRIVATE: EKS API accessible from internet and VPC 

798 # Allows direct kubectl access but less secure 

799 endpoint_access=endpoint_access, 

800 role=cluster_admin_role, 

801 vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)], 

802 # Enable all control plane logging for security and compliance 

803 cluster_logging=[ 

804 eks.ClusterLoggingTypes.API, 

805 eks.ClusterLoggingTypes.AUDIT, 

806 eks.ClusterLoggingTypes.AUTHENTICATOR, 

807 eks.ClusterLoggingTypes.CONTROLLER_MANAGER, 

808 eks.ClusterLoggingTypes.SCHEDULER, 

809 ], 

810 # SECURITY: Enable envelope encryption for Kubernetes secrets using KMS 

811 secrets_encryption_key=self.eks_encryption_key, 

812 ) 

813 

814 # Auto Mode comes with essential addons pre-configured: 

815 # - AWS Load Balancer Controller (for ALB/NLB integration) 

816 # - CoreDNS, kube-proxy, VPC CNI (standard Kubernetes components) 

817 

818 # OIDC provider for IRSA — the primary credential injection mechanism. 

819 # IRSA uses projected service-account tokens exchanged via the OIDC provider 

820 # for temporary AWS credentials. This works reliably on EKS Auto Mode. 

821 self.oidc_provider = eks.OidcProviderNative( 

822 self, 

823 "OidcProvider", 

824 url=self.cluster.cluster_open_id_connect_issuer_url, 

825 ) 

826 

827 # Pod Identity Agent add-on — registers the admission webhook that injects 

828 # Pod Identity credentials. On Auto Mode the DaemonSet schedules 0 pods 

829 # (the agent is built into the node), but the add-on registration is still 

830 # needed for the control-plane webhook. Kept as a secondary credential path. 

831 self._create_pod_identity_agent_addon() 

832 

833 # Add Metrics Server add-on for HPA and resource monitoring 

834 self._create_metrics_server_addon() 

835 

836 # Add EFS CSI Driver add-on for shared storage 

837 self._create_efs_csi_driver_addon() 

838 

839 # Add CloudWatch Observability add-on for Container Insights metrics 

840 self._create_cloudwatch_observability_addon() 

841 

842 # NOTE: GPU compute is configured via Karpenter NodePools (not managed node groups) 

843 # NodePool manifests are located in lambda/kubectl-applier-simple/manifests/: 

844 # - 40-nodepool-gpu-x86.yaml: x86_64 GPU instances (g4dn, g5, g6, g6e, p3) 

845 # - 41-nodepool-gpu-arm.yaml: ARM64 GPU instances (g5g) 

846 # - 42-nodepool-inference.yaml: inference-optimized GPU instances 

847 # - 43-nodepool-efa.yaml: EFA-enabled instances (p4d, p5/p5e/p5en, p6-b200/p6-b300/p6e-gb200) 

848 # - 44-nodepool-neuron.yaml: Trainium/Inferentia instances 

849 # These will be applied by the kubectl Lambda custom resource (created below) 

850 

851 # Create IRSA role for service account to access secrets 

852 self._create_service_account_role() 

853 

854 # Create kubectl Lambda for applying Kubernetes manifests 

855 self._create_kubectl_lambda() 

856 

857 # ── Shared toleration config for EKS add-ons ────────────────────────── 

858 # All GCO nodepools apply taints (nvidia.com/gpu, aws.amazon.com/neuron, 

859 # vpc.amazonaws.com/efa) that prevent DaemonSet pods from scheduling. 

860 # Every add-on that runs a DaemonSet (or may schedule on tainted nodes) 

861 # must tolerate these taints so that storage drivers, metrics agents, and 

862 # other infrastructure components work on every node type. 

863 _ADDON_NODE_TOLERATIONS = [ 

864 {"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"}, 

865 {"key": "aws.amazon.com/neuron", "operator": "Exists", "effect": "NoSchedule"}, 

866 {"key": "vpc.amazonaws.com/efa", "operator": "Exists", "effect": "NoSchedule"}, 

867 ] 

868 

869 def _create_pod_identity_agent_addon(self) -> None: 

870 """Create EKS Pod Identity Agent add-on. 

871 

872 On Auto Mode the DaemonSet schedules 0 pods (the agent is built into 

873 the node runtime), but the add-on registration is still required for 

874 the control-plane admission webhook that injects Pod Identity tokens. 

875 """ 

876 eks.Addon( 

877 self, 

878 "PodIdentityAgentAddon", 

879 cluster=self.cluster, # type: ignore[arg-type] 

880 addon_name="eks-pod-identity-agent", 

881 addon_version=EKS_ADDON_POD_IDENTITY_AGENT, 

882 preserve_on_delete=False, 

883 configuration_values={ 

884 "tolerations": self._ADDON_NODE_TOLERATIONS, 

885 }, 

886 ) 

887 

888 def _create_metrics_server_addon(self) -> None: 

889 """Create Metrics Server add-on for resource metrics. 

890 

891 The Metrics Server collects resource metrics from kubelets and exposes 

892 them via the Kubernetes API server. This is required for: 

893 - Horizontal Pod Autoscaler (HPA) 

894 - Vertical Pod Autoscaler (VPA) 

895 - kubectl top commands 

896 - Resource monitoring dashboards 

897 

898 Note: Metrics Server doesn't require an IRSA role as it only needs 

899 in-cluster permissions which are handled by its service account. 

900 """ 

901 eks.Addon( 

902 self, 

903 "MetricsServerAddon", 

904 cluster=self.cluster, # type: ignore[arg-type] 

905 addon_name="metrics-server", 

906 addon_version=EKS_ADDON_METRICS_SERVER, 

907 preserve_on_delete=False, 

908 configuration_values={ 

909 "tolerations": self._ADDON_NODE_TOLERATIONS, 

910 }, 

911 ) 

912 

913 def _create_efs_csi_driver_addon(self) -> None: 

914 """Create EFS CSI Driver add-on for shared storage support. 

915 

916 The EFS CSI driver enables Kubernetes pods to mount EFS file systems 

917 as persistent volumes. This is required for the shared storage feature. 

918 

919 We create a Pod Identity role for the EFS CSI driver and update the add-on 

920 to use it via a custom resource after the add-on is created. 

921 """ 

922 # Create IAM role for EFS CSI Driver using IRSA + Pod Identity 

923 self.efs_csi_role = GCORegionalStack._create_irsa_role( 

924 self, 

925 "EfsCsiDriverRole", 

926 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

927 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

928 service_account_names=["efs-csi-controller-sa"], 

929 namespaces=["kube-system"], 

930 ) 

931 

932 # Add EFS CSI driver permissions 

933 self.efs_csi_role.add_managed_policy( 

934 iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEFSCSIDriverPolicy") 

935 ) 

936 

937 # Create EFS CSI Driver add-on 

938 efs_addon = eks.Addon( 

939 self, 

940 "EfsCsiDriverAddon", 

941 cluster=self.cluster, # type: ignore[arg-type] 

942 addon_name="aws-efs-csi-driver", 

943 addon_version=EKS_ADDON_EFS_CSI_DRIVER, 

944 preserve_on_delete=False, 

945 configuration_values={ 

946 "node": { 

947 "tolerations": self._ADDON_NODE_TOLERATIONS, 

948 }, 

949 "controller": { 

950 "tolerations": self._ADDON_NODE_TOLERATIONS, 

951 }, 

952 }, 

953 ) 

954 

955 # Append the PassRole statement for the EFS CSI role to the shared 

956 # AwsCustomResource execution role. See the role's creation in 

957 # _create_aws_custom_resource_role for the full rationale on why 

958 # we pre-create + attach up-front instead of letting CDK 

959 # auto-generate per-CR roles. 

960 self.aws_custom_resource_role.add_to_policy( 

961 iam.PolicyStatement( 

962 effect=iam.Effect.ALLOW, 

963 actions=["iam:PassRole"], 

964 resources=[self.efs_csi_role.role_arn], 

965 ) 

966 ) 

967 

968 # Update the add-on to use the IRSA role via custom resource 

969 # This is needed because the eks v2 alpha Addon doesn't support service_account_role directly 

970 update_addon = cr.AwsCustomResource( 

971 self, 

972 "UpdateEfsCsiAddonRole", 

973 on_create=cr.AwsSdkCall( 

974 service="EKS", 

975 action="updateAddon", 

976 parameters={ 

977 "clusterName": self.cluster.cluster_name, 

978 "addonName": "aws-efs-csi-driver", 

979 "serviceAccountRoleArn": self.efs_csi_role.role_arn, 

980 }, 

981 physical_resource_id=cr.PhysicalResourceId.of( 

982 f"{self.cluster.cluster_name}-efs-csi-role-update" 

983 ), 

984 ), 

985 on_update=cr.AwsSdkCall( 

986 service="EKS", 

987 action="updateAddon", 

988 parameters={ 

989 "clusterName": self.cluster.cluster_name, 

990 "addonName": "aws-efs-csi-driver", 

991 "serviceAccountRoleArn": self.efs_csi_role.role_arn, 

992 }, 

993 ), 

994 role=self.aws_custom_resource_role, 

995 ) 

996 

997 # Ensure the update happens after the add-on is created. We also 

998 # depend on the shared execution role so CloudFormation has fully 

999 # attached + replicated its inline policy before the Lambda fires. 

1000 update_addon.node.add_dependency(efs_addon) 

1001 update_addon.node.add_dependency(self.efs_csi_role) 

1002 update_addon.node.add_dependency(self.aws_custom_resource_role) 

1003 

1004 # Expose the update-addon resource so _apply_kubernetes_manifests can 

1005 # make the kubectl Lambda wait for the IRSA annotation patch to land 

1006 # before it tries to rollout-restart the efs-csi-controller. Without 

1007 # this ordering, the restart could fire before EKS has re-attached 

1008 # the role ARN, leaving the new pods just as credential-less as the 

1009 # old ones and causing every EFS CreateAccessPoint to fail with a 

1010 # 401 from IMDS. 

1011 self._efs_csi_addon_role_update = update_addon 

1012 

1013 def _create_cloudwatch_observability_addon(self) -> None: 

1014 """Create CloudWatch Observability add-on for Container Insights. 

1015 

1016 The CloudWatch Observability add-on enables Container Insights metrics 

1017 for the EKS cluster, providing visibility into: 

1018 - Cluster CPU and memory utilization 

1019 - Node-level metrics 

1020 - Pod and container metrics 

1021 - Application logs (optional) 

1022 

1023 These metrics are used by the monitoring dashboard to display 

1024 cluster health and resource utilization. 

1025 """ 

1026 

1027 # Create IAM role for CloudWatch agent using IRSA + Pod Identity 

1028 self.cloudwatch_role = GCORegionalStack._create_irsa_role( 

1029 self, 

1030 "CloudWatchObservabilityRole", 

1031 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

1032 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

1033 service_account_names=["cloudwatch-agent"], 

1034 namespaces=["amazon-cloudwatch"], 

1035 ) 

1036 

1037 # Add CloudWatch agent permissions 

1038 self.cloudwatch_role.add_managed_policy( 

1039 iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchAgentServerPolicy") 

1040 ) 

1041 self.cloudwatch_role.add_managed_policy( 

1042 iam.ManagedPolicy.from_aws_managed_policy_name("AWSXrayWriteOnlyAccess") 

1043 ) 

1044 

1045 # Create CloudWatch Observability add-on 

1046 cw_addon = eks.Addon( 

1047 self, 

1048 "CloudWatchObservabilityAddon", 

1049 cluster=self.cluster, # type: ignore[arg-type] 

1050 addon_name="amazon-cloudwatch-observability", 

1051 addon_version=EKS_ADDON_CLOUDWATCH_OBSERVABILITY, 

1052 preserve_on_delete=False, 

1053 configuration_values={ 

1054 "tolerations": self._ADDON_NODE_TOLERATIONS, 

1055 # Enable Container Insights with application log collection 

1056 # Logs are sent to /aws/containerinsights/{cluster}/application 

1057 "containerLogs": { 

1058 "enabled": True, 

1059 }, 

1060 }, 

1061 ) 

1062 

1063 # Append the PassRole statement for the CloudWatch Observability 

1064 # role to the shared AwsCustomResource execution role. See 

1065 # _create_aws_custom_resource_role for the full rationale. 

1066 self.aws_custom_resource_role.add_to_policy( 

1067 iam.PolicyStatement( 

1068 effect=iam.Effect.ALLOW, 

1069 actions=["iam:PassRole"], 

1070 resources=[self.cloudwatch_role.role_arn], 

1071 ) 

1072 ) 

1073 

1074 # Update the add-on to use the IRSA role via custom resource 

1075 update_cw_addon = cr.AwsCustomResource( 

1076 self, 

1077 "UpdateCloudWatchAddonRole", 

1078 on_create=cr.AwsSdkCall( 

1079 service="EKS", 

1080 action="updateAddon", 

1081 parameters={ 

1082 "clusterName": self.cluster.cluster_name, 

1083 "addonName": "amazon-cloudwatch-observability", 

1084 "serviceAccountRoleArn": self.cloudwatch_role.role_arn, 

1085 }, 

1086 physical_resource_id=cr.PhysicalResourceId.of( 

1087 f"{self.cluster.cluster_name}-cw-obs-role-update" 

1088 ), 

1089 ), 

1090 on_update=cr.AwsSdkCall( 

1091 service="EKS", 

1092 action="updateAddon", 

1093 parameters={ 

1094 "clusterName": self.cluster.cluster_name, 

1095 "addonName": "amazon-cloudwatch-observability", 

1096 "serviceAccountRoleArn": self.cloudwatch_role.role_arn, 

1097 }, 

1098 ), 

1099 role=self.aws_custom_resource_role, 

1100 ) 

1101 

1102 # Ensure the update happens after the add-on is created. Depend on 

1103 # the shared execution role so CFN has fully attached + replicated 

1104 # its inline policy before the Lambda fires. No CR→CR dependency 

1105 # chain needed anymore — the race it was serializing against is 

1106 # eliminated by pre-creating the role. 

1107 update_cw_addon.node.add_dependency(cw_addon) 

1108 update_cw_addon.node.add_dependency(self.cloudwatch_role) 

1109 update_cw_addon.node.add_dependency(self.aws_custom_resource_role) 

1110 

1111 # Expose the update-addon resource so _apply_kubernetes_manifests can 

1112 # make the kubectl Lambda wait for the IRSA annotation patch to land 

1113 # before it rollout-restarts the cloudwatch-agent DaemonSet. See the 

1114 # EFS CSI equivalent for the full rationale — same race, same fix. 

1115 self._cloudwatch_addon_role_update = update_cw_addon 

1116 

1117 def _create_service_account_role(self) -> None: 

1118 """Create IAM role for Kubernetes service account using EKS Pod Identity. 

1119 

1120 Pod Identity is the recommended mechanism for EKS Auto Mode. It's simpler 

1121 and more reliable than IRSA — no OIDC provider, no webhook injection, no 

1122 projected tokens. EKS manages the credential injection automatically. 

1123 

1124 This role can be assumed by the gco-service-account in: 

1125 - gco-system namespace (for system services like health-monitor, manifest-processor) 

1126 - gco-jobs namespace (for user jobs that need SQS access for KEDA scaling) 

1127 - gco-inference namespace (for inference endpoints) 

1128 """ 

1129 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust 

1130 # 

1131 # The trust policy's `sub` condition must list every ServiceAccount 

1132 # that needs to assume this role. Keep in sync with: 

1133 # - lambda/kubectl-applier-simple/manifests/01-serviceaccounts.yaml 

1134 # (gco-service-account) 

1135 # - lambda/kubectl-applier-simple/manifests/02-rbac.yaml 

1136 # (gco-health-monitor-sa, gco-manifest-processor-sa, 

1137 # gco-inference-monitor-sa) 

1138 # - lambda/kubectl-applier-simple/manifests/04a-jobs-serviceaccount.yaml 

1139 # (gco-service-account in gco-jobs) 

1140 self.service_account_role = GCORegionalStack._create_irsa_role( 

1141 self, 

1142 "ServiceAccountRole", 

1143 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

1144 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

1145 service_account_names=[ 

1146 "gco-service-account", 

1147 "gco-health-monitor-sa", 

1148 "gco-manifest-processor-sa", 

1149 "gco-inference-monitor-sa", 

1150 ], 

1151 namespaces=["gco-system", "gco-jobs", "gco-inference"], 

1152 ) 

1153 

1154 # Grant permission to read the auth secret 

1155 # Note: We use an explicit IAM policy statement with a wildcard (*) because: 

1156 # 1. The secret is in a different region (API Gateway region) 

1157 # 2. CDK's grant_read() generates a policy with ?????? suffix which requires 

1158 # exactly 6 characters, but the SDK can call GetSecretValue with either 

1159 # the full ARN (with suffix) or partial ARN (without suffix) 

1160 # 3. Using * ensures both forms work correctly 

1161 self.service_account_role.add_to_policy( 

1162 iam.PolicyStatement( 

1163 effect=iam.Effect.ALLOW, 

1164 actions=[ 

1165 "secretsmanager:GetSecretValue", 

1166 "secretsmanager:DescribeSecret", 

1167 ], 

1168 resources=[f"{self.auth_secret_arn}*"], # Wildcard to match with or without suffix 

1169 ) 

1170 ) 

1171 

1172 # cdk-nag suppression: the trailing ``*`` on the auth secret 

1173 # ARN above is intentional and is NOT a broad wildcard. Secrets 

1174 # Manager appends a random 6-character suffix to every secret 

1175 # ARN at creation time (``arn:...:secret:my-secret-AbC123``). 

1176 # The secret lives in a separate stack (api_gateway_global_stack) 

1177 # and is referenced here via a cross-stack token, so the actual 

1178 # suffix is unknown at synth time. The wildcard matches the 

1179 # suffix only — every finding under this rule is still scoped 

1180 # to this single secret. 

1181 from cdk_nag import NagSuppressions 

1182 

1183 NagSuppressions.add_resource_suppressions( 

1184 self.service_account_role, 

1185 [ 

1186 { 

1187 "id": "AwsSolutions-IAM5", 

1188 "reason": ( 

1189 "The trailing ``*`` matches the 6-character " 

1190 "random suffix Secrets Manager appends to secret " 

1191 "ARNs. The secret is created in a different stack " 

1192 "(api_gateway_global_stack) and referenced here " 

1193 "via a cross-stack token, so the actual suffix " 

1194 "isn't known at synth time. The wildcard is " 

1195 "bounded to a single secret — it does not grant " 

1196 "access to any other secret in the account." 

1197 ), 

1198 "appliesTo": [ 

1199 {"regex": "/^Resource::<GCOAuthSecret.*>\\*$/"}, 

1200 ], 

1201 }, 

1202 ], 

1203 apply_to_children=True, 

1204 ) 

1205 

1206 # cdk-nag suppression: the ServiceAccountRole grants ec2:Describe* 

1207 # and elasticloadbalancing:Describe* for the AWS Load Balancer 

1208 # Controller. These AWS APIs do not support resource-level IAM 

1209 # scoping — Resource: * is the only valid form. 

1210 NagSuppressions.add_resource_suppressions( 

1211 self.service_account_role, 

1212 [ 

1213 { 

1214 "id": "AwsSolutions-IAM5", 

1215 "reason": ( 

1216 "The ServiceAccountRole grants ec2:Describe* and " 

1217 "elasticloadbalancing:Describe* for the AWS Load Balancer " 

1218 "Controller. These AWS APIs do not support resource-level " 

1219 "IAM scoping — Resource: * is the only valid form. See " 

1220 "https://docs.aws.amazon.com/service-authorization/latest/" 

1221 "reference/list_amazonec2.html" 

1222 ), 

1223 "appliesTo": ["Resource::*"], 

1224 }, 

1225 ], 

1226 apply_to_children=True, 

1227 ) 

1228 

1229 # Add permissions for AWS Load Balancer Controller 

1230 self.service_account_role.add_to_policy( 

1231 iam.PolicyStatement( 

1232 effect=iam.Effect.ALLOW, 

1233 actions=[ 

1234 "ec2:DescribeAccountAttributes", 

1235 "ec2:DescribeAddresses", 

1236 "ec2:DescribeAvailabilityZones", 

1237 "ec2:DescribeInternetGateways", 

1238 "ec2:DescribeVpcs", 

1239 "ec2:DescribeVpcPeeringConnections", 

1240 "ec2:DescribeSubnets", 

1241 "ec2:DescribeSecurityGroups", 

1242 "ec2:DescribeInstances", 

1243 "ec2:DescribeNetworkInterfaces", 

1244 "ec2:DescribeTags", 

1245 "ec2:GetCoipPoolUsage", 

1246 "ec2:DescribeCoipPools", 

1247 "elasticloadbalancing:DescribeLoadBalancers", 

1248 "elasticloadbalancing:DescribeLoadBalancerAttributes", 

1249 "elasticloadbalancing:DescribeListeners", 

1250 "elasticloadbalancing:DescribeListenerCertificates", 

1251 "elasticloadbalancing:DescribeSSLPolicies", 

1252 "elasticloadbalancing:DescribeRules", 

1253 "elasticloadbalancing:DescribeTargetGroups", 

1254 "elasticloadbalancing:DescribeTargetGroupAttributes", 

1255 "elasticloadbalancing:DescribeTargetHealth", 

1256 "elasticloadbalancing:DescribeTags", 

1257 ], 

1258 resources=["*"], 

1259 ) 

1260 ) 

1261 

1262 self.service_account_role.add_to_policy( 

1263 iam.PolicyStatement( 

1264 effect=iam.Effect.ALLOW, 

1265 actions=[ 

1266 "elasticloadbalancing:CreateLoadBalancer", 

1267 "elasticloadbalancing:CreateTargetGroup", 

1268 "elasticloadbalancing:CreateListener", 

1269 "elasticloadbalancing:DeleteLoadBalancer", 

1270 "elasticloadbalancing:DeleteTargetGroup", 

1271 "elasticloadbalancing:DeleteListener", 

1272 "elasticloadbalancing:ModifyLoadBalancerAttributes", 

1273 "elasticloadbalancing:ModifyTargetGroup", 

1274 "elasticloadbalancing:ModifyTargetGroupAttributes", 

1275 "elasticloadbalancing:ModifyListener", 

1276 "elasticloadbalancing:RegisterTargets", 

1277 "elasticloadbalancing:DeregisterTargets", 

1278 "elasticloadbalancing:SetWebAcl", 

1279 "elasticloadbalancing:SetSecurityGroups", 

1280 "elasticloadbalancing:SetSubnets", 

1281 "elasticloadbalancing:AddTags", 

1282 "elasticloadbalancing:RemoveTags", 

1283 ], 

1284 resources=["*"], 

1285 ) 

1286 ) 

1287 

1288 self.service_account_role.add_to_policy( 

1289 iam.PolicyStatement( 

1290 effect=iam.Effect.ALLOW, 

1291 actions=[ 

1292 "ec2:CreateSecurityGroup", 

1293 "ec2:CreateTags", 

1294 "ec2:DeleteTags", 

1295 "ec2:AuthorizeSecurityGroupIngress", 

1296 "ec2:RevokeSecurityGroupIngress", 

1297 "ec2:DeleteSecurityGroup", 

1298 ], 

1299 resources=["*"], 

1300 ) 

1301 ) 

1302 

1303 self.service_account_role.add_to_policy( 

1304 iam.PolicyStatement( 

1305 effect=iam.Effect.ALLOW, 

1306 actions=["iam:CreateServiceLinkedRole"], 

1307 resources=["*"], 

1308 conditions={ 

1309 "StringEquals": {"iam:AWSServiceName": "elasticloadbalancing.amazonaws.com"} 

1310 }, 

1311 ) 

1312 ) 

1313 

1314 self.service_account_role.add_to_policy( 

1315 iam.PolicyStatement( 

1316 effect=iam.Effect.ALLOW, 

1317 actions=[ 

1318 "wafv2:GetWebACL", 

1319 "wafv2:GetWebACLForResource", 

1320 "wafv2:AssociateWebACL", 

1321 "wafv2:DisassociateWebACL", 

1322 ], 

1323 resources=["*"], 

1324 ) 

1325 ) 

1326 

1327 self.service_account_role.add_to_policy( 

1328 iam.PolicyStatement( 

1329 effect=iam.Effect.ALLOW, 

1330 actions=[ 

1331 "shield:GetSubscriptionState", 

1332 "shield:DescribeProtection", 

1333 "shield:CreateProtection", 

1334 "shield:DeleteProtection", 

1335 ], 

1336 resources=["*"], 

1337 ) 

1338 ) 

1339 

1340 self.service_account_role.add_to_policy( 

1341 iam.PolicyStatement( 

1342 effect=iam.Effect.ALLOW, 

1343 actions=["acm:ListCertificates", "acm:DescribeCertificate"], 

1344 resources=["*"], 

1345 ) 

1346 ) 

1347 

1348 self.service_account_role.add_to_policy( 

1349 iam.PolicyStatement( 

1350 effect=iam.Effect.ALLOW, 

1351 actions=["cognito-idp:DescribeUserPoolClient"], 

1352 resources=["*"], 

1353 ) 

1354 ) 

1355 

1356 # Add SQS permissions for KEDA to scale based on queue depth 

1357 self.service_account_role.add_to_policy( 

1358 iam.PolicyStatement( 

1359 effect=iam.Effect.ALLOW, 

1360 actions=[ 

1361 "sqs:GetQueueAttributes", 

1362 "sqs:GetQueueUrl", 

1363 "sqs:ReceiveMessage", 

1364 "sqs:DeleteMessage", 

1365 "sqs:SendMessage", 

1366 ], 

1367 resources=[ 

1368 self.job_queue.queue_arn, 

1369 self.job_dlq.queue_arn, 

1370 ], 

1371 ) 

1372 ) 

1373 

1374 # Add CloudWatch permissions for publishing custom metrics 

1375 # Used by health-monitor and manifest-processor to publish metrics 

1376 self.service_account_role.add_to_policy( 

1377 iam.PolicyStatement( 

1378 effect=iam.Effect.ALLOW, 

1379 actions=["cloudwatch:PutMetricData"], 

1380 resources=["*"], 

1381 conditions={ 

1382 "StringEquals": { 

1383 "cloudwatch:namespace": [ 

1384 "GCO/HealthMonitor", 

1385 "GCO/ManifestProcessor", 

1386 ] 

1387 } 

1388 }, 

1389 ) 

1390 ) 

1391 

1392 # Add DynamoDB permissions for templates, webhooks, and job queue 

1393 # Tables are created in the global stack and accessed from all regions 

1394 project_name = self.config.get_project_name() 

1395 global_region = self.config.get_global_region() 

1396 

1397 self.service_account_role.add_to_policy( 

1398 iam.PolicyStatement( 

1399 effect=iam.Effect.ALLOW, 

1400 actions=[ 

1401 "dynamodb:GetItem", 

1402 "dynamodb:PutItem", 

1403 "dynamodb:UpdateItem", 

1404 "dynamodb:DeleteItem", 

1405 "dynamodb:Query", 

1406 "dynamodb:Scan", 

1407 ], 

1408 resources=[ 

1409 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates", 

1410 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates/index/*", 

1411 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks", 

1412 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks/index/*", 

1413 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs", 

1414 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs/index/*", 

1415 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints", 

1416 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints/index/*", 

1417 ], 

1418 ) 

1419 ) 

1420 

1421 # Add S3 permissions for model weights bucket (used by inference init containers) 

1422 self.service_account_role.add_to_policy( 

1423 iam.PolicyStatement( 

1424 effect=iam.Effect.ALLOW, 

1425 actions=[ 

1426 "s3:GetObject", 

1427 "s3:ListBucket", 

1428 ], 

1429 resources=[ 

1430 f"arn:aws:s3:::{project_name}-*", 

1431 f"arn:aws:s3:::{project_name}-*/*", 

1432 ], 

1433 ) 

1434 ) 

1435 

1436 # KMS decrypt for model weights bucket (S3-scoped) 

1437 self.service_account_role.add_to_policy( 

1438 iam.PolicyStatement( 

1439 effect=iam.Effect.ALLOW, 

1440 actions=["kms:Decrypt", "kms:GenerateDataKey"], 

1441 resources=[f"arn:aws:kms:*:{self.account}:key/*"], 

1442 conditions={ 

1443 "StringLike": { 

1444 "kms:ViaService": "s3.*.amazonaws.com", 

1445 } 

1446 }, 

1447 ) 

1448 ) 

1449 

1450 # Create KEDA operator IAM role for SQS access 

1451 self._create_keda_operator_role() 

1452 

1453 # Create Pod Identity Associations for all service accounts 

1454 self._create_pod_identity_associations() 

1455 

1456 def _create_keda_operator_role(self) -> None: 

1457 """Create IAM role for KEDA operator service account using EKS Pod Identity. 

1458 

1459 This role allows the KEDA operator to access SQS queues for scaling 

1460 based on queue depth. The role is assumed by the keda-operator service 

1461 account in the keda namespace. 

1462 """ 

1463 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust 

1464 self.keda_operator_role = GCORegionalStack._create_irsa_role( 

1465 self, 

1466 "KedaOperatorRole", 

1467 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

1468 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

1469 service_account_names=["keda-operator"], 

1470 namespaces=["keda"], 

1471 ) 

1472 

1473 # Add SQS permissions for KEDA to read queue metrics 

1474 self.keda_operator_role.add_to_policy( 

1475 iam.PolicyStatement( 

1476 effect=iam.Effect.ALLOW, 

1477 actions=[ 

1478 "sqs:GetQueueAttributes", 

1479 "sqs:GetQueueUrl", 

1480 ], 

1481 resources=[ 

1482 self.job_queue.queue_arn, 

1483 self.job_dlq.queue_arn, 

1484 ], 

1485 ) 

1486 ) 

1487 

1488 def _create_pod_identity_associations(self) -> None: 

1489 """Create EKS Pod Identity Associations for all service accounts. 

1490 

1491 Pod Identity is the recommended mechanism for EKS Auto Mode. Each 

1492 association links an IAM role to a Kubernetes service account in a 

1493 specific namespace. EKS manages credential injection automatically. 

1494 

1495 Stores associations in self._pod_identity_associations so the 

1496 kubectl-applier custom resource can declare an explicit dependency, 

1497 ensuring credentials are available before workloads start. 

1498 """ 

1499 self._pod_identity_associations: list[Any] = [] 

1500 

1501 # GCO service account — used by health-monitor, manifest-processor, inference-monitor 

1502 for namespace in ["gco-system", "gco-jobs", "gco-inference"]: 

1503 assoc = eks_l1.CfnPodIdentityAssociation( 

1504 self, 

1505 f"PodIdentity-gco-sa-{namespace}", 

1506 cluster_name=self.cluster.cluster_name, 

1507 namespace=namespace, 

1508 service_account="gco-service-account", 

1509 role_arn=self.service_account_role.role_arn, 

1510 ) 

1511 self._pod_identity_associations.append(assoc) 

1512 

1513 # KEDA operator — needs SQS access for queue-based scaling 

1514 keda_assoc = eks_l1.CfnPodIdentityAssociation( 

1515 self, 

1516 "PodIdentity-keda-operator", 

1517 cluster_name=self.cluster.cluster_name, 

1518 namespace="keda", 

1519 service_account="keda-operator", 

1520 role_arn=self.keda_operator_role.role_arn, 

1521 ) 

1522 self._pod_identity_associations.append(keda_assoc) 

1523 

1524 # EFS CSI driver — needs EFS access for shared storage 

1525 efs_assoc = eks_l1.CfnPodIdentityAssociation( 

1526 self, 

1527 "PodIdentity-efs-csi", 

1528 cluster_name=self.cluster.cluster_name, 

1529 namespace="kube-system", 

1530 service_account="efs-csi-controller-sa", 

1531 role_arn=self.efs_csi_role.role_arn, 

1532 ) 

1533 self._pod_identity_associations.append(efs_assoc) 

1534 

1535 # CloudWatch agent — needs CloudWatch access for observability 

1536 cw_assoc = eks_l1.CfnPodIdentityAssociation( 

1537 self, 

1538 "PodIdentity-cloudwatch", 

1539 cluster_name=self.cluster.cluster_name, 

1540 namespace="amazon-cloudwatch", 

1541 service_account="cloudwatch-agent", 

1542 role_arn=self.cloudwatch_role.role_arn, 

1543 ) 

1544 self._pod_identity_associations.append(cw_assoc) 

1545 

1546 # FSx CSI driver — only when FSx is enabled (created later in _create_fsx_lustre) 

1547 # The FSx Pod Identity association is added in _create_fsx_lustre instead 

1548 

1549 def _resolve_cluster_shared_bucket_from_ssm(self) -> SharedBucketIdentity: 

1550 """Resolve the ``Cluster_Shared_Bucket`` identity from cross-region SSM. 

1551 

1552 ``GCOGlobalStack`` publishes three ``ssm.StringParameter``s in the 

1553 global region at ``/gco/cluster-shared-bucket/{name,arn,region}``. 

1554 This method reads them back from the regional stack via 

1555 ``cr.AwsCustomResource`` with ``service="SSM"``, 

1556 ``action="getParameter"``, and ``region=<global-region>`` — matching 

1557 the cross-region read pattern already used in 

1558 ``_create_ga_registration_lambda`` for the Global Accelerator 

1559 endpoint group ARN. 

1560 

1561 Runs unconditionally in ``__init__`` — no feature toggle, no 

1562 conditional guard. The returned :class:`SharedBucketIdentity` feeds 

1563 ``_grant_cluster_shared_bucket_to_job_role`` (IAM) and the 

1564 ``image_replacements`` dict (ConfigMap) downstream. 

1565 

1566 Returns: 

1567 :class:`SharedBucketIdentity` with ``name``, ``arn``, and 

1568 ``region`` populated as CDK tokens that resolve at deploy time. 

1569 """ 

1570 from cdk_nag import NagSuppressions 

1571 

1572 global_region = self.config.get_global_region() 

1573 resolved: dict[str, str] = {} 

1574 

1575 for suffix in ("name", "arn", "region"): 

1576 parameter_name = f"{CLUSTER_SHARED_SSM_PARAMETER_PREFIX}/{suffix}" 

1577 read_cr = cr.AwsCustomResource( 

1578 self, 

1579 f"ReadClusterSharedBucket{suffix.capitalize()}", 

1580 on_create=cr.AwsSdkCall( 

1581 service="SSM", 

1582 action="getParameter", 

1583 parameters={"Name": parameter_name}, 

1584 region=global_region, 

1585 physical_resource_id=cr.PhysicalResourceId.of(f"cluster-shared-{suffix}"), 

1586 ), 

1587 on_update=cr.AwsSdkCall( 

1588 service="SSM", 

1589 action="getParameter", 

1590 parameters={"Name": parameter_name}, 

1591 region=global_region, 

1592 physical_resource_id=cr.PhysicalResourceId.of(f"cluster-shared-{suffix}"), 

1593 ), 

1594 # Cross-region SSM GetParameter doesn't support resource-level 

1595 # scoping cleanly — the principal evaluating the call lives in 

1596 # this stack's region but the parameter lives in the global 

1597 # region. ANY_RESOURCE is the AWS-documented escape hatch; the 

1598 # resulting AwsSolutions-IAM5 nag finding is suppressed in a 

1599 # scoped add_resource_suppressions call below. 

1600 policy=cr.AwsCustomResourcePolicy.from_sdk_calls( 

1601 resources=cr.AwsCustomResourcePolicy.ANY_RESOURCE 

1602 ), 

1603 ) 

1604 

1605 # Scoped suppression: the CR policy is Resource::* because the 

1606 # SSM parameter lives in the global region (cross-region calls 

1607 # don't support resource-level scoping cleanly). The action is 

1608 # fixed to ssm:GetParameter and the parameter Name is fixed to 

1609 # a literal string, so the wildcard can only ever read one 

1610 # parameter. 

1611 NagSuppressions.add_resource_suppressions( 

1612 read_cr, 

1613 [ 

1614 { 

1615 "id": "AwsSolutions-IAM5", 

1616 "reason": ( 

1617 "Cross-region ssm:GetParameter for " 

1618 f"{parameter_name} in the global region. The " 

1619 "AwsCustomResource SDK-call policy is scoped to " 

1620 "a single fixed action (ssm:GetParameter) with " 

1621 "a fixed parameter Name — the Resource: * is " 

1622 "the CDK-documented escape hatch because the " 

1623 "parameter ARN is not known to the calling " 

1624 "principal's region. Effective blast radius: " 

1625 "one parameter." 

1626 ), 

1627 "appliesTo": ["Resource::*"], 

1628 }, 

1629 ], 

1630 apply_to_children=True, 

1631 ) 

1632 

1633 resolved[suffix] = read_cr.get_response_field("Parameter.Value") 

1634 

1635 return SharedBucketIdentity( 

1636 name=resolved["name"], 

1637 arn=resolved["arn"], 

1638 region=resolved["region"], 

1639 ) 

1640 

1641 def _grant_cluster_shared_bucket_to_job_role(self, shared: SharedBucketIdentity) -> None: 

1642 """Attach RW + KMS permissions on ``Cluster_Shared_Bucket`` to the job-pod role. 

1643 

1644 Two ``iam.PolicyStatement``s are added to ``self.service_account_role`` 

1645 (the EKS Pod Identity role used by every pod in ``gco-jobs``, 

1646 ``gco-system``, and ``gco-inference``): 

1647 

1648 1. S3 object + bucket-level actions (``GetObject``, ``PutObject``, 

1649 ``DeleteObject``, ``ListBucket``, ``GetBucketLocation``) scoped 

1650 to ``<shared.arn>`` and ``<shared.arn>/*`` — the bucket-ARN 

1651 shape uses the ``gco-cluster-shared-*`` prefix that IAM 

1652 policies scope against. 

1653 2. KMS ``Decrypt`` / ``GenerateDataKey`` scoped by the 

1654 ``kms:ViaService=s3.<shared.region>.amazonaws.com`` condition — 

1655 ``resources=["*"]`` because the KMS key ARN is not known to this 

1656 stack (it lives in the global region and is referenced indirectly 

1657 through the S3 service). The condition is what actually restricts 

1658 the grant to the cluster-shared bucket's key. 

1659 

1660 Runs unconditionally — the grant is 

1661 present on every regional cluster whether or not analytics is 

1662 enabled. 

1663 """ 

1664 from cdk_nag import NagSuppressions 

1665 

1666 self.service_account_role.add_to_policy( 

1667 iam.PolicyStatement( 

1668 effect=iam.Effect.ALLOW, 

1669 actions=[ 

1670 "s3:GetObject", 

1671 "s3:PutObject", 

1672 "s3:DeleteObject", 

1673 "s3:ListBucket", 

1674 "s3:GetBucketLocation", 

1675 ], 

1676 resources=[shared.arn, f"{shared.arn}/*"], 

1677 ) 

1678 ) 

1679 

1680 self.service_account_role.add_to_policy( 

1681 iam.PolicyStatement( 

1682 effect=iam.Effect.ALLOW, 

1683 actions=["kms:Decrypt", "kms:GenerateDataKey"], 

1684 resources=["*"], 

1685 conditions={ 

1686 "StringEquals": { 

1687 "kms:ViaService": f"s3.{shared.region}.amazonaws.com", 

1688 } 

1689 }, 

1690 ) 

1691 ) 

1692 

1693 # The S3 bucket-ARN resource uses a ``<arn>/*`` object-key wildcard 

1694 # which cdk-nag flags as a wildcard resource. The ARN itself is the 

1695 # literal Cluster_Shared_Bucket ARN resolved from SSM — the ``/*`` 

1696 # covers all object keys inside that single bucket, which is the 

1697 # intended semantic for the RW grant. 

1698 NagSuppressions.add_resource_suppressions( 

1699 self.service_account_role, 

1700 [ 

1701 { 

1702 "id": "AwsSolutions-IAM5", 

1703 "reason": ( 

1704 "The Cluster_Shared_Bucket RW grant uses an <arn>/* " 

1705 "object-key wildcard on the literal cluster-shared " 

1706 "bucket ARN resolved from SSM. The wildcard covers " 

1707 "object keys within a single bucket (the always-on " 

1708 "gco-cluster-shared-<account>-<region> bucket) — " 

1709 "this is the standard shape for a bucket-scoped " 

1710 "RW grant and is what the allow-list assertion " 

1711 "is written against." 

1712 ), 

1713 "appliesTo": [ 

1714 {"regex": r"/^Resource::<ReadClusterSharedBucketArn.*>\/\*$/"}, 

1715 ], 

1716 }, 

1717 ], 

1718 apply_to_children=True, 

1719 ) 

1720 

1721 def _create_kubectl_lambda(self) -> None: 

1722 """Create Lambda function to apply Kubernetes manifests using Python client. 

1723 

1724 Note: This creates the Lambda and provider but does NOT create the custom resource. 

1725 The custom resource is created in _apply_kubernetes_manifests() after ALB is created, 

1726 so that target group ARNs can be passed to the manifests. 

1727 """ 

1728 project_name = self.config.get_project_name() 

1729 

1730 # Create IAM role for kubectl Lambda 

1731 kubectl_lambda_role = iam.Role( 

1732 self, 

1733 "KubectlLambdaRole", 

1734 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

1735 managed_policies=[ 

1736 iam.ManagedPolicy.from_aws_managed_policy_name( 

1737 "service-role/AWSLambdaVPCAccessExecutionRole" 

1738 ), 

1739 iam.ManagedPolicy.from_aws_managed_policy_name( 

1740 "service-role/AWSLambdaBasicExecutionRole" 

1741 ), 

1742 ], 

1743 ) 

1744 

1745 # Add EKS permissions 

1746 kubectl_lambda_role.add_to_policy( 

1747 iam.PolicyStatement( 

1748 actions=[ 

1749 "eks:DescribeCluster", 

1750 "eks:ListClusters", 

1751 ], 

1752 resources=[self.cluster.cluster_arn], 

1753 ) 

1754 ) 

1755 

1756 # Add permissions to assume cluster admin role 

1757 kubectl_lambda_role.add_to_policy( 

1758 iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"]) 

1759 ) 

1760 

1761 # Create security group for kubectl Lambda 

1762 kubectl_lambda_sg = ec2.SecurityGroup( 

1763 self, 

1764 "KubectlLambdaSG", 

1765 vpc=self.vpc, 

1766 description="Security group for kubectl Lambda to access EKS cluster", 

1767 security_group_name=f"{self.config.get_project_name()}-kubectl-lambda-sg-{self.deployment_region}", 

1768 allow_all_outbound=True, # Lambda needs outbound access to EKS API 

1769 ) 

1770 

1771 # Allow Lambda security group to access EKS cluster security group on port 443 

1772 # The EKS cluster security group is automatically created by EKS 

1773 self.cluster.cluster_security_group.add_ingress_rule( 

1774 peer=kubectl_lambda_sg, 

1775 connection=ec2.Port.tcp(443), 

1776 description="Allow kubectl Lambda to access EKS API", 

1777 ) 

1778 

1779 # Create Lambda function (Python-only, no Docker!) 

1780 # Store function name as string attribute for cross-stack references 

1781 # This avoids CDK cross-environment resolution issues when account is unresolved 

1782 self.kubectl_lambda_function_name = f"{project_name}-kubectl-{self.deployment_region}" 

1783 self.kubectl_lambda = lambda_.Function( 

1784 self, 

1785 "KubectlApplierFunction", 

1786 function_name=self.kubectl_lambda_function_name, 

1787 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

1788 handler="handler.lambda_handler", 

1789 code=lambda_.Code.from_asset("lambda/kubectl-applier-simple-build"), 

1790 timeout=Duration.minutes(15), # Max Lambda timeout 

1791 memory_size=512, 

1792 role=kubectl_lambda_role, 

1793 vpc=self.vpc, 

1794 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

1795 security_groups=[kubectl_lambda_sg], # Use the security group we created 

1796 environment={ 

1797 "CLUSTER_NAME": self.cluster.cluster_name, 

1798 "REGION": self.deployment_region, 

1799 }, 

1800 tracing=lambda_.Tracing.ACTIVE, 

1801 ) 

1802 

1803 # Add EKS access entry for the Lambda role to authenticate with the cluster 

1804 # This grants the Lambda role cluster admin permissions 

1805 eks.AccessEntry( 

1806 self, 

1807 "KubectlLambdaAccessEntry", 

1808 cluster=self.cluster, # type: ignore[arg-type] 

1809 principal=kubectl_lambda_role.role_arn, 

1810 access_policies=[ 

1811 eks.AccessPolicy.from_access_policy_name( 

1812 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER 

1813 ) 

1814 ], 

1815 ) 

1816 

1817 # Create log group for kubectl provider 

1818 kubectl_provider_log_group = logs.LogGroup( 

1819 self, 

1820 "KubectlProviderLogGroup", 

1821 retention=logs.RetentionDays.ONE_WEEK, 

1822 removal_policy=RemovalPolicy.DESTROY, 

1823 ) 

1824 

1825 # Create custom resource provider (stored for use in _apply_kubernetes_manifests) 

1826 self.kubectl_provider = cr.Provider( 

1827 self, 

1828 "KubectlProvider", 

1829 on_event_handler=self.kubectl_lambda, 

1830 log_group=kubectl_provider_log_group, 

1831 ) 

1832 

1833 # cdk-nag suppression: the kubectl-applier Lambda requires broad 

1834 # EKS and Kubernetes API access to apply arbitrary manifests. 

1835 from cdk_nag import NagSuppressions 

1836 

1837 NagSuppressions.add_resource_suppressions( 

1838 kubectl_lambda_role, 

1839 [ 

1840 { 

1841 "id": "AwsSolutions-IAM5", 

1842 "reason": ( 

1843 "The kubectl-applier Lambda requires broad EKS and Kubernetes API " 

1844 "access to apply arbitrary manifests (RBAC, ServiceAccounts, " 

1845 "Deployments, Jobs, NetworkPolicies) across multiple namespaces. " 

1846 "Resource: * is required because the set of Kubernetes resources " 

1847 "is dynamic and not known at synth time." 

1848 ), 

1849 "appliesTo": ["Resource::*"], 

1850 }, 

1851 ], 

1852 apply_to_children=True, 

1853 ) 

1854 

1855 def _apply_kubernetes_manifests(self) -> None: 

1856 """Apply Kubernetes manifests using the kubectl Lambda custom resource. 

1857 

1858 This is called after ALB security group and EFS are created. 

1859 The Ingress will use the security group ID to create the ALB. 

1860 """ 

1861 

1862 # Get public subnet IDs for Ingress annotation (currently unused but kept for future use) 

1863 # public_subnet_ids = ",".join([subnet.subnet_id for subnet in self.vpc.public_subnets]) 

1864 

1865 # Apply manifests using custom resource 

1866 # Build image replacements dict 

1867 # Include a deployment timestamp to force pod rollouts when code changes 

1868 from datetime import UTC, datetime 

1869 

1870 deployment_timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") 

1871 

1872 # Get resource thresholds from config 

1873 thresholds = self.config.get_resource_thresholds() 

1874 

1875 # Get manifest processor resource quotas. 

1876 # Resource quotas and the security/image policy now live under the 

1877 # shared job_validation_policy section because both the REST 

1878 # manifest_processor and the SQS queue_processor read them. Service- 

1879 # specific knobs (replicas, validation_enabled, max_request_body_bytes, 

1880 # etc.) stay under manifest_processor. 

1881 mp_config = self.node.try_get_context("manifest_processor") or {} 

1882 job_policy = self.node.try_get_context("job_validation_policy") or {} 

1883 job_quotas = job_policy.get("resource_quotas", {}) 

1884 

1885 image_replacements = { 

1886 "{{HEALTH_MONITOR_IMAGE}}": self.health_monitor_image.image_uri, 

1887 "{{MANIFEST_PROCESSOR_IMAGE}}": self.manifest_processor_image.image_uri, 

1888 "{{INFERENCE_MONITOR_IMAGE}}": self.inference_monitor_image.image_uri, 

1889 "{{CLUSTER_NAME}}": self.cluster.cluster_name, 

1890 "{{REGION}}": self.deployment_region, 

1891 "{{AUTH_SECRET_ARN}}": self.auth_secret_arn, 

1892 "{{SERVICE_ACCOUNT_ROLE_ARN}}": self.service_account_role.role_arn, 

1893 "{{EFS_FILE_SYSTEM_ID}}": self.efs_file_system.file_system_id, 

1894 "{{EFS_ACCESS_POINT_ID}}": self.efs_access_point.access_point_id, 

1895 "{{JOB_QUEUE_URL}}": self.job_queue.queue_url, 

1896 "{{JOB_QUEUE_ARN}}": self.job_queue.queue_arn, 

1897 "{{DEPLOYMENT_TIMESTAMP}}": deployment_timestamp, 

1898 # Resource thresholds 

1899 "{{CPU_THRESHOLD}}": str(thresholds.cpu_threshold), 

1900 "{{MEMORY_THRESHOLD}}": str(thresholds.memory_threshold), 

1901 "{{GPU_THRESHOLD}}": str(thresholds.gpu_threshold), 

1902 "{{PENDING_PODS_THRESHOLD}}": str(thresholds.pending_pods_threshold), 

1903 "{{PENDING_REQUESTED_CPU_VCPUS}}": str(thresholds.pending_requested_cpu_vcpus), 

1904 "{{PENDING_REQUESTED_MEMORY_GB}}": str(thresholds.pending_requested_memory_gb), 

1905 "{{PENDING_REQUESTED_GPUS}}": str(thresholds.pending_requested_gpus), 

1906 # DynamoDB table names (from global stack) 

1907 "{{TEMPLATES_TABLE_NAME}}": f"{self.config.get_project_name()}-job-templates", 

1908 "{{WEBHOOKS_TABLE_NAME}}": f"{self.config.get_project_name()}-webhooks", 

1909 "{{JOBS_TABLE_NAME}}": f"{self.config.get_project_name()}-jobs", 

1910 # DynamoDB region (global stack region, may differ from cluster region) 

1911 "{{DYNAMODB_REGION}}": self.config.get_global_region(), 

1912 # Manifest processor resource quotas (sourced from shared policy). 

1913 "{{MP_MAX_CPU_PER_MANIFEST}}": str(job_quotas.get("max_cpu_per_manifest", "10")), 

1914 "{{MP_MAX_MEMORY_PER_MANIFEST}}": str( 

1915 job_quotas.get("max_memory_per_manifest", "32Gi") 

1916 ), 

1917 "{{MP_MAX_GPU_PER_MANIFEST}}": str(job_quotas.get("max_gpu_per_manifest", 4)), 

1918 # Manifest processor namespace allowlist (sourced from shared policy). 

1919 # Both the REST manifest processor and the SQS queue processor 

1920 # read from job_validation_policy.allowed_namespaces so a single 

1921 # edit takes effect on both submission paths at the next deploy. 

1922 "{{MP_ALLOWED_NAMESPACES}}": ",".join( 

1923 job_policy.get("allowed_namespaces", ["default", "gco-jobs"]) 

1924 ), 

1925 # Manifest processor image registry allowlist (sourced from shared 

1926 # policy). Augmented with the project's own ECR registry hostnames 

1927 # so jobs built via ``gco images build`` aren't rejected by the 

1928 # REST submission path. Identical augmentation runs on the SQS 

1929 # path below — see ``{{QP_TRUSTED_REGISTRIES}}``. 

1930 "{{MP_TRUSTED_REGISTRIES}}": ",".join( 

1931 _augment_trusted_registries_with_project_ecr( 

1932 job_policy.get("trusted_registries", []), 

1933 account=self.account, 

1934 regions=self.config.get_regions(), 

1935 global_region=self.config.get_global_region(), 

1936 ) 

1937 ), 

1938 "{{MP_TRUSTED_DOCKERHUB_ORGS}}": ",".join(job_policy.get("trusted_dockerhub_orgs", [])), 

1939 # Manifest processor request body size cap (HTTP 413 middleware). 

1940 # Lives at cdk.json::manifest_processor.max_request_body_bytes. 

1941 "{{MP_MAX_REQUEST_BODY_BYTES}}": str( 

1942 mp_config.get("max_request_body_bytes", 1_048_576) 

1943 ), 

1944 } 

1945 

1946 # Always-on Cluster_Shared_Bucket replacements. Populated from the 

1947 # SharedBucketIdentity resolved in __init__ via cross-region SSM 

1948 # read from GCOGlobalStack. Never gated on the analytics toggle — 

1949 # the gco-cluster-shared-bucket ConfigMap is applied to every 

1950 # regional cluster. 

1951 image_replacements.update( 

1952 _compute_kubectl_cluster_shared_replacements(self.cluster_shared_identity) 

1953 ) 

1954 

1955 # Add queue processor replacements if enabled 

1956 qp_config = self.node.try_get_context("queue_processor") or {} 

1957 

1958 # Add VPC endpoint CIDR replacements for network policy restrictions 

1959 # Generates a YAML block of ipBlock entries from the vpc_endpoint_cidrs array. 

1960 # The placeholder {{VPC_ENDPOINT_CIDR_BLOCKS}} sits at 8-space indentation in 

1961 # the manifest, so the first entry needs no leading indent (the manifest provides 

1962 # it) and subsequent entries are indented to align. 

1963 vpc_endpoint_cidrs = self.node.try_get_context("vpc_endpoint_cidrs") or ["10.0.0.0/16"] 

1964 cidr_lines = [] 

1965 for i, cidr in enumerate(vpc_endpoint_cidrs): 

1966 prefix = "" if i == 0 else " " 

1967 cidr_lines.append(f'{prefix}- ipBlock:\n cidr: "{cidr}"') 

1968 image_replacements["{{VPC_ENDPOINT_CIDR_BLOCKS}}"] = "\n".join(cidr_lines) 

1969 

1970 # Resource governance for gco-jobs namespace: ResourceQuota caps aggregate 

1971 # resource consumption across the namespace, LimitRange caps per-container 

1972 # maxima. Values come from cdk.json `resource_quota` context with defaults 

1973 # sized for a modest multi-tenant dev cluster. 

1974 resource_quota = self.node.try_get_context("resource_quota") or {} 

1975 image_replacements["{{QUOTA_MAX_CPU}}"] = str(resource_quota.get("max_cpu", "100")) 

1976 image_replacements["{{QUOTA_MAX_MEMORY}}"] = str(resource_quota.get("max_memory", "512Gi")) 

1977 image_replacements["{{QUOTA_MAX_GPU}}"] = str(resource_quota.get("max_gpu", "32")) 

1978 image_replacements["{{QUOTA_MAX_PODS}}"] = str(resource_quota.get("max_pods", "50")) 

1979 image_replacements["{{LIMIT_MAX_CPU}}"] = str(resource_quota.get("container_max_cpu", "10")) 

1980 image_replacements["{{LIMIT_MAX_MEMORY}}"] = str( 

1981 resource_quota.get("container_max_memory", "64Gi") 

1982 ) 

1983 image_replacements["{{LIMIT_MAX_GPU}}"] = str(resource_quota.get("container_max_gpu", "4")) 

1984 

1985 if self.queue_processor_enabled: 1985 ↛ 2064line 1985 didn't jump to line 2064 because the condition on line 1985 was always true

1986 image_replacements["{{QUEUE_PROCESSOR_IMAGE}}"] = self.queue_processor_image.image_uri 

1987 image_replacements["{{QP_POLLING_INTERVAL}}"] = str( 

1988 qp_config.get("polling_interval", 10) 

1989 ) 

1990 image_replacements["{{QP_MAX_CONCURRENT_JOBS}}"] = str( 

1991 qp_config.get("max_concurrent_jobs", 10) 

1992 ) 

1993 image_replacements["{{QP_MESSAGES_PER_JOB}}"] = str( 

1994 qp_config.get("messages_per_job", 1) 

1995 ) 

1996 image_replacements["{{QP_SUCCESSFUL_JOBS_HISTORY}}"] = str( 

1997 qp_config.get("successful_jobs_history", 20) 

1998 ) 

1999 image_replacements["{{QP_FAILED_JOBS_HISTORY}}"] = str( 

2000 qp_config.get("failed_jobs_history", 10) 

2001 ) 

2002 image_replacements["{{QP_ALLOWED_NAMESPACES}}"] = ",".join( 

2003 job_policy.get("allowed_namespaces", ["default", "gco-jobs"]) 

2004 ) 

2005 # Resource caps, image allowlist, and security policy are shared 

2006 # with the REST manifest processor. Source them from the 

2007 # job_validation_policy section so a single change in cdk.json 

2008 # takes effect on both submission paths at the next deploy. 

2009 image_replacements["{{QP_MAX_GPU_PER_MANIFEST}}"] = str( 

2010 job_quotas.get("max_gpu_per_manifest", 4) 

2011 ) 

2012 image_replacements["{{QP_MAX_CPU_PER_MANIFEST}}"] = str( 

2013 job_quotas.get("max_cpu_per_manifest", "10") 

2014 ) 

2015 image_replacements["{{QP_MAX_MEMORY_PER_MANIFEST}}"] = str( 

2016 job_quotas.get("max_memory_per_manifest", "32Gi") 

2017 ) 

2018 image_replacements["{{QP_TRUSTED_REGISTRIES}}"] = ",".join( 

2019 _augment_trusted_registries_with_project_ecr( 

2020 job_policy.get("trusted_registries", []), 

2021 account=self.account, 

2022 regions=self.config.get_regions(), 

2023 global_region=self.config.get_global_region(), 

2024 ) 

2025 ) 

2026 image_replacements["{{QP_TRUSTED_DOCKERHUB_ORGS}}"] = ",".join( 

2027 job_policy.get("trusted_dockerhub_orgs", []) 

2028 ) 

2029 

2030 # Security policy toggles — shared with the REST manifest_processor. 

2031 # Both services read the same cdk.json section so a single policy 

2032 # flip (e.g. block_run_as_root: true) takes effect on both paths. 

2033 security_policy = job_policy.get("manifest_security_policy", {}) 

2034 

2035 def _policy_str(v: object) -> str: 

2036 return "true" if v else "false" 

2037 

2038 image_replacements["{{QP_BLOCK_PRIVILEGED}}"] = _policy_str( 

2039 security_policy.get("block_privileged", True) 

2040 ) 

2041 image_replacements["{{QP_BLOCK_PRIVILEGE_ESCALATION}}"] = _policy_str( 

2042 security_policy.get("block_privilege_escalation", True) 

2043 ) 

2044 image_replacements["{{QP_BLOCK_HOST_NETWORK}}"] = _policy_str( 

2045 security_policy.get("block_host_network", True) 

2046 ) 

2047 image_replacements["{{QP_BLOCK_HOST_PID}}"] = _policy_str( 

2048 security_policy.get("block_host_pid", True) 

2049 ) 

2050 image_replacements["{{QP_BLOCK_HOST_IPC}}"] = _policy_str( 

2051 security_policy.get("block_host_ipc", True) 

2052 ) 

2053 image_replacements["{{QP_BLOCK_HOST_PATH}}"] = _policy_str( 

2054 security_policy.get("block_host_path", True) 

2055 ) 

2056 image_replacements["{{QP_BLOCK_ADDED_CAPABILITIES}}"] = _policy_str( 

2057 security_policy.get("block_added_capabilities", True) 

2058 ) 

2059 image_replacements["{{QP_BLOCK_RUN_AS_ROOT}}"] = _policy_str( 

2060 security_policy.get("block_run_as_root", False) 

2061 ) 

2062 

2063 # Add Valkey endpoint if enabled 

2064 if hasattr(self, "valkey_cache") and self.valkey_cache: 2064 ↛ 2065line 2064 didn't jump to line 2065 because the condition on line 2064 was never true

2065 image_replacements["{{VALKEY_ENDPOINT}}"] = self.valkey_cache.attr_endpoint_address 

2066 image_replacements["{{VALKEY_PORT}}"] = self.valkey_cache.attr_endpoint_port 

2067 

2068 # Add Aurora pgvector endpoint if enabled 

2069 if hasattr(self, "aurora_cluster") and self.aurora_cluster: 

2070 image_replacements["{{AURORA_PGVECTOR_ENDPOINT}}"] = ( 

2071 self.aurora_cluster.cluster_endpoint.hostname 

2072 ) 

2073 image_replacements["{{AURORA_PGVECTOR_READER_ENDPOINT}}"] = ( 

2074 self.aurora_cluster.cluster_read_endpoint.hostname 

2075 ) 

2076 image_replacements["{{AURORA_PGVECTOR_PORT}}"] = str( 

2077 self.aurora_cluster.cluster_endpoint.port 

2078 ) 

2079 if self.aurora_cluster.secret: 2079 ↛ 2085line 2079 didn't jump to line 2085 because the condition on line 2079 was always true

2080 image_replacements["{{AURORA_PGVECTOR_SECRET_ARN}}"] = ( 

2081 self.aurora_cluster.secret.secret_arn 

2082 ) 

2083 

2084 # Add FSx replacements if enabled 

2085 if self.fsx_file_system: 

2086 image_replacements["{{FSX_FILE_SYSTEM_ID}}"] = self.fsx_file_system.ref 

2087 image_replacements["{{FSX_DNS_NAME}}"] = self.fsx_file_system.attr_dns_name 

2088 image_replacements["{{FSX_MOUNT_NAME}}"] = self.fsx_file_system.attr_lustre_mount_name 

2089 image_replacements["{{PRIVATE_SUBNET_ID}}"] = self.vpc.private_subnets[0].subnet_id 

2090 image_replacements["{{FSX_SECURITY_GROUP_ID}}"] = ( 

2091 self.fsx_security_group.security_group_id 

2092 ) 

2093 

2094 kubectl_apply = CustomResource( 

2095 self, 

2096 "KubectlApplyManifests", 

2097 service_token=self.kubectl_provider.service_token, 

2098 properties={ 

2099 "ClusterName": self.cluster.cluster_name, 

2100 "Region": self.deployment_region, 

2101 "SkipDeletionOnStackDelete": "true", # Don't delete resources on stack deletion 

2102 "ImageReplacements": image_replacements, 

2103 # Include FSx file system ID directly to force update when FSx changes 

2104 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none", 

2105 # Force update on each deployment to trigger pod rollouts 

2106 "DeploymentTimestamp": deployment_timestamp, 

2107 }, 

2108 ) 

2109 

2110 # Ensure manifests are applied after cluster, EFS, and FSx are ready 

2111 # Note: ALB is created by EKS Auto Mode when Ingress is applied 

2112 kubectl_apply.node.add_dependency(self.cluster) 

2113 kubectl_apply.node.add_dependency(self.efs_file_system) 

2114 if self.fsx_file_system: 

2115 kubectl_apply.node.add_dependency(self.fsx_file_system) 

2116 

2117 # Wait for EKS to have patched the IRSA role ARN onto each managed 

2118 # addon's service account before the kubectl Lambda rollout-restarts 

2119 # the controllers at the end of this invocation. Otherwise the 

2120 # restart sees the old (annotation-less) SA, the mutating webhook 

2121 # can't inject AWS_ROLE_ARN, and the new pods are just as 

2122 # credential-less as the ones they replaced. The symptom is 

2123 # controller pods silently failing with "no EC2 IMDS role found" — 

2124 # for EFS/FSx that manifests as PVCs stuck Pending forever, for 

2125 # CloudWatch as missing Container Insights metrics. See the 

2126 # UpdateEfsCsiAddonRole custom resource in _create_efs_csi_driver_addon 

2127 # for the full rationale. 

2128 for attr in ( 

2129 "_efs_csi_addon_role_update", 

2130 "_fsx_csi_addon_role_update", 

2131 "_cloudwatch_addon_role_update", 

2132 ): 

2133 update_cr = getattr(self, attr, None) 

2134 if update_cr is not None: 

2135 kubectl_apply.node.add_dependency(update_cr) 

2136 

2137 # Ensure Pod Identity associations exist before workloads start, 

2138 # so pods get IAM credentials on first launch 

2139 for assoc in self._pod_identity_associations: 

2140 kubectl_apply.node.add_dependency(assoc) 

2141 

2142 # Install Helm charts (KEDA, etc.) after base manifests are applied 

2143 # This ensures namespaces and RBAC are in place before Helm installations 

2144 helm_install = CustomResource( 

2145 self, 

2146 "HelmInstallCharts", 

2147 service_token=self.helm_installer_provider.service_token, 

2148 properties={ 

2149 "ClusterName": self.cluster.cluster_name, 

2150 "Region": self.deployment_region, 

2151 # Enable core AI/ML infrastructure charts by default 

2152 # NVIDIA Network Operator toggled via cdk.json nvidia_network_operator.enabled 

2153 "EnabledCharts": self._get_enabled_helm_charts(), 

2154 # Override chart values if needed 

2155 "Charts": {}, 

2156 # Pass IAM role ARNs for service account annotations 

2157 "KedaOperatorRoleArn": self.keda_operator_role.role_arn, 

2158 # Force re-invocation on every deployment to pick up charts.yaml changes 

2159 "DeploymentTimestamp": deployment_timestamp, 

2160 }, 

2161 ) 

2162 

2163 # Helm charts depend on kubectl manifests being applied first 

2164 helm_install.node.add_dependency(kubectl_apply) 

2165 

2166 # Apply CRD-dependent manifests after Helm installs the CRDs. 

2167 # KEDA ScaledJob/ScaledObject require the KEDA CRDs to exist first. 

2168 # This second kubectl pass runs after Helm and applies only those resources. 

2169 kubectl_apply_post_helm = CustomResource( 

2170 self, 

2171 "KubectlApplyPostHelmManifests", 

2172 service_token=self.kubectl_provider.service_token, 

2173 properties={ 

2174 "ClusterName": self.cluster.cluster_name, 

2175 "Region": self.deployment_region, 

2176 "SkipDeletionOnStackDelete": "true", 

2177 "ImageReplacements": image_replacements, 

2178 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none", 

2179 "DeploymentTimestamp": deployment_timestamp, 

2180 # PostHelm: "true" tells the handler to apply only post-helm-* manifests 

2181 "PostHelm": "true", 

2182 }, 

2183 ) 

2184 

2185 # Must run after Helm has installed the CRDs 

2186 kubectl_apply_post_helm.node.add_dependency(helm_install) 

2187 

2188 # Create GA registration custom resource AFTER manifests are applied 

2189 # This waits for the Ingress to create the ALB and registers it with GA 

2190 # 

2191 # IMPORTANT: We include a deployment timestamp to force CloudFormation to 

2192 # re-invoke the Lambda on every deployment. This ensures the ALB is always 

2193 # registered with the Global Accelerator, even if other properties haven't changed. 

2194 # Without this, CloudFormation may skip the custom resource if it thinks 

2195 # nothing has changed, leaving the ALB unregistered after GA recreation. 

2196 deployment_timestamp = str(int(time.time())) 

2197 

2198 ga_registration = CustomResource( 

2199 self, 

2200 "GaRegistration", 

2201 service_token=self.ga_registration_provider.service_token, 

2202 properties={ 

2203 "ClusterName": self.cluster.cluster_name, 

2204 "Region": self.deployment_region, 

2205 "EndpointGroupArn": self.endpoint_group_arn, 

2206 "IngressName": "gco-ingress", 

2207 "Namespace": "gco-system", 

2208 # Pass global region and project name for SSM storage 

2209 "GlobalRegion": self.config.get_global_region(), 

2210 "ProjectName": self.config.get_project_name(), 

2211 # Force re-invocation on every deployment 

2212 "DeploymentTimestamp": deployment_timestamp, 

2213 }, 

2214 ) 

2215 

2216 # GA registration must happen after manifests are applied 

2217 ga_registration.node.add_dependency(kubectl_apply) 

2218 

2219 def _create_ga_registration_lambda(self) -> None: 

2220 """Create Lambda function to register Ingress-created ALB with Global Accelerator. 

2221 

2222 This Lambda: 

2223 1. Waits for the Ingress to get an ALB address 

2224 2. Gets the ALB ARN from the address 

2225 3. Registers that ALB with Global Accelerator 

2226 

2227 This is necessary because the ALB is created by the AWS Load Balancer Controller 

2228 (not CDK), so we can't directly reference its ARN. 

2229 """ 

2230 project_name = self.config.get_project_name() 

2231 

2232 # Create Lambda function for GA registration using external handler 

2233 ga_registration_lambda = lambda_.Function( 

2234 self, 

2235 "GaRegistrationFunction", 

2236 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

2237 handler="handler.lambda_handler", 

2238 code=lambda_.Code.from_asset("lambda/ga-registration"), 

2239 timeout=Duration.minutes(15), # Max Lambda timeout; handler uses 14 min budget 

2240 memory_size=256, 

2241 vpc=self.vpc, 

2242 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2243 environment={ 

2244 "CLUSTER_NAME": self.cluster.cluster_name, 

2245 "REGION": self.deployment_region, 

2246 }, 

2247 tracing=lambda_.Tracing.ACTIVE, 

2248 ) 

2249 

2250 # Grant permissions 

2251 ga_registration_lambda.add_to_role_policy( 

2252 iam.PolicyStatement( 

2253 effect=iam.Effect.ALLOW, 

2254 actions=["eks:DescribeCluster"], 

2255 resources=[self.cluster.cluster_arn], 

2256 ) 

2257 ) 

2258 ga_registration_lambda.add_to_role_policy( 

2259 iam.PolicyStatement( 

2260 effect=iam.Effect.ALLOW, 

2261 actions=[ 

2262 "elasticloadbalancing:DescribeLoadBalancers", 

2263 "elasticloadbalancing:DescribeTags", # Required for tag-based ALB detection 

2264 ], 

2265 resources=["*"], 

2266 ) 

2267 ) 

2268 ga_registration_lambda.add_to_role_policy( 

2269 iam.PolicyStatement( 

2270 effect=iam.Effect.ALLOW, 

2271 actions=[ 

2272 "globalaccelerator:AddEndpoints", 

2273 "globalaccelerator:RemoveEndpoints", 

2274 "globalaccelerator:UpdateEndpointGroup", 

2275 "globalaccelerator:DescribeEndpointGroup", 

2276 ], 

2277 resources=["*"], 

2278 ) 

2279 ) 

2280 ga_registration_lambda.add_to_role_policy( 

2281 iam.PolicyStatement( 

2282 effect=iam.Effect.ALLOW, 

2283 actions=["ssm:GetParameter", "ssm:PutParameter", "ssm:DeleteParameter"], 

2284 resources=[ 

2285 f"arn:aws:ssm:{self.config.get_global_region()}:{self.account}:parameter/{project_name}/*" 

2286 ], 

2287 ) 

2288 ) 

2289 

2290 # Add EKS access entry for the Lambda role 

2291 if ga_registration_lambda.role is not None: 2291 ↛ 2305line 2291 didn't jump to line 2305 because the condition on line 2291 was always true

2292 eks.AccessEntry( 

2293 self, 

2294 "GaRegistrationLambdaAccessEntry", 

2295 cluster=self.cluster, # type: ignore[arg-type] 

2296 principal=ga_registration_lambda.role.role_arn, 

2297 access_policies=[ 

2298 eks.AccessPolicy.from_access_policy_name( 

2299 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER 

2300 ) 

2301 ], 

2302 ) 

2303 

2304 # Allow Lambda to access EKS API 

2305 self.cluster.cluster_security_group.add_ingress_rule( 

2306 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block), 

2307 connection=ec2.Port.tcp(443), 

2308 description="Allow GA registration Lambda to access EKS API", 

2309 ) 

2310 

2311 # Get endpoint group ARN from SSM (stored in global region). 

2312 # Uses the shared AwsCustomResource execution role (pre-created in 

2313 # _create_aws_custom_resource_role) — the SSM GetParameter 

2314 # statement was attached there up-front so the Lambda never hits 

2315 # an IAM propagation race on cold deploys. 

2316 global_region = self.config.get_global_region() 

2317 get_endpoint_group_arn = cr.AwsCustomResource( 

2318 self, 

2319 "GetEndpointGroupArn", 

2320 on_create=cr.AwsSdkCall( 

2321 service="SSM", 

2322 action="getParameter", 

2323 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"}, 

2324 region=global_region, 

2325 physical_resource_id=cr.PhysicalResourceId.of( 

2326 f"{project_name}-get-endpoint-group-arn-{self.deployment_region}" 

2327 ), 

2328 ), 

2329 on_update=cr.AwsSdkCall( 

2330 service="SSM", 

2331 action="getParameter", 

2332 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"}, 

2333 region=global_region, 

2334 ), 

2335 role=self.aws_custom_resource_role, 

2336 ) 

2337 get_endpoint_group_arn.node.add_dependency(self.aws_custom_resource_role) 

2338 

2339 endpoint_group_arn = get_endpoint_group_arn.get_response_field("Parameter.Value") 

2340 

2341 # Create log group for GA registration provider 

2342 ga_provider_log_group = logs.LogGroup( 

2343 self, 

2344 "GaRegistrationProviderLogGroup", 

2345 retention=logs.RetentionDays.ONE_WEEK, 

2346 removal_policy=RemovalPolicy.DESTROY, 

2347 ) 

2348 

2349 # Create provider and custom resource 

2350 ga_provider = cr.Provider( 

2351 self, 

2352 "GaRegistrationProvider", 

2353 on_event_handler=ga_registration_lambda, 

2354 log_group=ga_provider_log_group, 

2355 ) 

2356 

2357 # Store for use after kubectl apply 

2358 self.ga_registration_provider = ga_provider 

2359 self.endpoint_group_arn = endpoint_group_arn 

2360 

2361 # cdk-nag suppression: the GA registration Lambda needs broad 

2362 # Global Accelerator and ELB Describe access with Resource: *. 

2363 from cdk_nag import NagSuppressions 

2364 

2365 NagSuppressions.add_resource_suppressions( 

2366 ga_registration_lambda, 

2367 [ 

2368 { 

2369 "id": "AwsSolutions-IAM5", 

2370 "reason": ( 

2371 "The GA registration Lambda needs elasticloadbalancing:Describe* " 

2372 "and globalaccelerator:* to discover the Ingress-created ALB and " 

2373 "register it with Global Accelerator. These APIs do not support " 

2374 "resource-level IAM scoping — Resource: * is the only valid form." 

2375 ), 

2376 "appliesTo": ["Resource::*"], 

2377 }, 

2378 ], 

2379 apply_to_children=True, 

2380 ) 

2381 

2382 def _get_enabled_helm_charts(self) -> list[str]: 

2383 """Return the list of Helm charts to install based on cdk.json helm config. 

2384 

2385 Reads the 'helm' section from cdk.json context. Each key maps to one or 

2386 more Helm chart names. Charts are returned in dependency order with Kueue 

2387 last (its webhook intercepts all Job/Deployment mutations). 

2388 """ 

2389 helm_config = self.node.try_get_context("helm") or {} 

2390 

2391 # Mapping from cdk.json helm key → Helm chart name(s) in charts.yaml 

2392 # Order matters: dependencies first, Kueue last 

2393 chart_map: list[tuple[str, list[str]]] = [ 

2394 ("keda", ["keda"]), 

2395 ("nvidia_gpu_operator", ["nvidia-gpu-operator"]), 

2396 ("nvidia_dra_driver", ["nvidia-dra-driver"]), 

2397 ("nvidia_network_operator", ["nvidia-network-operator"]), 

2398 ("aws_efa_device_plugin", ["aws-efa-device-plugin"]), 

2399 ("aws_neuron_device_plugin", ["aws-neuron-device-plugin"]), 

2400 ("volcano", ["volcano"]), 

2401 ("kuberay", ["kuberay-operator"]), 

2402 ("cert_manager", ["cert-manager"]), 

2403 ("slurm", ["slinky-slurm-operator", "slinky-slurm"]), 

2404 ("yunikorn", ["yunikorn"]), 

2405 ("kueue", ["kueue"]), # Must be last 

2406 ] 

2407 

2408 enabled_charts = [] 

2409 for config_key, chart_names in chart_map: 

2410 chart_config = helm_config.get(config_key, {}) 

2411 if chart_config.get("enabled", True): 

2412 enabled_charts.extend(chart_names) 

2413 

2414 return enabled_charts 

2415 

2416 def _create_helm_installer_lambda(self) -> None: 

2417 """Create Lambda function to install Helm charts (KEDA, NVIDIA DRA, etc.). 

2418 

2419 This Lambda uses Helm to install charts that require complex setup 

2420 (TLS certificates, CRDs, etc.) that are difficult to manage via raw manifests. 

2421 

2422 Charts installed: 

2423 - KEDA: Kubernetes Event-Driven Autoscaling (enabled by default) 

2424 - NVIDIA DRA Driver: Dynamic Resource Allocation for GPUs (disabled by default) 

2425 """ 

2426 project_name = self.config.get_project_name() 

2427 

2428 # Create IAM role for Helm installer Lambda 

2429 helm_lambda_role = iam.Role( 

2430 self, 

2431 "HelmInstallerLambdaRole", 

2432 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

2433 managed_policies=[ 

2434 iam.ManagedPolicy.from_aws_managed_policy_name( 

2435 "service-role/AWSLambdaVPCAccessExecutionRole" 

2436 ), 

2437 iam.ManagedPolicy.from_aws_managed_policy_name( 

2438 "service-role/AWSLambdaBasicExecutionRole" 

2439 ), 

2440 ], 

2441 ) 

2442 

2443 # Add EKS permissions 

2444 helm_lambda_role.add_to_policy( 

2445 iam.PolicyStatement( 

2446 actions=["eks:DescribeCluster", "eks:ListClusters"], 

2447 resources=[self.cluster.cluster_arn], 

2448 ) 

2449 ) 

2450 

2451 # Create security group for Helm installer Lambda 

2452 helm_lambda_sg = ec2.SecurityGroup( 

2453 self, 

2454 "HelmInstallerLambdaSG", 

2455 vpc=self.vpc, 

2456 description="Security group for Helm installer Lambda to access EKS cluster", 

2457 security_group_name=f"{project_name}-helm-lambda-sg-{self.deployment_region}", 

2458 allow_all_outbound=True, 

2459 ) 

2460 

2461 # Allow Lambda to access EKS cluster API 

2462 self.cluster.cluster_security_group.add_ingress_rule( 

2463 peer=helm_lambda_sg, 

2464 connection=ec2.Port.tcp(443), 

2465 description="Allow Helm installer Lambda to access EKS API", 

2466 ) 

2467 

2468 # Build Docker image for Helm installer Lambda 

2469 # Points at helm-installer-build/ which is rebuilt fresh every deploy 

2470 # by _build_helm_installer_lambda() in cli/stacks.py 

2471 ecr_assets.DockerImageAsset( 

2472 self, 

2473 "HelmInstallerImage", 

2474 directory="lambda/helm-installer-build", 

2475 platform=ecr_assets.Platform.LINUX_AMD64, 

2476 ) 

2477 

2478 # Create Lambda function using Docker image 

2479 # Store function name as string attribute for cross-stack references 

2480 # This avoids CDK cross-environment resolution issues when account is unresolved 

2481 self.helm_installer_lambda_function_name = f"{project_name}-helm-{self.deployment_region}" 

2482 self.helm_installer_lambda = lambda_.DockerImageFunction( 

2483 self, 

2484 "HelmInstallerFunction", 

2485 function_name=self.helm_installer_lambda_function_name, 

2486 code=lambda_.DockerImageCode.from_image_asset( 

2487 directory="lambda/helm-installer-build", 

2488 platform=ecr_assets.Platform.LINUX_AMD64, 

2489 ), 

2490 timeout=Duration.minutes(15), 

2491 memory_size=1024, 

2492 architecture=lambda_.Architecture.X86_64, 

2493 role=helm_lambda_role, 

2494 vpc=self.vpc, 

2495 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2496 security_groups=[helm_lambda_sg], 

2497 environment={ 

2498 "CLUSTER_NAME": self.cluster.cluster_name, 

2499 "REGION": self.deployment_region, 

2500 }, 

2501 tracing=lambda_.Tracing.ACTIVE, 

2502 ) 

2503 

2504 # Add EKS access entry for the Lambda role 

2505 eks.AccessEntry( 

2506 self, 

2507 "HelmInstallerLambdaAccessEntry", 

2508 cluster=self.cluster, # type: ignore[arg-type] 

2509 principal=helm_lambda_role.role_arn, 

2510 access_policies=[ 

2511 eks.AccessPolicy.from_access_policy_name( 

2512 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER 

2513 ) 

2514 ], 

2515 ) 

2516 

2517 # Create log group for Helm installer provider 

2518 helm_provider_log_group = logs.LogGroup( 

2519 self, 

2520 "HelmInstallerProviderLogGroup", 

2521 retention=logs.RetentionDays.ONE_WEEK, 

2522 removal_policy=RemovalPolicy.DESTROY, 

2523 ) 

2524 

2525 # Create custom resource provider 

2526 self.helm_installer_provider = cr.Provider( 

2527 self, 

2528 "HelmInstallerProvider", 

2529 on_event_handler=self.helm_installer_lambda, 

2530 log_group=helm_provider_log_group, 

2531 ) 

2532 

2533 # cdk-nag suppression: the Helm installer Lambda requires broad 

2534 # EKS and Kubernetes API access to install Helm charts. 

2535 from cdk_nag import NagSuppressions 

2536 

2537 NagSuppressions.add_resource_suppressions( 

2538 helm_lambda_role, 

2539 [ 

2540 { 

2541 "id": "AwsSolutions-IAM5", 

2542 "reason": ( 

2543 "The Helm installer Lambda requires broad EKS and Kubernetes API " 

2544 "access to install Helm charts (KEDA, NVIDIA DRA, etc.) that create " 

2545 "CRDs, RBAC rules, and workloads across multiple namespaces. " 

2546 "Resource: * is required because the set of Kubernetes resources " 

2547 "is dynamic and not known at synth time." 

2548 ), 

2549 "appliesTo": ["Resource::*"], 

2550 }, 

2551 ], 

2552 apply_to_children=True, 

2553 ) 

2554 

2555 def _create_efs(self) -> None: 

2556 """Create EFS file system for shared storage across jobs. 

2557 

2558 Creates an EFS file system with mount targets in each private subnet, 

2559 allowing pods to share data and persist outputs. The EFS is configured 

2560 with: 

2561 - Encryption at rest 

2562 - Automatic backups 

2563 - General Purpose performance mode (suitable for most workloads) 

2564 - Bursting throughput mode 

2565 

2566 Kubernetes resources (StorageClass, PV, PVC) are created via manifests. 

2567 """ 

2568 project_name = self.config.get_project_name() 

2569 

2570 # Create security group for EFS 

2571 self.efs_security_group = ec2.SecurityGroup( 

2572 self, 

2573 "EfsSecurityGroup", 

2574 vpc=self.vpc, 

2575 description=f"Security group for {project_name} EFS in {self.deployment_region}", 

2576 security_group_name=f"{project_name}-efs-sg-{self.deployment_region}", 

2577 allow_all_outbound=False, # EFS doesn't need outbound 

2578 ) 

2579 

2580 # Allow NFS traffic from EKS cluster security group 

2581 self.efs_security_group.add_ingress_rule( 

2582 peer=self.cluster.cluster_security_group, 

2583 connection=ec2.Port.tcp(2049), 

2584 description="Allow NFS from EKS cluster", 

2585 ) 

2586 

2587 # Create EFS file system 

2588 self.efs_file_system = efs.FileSystem( 

2589 self, 

2590 "GCOEfs", 

2591 vpc=self.vpc, 

2592 file_system_name=f"{project_name}-efs-{self.deployment_region}", 

2593 security_group=self.efs_security_group, 

2594 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2595 encrypted=True, 

2596 performance_mode=efs.PerformanceMode.GENERAL_PURPOSE, 

2597 throughput_mode=efs.ThroughputMode.BURSTING, 

2598 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production 

2599 enable_automatic_backups=True, 

2600 ) 

2601 

2602 # Add file system policy to allow mounting without IAM authorization 

2603 # This allows any client that can reach the mount target to mount the file system 

2604 self.efs_file_system.add_to_resource_policy( 

2605 iam.PolicyStatement( 

2606 effect=iam.Effect.ALLOW, 

2607 principals=[iam.AnyPrincipal()], 

2608 actions=[ 

2609 "elasticfilesystem:ClientMount", 

2610 "elasticfilesystem:ClientWrite", 

2611 "elasticfilesystem:ClientRootAccess", 

2612 ], 

2613 conditions={"Bool": {"elasticfilesystem:AccessedViaMountTarget": "true"}}, 

2614 ) 

2615 ) 

2616 

2617 # Create access point for the gco-jobs directory 

2618 self.efs_access_point = self.efs_file_system.add_access_point( 

2619 "JobsAccessPoint", 

2620 path="/gco-jobs", 

2621 create_acl=efs.Acl(owner_uid="1000", owner_gid="1000", permissions="755"), 

2622 posix_user=efs.PosixUser(uid="1000", gid="1000"), 

2623 ) 

2624 

2625 # Output EFS information 

2626 CfnOutput( 

2627 self, 

2628 "EfsFileSystemId", 

2629 value=self.efs_file_system.file_system_id, 

2630 description="EFS File System ID for shared job storage", 

2631 ) 

2632 

2633 CfnOutput( 

2634 self, 

2635 "EfsAccessPointId", 

2636 value=self.efs_access_point.access_point_id, 

2637 description="EFS Access Point ID for job outputs", 

2638 ) 

2639 

2640 def _create_fsx_lustre(self) -> None: 

2641 """Create FSx for Lustre file system for high-performance storage. 

2642 

2643 FSx for Lustre provides high-performance parallel file system storage 

2644 ideal for ML training workloads that require high throughput and low latency. 

2645 

2646 This is optional and controlled by the fsx_lustre.enabled config setting. 

2647 

2648 Supported deployment types: 

2649 - SCRATCH_1: Temporary storage, no data replication 

2650 - SCRATCH_2: Temporary storage with better burst performance 

2651 - PERSISTENT_1: Persistent storage with data replication 

2652 - PERSISTENT_2: Latest persistent storage with higher throughput 

2653 """ 

2654 fsx_config = self.config.get_fsx_lustre_config(self.deployment_region) 

2655 

2656 if not fsx_config.get("enabled", False): 

2657 self.fsx_file_system = None 

2658 return 

2659 

2660 project_name = self.config.get_project_name() 

2661 

2662 # Create security group for FSx 

2663 self.fsx_security_group = ec2.SecurityGroup( 

2664 self, 

2665 "FsxSecurityGroup", 

2666 vpc=self.vpc, 

2667 description=f"Security group for {project_name} FSx Lustre in {self.deployment_region}", 

2668 security_group_name=f"{project_name}-fsx-sg-{self.deployment_region}", 

2669 allow_all_outbound=False, 

2670 ) 

2671 

2672 # Allow Lustre traffic from EKS cluster security group 

2673 # Lustre uses ports 988 (control) and 1021-1023 (data) 

2674 self.fsx_security_group.add_ingress_rule( 

2675 peer=self.cluster.cluster_security_group, 

2676 connection=ec2.Port.tcp(988), 

2677 description="Allow Lustre control traffic from EKS cluster", 

2678 ) 

2679 self.fsx_security_group.add_ingress_rule( 

2680 peer=self.cluster.cluster_security_group, 

2681 connection=ec2.Port.tcp_range(1021, 1023), 

2682 description="Allow Lustre data traffic from EKS cluster", 

2683 ) 

2684 

2685 # Allow self-referencing traffic for FSx Lustre internal communication 

2686 # FSx Lustre nodes need to communicate with each other on port 988 

2687 self.fsx_security_group.add_ingress_rule( 

2688 peer=self.fsx_security_group, 

2689 connection=ec2.Port.tcp(988), 

2690 description="Allow Lustre internal traffic on port 988", 

2691 ) 

2692 self.fsx_security_group.add_ingress_rule( 

2693 peer=self.fsx_security_group, 

2694 connection=ec2.Port.tcp_range(1021, 1023), 

2695 description="Allow Lustre internal traffic on ports 1021-1023", 

2696 ) 

2697 

2698 # Get deployment type 

2699 deployment_type = fsx_config.get("deployment_type", "SCRATCH_2") 

2700 storage_capacity = fsx_config.get("storage_capacity_gib", 1200) 

2701 

2702 # Build Lustre configuration based on deployment type 

2703 lustre_config = { 

2704 "deploymentType": deployment_type, 

2705 "dataCompressionType": fsx_config.get("data_compression_type", "LZ4"), 

2706 } 

2707 

2708 # Add throughput for PERSISTENT types 

2709 if deployment_type.startswith("PERSISTENT"): 

2710 lustre_config["perUnitStorageThroughput"] = fsx_config.get( 

2711 "per_unit_storage_throughput", 200 

2712 ) 

2713 

2714 # Add S3 import/export if configured 

2715 import_path = fsx_config.get("import_path") 

2716 export_path = fsx_config.get("export_path") 

2717 

2718 if import_path: 

2719 lustre_config["importPath"] = import_path 

2720 lustre_config["autoImportPolicy"] = fsx_config.get( 

2721 "auto_import_policy", "NEW_CHANGED_DELETED" 

2722 ) 

2723 

2724 if export_path: 

2725 lustre_config["exportPath"] = export_path 

2726 

2727 # Get file system type version (default to 2.15 for kernel 6.x compatibility) 

2728 # IMPORTANT: Lustre 2.10 is NOT compatible with kernel 6.x (AL2023, Bottlerocket 1.19+) 

2729 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/lustre-client-matrix.html 

2730 file_system_type_version = fsx_config.get("file_system_type_version", "2.15") 

2731 

2732 # Create FSx for Lustre file system 

2733 self.fsx_file_system = fsx.CfnFileSystem( 

2734 self, 

2735 "GCOFsxLustre", 

2736 file_system_type="LUSTRE", 

2737 file_system_type_version=file_system_type_version, 

2738 storage_capacity=storage_capacity, 

2739 subnet_ids=[self.vpc.private_subnets[0].subnet_id], 

2740 security_group_ids=[self.fsx_security_group.security_group_id], 

2741 lustre_configuration=lustre_config, 

2742 tags=[ 

2743 {"key": "Name", "value": f"{project_name}-fsx-{self.deployment_region}"}, 

2744 {"key": "Project", "value": project_name}, 

2745 ], 

2746 ) 

2747 

2748 # Ensure FSx file system waits for security group ingress rules to be created 

2749 # This prevents "security group does not permit Lustre LNET traffic" errors 

2750 self.fsx_file_system.node.add_dependency(self.fsx_security_group) 

2751 

2752 # Create FSx CSI Driver add-on for Kubernetes integration 

2753 self._create_fsx_csi_driver_addon() 

2754 

2755 # Output FSx information 

2756 CfnOutput( 

2757 self, 

2758 "FsxFileSystemId", 

2759 value=self.fsx_file_system.ref, 

2760 description="FSx for Lustre File System ID", 

2761 ) 

2762 

2763 CfnOutput( 

2764 self, 

2765 "FsxDnsName", 

2766 value=self.fsx_file_system.attr_dns_name, 

2767 description="FSx for Lustre DNS Name", 

2768 ) 

2769 

2770 CfnOutput( 

2771 self, 

2772 "FsxMountName", 

2773 value=self.fsx_file_system.attr_lustre_mount_name, 

2774 description="FSx for Lustre Mount Name", 

2775 ) 

2776 

2777 def _create_valkey_cache(self) -> None: 

2778 """Create an ElastiCache Serverless Valkey cache for K/V caching. 

2779 

2780 Provides a low-latency key-value store that inference endpoints and 

2781 jobs can use for prompt caching, session state, feature stores, or 

2782 any shared state across pods. Valkey Serverless auto-scales and 

2783 requires no node management. 

2784 

2785 The cache is placed in the VPC private subnets and accessible from 

2786 any pod via the cluster security group. 

2787 """ 

2788 valkey_config = self.config.get_valkey_config() 

2789 if not valkey_config.get("enabled", False): 2789 ↛ 2792line 2789 didn't jump to line 2792 because the condition on line 2789 was always true

2790 return 

2791 

2792 from aws_cdk import aws_elasticache as elasticache 

2793 

2794 # Security group for Valkey (allow access from EKS cluster) 

2795 valkey_sg = ec2.SecurityGroup( 

2796 self, 

2797 "ValkeySG", 

2798 vpc=self.vpc, 

2799 description="Security group for Valkey Serverless cache", 

2800 allow_all_outbound=False, 

2801 ) 

2802 valkey_sg.add_ingress_rule( 

2803 ec2.Peer.ipv4(self.vpc.vpc_cidr_block), 

2804 ec2.Port.tcp(6379), 

2805 "Allow Valkey access from VPC", 

2806 ) 

2807 

2808 private_subnet_ids = [s.subnet_id for s in self.vpc.private_subnets] 

2809 

2810 self.valkey_cache = elasticache.CfnServerlessCache( 

2811 self, 

2812 "ValkeyCache", 

2813 engine="valkey", 

2814 serverless_cache_name=f"gco-{self.deployment_region}", 

2815 description=f"GCO K/V cache for {self.deployment_region}", 

2816 major_engine_version="8", 

2817 security_group_ids=[valkey_sg.security_group_id], 

2818 subnet_ids=private_subnet_ids, 

2819 cache_usage_limits=elasticache.CfnServerlessCache.CacheUsageLimitsProperty( 

2820 data_storage=elasticache.CfnServerlessCache.DataStorageProperty( 

2821 maximum=valkey_config.get("max_data_storage_gb", 5), 

2822 minimum=1, 

2823 unit="GB", 

2824 ), 

2825 ecpu_per_second=elasticache.CfnServerlessCache.ECPUPerSecondProperty( 

2826 maximum=valkey_config.get("max_ecpu_per_second", 5000), 

2827 minimum=1000, 

2828 ), 

2829 ), 

2830 snapshot_retention_limit=valkey_config.get("snapshot_retention_limit", 1), 

2831 tags=[ 

2832 CfnTag(key="Project", value="gco"), 

2833 CfnTag(key="Region", value=self.deployment_region), 

2834 ], 

2835 ) 

2836 

2837 CfnOutput( 

2838 self, 

2839 "ValkeyEndpoint", 

2840 value=self.valkey_cache.attr_endpoint_address, 

2841 description="Valkey Serverless cache endpoint", 

2842 ) 

2843 CfnOutput( 

2844 self, 

2845 "ValkeyPort", 

2846 value=self.valkey_cache.attr_endpoint_port, 

2847 description="Valkey Serverless cache port", 

2848 ) 

2849 

2850 # Store endpoint in SSM for discovery by pods 

2851 ssm.StringParameter( 

2852 self, 

2853 "ValkeyEndpointParam", 

2854 parameter_name=f"/{self.config.get_project_name()}/valkey-endpoint-{self.deployment_region}", 

2855 string_value=self.valkey_cache.attr_endpoint_address, 

2856 description=f"Valkey endpoint for {self.deployment_region}", 

2857 ) 

2858 

2859 def _create_aurora_pgvector(self) -> None: 

2860 """Create an Aurora Serverless v2 PostgreSQL cluster with pgvector. 

2861 

2862 Provides a fully managed vector database that inference endpoints and 

2863 jobs can use for RAG (retrieval-augmented generation), semantic search, 

2864 embedding storage, and similarity queries. Aurora Serverless v2 

2865 auto-scales capacity and requires no instance management. 

2866 

2867 The cluster is placed in the VPC private subnets and accessible from 

2868 any pod via the cluster security group. Credentials are stored in 

2869 Secrets Manager and the endpoint is published to SSM + a K8s ConfigMap 

2870 for automatic discovery. 

2871 

2872 See: https://aws.amazon.com/blogs/database/accelerate-generative-ai-workloads-on-amazon-aurora-with-optimized-reads-and-pgvector/ 

2873 """ 

2874 aurora_config = self.config.get_aurora_pgvector_config() 

2875 if not aurora_config.get("enabled", False): 

2876 return 

2877 

2878 from aws_cdk import aws_rds as rds 

2879 

2880 project_name = self.config.get_project_name() 

2881 

2882 # Security group for Aurora (allow PostgreSQL access from EKS cluster only) 

2883 aurora_sg = ec2.SecurityGroup( 

2884 self, 

2885 "AuroraPgvectorSG", 

2886 vpc=self.vpc, 

2887 description="Security group for Aurora Serverless v2 pgvector", 

2888 allow_all_outbound=False, 

2889 ) 

2890 aurora_sg.add_ingress_rule( 

2891 self.cluster.cluster_security_group, 

2892 ec2.Port.tcp(5432), 

2893 "Allow PostgreSQL access from EKS cluster", 

2894 ) 

2895 

2896 # Subnet group for Aurora (private subnets only) 

2897 subnet_group = rds.SubnetGroup( 

2898 self, 

2899 "AuroraPgvectorSubnetGroup", 

2900 description=f"Subnet group for GCO Aurora pgvector in {self.deployment_region}", 

2901 vpc=self.vpc, 

2902 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2903 ) 

2904 

2905 # Aurora Serverless v2 cluster with PostgreSQL 16 + pgvector 

2906 self.aurora_cluster = rds.DatabaseCluster( 

2907 self, 

2908 "AuroraPgvectorCluster", 

2909 engine=rds.DatabaseClusterEngine.aurora_postgres( 

2910 version=getattr(rds.AuroraPostgresEngineVersion, AURORA_POSTGRES_VERSION), 

2911 ), 

2912 serverless_v2_min_capacity=aurora_config.get("min_acu", 0), 

2913 serverless_v2_max_capacity=aurora_config.get("max_acu", 16), 

2914 writer=rds.ClusterInstance.serverless_v2( 

2915 "Writer", 

2916 auto_minor_version_upgrade=True, 

2917 ), 

2918 readers=[ 

2919 rds.ClusterInstance.serverless_v2( 

2920 "Reader", 

2921 auto_minor_version_upgrade=True, 

2922 scale_with_writer=True, 

2923 ), 

2924 ], 

2925 vpc=self.vpc, 

2926 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS), 

2927 subnet_group=subnet_group, 

2928 security_groups=[aurora_sg], 

2929 default_database_name="gco_vectors", 

2930 backup=rds.BackupProps( 

2931 retention=Duration.days(aurora_config.get("backup_retention_days", 7)), 

2932 ), 

2933 deletion_protection=aurora_config.get("deletion_protection", False), 

2934 removal_policy=RemovalPolicy.DESTROY, 

2935 storage_encrypted=True, 

2936 iam_authentication=True, 

2937 cloudwatch_logs_exports=["postgresql"], 

2938 monitoring_interval=Duration.seconds(60), 

2939 cluster_identifier=f"{project_name}-pgvector-{self.deployment_region}", 

2940 ) 

2941 

2942 # Construct-level cdk-nag suppressions for Aurora pgvector 

2943 from cdk_nag import NagPackSuppression, NagSuppressions 

2944 

2945 NagSuppressions.add_resource_suppressions( 

2946 self.aurora_cluster, 

2947 [ 

2948 NagPackSuppression( 

2949 id="AwsSolutions-RDS10", 

2950 reason=( 

2951 "Deletion protection is intentionally disabled for dev/test deployments. " 

2952 "Production deployments should set aurora_pgvector.deletion_protection=true " 

2953 "in cdk.json." 

2954 ), 

2955 ), 

2956 NagPackSuppression( 

2957 id="AwsSolutions-SMG4", 

2958 reason=( 

2959 "Aurora manages credential rotation via the RDS integration with Secrets " 

2960 "Manager. Manual Secrets Manager rotation is not required. " 

2961 "See: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/rds-secrets-manager.html" 

2962 ), 

2963 ), 

2964 NagPackSuppression( 

2965 id="HIPAA.Security-RDSInstanceDeletionProtectionEnabled", 

2966 reason=( 

2967 "Deletion protection is intentionally disabled for dev/test deployments. " 

2968 "Production deployments should set aurora_pgvector.deletion_protection=true " 

2969 "in cdk.json." 

2970 ), 

2971 ), 

2972 NagPackSuppression( 

2973 id="NIST.800.53.R5-RDSInstanceDeletionProtectionEnabled", 

2974 reason=( 

2975 "Deletion protection is intentionally disabled for dev/test deployments. " 

2976 "Production deployments should set aurora_pgvector.deletion_protection=true " 

2977 "in cdk.json." 

2978 ), 

2979 ), 

2980 NagPackSuppression( 

2981 id="PCI.DSS.321-SecretsManagerUsingKMSKey", 

2982 reason=( 

2983 "Aurora Serverless v2 credentials in Secrets Manager are encrypted with " 

2984 "AWS-managed keys by default. Customer-managed KMS can be enabled if " 

2985 "required for PCI compliance." 

2986 ), 

2987 ), 

2988 ], 

2989 apply_to_children=True, 

2990 ) 

2991 

2992 # Outputs 

2993 CfnOutput( 

2994 self, 

2995 "AuroraPgvectorEndpoint", 

2996 value=self.aurora_cluster.cluster_endpoint.hostname, 

2997 description="Aurora pgvector cluster writer endpoint", 

2998 ) 

2999 CfnOutput( 

3000 self, 

3001 "AuroraPgvectorReaderEndpoint", 

3002 value=self.aurora_cluster.cluster_read_endpoint.hostname, 

3003 description="Aurora pgvector cluster reader endpoint", 

3004 ) 

3005 CfnOutput( 

3006 self, 

3007 "AuroraPgvectorPort", 

3008 value=str(self.aurora_cluster.cluster_endpoint.port), 

3009 description="Aurora pgvector cluster port", 

3010 ) 

3011 CfnOutput( 

3012 self, 

3013 "AuroraPgvectorSecretArn", 

3014 value=self.aurora_cluster.secret.secret_arn if self.aurora_cluster.secret else "", 

3015 description="Aurora pgvector credentials secret ARN", 

3016 ) 

3017 

3018 # Store endpoint in SSM for discovery by pods and external tools 

3019 ssm.StringParameter( 

3020 self, 

3021 "AuroraPgvectorEndpointParam", 

3022 parameter_name=f"/{project_name}/aurora-pgvector-endpoint-{self.deployment_region}", 

3023 string_value=self.aurora_cluster.cluster_endpoint.hostname, 

3024 description=f"Aurora pgvector endpoint for {self.deployment_region}", 

3025 ) 

3026 

3027 # Grant the ServiceAccountRole read access to the Aurora secret 

3028 # so pods can retrieve credentials via the ConfigMap + Secrets Manager. 

3029 if self.aurora_cluster.secret: 3029 ↛ exitline 3029 didn't return from function '_create_aurora_pgvector' because the condition on line 3029 was always true

3030 self.aurora_cluster.secret.grant_read(self.service_account_role) 

3031 

3032 def _create_fsx_csi_driver_addon(self) -> None: 

3033 """Create FSx CSI Driver add-on for Kubernetes integration. 

3034 

3035 The FSx CSI driver enables Kubernetes pods to mount FSx for Lustre 

3036 file systems as persistent volumes. 

3037 """ 

3038 # Create IAM role for FSx CSI Driver using IRSA + Pod Identity 

3039 self.fsx_csi_role = GCORegionalStack._create_irsa_role( 

3040 self, 

3041 "FsxCsiDriverRole", 

3042 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn, 

3043 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url, 

3044 service_account_names=["fsx-csi-controller-sa"], 

3045 namespaces=["kube-system"], 

3046 ) 

3047 

3048 # Add FSx CSI driver permissions 

3049 self.fsx_csi_role.add_to_policy( 

3050 iam.PolicyStatement( 

3051 effect=iam.Effect.ALLOW, 

3052 actions=[ 

3053 "fsx:DescribeFileSystems", 

3054 "fsx:DescribeVolumes", 

3055 "fsx:CreateVolume", 

3056 "fsx:DeleteVolume", 

3057 "fsx:TagResource", 

3058 ], 

3059 resources=["*"], 

3060 ) 

3061 ) 

3062 

3063 self.fsx_csi_role.add_to_policy( 

3064 iam.PolicyStatement( 

3065 effect=iam.Effect.ALLOW, 

3066 actions=[ 

3067 "ec2:DescribeInstances", 

3068 "ec2:DescribeVolumes", 

3069 "ec2:DescribeVpcs", 

3070 "ec2:DescribeSubnets", 

3071 "ec2:DescribeSecurityGroups", 

3072 ], 

3073 resources=["*"], 

3074 ) 

3075 ) 

3076 

3077 # cdk-nag suppression: the FSx CSI driver role grants 

3078 # ec2:Describe* APIs that don't support resource-level scoping. 

3079 from cdk_nag import NagSuppressions 

3080 

3081 NagSuppressions.add_resource_suppressions( 

3082 self.fsx_csi_role, 

3083 [ 

3084 { 

3085 "id": "AwsSolutions-IAM5", 

3086 "reason": ( 

3087 "The FSx CSI driver role grants ec2:Describe* for volume " 

3088 "and network discovery. These AWS APIs do not support " 

3089 "resource-level IAM scoping — Resource: * is the only " 

3090 "valid form." 

3091 ), 

3092 "appliesTo": ["Resource::*"], 

3093 }, 

3094 ], 

3095 apply_to_children=True, 

3096 ) 

3097 

3098 # Create FSx CSI Driver add-on 

3099 fsx_addon = eks.Addon( 

3100 self, 

3101 "FsxCsiDriverAddon", 

3102 cluster=self.cluster, # type: ignore[arg-type] 

3103 addon_name="aws-fsx-csi-driver", 

3104 addon_version=EKS_ADDON_FSX_CSI_DRIVER, 

3105 preserve_on_delete=False, 

3106 configuration_values={ 

3107 "node": { 

3108 "tolerations": self._ADDON_NODE_TOLERATIONS, 

3109 }, 

3110 "controller": { 

3111 "tolerations": self._ADDON_NODE_TOLERATIONS, 

3112 }, 

3113 }, 

3114 ) 

3115 

3116 # Append the PassRole statement for the FSx CSI role to the shared 

3117 # AwsCustomResource execution role. See 

3118 # _create_aws_custom_resource_role for the full rationale. 

3119 self.aws_custom_resource_role.add_to_policy( 

3120 iam.PolicyStatement( 

3121 effect=iam.Effect.ALLOW, 

3122 actions=["iam:PassRole"], 

3123 resources=[self.fsx_csi_role.role_arn], 

3124 ) 

3125 ) 

3126 

3127 # Update the add-on to use the IRSA role 

3128 update_fsx_addon = cr.AwsCustomResource( 

3129 self, 

3130 "UpdateFsxCsiAddonRole", 

3131 on_create=cr.AwsSdkCall( 

3132 service="EKS", 

3133 action="updateAddon", 

3134 parameters={ 

3135 "clusterName": self.cluster.cluster_name, 

3136 "addonName": "aws-fsx-csi-driver", 

3137 "serviceAccountRoleArn": self.fsx_csi_role.role_arn, 

3138 }, 

3139 physical_resource_id=cr.PhysicalResourceId.of( 

3140 f"{self.cluster.cluster_name}-fsx-csi-role-update" 

3141 ), 

3142 ), 

3143 on_update=cr.AwsSdkCall( 

3144 service="EKS", 

3145 action="updateAddon", 

3146 parameters={ 

3147 "clusterName": self.cluster.cluster_name, 

3148 "addonName": "aws-fsx-csi-driver", 

3149 "serviceAccountRoleArn": self.fsx_csi_role.role_arn, 

3150 }, 

3151 ), 

3152 role=self.aws_custom_resource_role, 

3153 ) 

3154 

3155 update_fsx_addon.node.add_dependency(fsx_addon) 

3156 update_fsx_addon.node.add_dependency(self.fsx_csi_role) 

3157 update_fsx_addon.node.add_dependency(self.aws_custom_resource_role) 

3158 

3159 # Expose the update-addon resource so _apply_kubernetes_manifests can 

3160 # make the kubectl Lambda wait for the IRSA annotation patch to land 

3161 # before it rollout-restarts the fsx-csi-controller. See the EFS CSI 

3162 # equivalent for the full rationale — same race, same fix, same 

3163 # symptom (PVCs stuck Pending with "no EC2 IMDS role found"). 

3164 self._fsx_csi_addon_role_update = update_fsx_addon 

3165 

3166 # Create Pod Identity Association for FSx CSI driver 

3167 eks_l1.CfnPodIdentityAssociation( 

3168 self, 

3169 "PodIdentity-fsx-csi", 

3170 cluster_name=self.cluster.cluster_name, 

3171 namespace="kube-system", 

3172 service_account="fsx-csi-controller-sa", 

3173 role_arn=self.fsx_csi_role.role_arn, 

3174 ) 

3175 

3176 def _create_drift_detection(self) -> None: 

3177 """Create CloudFormation drift detection on a daily schedule. 

3178 

3179 Creates: 

3180 - SNS topic (KMS-encrypted) for drift alerts 

3181 - Lambda function that initiates drift detection on this stack, polls 

3182 until detection completes, and publishes to SNS if drift is found 

3183 - EventBridge rule on a daily schedule (configurable via cdk.json 

3184 ``drift_detection.schedule_hours``) that invokes the Lambda 

3185 

3186 Operators can disable drift detection entirely by setting 

3187 ``drift_detection.enabled`` to ``false`` in cdk.json. When disabled, 

3188 no resources are created. 

3189 """ 

3190 drift_config = self.node.try_get_context("drift_detection") or {} 

3191 if not drift_config.get("enabled", True): 

3192 return 

3193 

3194 schedule_hours = int(drift_config.get("schedule_hours", 24)) 

3195 

3196 # KMS key for SNS topic encryption. SNS with AWS-managed keys doesn't 

3197 # allow CloudFormation/Lambda to publish, so we use a customer-managed 

3198 # key we can grant publish access on. 

3199 drift_topic_key = kms.Key( 

3200 self, 

3201 "DriftDetectionTopicKey", 

3202 description="KMS key for GCO drift detection SNS topic", 

3203 enable_key_rotation=True, 

3204 removal_policy=RemovalPolicy.DESTROY, 

3205 ) 

3206 

3207 self.drift_detection_topic = sns.Topic( 

3208 self, 

3209 "DriftDetectionTopic", 

3210 display_name="GCO CloudFormation Drift Alerts", 

3211 master_key=drift_topic_key, 

3212 ) 

3213 

3214 # IAM role for the drift detection Lambda 

3215 drift_lambda_role = iam.Role( 

3216 self, 

3217 "DriftDetectionLambdaRole", 

3218 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), 

3219 managed_policies=[ 

3220 iam.ManagedPolicy.from_aws_managed_policy_name( 

3221 "service-role/AWSLambdaBasicExecutionRole" 

3222 ), 

3223 ], 

3224 ) 

3225 

3226 # CloudFormation drift APIs operate at the stack level; the API does 

3227 # not support resource-level ARN scoping for these actions, so we scope 

3228 # to this stack's ARN where supported and accept "*" where not. 

3229 drift_lambda_role.add_to_policy( 

3230 iam.PolicyStatement( 

3231 effect=iam.Effect.ALLOW, 

3232 actions=[ 

3233 "cloudformation:DetectStackDrift", 

3234 "cloudformation:DescribeStackDriftDetectionStatus", 

3235 "cloudformation:DescribeStackResourceDrifts", 

3236 "cloudformation:DescribeStackResource", 

3237 "cloudformation:DescribeStackResources", 

3238 ], 

3239 resources=["*"], 

3240 ) 

3241 ) 

3242 

3243 self.drift_detection_topic.grant_publish(drift_lambda_role) 

3244 

3245 # Lambda function — one per stack; stack name is baked into env vars 

3246 drift_lambda = lambda_.Function( 

3247 self, 

3248 "DriftDetectionFunction", 

3249 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME), 

3250 handler="handler.lambda_handler", 

3251 code=lambda_.Code.from_asset("lambda/drift-detection"), 

3252 timeout=Duration.minutes(14), # Leave headroom under Lambda 15-min cap 

3253 memory_size=256, 

3254 role=drift_lambda_role, 

3255 environment={ 

3256 "STACK_NAME": self.stack_name, 

3257 "SNS_TOPIC_ARN": self.drift_detection_topic.topic_arn, 

3258 "REGION": self.deployment_region, 

3259 }, 

3260 tracing=lambda_.Tracing.ACTIVE, 

3261 ) 

3262 

3263 # Dead-letter queue for EventBridge → Lambda target failures. 

3264 # Captures events that fail to reach the Lambda (e.g. due to 

3265 # throttling or permission issues) so operators can retry or 

3266 # investigate. Required by Serverless-EventBusDLQ cdk-nag rule. 

3267 drift_rule_dlq = sqs.Queue( 

3268 self, 

3269 "DriftDetectionRuleDlq", 

3270 retention_period=Duration.days(14), 

3271 enforce_ssl=True, 

3272 encryption=sqs.QueueEncryption.SQS_MANAGED, 

3273 removal_policy=RemovalPolicy.DESTROY, 

3274 ) 

3275 

3276 # DLQs themselves are terminal — they don't need their own DLQ. 

3277 # Suppress the circular AwsSolutions-SQS3 nag finding. 

3278 from cdk_nag import NagSuppressions as _DlqNagSuppressions 

3279 

3280 _DlqNagSuppressions.add_resource_suppressions( 

3281 drift_rule_dlq, 

3282 [ 

3283 { 

3284 "id": "AwsSolutions-SQS3", 

3285 "reason": ( 

3286 "This queue IS the dead-letter queue for the " 

3287 "DriftDetectionSchedule EventBridge rule. A DLQ for a " 

3288 "DLQ is circular; if events fail to reach this queue " 

3289 "they are captured by EventBridge's own retry metrics " 

3290 "(CloudWatch FailedInvocations)." 

3291 ), 

3292 }, 

3293 ], 

3294 ) 

3295 

3296 # EventBridge rule — daily schedule by default 

3297 events.Rule( 

3298 self, 

3299 "DriftDetectionSchedule", 

3300 description=(f"Daily CloudFormation drift detection for {self.stack_name}"), 

3301 schedule=events.Schedule.rate(Duration.hours(schedule_hours)), 

3302 targets=[ 

3303 events_targets.LambdaFunction( 

3304 drift_lambda, 

3305 dead_letter_queue=drift_rule_dlq, 

3306 retry_attempts=2, 

3307 ) 

3308 ], 

3309 ) 

3310 

3311 # Outputs for operators to subscribe to the topic 

3312 CfnOutput( 

3313 self, 

3314 "DriftDetectionTopicArn", 

3315 value=self.drift_detection_topic.topic_arn, 

3316 description=( 

3317 f"SNS topic ARN for CloudFormation drift alerts in " 

3318 f"{self.deployment_region}. Subscribe an endpoint (email, " 

3319 f"Slack, PagerDuty) to receive drift notifications." 

3320 ), 

3321 ) 

3322 

3323 # cdk-nag suppressions for this component 

3324 from cdk_nag import NagSuppressions 

3325 

3326 NagSuppressions.add_resource_suppressions( 

3327 drift_lambda_role, 

3328 [ 

3329 { 

3330 "id": "AwsSolutions-IAM4", 

3331 "reason": ( 

3332 "AWSLambdaBasicExecutionRole provides standard " 

3333 "CloudWatch Logs permissions required for Lambda " 

3334 "logging. This is the AWS-recommended managed policy." 

3335 ), 

3336 }, 

3337 { 

3338 "id": "AwsSolutions-IAM5", 

3339 "reason": ( 

3340 "CloudFormation drift detection APIs (DetectStackDrift, " 

3341 "DescribeStackDriftDetectionStatus, " 

3342 "DescribeStackResourceDrifts) cannot be scoped to a " 

3343 "specific stack resource via IAM; the action-level " 

3344 "scoping requires wildcard resources. The Lambda's " 

3345 "environment pins it to a single stack name, so the " 

3346 "effective blast radius is limited." 

3347 ), 

3348 }, 

3349 ], 

3350 apply_to_children=True, 

3351 ) 

3352 

3353 def _create_mcp_role(self) -> None: 

3354 """Create dedicated IAM role for the MCP server. 

3355 

3356 The MCP server exposes GCO CLI tools to LLM agents. Without a dedicated 

3357 role, the server would inherit the full ambient credentials of the user 

3358 who launches it (often an administrator). This method creates a 

3359 least-privilege role that the MCP server can assume at startup via 

3360 ``GCO_MCP_ROLE_ARN``. 

3361 

3362 Permissions are scoped to the minimum needed by the tools exposed: 

3363 

3364 - ``eks:DescribeCluster`` on this regional EKS cluster ARN only. 

3365 - ``s3:GetObject`` on model weights buckets. The model bucket lives in 

3366 the global stack, so we scope to the same name pattern used by the 

3367 service account role (``{project_name}-*``). This is a deliberate 

3368 compromise: a precise cross-stack ARN export would force a tight 

3369 dependency on the global stack, and cdk-nag will flag it anyway 

3370 because the bucket name is auto-generated. 

3371 - ``cloudwatch:GetMetricData`` / ``cloudwatch:ListMetrics``. These APIs 

3372 do not support resource-level IAM, so wildcard is required. Read-only. 

3373 - ``sqs:SendMessage`` scoped to this region's job queue ARN only. 

3374 

3375 The trust policy uses ``AccountRootPrincipal`` so any IAM user/role in 

3376 the account can assume it (gated by an explicit sts:AssumeRole 

3377 permission on the caller — standard AWS behavior). Operators who want 

3378 to restrict assumption further should add an external-id or principal 

3379 condition to the trust policy after deployment. 

3380 

3381 Operators can disable this component entirely by setting 

3382 ``mcp_server.enabled`` to ``false`` in cdk.json. 

3383 """ 

3384 mcp_config = self.node.try_get_context("mcp_server") or {} 

3385 if not mcp_config.get("enabled", True): 

3386 return 

3387 

3388 project_name = self.config.get_project_name() 

3389 

3390 self.mcp_server_role = iam.Role( 

3391 self, 

3392 "McpServerRole", 

3393 assumed_by=iam.AccountRootPrincipal(), 

3394 description=( 

3395 "Least-privilege role assumed by the GCO MCP server at startup. " 

3396 "Grants only the permissions needed by MCP tools: eks:DescribeCluster, " 

3397 "s3:GetObject on model buckets, cloudwatch read-only metrics, and " 

3398 "sqs:SendMessage to the regional job queue." 

3399 ), 

3400 max_session_duration=Duration.hours(12), 

3401 ) 

3402 

3403 # eks:DescribeCluster on this region's cluster only 

3404 self.mcp_server_role.add_to_policy( 

3405 iam.PolicyStatement( 

3406 effect=iam.Effect.ALLOW, 

3407 actions=["eks:DescribeCluster"], 

3408 resources=[self.cluster.cluster_arn], 

3409 ) 

3410 ) 

3411 

3412 # s3:GetObject on model weights buckets. Bucket name is auto-generated 

3413 # in the global stack, so we match the same prefix pattern used by the 

3414 # service account role. 

3415 self.mcp_server_role.add_to_policy( 

3416 iam.PolicyStatement( 

3417 effect=iam.Effect.ALLOW, 

3418 actions=["s3:GetObject", "s3:ListBucket"], 

3419 resources=[ 

3420 f"arn:aws:s3:::{project_name}-*", 

3421 f"arn:aws:s3:::{project_name}-*/*", 

3422 ], 

3423 ) 

3424 ) 

3425 

3426 # CloudWatch read-only metrics APIs. These APIs do not support 

3427 # resource-level IAM so wildcard is required. 

3428 self.mcp_server_role.add_to_policy( 

3429 iam.PolicyStatement( 

3430 effect=iam.Effect.ALLOW, 

3431 actions=[ 

3432 "cloudwatch:GetMetricData", 

3433 "cloudwatch:GetMetricStatistics", 

3434 "cloudwatch:ListMetrics", 

3435 ], 

3436 resources=["*"], 

3437 ) 

3438 ) 

3439 

3440 # sqs:SendMessage scoped to the regional job queue only 

3441 self.mcp_server_role.add_to_policy( 

3442 iam.PolicyStatement( 

3443 effect=iam.Effect.ALLOW, 

3444 actions=["sqs:SendMessage", "sqs:GetQueueUrl", "sqs:GetQueueAttributes"], 

3445 resources=[self.job_queue.queue_arn], 

3446 ) 

3447 ) 

3448 

3449 # Export the role ARN so operators can set GCO_MCP_ROLE_ARN in their 

3450 # MCP server environment. 

3451 CfnOutput( 

3452 self, 

3453 "McpServerRoleArn", 

3454 value=self.mcp_server_role.role_arn, 

3455 description=( 

3456 "IAM role ARN for the GCO MCP server. Set GCO_MCP_ROLE_ARN to " 

3457 "this value when launching the MCP server so it assumes a " 

3458 "least-privilege role instead of ambient credentials." 

3459 ), 

3460 export_name=f"{project_name}-mcp-server-role-arn-{self.deployment_region}", 

3461 ) 

3462 

3463 # cdk-nag suppressions: CloudWatch metrics APIs cannot be scoped. 

3464 from cdk_nag import NagSuppressions 

3465 

3466 NagSuppressions.add_resource_suppressions( 

3467 self.mcp_server_role, 

3468 [ 

3469 { 

3470 "id": "AwsSolutions-IAM5", 

3471 "reason": ( 

3472 "The CloudWatch metrics APIs (GetMetricData, " 

3473 "GetMetricStatistics, ListMetrics) do not support " 

3474 "resource-level IAM; wildcard resource is required. " 

3475 "The S3 permissions use the {project_name}-* prefix " 

3476 "pattern because the model weights bucket name is " 

3477 "auto-generated by CDK in the global stack and a " 

3478 "cross-stack ARN export would create tight stack " 

3479 "coupling. All actions are read-only or scoped " 

3480 "send-only (SQS)." 

3481 ), 

3482 }, 

3483 ], 

3484 apply_to_children=True, 

3485 ) 

3486 

3487 def _create_outputs(self) -> None: 

3488 """Create CloudFormation outputs for cluster information""" 

3489 project_name = self.config.get_project_name() 

3490 

3491 # Export cluster information 

3492 CfnOutput( 

3493 self, 

3494 "ClusterName", 

3495 value=self.cluster.cluster_name, 

3496 description=f"EKS cluster name for {self.deployment_region}", 

3497 export_name=f"{project_name}-cluster-name-{self.deployment_region}", 

3498 ) 

3499 

3500 CfnOutput( 

3501 self, 

3502 "ClusterArn", 

3503 value=self.cluster.cluster_arn, 

3504 description=f"EKS cluster ARN for {self.deployment_region}", 

3505 export_name=f"{project_name}-cluster-arn-{self.deployment_region}", 

3506 ) 

3507 

3508 CfnOutput( 

3509 self, 

3510 "ClusterEndpoint", 

3511 value=self.cluster.cluster_endpoint, 

3512 description=f"EKS cluster endpoint for {self.deployment_region}", 

3513 export_name=f"{project_name}-cluster-endpoint-{self.deployment_region}", 

3514 ) 

3515 

3516 CfnOutput( 

3517 self, 

3518 "ClusterSecurityGroupId", 

3519 value=self.cluster.cluster_security_group_id, 

3520 description=f"EKS cluster security group ID for {self.deployment_region}", 

3521 export_name=f"{project_name}-cluster-sg-{self.deployment_region}", 

3522 ) 

3523 

3524 CfnOutput( 

3525 self, 

3526 "VpcId", 

3527 value=self.vpc.vpc_id, 

3528 description=f"VPC ID for {self.deployment_region}", 

3529 export_name=f"{project_name}-vpc-id-{self.deployment_region}", 

3530 ) 

3531 

3532 # Export public subnet IDs for ALB 

3533 public_subnet_ids = [subnet.subnet_id for subnet in self.vpc.public_subnets] 

3534 CfnOutput( 

3535 self, 

3536 "PublicSubnetIds", 

3537 value=Fn.join(",", public_subnet_ids), 

3538 description=f"Public subnet IDs for ALB in {self.deployment_region}", 

3539 export_name=f"{project_name}-public-subnets-{self.deployment_region}", 

3540 ) 

3541 

3542 # Note: ALB is created by AWS Load Balancer Controller via Ingress 

3543 # The ALB ARN is registered with Global Accelerator by the GA registration Lambda 

3544 

3545 def get_cluster(self) -> eks.Cluster: 

3546 """Get the EKS cluster""" 

3547 return self.cluster 

3548 

3549 def get_vpc(self) -> ec2.Vpc: 

3550 """Get the VPC""" 

3551 return self.vpc