Coverage for gco / stacks / regional_stack.py: 93%
445 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 21:47 +0000
1"""
2Regional stack for GCO (Global Capacity Orchestrator on AWS) - EKS cluster and ALB per region.
4This is the largest stack in the project (~3200 lines) and creates all regional
5resources for a single AWS region. One instance is deployed per region defined
6in cdk.json.
8Resources Created:
9 VPC & Networking:
10 - VPC with 3 AZs, public subnets (ALB), private subnets (EKS nodes)
11 - 2 NAT Gateways for high availability
12 - VPC endpoints for ECR, S3, STS, Secrets Manager, SSM, CloudWatch
13 - VPC Flow Logs (CloudWatch Logs, 30-day retention)
15 EKS Cluster (Auto Mode):
16 - Managed control plane with full logging (API, Audit, Authenticator, Controller Manager, Scheduler)
17 - NodePools: system, general-purpose, gpu-x86, gpu-arm, inference, gpu-efa, neuron, cpu-general
18 - IRSA roles for service accounts (Secrets Manager, SQS, DynamoDB, CloudWatch, S3, EFS)
20 Load Balancing:
21 - ALB (created by Ingress via AWS Load Balancer Controller)
22 - Internal NLB for regional API Gateway VPC Link
23 - Global Accelerator endpoint registration (via ga-registration Lambda)
25 Storage:
26 - EFS with dynamic provisioning (CSI driver, access points, encryption at rest + in transit)
27 - FSx for Lustre (optional, toggled via cdk.json)
28 - Valkey Serverless cache (optional)
29 - Aurora Serverless v2 with pgvector (optional)
31 Lambda Functions:
32 - kubectl-applier: applies K8s manifests during deployment
33 - helm-installer: installs Helm charts (KEDA, Volcano, KubeRay, GPU Operator, etc.)
34 - ga-registration: registers ALB with Global Accelerator
35 - regional-api-proxy: proxies regional API Gateway to internal ALB
37 Container Images:
38 - ECR repositories + Docker image builds for health-monitor, manifest-processor,
39 inference-monitor, queue-processor
41 SQS:
42 - Regional job queue + dead letter queue (for gco jobs submit-sqs)
44Key Design Decisions:
45 - EKS Auto Mode handles node provisioning — no managed node groups or Karpenter provisioners
46 - NodePools use WhenEmpty consolidation for inference to avoid disrupting long-running pods
47 - IRSA (IAM Roles for Service Accounts) for least-privilege pod-level AWS access
48 - All optional features (FSx, Valkey, Aurora) are toggled via cdk.json context variables
49 - Template variables in K8s manifests ({{PLACEHOLDER}}) are replaced at deploy time
51Dependencies:
52 - GCOGlobalStack (for Global Accelerator endpoint group ARN, DynamoDB table names, S3 bucket)
53 - GCOApiGatewayGlobalStack (for auth secret ARN)
55Modification Guide:
56 - To add a new NodePool: add a YAML manifest in lambda/kubectl-applier-simple/manifests/ (40-49 range)
57 - To add a new service: add ECR image build here, Dockerfile in dockerfiles/, manifest in manifests/
58 - To add a new optional feature: add a cdk.json context toggle, guard with if/else in this file
59 - To change EKS version: update KUBERNETES_VERSION in constants.py
60"""
62from __future__ import annotations
64import time
65from typing import Any
67import aws_cdk.aws_eks_v2 as eks
68from aws_cdk import (
69 CfnJson,
70 CfnOutput,
71 CfnTag,
72 CustomResource,
73 Duration,
74 Fn,
75 RemovalPolicy,
76 Stack,
77)
78from aws_cdk import aws_ec2 as ec2
79from aws_cdk import aws_ecr as ecr
80from aws_cdk import aws_ecr_assets as ecr_assets
81from aws_cdk import aws_efs as efs
82from aws_cdk import aws_eks as eks_l1 # L1 constructs (CfnPodIdentityAssociation)
83from aws_cdk import aws_events as events
84from aws_cdk import aws_events_targets as events_targets
85from aws_cdk import aws_fsx as fsx
86from aws_cdk import aws_iam as iam
87from aws_cdk import aws_kms as kms
88from aws_cdk import aws_lambda as lambda_
89from aws_cdk import aws_logs as logs
90from aws_cdk import aws_sns as sns
91from aws_cdk import aws_sqs as sqs
92from aws_cdk import aws_ssm as ssm
93from aws_cdk import custom_resources as cr
94from constructs import Construct
96from gco.config.config_loader import ConfigLoader
97from gco.stacks.constants import (
98 AURORA_POSTGRES_VERSION,
99 EKS_ADDON_CLOUDWATCH_OBSERVABILITY,
100 EKS_ADDON_EFS_CSI_DRIVER,
101 EKS_ADDON_FSX_CSI_DRIVER,
102 EKS_ADDON_METRICS_SERVER,
103 EKS_ADDON_POD_IDENTITY_AGENT,
104 LAMBDA_PYTHON_RUNTIME,
105)
108class GCORegionalStack(Stack):
109 """
110 Regional resources stack for a single AWS region.
112 Creates EKS cluster, load balancers, and supporting infrastructure
113 for running GCO services in a specific region.
115 Attributes:
116 vpc: VPC with public/private subnets
117 cluster: EKS Auto Mode cluster
118 """
120 @staticmethod
121 def _create_irsa_role(
122 scope: GCORegionalStack,
123 id: str,
124 oidc_provider_arn: str,
125 oidc_issuer_url: str,
126 service_account_names: list[str],
127 namespaces: list[str],
128 ) -> iam.Role:
129 """Create an IAM role trusted by both IRSA (OIDC) and EKS Pod Identity.
131 IRSA is the primary credential mechanism — it works reliably on EKS Auto
132 Mode by projecting a service-account token that the AWS SDK exchanges for
133 temporary credentials via the OIDC provider.
135 Pod Identity trust is added as a secondary path so the role is ready if/when
136 Pod Identity injection starts working on Auto Mode nodes.
138 Uses CfnJson to defer OIDC condition key resolution to deploy time,
139 because the issuer URL is a CloudFormation token that can't be used
140 as a Python dict key at synth time.
141 """
142 # Strip https:// from issuer URL for the OIDC condition
143 issuer = Fn.select(1, Fn.split("//", oidc_issuer_url))
145 # Build OIDC conditions using CfnJson to defer token resolution
146 # The issuer URL is a CFN token — can't be used as a dict key at synth time
147 aud_key = Fn.join("", [issuer, ":aud"])
148 sub_key = Fn.join("", [issuer, ":sub"])
150 conditions_json = CfnJson(
151 scope,
152 f"{id}OidcConditions",
153 value={
154 aud_key: "sts.amazonaws.com",
155 sub_key: [
156 f"system:serviceaccount:{ns}:{sa}"
157 for ns in namespaces
158 for sa in service_account_names
159 ],
160 },
161 )
163 role = iam.Role(
164 scope,
165 id,
166 assumed_by=iam.FederatedPrincipal(
167 federated=oidc_provider_arn,
168 conditions={
169 "StringEquals": conditions_json,
170 },
171 assume_role_action="sts:AssumeRoleWithWebIdentity",
172 ),
173 )
175 # Also allow Pod Identity (secondary path for future use)
176 assert role.assume_role_policy is not None # guaranteed by assumed_by parameter above
177 role.assume_role_policy.add_statements(
178 iam.PolicyStatement(
179 effect=iam.Effect.ALLOW,
180 principals=[iam.ServicePrincipal("pods.eks.amazonaws.com")],
181 actions=["sts:AssumeRole", "sts:TagSession"],
182 )
183 )
184 return role
186 def __init__(
187 self,
188 scope: Construct,
189 construct_id: str,
190 config: ConfigLoader,
191 region: str,
192 auth_secret_arn: str,
193 **kwargs: Any,
194 ) -> None:
195 super().__init__(scope, construct_id, **kwargs)
197 self.config = config
198 self.deployment_region = region
199 self.auth_secret_arn = auth_secret_arn
200 self.alb_arn: str | None = None
202 # Get cluster configuration for this region
203 cluster_config = self.config.get_cluster_config(region)
204 self.cluster_config = cluster_config
206 # Create VPC for the EKS cluster
207 self.vpc = ec2.Vpc(
208 self,
209 "GCOVpc",
210 # vpc_name intentionally omitted - let CDK generate unique name
211 max_azs=3,
212 nat_gateways=2, # For high availability
213 subnet_configuration=[
214 ec2.SubnetConfiguration(
215 name="PublicSubnet", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24
216 ),
217 ec2.SubnetConfiguration(
218 name="PrivateSubnet",
219 subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,
220 cidr_mask=24,
221 ),
222 ],
223 )
225 # Enable VPC Flow Logs for network traffic analysis and security monitoring
226 self._create_vpc_flow_logs()
228 # Create SQS queue for job ingestion
229 self._create_sqs_queue()
231 # Create ECR repositories and build Docker images
232 self._create_container_images()
234 # Pre-create the execution role shared by every ``cr.AwsCustomResource``
235 # in this stack. See ``_create_aws_custom_resource_role`` for the full
236 # rationale — in short, CDK's default behavior of auto-generating a
237 # Lambda role per ``AwsCustomResource`` (and then merging all the
238 # ``policy=`` statements onto it during deploy) triggers an IAM
239 # propagation race on cold creates. We sidestep the race by creating
240 # a single long-lived role up front and attaching policies to it as
241 # each consumer is built; every ``AwsCustomResource`` then passes
242 # ``role=self.aws_custom_resource_role`` instead of ``policy=``, so
243 # the singleton Lambda runs against a role whose inline policy has
244 # already replicated globally.
245 self._create_aws_custom_resource_role()
247 # Create EKS cluster
248 self._create_eks_cluster(cluster_config)
250 # Create EFS for shared storage
251 self._create_efs()
253 # Create FSx for Lustre (if enabled) for high-performance storage
254 self._create_fsx_lustre()
256 # Create Valkey Serverless cache (if enabled) for K/V caching
257 self._create_valkey_cache()
259 # Create Aurora Serverless v2 + pgvector (if enabled) for vector DB
260 self._create_aurora_pgvector()
262 # Create GA registration Lambda for registering Ingress-created ALB
263 self._create_ga_registration_lambda()
265 # Create Helm installer Lambda for KEDA and other Helm-based installations
266 self._create_helm_installer_lambda()
268 # Apply Kubernetes manifests (after EFS so IDs are available)
269 self._apply_kubernetes_manifests()
271 # Create CloudFormation drift detection (daily schedule + SNS alerts)
272 self._create_drift_detection()
274 # Create dedicated IAM role for MCP server
275 self._create_mcp_role()
277 # Export cluster information
278 self._create_outputs()
280 # Apply cdk-nag suppressions for this stack
281 self._apply_nag_suppressions()
283 def _create_vpc_flow_logs(self) -> None:
284 """Create VPC Flow Logs for network traffic monitoring.
286 Flow logs capture information about IP traffic going to and from
287 network interfaces in the VPC. This is required for security
288 monitoring and compliance (HIPAA, SOC2, etc.).
289 """
290 # Create CloudWatch Log Group for flow logs
291 flow_log_group = logs.LogGroup(
292 self,
293 "VpcFlowLogGroup",
294 # log_group_name intentionally omitted - let CDK generate unique name
295 retention=logs.RetentionDays.ONE_MONTH,
296 removal_policy=RemovalPolicy.DESTROY,
297 )
299 # Create IAM role for VPC Flow Logs
300 flow_log_role = iam.Role(
301 self,
302 "VpcFlowLogRole",
303 assumed_by=iam.ServicePrincipal("vpc-flow-logs.amazonaws.com"),
304 )
306 flow_log_role.add_to_policy(
307 iam.PolicyStatement(
308 actions=[
309 "logs:CreateLogStream",
310 "logs:PutLogEvents",
311 "logs:DescribeLogGroups",
312 "logs:DescribeLogStreams",
313 ],
314 resources=[flow_log_group.log_group_arn, f"{flow_log_group.log_group_arn}:*"],
315 )
316 )
318 # Create VPC Flow Log
319 ec2.FlowLog(
320 self,
321 "VpcFlowLog",
322 resource_type=ec2.FlowLogResourceType.from_vpc(self.vpc),
323 destination=ec2.FlowLogDestination.to_cloud_watch_logs(flow_log_group, flow_log_role),
324 traffic_type=ec2.FlowLogTrafficType.ALL,
325 )
327 def _apply_nag_suppressions(self) -> None:
328 """Apply cdk-nag suppressions for this stack."""
329 from gco.stacks.nag_suppressions import apply_all_suppressions
331 apply_all_suppressions(
332 self,
333 stack_type="regional",
334 regions=self.config.get_regions(),
335 global_region=self.config.get_global_region(),
336 )
338 def _create_sqs_queue(self) -> None:
339 """Create SQS queue for job ingestion.
341 Creates an SQS queue that serves as the default job ingestion point
342 for this region. Jobs submitted to this queue are processed by the
343 manifest processor and KEDA scales based on queue depth.
345 Also creates a dead-letter queue for failed messages.
346 Both queues use server-side encryption with AWS managed keys.
347 """
348 project_name = self.config.get_project_name()
350 # Create dead-letter queue for failed messages
351 self.job_dlq = sqs.Queue(
352 self,
353 "JobDeadLetterQueue",
354 queue_name=f"{project_name}-jobs-dlq-{self.deployment_region}",
355 retention_period=Duration.days(14),
356 removal_policy=RemovalPolicy.DESTROY,
357 enforce_ssl=True, # Require SSL for all requests
358 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption
359 )
361 # Create main job queue
362 self.job_queue = sqs.Queue(
363 self,
364 "JobQueue",
365 queue_name=f"{project_name}-jobs-{self.deployment_region}",
366 visibility_timeout=Duration.minutes(5), # Match Lambda timeout
367 retention_period=Duration.days(7),
368 dead_letter_queue=sqs.DeadLetterQueue(
369 max_receive_count=3, # Move to DLQ after 3 failed attempts
370 queue=self.job_dlq,
371 ),
372 removal_policy=RemovalPolicy.DESTROY,
373 enforce_ssl=True, # Require SSL for all requests
374 encryption=sqs.QueueEncryption.SQS_MANAGED, # Server-side encryption
375 )
377 # Output queue information
378 CfnOutput(
379 self,
380 "JobQueueUrl",
381 value=self.job_queue.queue_url,
382 description=f"SQS Job Queue URL for {self.deployment_region}",
383 export_name=f"{project_name}-job-queue-url-{self.deployment_region}",
384 )
386 CfnOutput(
387 self,
388 "JobQueueArn",
389 value=self.job_queue.queue_arn,
390 description=f"SQS Job Queue ARN for {self.deployment_region}",
391 export_name=f"{project_name}-job-queue-arn-{self.deployment_region}",
392 )
394 CfnOutput(
395 self,
396 "JobDlqUrl",
397 value=self.job_dlq.queue_url,
398 description=f"SQS Dead Letter Queue URL for {self.deployment_region}",
399 export_name=f"{project_name}-job-dlq-url-{self.deployment_region}",
400 )
402 def _create_aws_custom_resource_role(self) -> None:
403 """Pre-create the execution role shared by every ``AwsCustomResource``.
405 CDK's ``cr.AwsCustomResource`` defaults to auto-generating a per-
406 construct Lambda execution role from the ``policy=`` parameter.
407 Internally, CDK deduplicates those auto-generated roles onto a
408 single *singleton* provider Lambda (logical id prefix
409 ``AWS679f53fac002430cb0da5b7982bd22872``), and merges each custom
410 resource's policy statements onto that Lambda's role at stack
411 create time. On cold deploys, CloudFormation invokes the Lambda
412 within 2-3 seconds of attaching a new policy statement, which is
413 faster than IAM's global propagation window. The symptom is a
414 ``iam:PassRole NOT authorized`` failure on whichever addon role
415 update happens to run right after its ``iam:PassRole`` policy
416 statement was attached but before it had replicated.
418 The fix is to create the role up front, attach every policy
419 statement the stack will need during stack creation, and pass
420 ``role=self.aws_custom_resource_role`` to every
421 ``AwsCustomResource`` instead of ``policy=``. Because the role
422 already exists — and its inline policy has had minutes to
423 replicate by the time any ``AwsCustomResource`` actually fires —
424 the race disappears entirely.
426 This method creates the role with the statements we can compute
427 without a cluster reference (EKS ``UpdateAddon`` / ``DescribeAddon``
428 scoped to this cluster, and SSM ``GetParameter`` for the endpoint
429 group ARN). ``iam:PassRole`` statements for individual addon
430 roles (EFS CSI, FSx CSI, CloudWatch Observability) are appended
431 by each ``_create_*_addon`` method after the corresponding IRSA
432 role has been created, so every PassRole ``resources=`` list
433 stays precise (no wildcards) and cdk-nag stays happy.
434 """
435 project_name = self.config.get_project_name()
436 global_region = self.config.get_global_region()
438 self.aws_custom_resource_role = iam.Role(
439 self,
440 "AwsCustomResourceRole",
441 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
442 description=(
443 "Shared execution role for every cr.AwsCustomResource in this "
444 "stack. Pre-created to avoid the IAM policy propagation race "
445 "that occurs when CDK auto-generates per-CR roles and the "
446 "singleton provider Lambda fires before the freshly-attached "
447 "policy has replicated globally."
448 ),
449 managed_policies=[
450 iam.ManagedPolicy.from_aws_managed_policy_name(
451 "service-role/AWSLambdaBasicExecutionRole"
452 ),
453 ],
454 )
456 # EKS UpdateAddon / DescribeAddon — used by the three updateAddon
457 # custom resources (EFS CSI, FSx CSI, CloudWatch Observability).
458 # Scoped to this cluster's addons by ARN.
459 self.aws_custom_resource_role.add_to_policy(
460 iam.PolicyStatement(
461 effect=iam.Effect.ALLOW,
462 actions=["eks:UpdateAddon", "eks:DescribeAddon"],
463 resources=[
464 f"arn:aws:eks:{self.deployment_region}:{self.account}"
465 f":addon/{self.cluster_config.cluster_name}/*"
466 ],
467 )
468 )
470 # SSM GetParameter — used by the GetEndpointGroupArn custom
471 # resource in _create_ga_registration_lambda to read the ARN of
472 # the Global Accelerator endpoint group published by the global
473 # stack during its deploy.
474 self.aws_custom_resource_role.add_to_policy(
475 iam.PolicyStatement(
476 effect=iam.Effect.ALLOW,
477 actions=["ssm:GetParameter"],
478 resources=[
479 f"arn:aws:ssm:{global_region}:{self.account}" f":parameter/{project_name}/*"
480 ],
481 )
482 )
484 # cdk-nag suppressions: the two wildcard-bearing ARNs above are
485 # intentional and both scoped as tightly as AWS IAM permits.
486 #
487 # - The ``eks:UpdateAddon`` / ``eks:DescribeAddon`` statement uses
488 # ``addon/<cluster>/*`` as its resource because the same shared
489 # role is consumed by three different updateAddon custom
490 # resources (EFS CSI, FSx CSI, CloudWatch Observability). Each
491 # addon has its own ARN and we'd otherwise need three separate
492 # statements that each grant access to a known addon name. The
493 # wildcard is scoped to a single cluster in a single region in
494 # a single account — it cannot be used against any addon
495 # belonging to a different cluster or a different service.
496 #
497 # - The ``ssm:GetParameter`` statement uses
498 # ``parameter/<project>/*`` because the exact parameter name
499 # (``endpoint-group-<region>-arn``) is only known at Global
500 # Accelerator registration time and the endpoint path
501 # structure is ``<project>/<parameter>``. Scoping to the
502 # project prefix restricts access to parameters owned by this
503 # project only.
504 from cdk_nag import NagSuppressions
506 NagSuppressions.add_resource_suppressions(
507 self.aws_custom_resource_role,
508 [
509 {
510 "id": "AwsSolutions-IAM5",
511 "reason": (
512 "Scoped to a single EKS cluster's addons "
513 "(addon/<cluster>/*) and this project's SSM "
514 "parameters (parameter/<project>/*). Both wildcards "
515 "are as tight as AWS IAM permits: addon names and "
516 "parameter names are not known at stack synthesis "
517 "time because the addons are created later in the "
518 "same stack and the GA endpoint group ARN is "
519 "published by a separate stack during deploy. The "
520 "shared role pattern itself is deliberate — see "
521 "_create_aws_custom_resource_role docstring for why "
522 "we pre-create instead of letting CDK auto-generate "
523 "per-CR roles."
524 ),
525 "appliesTo": [
526 f"Resource::arn:aws:eks:{self.deployment_region}"
527 f":<AWS::AccountId>:addon/{self.cluster_config.cluster_name}/*",
528 f"Resource::arn:aws:ssm:{global_region}"
529 f":<AWS::AccountId>:parameter/{project_name}/*",
530 ],
531 },
532 ],
533 apply_to_children=True,
534 )
536 def _create_container_images(self) -> None:
537 """Create ECR repositories and build Docker images for services"""
539 # Create ECR repository for health monitor
540 self.health_monitor_repo = ecr.Repository(
541 self,
542 "HealthMonitorRepo",
543 # repository_name intentionally omitted - let CDK generate unique name
544 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production
545 empty_on_delete=True, # Clean up images on stack deletion
546 image_scan_on_push=True, # Enable vulnerability scanning on push
547 )
549 # All Docker images target AMD64 (x86_64) to match EKS Auto Mode's
550 # default system nodepool.
552 # Build and push health monitor Docker image
553 self.health_monitor_image = ecr_assets.DockerImageAsset(
554 self,
555 "HealthMonitorImage",
556 directory=".", # Root directory
557 file="dockerfiles/health-monitor-dockerfile",
558 platform=ecr_assets.Platform.LINUX_AMD64,
559 )
561 # Create ECR repository for manifest processor
562 self.manifest_processor_repo = ecr.Repository(
563 self,
564 "ManifestProcessorRepo",
565 # repository_name intentionally omitted - let CDK generate unique name
566 removal_policy=RemovalPolicy.DESTROY,
567 empty_on_delete=True,
568 image_scan_on_push=True, # Enable vulnerability scanning on push
569 )
571 # Build and push manifest processor Docker image
572 self.manifest_processor_image = ecr_assets.DockerImageAsset(
573 self,
574 "ManifestProcessorImage",
575 directory=".",
576 file="dockerfiles/manifest-processor-dockerfile",
577 platform=ecr_assets.Platform.LINUX_AMD64,
578 )
580 # Output image URIs for reference
581 CfnOutput(
582 self,
583 "HealthMonitorImageUri",
584 value=self.health_monitor_image.image_uri,
585 description="Health Monitor Docker image URI",
586 )
588 CfnOutput(
589 self,
590 "ManifestProcessorImageUri",
591 value=self.manifest_processor_image.image_uri,
592 description="Manifest Processor Docker image URI",
593 )
595 # Build and push inference monitor Docker image
596 self.inference_monitor_image = ecr_assets.DockerImageAsset(
597 self,
598 "InferenceMonitorImage",
599 directory=".",
600 file="dockerfiles/inference-monitor-dockerfile",
601 platform=ecr_assets.Platform.LINUX_AMD64,
602 )
604 CfnOutput(
605 self,
606 "InferenceMonitorImageUri",
607 value=self.inference_monitor_image.image_uri,
608 description="Inference Monitor Docker image URI",
609 )
611 # Build and push queue processor Docker image (if enabled).
612 # The queue processor is a KEDA ScaledJob that consumes manifests from
613 # the regional SQS queue. It can be disabled in cdk.json if users want
614 # to implement their own consumer. When disabled, the post-helm-sqs-consumer.yaml
615 # manifest is skipped (unreplaced template variables cause it to be skipped).
616 queue_processor_config = self.node.try_get_context("queue_processor") or {}
617 self.queue_processor_enabled = queue_processor_config.get("enabled", True)
619 if self.queue_processor_enabled: 619 ↛ exitline 619 didn't return from function '_create_container_images' because the condition on line 619 was always true
620 self.queue_processor_image = ecr_assets.DockerImageAsset(
621 self,
622 "QueueProcessorImage",
623 directory=".",
624 file="dockerfiles/queue-processor-dockerfile",
625 platform=ecr_assets.Platform.LINUX_AMD64,
626 )
628 CfnOutput(
629 self,
630 "QueueProcessorImageUri",
631 value=self.queue_processor_image.image_uri,
632 description="Queue Processor Docker image URI",
633 )
635 def _create_eks_cluster(self, cluster_config: Any) -> None:
636 """Create the EKS cluster with auto mode and GPU node groups"""
638 # Create cluster admin role
639 # role_name intentionally omitted - let CDK generate unique name
640 cluster_admin_role = iam.Role(
641 self,
642 "ClusterAdminRole",
643 assumed_by=iam.ServicePrincipal("eks.amazonaws.com"),
644 managed_policies=[
645 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSClusterPolicy")
646 ],
647 )
649 # Create node group role
650 # role_name intentionally omitted - let CDK generate unique name
651 iam.Role(
652 self,
653 "NodeGroupRole",
654 assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
655 managed_policies=[
656 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKSWorkerNodePolicy"),
657 iam.ManagedPolicy.from_aws_managed_policy_name("AmazonEKS_CNI_Policy"),
658 iam.ManagedPolicy.from_aws_managed_policy_name(
659 "AmazonEC2ContainerRegistryReadOnly"
660 ),
661 ],
662 )
664 # Create EKS Auto Mode cluster with built-in system and general-purpose nodepools
665 # Auto Mode automatically manages compute resources and comes with essential addons
666 # Get endpoint access configuration
667 eks_config = self.config.get_eks_cluster_config()
668 endpoint_access_mode = eks_config.get("endpoint_access", "PRIVATE")
670 # Map config string to EKS EndpointAccess enum
671 endpoint_access = (
672 eks.EndpointAccess.PRIVATE
673 if endpoint_access_mode == "PRIVATE"
674 else eks.EndpointAccess.PUBLIC_AND_PRIVATE
675 )
677 # Create KMS key for EKS secrets encryption
678 self.eks_encryption_key = kms.Key(
679 self,
680 "EksSecretsEncryptionKey",
681 description="KMS key for EKS Kubernetes secrets encryption",
682 enable_key_rotation=True,
683 removal_policy=RemovalPolicy.RETAIN,
684 )
686 # Get Kubernetes version - use custom version if not available in CDK enum
687 k8s_version_str = cluster_config.kubernetes_version
688 try:
689 k8s_version = getattr(eks.KubernetesVersion, f"V{k8s_version_str.replace('.', '_')}")
690 except AttributeError:
691 # Version not in CDK enum yet, use custom version
692 k8s_version = eks.KubernetesVersion.of(k8s_version_str)
694 self.cluster = eks.Cluster(
695 self,
696 "GCOEksCluster",
697 cluster_name=cluster_config.cluster_name,
698 version=k8s_version, # Use configured version for Auto Mode with DRA support
699 vpc=self.vpc,
700 compute=eks.ComputeConfig(
701 # Enable both built-in node pools - Auto Mode manages these automatically
702 node_pools=["system", "general-purpose"]
703 ),
704 # SECURITY: Endpoint access controlled via cdk.json eks_cluster.endpoint_access
705 # PRIVATE (default): EKS API accessible only from within VPC - most secure
706 # Job submission works via API Gateway → Lambda (in VPC) or SQS
707 # For kubectl access, use a bastion host, VPN, or AWS SSM Session Manager
708 # PUBLIC_AND_PRIVATE: EKS API accessible from internet and VPC
709 # Allows direct kubectl access but less secure
710 endpoint_access=endpoint_access,
711 role=cluster_admin_role,
712 vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)],
713 # Enable all control plane logging for security and compliance
714 cluster_logging=[
715 eks.ClusterLoggingTypes.API,
716 eks.ClusterLoggingTypes.AUDIT,
717 eks.ClusterLoggingTypes.AUTHENTICATOR,
718 eks.ClusterLoggingTypes.CONTROLLER_MANAGER,
719 eks.ClusterLoggingTypes.SCHEDULER,
720 ],
721 # SECURITY: Enable envelope encryption for Kubernetes secrets using KMS
722 secrets_encryption_key=self.eks_encryption_key,
723 )
725 # Auto Mode comes with essential addons pre-configured:
726 # - AWS Load Balancer Controller (for ALB/NLB integration)
727 # - CoreDNS, kube-proxy, VPC CNI (standard Kubernetes components)
729 # OIDC provider for IRSA — the primary credential injection mechanism.
730 # IRSA uses projected service-account tokens exchanged via the OIDC provider
731 # for temporary AWS credentials. This works reliably on EKS Auto Mode.
732 self.oidc_provider = eks.OidcProviderNative(
733 self,
734 "OidcProvider",
735 url=self.cluster.cluster_open_id_connect_issuer_url,
736 )
738 # Pod Identity Agent add-on — registers the admission webhook that injects
739 # Pod Identity credentials. On Auto Mode the DaemonSet schedules 0 pods
740 # (the agent is built into the node), but the add-on registration is still
741 # needed for the control-plane webhook. Kept as a secondary credential path.
742 self._create_pod_identity_agent_addon()
744 # Add Metrics Server add-on for HPA and resource monitoring
745 self._create_metrics_server_addon()
747 # Add EFS CSI Driver add-on for shared storage
748 self._create_efs_csi_driver_addon()
750 # Add CloudWatch Observability add-on for Container Insights metrics
751 self._create_cloudwatch_observability_addon()
753 # NOTE: GPU compute is configured via Karpenter NodePools (not managed node groups)
754 # NodePool manifests are located in lambda/kubectl-applier-simple/manifests/:
755 # - 40-nodepool-gpu-x86.yaml: x86_64 GPU instances (g4dn, g5, g6, g6e, p3)
756 # - 41-nodepool-gpu-arm.yaml: ARM64 GPU instances (g5g)
757 # - 42-nodepool-inference.yaml: inference-optimized GPU instances
758 # - 43-nodepool-efa.yaml: EFA-enabled instances (p4d, p5, p6)
759 # - 44-nodepool-neuron.yaml: Trainium/Inferentia instances
760 # These will be applied by the kubectl Lambda custom resource (created below)
762 # Create IRSA role for service account to access secrets
763 self._create_service_account_role()
765 # Create kubectl Lambda for applying Kubernetes manifests
766 self._create_kubectl_lambda()
768 # ── Shared toleration config for EKS add-ons ──────────────────────────
769 # All GCO nodepools apply taints (nvidia.com/gpu, aws.amazon.com/neuron,
770 # vpc.amazonaws.com/efa) that prevent DaemonSet pods from scheduling.
771 # Every add-on that runs a DaemonSet (or may schedule on tainted nodes)
772 # must tolerate these taints so that storage drivers, metrics agents, and
773 # other infrastructure components work on every node type.
774 _ADDON_NODE_TOLERATIONS = [
775 {"key": "nvidia.com/gpu", "operator": "Exists", "effect": "NoSchedule"},
776 {"key": "aws.amazon.com/neuron", "operator": "Exists", "effect": "NoSchedule"},
777 {"key": "vpc.amazonaws.com/efa", "operator": "Exists", "effect": "NoSchedule"},
778 ]
780 def _create_pod_identity_agent_addon(self) -> None:
781 """Create EKS Pod Identity Agent add-on.
783 On Auto Mode the DaemonSet schedules 0 pods (the agent is built into
784 the node runtime), but the add-on registration is still required for
785 the control-plane admission webhook that injects Pod Identity tokens.
786 """
787 eks.Addon(
788 self,
789 "PodIdentityAgentAddon",
790 cluster=self.cluster, # type: ignore[arg-type]
791 addon_name="eks-pod-identity-agent",
792 addon_version=EKS_ADDON_POD_IDENTITY_AGENT,
793 preserve_on_delete=False,
794 configuration_values={
795 "tolerations": self._ADDON_NODE_TOLERATIONS,
796 },
797 )
799 def _create_metrics_server_addon(self) -> None:
800 """Create Metrics Server add-on for resource metrics.
802 The Metrics Server collects resource metrics from kubelets and exposes
803 them via the Kubernetes API server. This is required for:
804 - Horizontal Pod Autoscaler (HPA)
805 - Vertical Pod Autoscaler (VPA)
806 - kubectl top commands
807 - Resource monitoring dashboards
809 Note: Metrics Server doesn't require an IRSA role as it only needs
810 in-cluster permissions which are handled by its service account.
811 """
812 eks.Addon(
813 self,
814 "MetricsServerAddon",
815 cluster=self.cluster, # type: ignore[arg-type]
816 addon_name="metrics-server",
817 addon_version=EKS_ADDON_METRICS_SERVER,
818 preserve_on_delete=False,
819 configuration_values={
820 "tolerations": self._ADDON_NODE_TOLERATIONS,
821 },
822 )
824 def _create_efs_csi_driver_addon(self) -> None:
825 """Create EFS CSI Driver add-on for shared storage support.
827 The EFS CSI driver enables Kubernetes pods to mount EFS file systems
828 as persistent volumes. This is required for the shared storage feature.
830 We create a Pod Identity role for the EFS CSI driver and update the add-on
831 to use it via a custom resource after the add-on is created.
832 """
833 # Create IAM role for EFS CSI Driver using IRSA + Pod Identity
834 self.efs_csi_role = GCORegionalStack._create_irsa_role(
835 self,
836 "EfsCsiDriverRole",
837 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
838 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
839 service_account_names=["efs-csi-controller-sa"],
840 namespaces=["kube-system"],
841 )
843 # Add EFS CSI driver permissions
844 self.efs_csi_role.add_managed_policy(
845 iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEFSCSIDriverPolicy")
846 )
848 # Create EFS CSI Driver add-on
849 efs_addon = eks.Addon(
850 self,
851 "EfsCsiDriverAddon",
852 cluster=self.cluster, # type: ignore[arg-type]
853 addon_name="aws-efs-csi-driver",
854 addon_version=EKS_ADDON_EFS_CSI_DRIVER,
855 preserve_on_delete=False,
856 configuration_values={
857 "node": {
858 "tolerations": self._ADDON_NODE_TOLERATIONS,
859 },
860 "controller": {
861 "tolerations": self._ADDON_NODE_TOLERATIONS,
862 },
863 },
864 )
866 # Append the PassRole statement for the EFS CSI role to the shared
867 # AwsCustomResource execution role. See the role's creation in
868 # _create_aws_custom_resource_role for the full rationale on why
869 # we pre-create + attach up-front instead of letting CDK
870 # auto-generate per-CR roles.
871 self.aws_custom_resource_role.add_to_policy(
872 iam.PolicyStatement(
873 effect=iam.Effect.ALLOW,
874 actions=["iam:PassRole"],
875 resources=[self.efs_csi_role.role_arn],
876 )
877 )
879 # Update the add-on to use the IRSA role via custom resource
880 # This is needed because the eks v2 alpha Addon doesn't support service_account_role directly
881 update_addon = cr.AwsCustomResource(
882 self,
883 "UpdateEfsCsiAddonRole",
884 on_create=cr.AwsSdkCall(
885 service="EKS",
886 action="updateAddon",
887 parameters={
888 "clusterName": self.cluster.cluster_name,
889 "addonName": "aws-efs-csi-driver",
890 "serviceAccountRoleArn": self.efs_csi_role.role_arn,
891 },
892 physical_resource_id=cr.PhysicalResourceId.of(
893 f"{self.cluster.cluster_name}-efs-csi-role-update"
894 ),
895 ),
896 on_update=cr.AwsSdkCall(
897 service="EKS",
898 action="updateAddon",
899 parameters={
900 "clusterName": self.cluster.cluster_name,
901 "addonName": "aws-efs-csi-driver",
902 "serviceAccountRoleArn": self.efs_csi_role.role_arn,
903 },
904 ),
905 role=self.aws_custom_resource_role,
906 )
908 # Ensure the update happens after the add-on is created. We also
909 # depend on the shared execution role so CloudFormation has fully
910 # attached + replicated its inline policy before the Lambda fires.
911 update_addon.node.add_dependency(efs_addon)
912 update_addon.node.add_dependency(self.efs_csi_role)
913 update_addon.node.add_dependency(self.aws_custom_resource_role)
915 # Expose the update-addon resource so _apply_kubernetes_manifests can
916 # make the kubectl Lambda wait for the IRSA annotation patch to land
917 # before it tries to rollout-restart the efs-csi-controller. Without
918 # this ordering, the restart could fire before EKS has re-attached
919 # the role ARN, leaving the new pods just as credential-less as the
920 # old ones and causing every EFS CreateAccessPoint to fail with a
921 # 401 from IMDS.
922 self._efs_csi_addon_role_update = update_addon
924 def _create_cloudwatch_observability_addon(self) -> None:
925 """Create CloudWatch Observability add-on for Container Insights.
927 The CloudWatch Observability add-on enables Container Insights metrics
928 for the EKS cluster, providing visibility into:
929 - Cluster CPU and memory utilization
930 - Node-level metrics
931 - Pod and container metrics
932 - Application logs (optional)
934 These metrics are used by the monitoring dashboard to display
935 cluster health and resource utilization.
936 """
938 # Create IAM role for CloudWatch agent using IRSA + Pod Identity
939 self.cloudwatch_role = GCORegionalStack._create_irsa_role(
940 self,
941 "CloudWatchObservabilityRole",
942 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
943 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
944 service_account_names=["cloudwatch-agent"],
945 namespaces=["amazon-cloudwatch"],
946 )
948 # Add CloudWatch agent permissions
949 self.cloudwatch_role.add_managed_policy(
950 iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchAgentServerPolicy")
951 )
952 self.cloudwatch_role.add_managed_policy(
953 iam.ManagedPolicy.from_aws_managed_policy_name("AWSXrayWriteOnlyAccess")
954 )
956 # Create CloudWatch Observability add-on
957 cw_addon = eks.Addon(
958 self,
959 "CloudWatchObservabilityAddon",
960 cluster=self.cluster, # type: ignore[arg-type]
961 addon_name="amazon-cloudwatch-observability",
962 addon_version=EKS_ADDON_CLOUDWATCH_OBSERVABILITY,
963 preserve_on_delete=False,
964 configuration_values={
965 "tolerations": self._ADDON_NODE_TOLERATIONS,
966 # Enable Container Insights with application log collection
967 # Logs are sent to /aws/containerinsights/{cluster}/application
968 "containerLogs": {
969 "enabled": True,
970 },
971 },
972 )
974 # Append the PassRole statement for the CloudWatch Observability
975 # role to the shared AwsCustomResource execution role. See
976 # _create_aws_custom_resource_role for the full rationale.
977 self.aws_custom_resource_role.add_to_policy(
978 iam.PolicyStatement(
979 effect=iam.Effect.ALLOW,
980 actions=["iam:PassRole"],
981 resources=[self.cloudwatch_role.role_arn],
982 )
983 )
985 # Update the add-on to use the IRSA role via custom resource
986 update_cw_addon = cr.AwsCustomResource(
987 self,
988 "UpdateCloudWatchAddonRole",
989 on_create=cr.AwsSdkCall(
990 service="EKS",
991 action="updateAddon",
992 parameters={
993 "clusterName": self.cluster.cluster_name,
994 "addonName": "amazon-cloudwatch-observability",
995 "serviceAccountRoleArn": self.cloudwatch_role.role_arn,
996 },
997 physical_resource_id=cr.PhysicalResourceId.of(
998 f"{self.cluster.cluster_name}-cw-obs-role-update"
999 ),
1000 ),
1001 on_update=cr.AwsSdkCall(
1002 service="EKS",
1003 action="updateAddon",
1004 parameters={
1005 "clusterName": self.cluster.cluster_name,
1006 "addonName": "amazon-cloudwatch-observability",
1007 "serviceAccountRoleArn": self.cloudwatch_role.role_arn,
1008 },
1009 ),
1010 role=self.aws_custom_resource_role,
1011 )
1013 # Ensure the update happens after the add-on is created. Depend on
1014 # the shared execution role so CFN has fully attached + replicated
1015 # its inline policy before the Lambda fires. No CR→CR dependency
1016 # chain needed anymore — the race it was serializing against is
1017 # eliminated by pre-creating the role.
1018 update_cw_addon.node.add_dependency(cw_addon)
1019 update_cw_addon.node.add_dependency(self.cloudwatch_role)
1020 update_cw_addon.node.add_dependency(self.aws_custom_resource_role)
1022 # Expose the update-addon resource so _apply_kubernetes_manifests can
1023 # make the kubectl Lambda wait for the IRSA annotation patch to land
1024 # before it rollout-restarts the cloudwatch-agent DaemonSet. See the
1025 # EFS CSI equivalent for the full rationale — same race, same fix.
1026 self._cloudwatch_addon_role_update = update_cw_addon
1028 def _create_service_account_role(self) -> None:
1029 """Create IAM role for Kubernetes service account using EKS Pod Identity.
1031 Pod Identity is the recommended mechanism for EKS Auto Mode. It's simpler
1032 and more reliable than IRSA — no OIDC provider, no webhook injection, no
1033 projected tokens. EKS manages the credential injection automatically.
1035 This role can be assumed by the gco-service-account in:
1036 - gco-system namespace (for system services like health-monitor, manifest-processor)
1037 - gco-jobs namespace (for user jobs that need SQS access for KEDA scaling)
1038 - gco-inference namespace (for inference endpoints)
1039 """
1040 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust
1041 #
1042 # The trust policy's `sub` condition must list every ServiceAccount
1043 # that needs to assume this role. Keep in sync with:
1044 # - lambda/kubectl-applier-simple/manifests/01-serviceaccounts.yaml
1045 # (gco-service-account)
1046 # - lambda/kubectl-applier-simple/manifests/02-rbac.yaml
1047 # (gco-health-monitor-sa, gco-manifest-processor-sa,
1048 # gco-inference-monitor-sa)
1049 # - lambda/kubectl-applier-simple/manifests/04a-jobs-serviceaccount.yaml
1050 # (gco-service-account in gco-jobs)
1051 self.service_account_role = GCORegionalStack._create_irsa_role(
1052 self,
1053 "ServiceAccountRole",
1054 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
1055 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
1056 service_account_names=[
1057 "gco-service-account",
1058 "gco-health-monitor-sa",
1059 "gco-manifest-processor-sa",
1060 "gco-inference-monitor-sa",
1061 ],
1062 namespaces=["gco-system", "gco-jobs", "gco-inference"],
1063 )
1065 # Grant permission to read the auth secret
1066 # Note: We use an explicit IAM policy statement with a wildcard (*) because:
1067 # 1. The secret is in a different region (API Gateway region)
1068 # 2. CDK's grant_read() generates a policy with ?????? suffix which requires
1069 # exactly 6 characters, but the SDK can call GetSecretValue with either
1070 # the full ARN (with suffix) or partial ARN (without suffix)
1071 # 3. Using * ensures both forms work correctly
1072 self.service_account_role.add_to_policy(
1073 iam.PolicyStatement(
1074 effect=iam.Effect.ALLOW,
1075 actions=[
1076 "secretsmanager:GetSecretValue",
1077 "secretsmanager:DescribeSecret",
1078 ],
1079 resources=[f"{self.auth_secret_arn}*"], # Wildcard to match with or without suffix
1080 )
1081 )
1083 # cdk-nag suppression: the trailing ``*`` on the auth secret
1084 # ARN above is intentional and is NOT a broad wildcard. Secrets
1085 # Manager appends a random 6-character suffix to every secret
1086 # ARN at creation time (``arn:...:secret:my-secret-AbC123``).
1087 # The secret lives in a separate stack (api_gateway_global_stack)
1088 # and is referenced here via a cross-stack token, so the actual
1089 # suffix is unknown at synth time. The wildcard matches the
1090 # suffix only — every finding under this rule is still scoped
1091 # to this single secret.
1092 from cdk_nag import NagSuppressions
1094 NagSuppressions.add_resource_suppressions(
1095 self.service_account_role,
1096 [
1097 {
1098 "id": "AwsSolutions-IAM5",
1099 "reason": (
1100 "The trailing ``*`` matches the 6-character "
1101 "random suffix Secrets Manager appends to secret "
1102 "ARNs. The secret is created in a different stack "
1103 "(api_gateway_global_stack) and referenced here "
1104 "via a cross-stack token, so the actual suffix "
1105 "isn't known at synth time. The wildcard is "
1106 "bounded to a single secret — it does not grant "
1107 "access to any other secret in the account."
1108 ),
1109 "appliesTo": [
1110 {"regex": "/^Resource::<GCOAuthSecret.*>\\*$/"},
1111 ],
1112 },
1113 ],
1114 apply_to_children=True,
1115 )
1117 # cdk-nag suppression: the ServiceAccountRole grants ec2:Describe*
1118 # and elasticloadbalancing:Describe* for the AWS Load Balancer
1119 # Controller. These AWS APIs do not support resource-level IAM
1120 # scoping — Resource: * is the only valid form.
1121 NagSuppressions.add_resource_suppressions(
1122 self.service_account_role,
1123 [
1124 {
1125 "id": "AwsSolutions-IAM5",
1126 "reason": (
1127 "The ServiceAccountRole grants ec2:Describe* and "
1128 "elasticloadbalancing:Describe* for the AWS Load Balancer "
1129 "Controller. These AWS APIs do not support resource-level "
1130 "IAM scoping — Resource: * is the only valid form. See "
1131 "https://docs.aws.amazon.com/service-authorization/latest/"
1132 "reference/list_amazonec2.html"
1133 ),
1134 "appliesTo": ["Resource::*"],
1135 },
1136 ],
1137 apply_to_children=True,
1138 )
1140 # Add permissions for AWS Load Balancer Controller
1141 self.service_account_role.add_to_policy(
1142 iam.PolicyStatement(
1143 effect=iam.Effect.ALLOW,
1144 actions=[
1145 "ec2:DescribeAccountAttributes",
1146 "ec2:DescribeAddresses",
1147 "ec2:DescribeAvailabilityZones",
1148 "ec2:DescribeInternetGateways",
1149 "ec2:DescribeVpcs",
1150 "ec2:DescribeVpcPeeringConnections",
1151 "ec2:DescribeSubnets",
1152 "ec2:DescribeSecurityGroups",
1153 "ec2:DescribeInstances",
1154 "ec2:DescribeNetworkInterfaces",
1155 "ec2:DescribeTags",
1156 "ec2:GetCoipPoolUsage",
1157 "ec2:DescribeCoipPools",
1158 "elasticloadbalancing:DescribeLoadBalancers",
1159 "elasticloadbalancing:DescribeLoadBalancerAttributes",
1160 "elasticloadbalancing:DescribeListeners",
1161 "elasticloadbalancing:DescribeListenerCertificates",
1162 "elasticloadbalancing:DescribeSSLPolicies",
1163 "elasticloadbalancing:DescribeRules",
1164 "elasticloadbalancing:DescribeTargetGroups",
1165 "elasticloadbalancing:DescribeTargetGroupAttributes",
1166 "elasticloadbalancing:DescribeTargetHealth",
1167 "elasticloadbalancing:DescribeTags",
1168 ],
1169 resources=["*"],
1170 )
1171 )
1173 self.service_account_role.add_to_policy(
1174 iam.PolicyStatement(
1175 effect=iam.Effect.ALLOW,
1176 actions=[
1177 "elasticloadbalancing:CreateLoadBalancer",
1178 "elasticloadbalancing:CreateTargetGroup",
1179 "elasticloadbalancing:CreateListener",
1180 "elasticloadbalancing:DeleteLoadBalancer",
1181 "elasticloadbalancing:DeleteTargetGroup",
1182 "elasticloadbalancing:DeleteListener",
1183 "elasticloadbalancing:ModifyLoadBalancerAttributes",
1184 "elasticloadbalancing:ModifyTargetGroup",
1185 "elasticloadbalancing:ModifyTargetGroupAttributes",
1186 "elasticloadbalancing:ModifyListener",
1187 "elasticloadbalancing:RegisterTargets",
1188 "elasticloadbalancing:DeregisterTargets",
1189 "elasticloadbalancing:SetWebAcl",
1190 "elasticloadbalancing:SetSecurityGroups",
1191 "elasticloadbalancing:SetSubnets",
1192 "elasticloadbalancing:AddTags",
1193 "elasticloadbalancing:RemoveTags",
1194 ],
1195 resources=["*"],
1196 )
1197 )
1199 self.service_account_role.add_to_policy(
1200 iam.PolicyStatement(
1201 effect=iam.Effect.ALLOW,
1202 actions=[
1203 "ec2:CreateSecurityGroup",
1204 "ec2:CreateTags",
1205 "ec2:DeleteTags",
1206 "ec2:AuthorizeSecurityGroupIngress",
1207 "ec2:RevokeSecurityGroupIngress",
1208 "ec2:DeleteSecurityGroup",
1209 ],
1210 resources=["*"],
1211 )
1212 )
1214 self.service_account_role.add_to_policy(
1215 iam.PolicyStatement(
1216 effect=iam.Effect.ALLOW,
1217 actions=["iam:CreateServiceLinkedRole"],
1218 resources=["*"],
1219 conditions={
1220 "StringEquals": {"iam:AWSServiceName": "elasticloadbalancing.amazonaws.com"}
1221 },
1222 )
1223 )
1225 self.service_account_role.add_to_policy(
1226 iam.PolicyStatement(
1227 effect=iam.Effect.ALLOW,
1228 actions=[
1229 "wafv2:GetWebACL",
1230 "wafv2:GetWebACLForResource",
1231 "wafv2:AssociateWebACL",
1232 "wafv2:DisassociateWebACL",
1233 ],
1234 resources=["*"],
1235 )
1236 )
1238 self.service_account_role.add_to_policy(
1239 iam.PolicyStatement(
1240 effect=iam.Effect.ALLOW,
1241 actions=[
1242 "shield:GetSubscriptionState",
1243 "shield:DescribeProtection",
1244 "shield:CreateProtection",
1245 "shield:DeleteProtection",
1246 ],
1247 resources=["*"],
1248 )
1249 )
1251 self.service_account_role.add_to_policy(
1252 iam.PolicyStatement(
1253 effect=iam.Effect.ALLOW,
1254 actions=["acm:ListCertificates", "acm:DescribeCertificate"],
1255 resources=["*"],
1256 )
1257 )
1259 self.service_account_role.add_to_policy(
1260 iam.PolicyStatement(
1261 effect=iam.Effect.ALLOW,
1262 actions=["cognito-idp:DescribeUserPoolClient"],
1263 resources=["*"],
1264 )
1265 )
1267 # Add SQS permissions for KEDA to scale based on queue depth
1268 self.service_account_role.add_to_policy(
1269 iam.PolicyStatement(
1270 effect=iam.Effect.ALLOW,
1271 actions=[
1272 "sqs:GetQueueAttributes",
1273 "sqs:GetQueueUrl",
1274 "sqs:ReceiveMessage",
1275 "sqs:DeleteMessage",
1276 "sqs:SendMessage",
1277 ],
1278 resources=[
1279 self.job_queue.queue_arn,
1280 self.job_dlq.queue_arn,
1281 ],
1282 )
1283 )
1285 # Add CloudWatch permissions for publishing custom metrics
1286 # Used by health-monitor and manifest-processor to publish metrics
1287 self.service_account_role.add_to_policy(
1288 iam.PolicyStatement(
1289 effect=iam.Effect.ALLOW,
1290 actions=["cloudwatch:PutMetricData"],
1291 resources=["*"],
1292 conditions={
1293 "StringEquals": {
1294 "cloudwatch:namespace": [
1295 "GCO/HealthMonitor",
1296 "GCO/ManifestProcessor",
1297 ]
1298 }
1299 },
1300 )
1301 )
1303 # Add DynamoDB permissions for templates, webhooks, and job queue
1304 # Tables are created in the global stack and accessed from all regions
1305 project_name = self.config.get_project_name()
1306 global_region = self.config.get_global_region()
1308 self.service_account_role.add_to_policy(
1309 iam.PolicyStatement(
1310 effect=iam.Effect.ALLOW,
1311 actions=[
1312 "dynamodb:GetItem",
1313 "dynamodb:PutItem",
1314 "dynamodb:UpdateItem",
1315 "dynamodb:DeleteItem",
1316 "dynamodb:Query",
1317 "dynamodb:Scan",
1318 ],
1319 resources=[
1320 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates",
1321 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-job-templates/index/*",
1322 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks",
1323 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-webhooks/index/*",
1324 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs",
1325 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-jobs/index/*",
1326 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints",
1327 f"arn:aws:dynamodb:{global_region}:{self.account}:table/{project_name}-inference-endpoints/index/*",
1328 ],
1329 )
1330 )
1332 # Add S3 permissions for model weights bucket (used by inference init containers)
1333 self.service_account_role.add_to_policy(
1334 iam.PolicyStatement(
1335 effect=iam.Effect.ALLOW,
1336 actions=[
1337 "s3:GetObject",
1338 "s3:ListBucket",
1339 ],
1340 resources=[
1341 f"arn:aws:s3:::{project_name}-*",
1342 f"arn:aws:s3:::{project_name}-*/*",
1343 ],
1344 )
1345 )
1347 # KMS decrypt for model weights bucket (S3-scoped)
1348 self.service_account_role.add_to_policy(
1349 iam.PolicyStatement(
1350 effect=iam.Effect.ALLOW,
1351 actions=["kms:Decrypt", "kms:GenerateDataKey"],
1352 resources=[f"arn:aws:kms:*:{self.account}:key/*"],
1353 conditions={
1354 "StringLike": {
1355 "kms:ViaService": "s3.*.amazonaws.com",
1356 }
1357 },
1358 )
1359 )
1361 # Create KEDA operator IAM role for SQS access
1362 self._create_keda_operator_role()
1364 # Create Pod Identity Associations for all service accounts
1365 self._create_pod_identity_associations()
1367 def _create_keda_operator_role(self) -> None:
1368 """Create IAM role for KEDA operator service account using EKS Pod Identity.
1370 This role allows the KEDA operator to access SQS queues for scaling
1371 based on queue depth. The role is assumed by the keda-operator service
1372 account in the keda namespace.
1373 """
1374 # Create IAM role with IRSA (OIDC) trust + Pod Identity trust
1375 self.keda_operator_role = GCORegionalStack._create_irsa_role(
1376 self,
1377 "KedaOperatorRole",
1378 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
1379 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
1380 service_account_names=["keda-operator"],
1381 namespaces=["keda"],
1382 )
1384 # Add SQS permissions for KEDA to read queue metrics
1385 self.keda_operator_role.add_to_policy(
1386 iam.PolicyStatement(
1387 effect=iam.Effect.ALLOW,
1388 actions=[
1389 "sqs:GetQueueAttributes",
1390 "sqs:GetQueueUrl",
1391 ],
1392 resources=[
1393 self.job_queue.queue_arn,
1394 self.job_dlq.queue_arn,
1395 ],
1396 )
1397 )
1399 def _create_pod_identity_associations(self) -> None:
1400 """Create EKS Pod Identity Associations for all service accounts.
1402 Pod Identity is the recommended mechanism for EKS Auto Mode. Each
1403 association links an IAM role to a Kubernetes service account in a
1404 specific namespace. EKS manages credential injection automatically.
1406 Stores associations in self._pod_identity_associations so the
1407 kubectl-applier custom resource can declare an explicit dependency,
1408 ensuring credentials are available before workloads start.
1409 """
1410 self._pod_identity_associations: list[Any] = []
1412 # GCO service account — used by health-monitor, manifest-processor, inference-monitor
1413 for namespace in ["gco-system", "gco-jobs", "gco-inference"]:
1414 assoc = eks_l1.CfnPodIdentityAssociation(
1415 self,
1416 f"PodIdentity-gco-sa-{namespace}",
1417 cluster_name=self.cluster.cluster_name,
1418 namespace=namespace,
1419 service_account="gco-service-account",
1420 role_arn=self.service_account_role.role_arn,
1421 )
1422 self._pod_identity_associations.append(assoc)
1424 # KEDA operator — needs SQS access for queue-based scaling
1425 keda_assoc = eks_l1.CfnPodIdentityAssociation(
1426 self,
1427 "PodIdentity-keda-operator",
1428 cluster_name=self.cluster.cluster_name,
1429 namespace="keda",
1430 service_account="keda-operator",
1431 role_arn=self.keda_operator_role.role_arn,
1432 )
1433 self._pod_identity_associations.append(keda_assoc)
1435 # EFS CSI driver — needs EFS access for shared storage
1436 efs_assoc = eks_l1.CfnPodIdentityAssociation(
1437 self,
1438 "PodIdentity-efs-csi",
1439 cluster_name=self.cluster.cluster_name,
1440 namespace="kube-system",
1441 service_account="efs-csi-controller-sa",
1442 role_arn=self.efs_csi_role.role_arn,
1443 )
1444 self._pod_identity_associations.append(efs_assoc)
1446 # CloudWatch agent — needs CloudWatch access for observability
1447 cw_assoc = eks_l1.CfnPodIdentityAssociation(
1448 self,
1449 "PodIdentity-cloudwatch",
1450 cluster_name=self.cluster.cluster_name,
1451 namespace="amazon-cloudwatch",
1452 service_account="cloudwatch-agent",
1453 role_arn=self.cloudwatch_role.role_arn,
1454 )
1455 self._pod_identity_associations.append(cw_assoc)
1457 # FSx CSI driver — only when FSx is enabled (created later in _create_fsx_lustre)
1458 # The FSx Pod Identity association is added in _create_fsx_lustre instead
1460 def _create_kubectl_lambda(self) -> None:
1461 """Create Lambda function to apply Kubernetes manifests using Python client.
1463 Note: This creates the Lambda and provider but does NOT create the custom resource.
1464 The custom resource is created in _apply_kubernetes_manifests() after ALB is created,
1465 so that target group ARNs can be passed to the manifests.
1466 """
1467 project_name = self.config.get_project_name()
1469 # Create IAM role for kubectl Lambda
1470 kubectl_lambda_role = iam.Role(
1471 self,
1472 "KubectlLambdaRole",
1473 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
1474 managed_policies=[
1475 iam.ManagedPolicy.from_aws_managed_policy_name(
1476 "service-role/AWSLambdaVPCAccessExecutionRole"
1477 ),
1478 iam.ManagedPolicy.from_aws_managed_policy_name(
1479 "service-role/AWSLambdaBasicExecutionRole"
1480 ),
1481 ],
1482 )
1484 # Add EKS permissions
1485 kubectl_lambda_role.add_to_policy(
1486 iam.PolicyStatement(
1487 actions=[
1488 "eks:DescribeCluster",
1489 "eks:ListClusters",
1490 ],
1491 resources=[self.cluster.cluster_arn],
1492 )
1493 )
1495 # Add permissions to assume cluster admin role
1496 kubectl_lambda_role.add_to_policy(
1497 iam.PolicyStatement(actions=["sts:AssumeRole"], resources=["*"])
1498 )
1500 # Create security group for kubectl Lambda
1501 kubectl_lambda_sg = ec2.SecurityGroup(
1502 self,
1503 "KubectlLambdaSG",
1504 vpc=self.vpc,
1505 description="Security group for kubectl Lambda to access EKS cluster",
1506 security_group_name=f"{self.config.get_project_name()}-kubectl-lambda-sg-{self.deployment_region}",
1507 allow_all_outbound=True, # Lambda needs outbound access to EKS API
1508 )
1510 # Allow Lambda security group to access EKS cluster security group on port 443
1511 # The EKS cluster security group is automatically created by EKS
1512 self.cluster.cluster_security_group.add_ingress_rule(
1513 peer=kubectl_lambda_sg,
1514 connection=ec2.Port.tcp(443),
1515 description="Allow kubectl Lambda to access EKS API",
1516 )
1518 # Create Lambda function (Python-only, no Docker!)
1519 # Store function name as string attribute for cross-stack references
1520 # This avoids CDK cross-environment resolution issues when account is unresolved
1521 self.kubectl_lambda_function_name = f"{project_name}-kubectl-{self.deployment_region}"
1522 self.kubectl_lambda = lambda_.Function(
1523 self,
1524 "KubectlApplierFunction",
1525 function_name=self.kubectl_lambda_function_name,
1526 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
1527 handler="handler.lambda_handler",
1528 code=lambda_.Code.from_asset("lambda/kubectl-applier-simple-build"),
1529 timeout=Duration.minutes(15), # Max Lambda timeout
1530 memory_size=512,
1531 role=kubectl_lambda_role,
1532 vpc=self.vpc,
1533 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
1534 security_groups=[kubectl_lambda_sg], # Use the security group we created
1535 environment={
1536 "CLUSTER_NAME": self.cluster.cluster_name,
1537 "REGION": self.deployment_region,
1538 },
1539 tracing=lambda_.Tracing.ACTIVE,
1540 )
1542 # Add EKS access entry for the Lambda role to authenticate with the cluster
1543 # This grants the Lambda role cluster admin permissions
1544 eks.AccessEntry(
1545 self,
1546 "KubectlLambdaAccessEntry",
1547 cluster=self.cluster, # type: ignore[arg-type]
1548 principal=kubectl_lambda_role.role_arn,
1549 access_policies=[
1550 eks.AccessPolicy.from_access_policy_name(
1551 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER
1552 )
1553 ],
1554 )
1556 # Create log group for kubectl provider
1557 kubectl_provider_log_group = logs.LogGroup(
1558 self,
1559 "KubectlProviderLogGroup",
1560 retention=logs.RetentionDays.ONE_WEEK,
1561 removal_policy=RemovalPolicy.DESTROY,
1562 )
1564 # Create custom resource provider (stored for use in _apply_kubernetes_manifests)
1565 self.kubectl_provider = cr.Provider(
1566 self,
1567 "KubectlProvider",
1568 on_event_handler=self.kubectl_lambda,
1569 log_group=kubectl_provider_log_group,
1570 )
1572 # cdk-nag suppression: the kubectl-applier Lambda requires broad
1573 # EKS and Kubernetes API access to apply arbitrary manifests.
1574 from cdk_nag import NagSuppressions
1576 NagSuppressions.add_resource_suppressions(
1577 kubectl_lambda_role,
1578 [
1579 {
1580 "id": "AwsSolutions-IAM5",
1581 "reason": (
1582 "The kubectl-applier Lambda requires broad EKS and Kubernetes API "
1583 "access to apply arbitrary manifests (RBAC, ServiceAccounts, "
1584 "Deployments, Jobs, NetworkPolicies) across multiple namespaces. "
1585 "Resource: * is required because the set of Kubernetes resources "
1586 "is dynamic and not known at synth time."
1587 ),
1588 "appliesTo": ["Resource::*"],
1589 },
1590 ],
1591 apply_to_children=True,
1592 )
1594 def _apply_kubernetes_manifests(self) -> None:
1595 """Apply Kubernetes manifests using the kubectl Lambda custom resource.
1597 This is called after ALB security group and EFS are created.
1598 The Ingress will use the security group ID to create the ALB.
1599 """
1601 # Get public subnet IDs for Ingress annotation (currently unused but kept for future use)
1602 # public_subnet_ids = ",".join([subnet.subnet_id for subnet in self.vpc.public_subnets])
1604 # Apply manifests using custom resource
1605 # Build image replacements dict
1606 # Include a deployment timestamp to force pod rollouts when code changes
1607 from datetime import UTC, datetime
1609 deployment_timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
1611 # Get resource thresholds from config
1612 thresholds = self.config.get_resource_thresholds()
1614 # Get manifest processor resource quotas.
1615 # Resource quotas and the security/image policy now live under the
1616 # shared job_validation_policy section because both the REST
1617 # manifest_processor and the SQS queue_processor read them. Service-
1618 # specific knobs (replicas, validation_enabled, max_request_body_bytes,
1619 # etc.) stay under manifest_processor.
1620 mp_config = self.node.try_get_context("manifest_processor") or {}
1621 job_policy = self.node.try_get_context("job_validation_policy") or {}
1622 job_quotas = job_policy.get("resource_quotas", {})
1624 image_replacements = {
1625 "{{HEALTH_MONITOR_IMAGE}}": self.health_monitor_image.image_uri,
1626 "{{MANIFEST_PROCESSOR_IMAGE}}": self.manifest_processor_image.image_uri,
1627 "{{INFERENCE_MONITOR_IMAGE}}": self.inference_monitor_image.image_uri,
1628 "{{CLUSTER_NAME}}": self.cluster.cluster_name,
1629 "{{REGION}}": self.deployment_region,
1630 "{{AUTH_SECRET_ARN}}": self.auth_secret_arn,
1631 "{{SERVICE_ACCOUNT_ROLE_ARN}}": self.service_account_role.role_arn,
1632 "{{EFS_FILE_SYSTEM_ID}}": self.efs_file_system.file_system_id,
1633 "{{EFS_ACCESS_POINT_ID}}": self.efs_access_point.access_point_id,
1634 "{{JOB_QUEUE_URL}}": self.job_queue.queue_url,
1635 "{{JOB_QUEUE_ARN}}": self.job_queue.queue_arn,
1636 "{{DEPLOYMENT_TIMESTAMP}}": deployment_timestamp,
1637 # Resource thresholds
1638 "{{CPU_THRESHOLD}}": str(thresholds.cpu_threshold),
1639 "{{MEMORY_THRESHOLD}}": str(thresholds.memory_threshold),
1640 "{{GPU_THRESHOLD}}": str(thresholds.gpu_threshold),
1641 "{{PENDING_PODS_THRESHOLD}}": str(thresholds.pending_pods_threshold),
1642 "{{PENDING_REQUESTED_CPU_VCPUS}}": str(thresholds.pending_requested_cpu_vcpus),
1643 "{{PENDING_REQUESTED_MEMORY_GB}}": str(thresholds.pending_requested_memory_gb),
1644 "{{PENDING_REQUESTED_GPUS}}": str(thresholds.pending_requested_gpus),
1645 # DynamoDB table names (from global stack)
1646 "{{TEMPLATES_TABLE_NAME}}": f"{self.config.get_project_name()}-job-templates",
1647 "{{WEBHOOKS_TABLE_NAME}}": f"{self.config.get_project_name()}-webhooks",
1648 "{{JOBS_TABLE_NAME}}": f"{self.config.get_project_name()}-jobs",
1649 # DynamoDB region (global stack region, may differ from cluster region)
1650 "{{DYNAMODB_REGION}}": self.config.get_global_region(),
1651 # Manifest processor resource quotas (sourced from shared policy).
1652 "{{MP_MAX_CPU_PER_MANIFEST}}": str(job_quotas.get("max_cpu_per_manifest", "10")),
1653 "{{MP_MAX_MEMORY_PER_MANIFEST}}": str(
1654 job_quotas.get("max_memory_per_manifest", "32Gi")
1655 ),
1656 "{{MP_MAX_GPU_PER_MANIFEST}}": str(job_quotas.get("max_gpu_per_manifest", 4)),
1657 # Manifest processor namespace allowlist (sourced from shared policy).
1658 # Both the REST manifest processor and the SQS queue processor
1659 # read from job_validation_policy.allowed_namespaces so a single
1660 # edit takes effect on both submission paths at the next deploy.
1661 "{{MP_ALLOWED_NAMESPACES}}": ",".join(
1662 job_policy.get("allowed_namespaces", ["default", "gco-jobs"])
1663 ),
1664 # Manifest processor request body size cap (HTTP 413 middleware).
1665 # Lives at cdk.json::manifest_processor.max_request_body_bytes.
1666 "{{MP_MAX_REQUEST_BODY_BYTES}}": str(
1667 mp_config.get("max_request_body_bytes", 1_048_576)
1668 ),
1669 }
1671 # Add queue processor replacements if enabled
1672 qp_config = self.node.try_get_context("queue_processor") or {}
1674 # Add VPC endpoint CIDR replacements for network policy restrictions
1675 # Generates a YAML block of ipBlock entries from the vpc_endpoint_cidrs array.
1676 # The placeholder {{VPC_ENDPOINT_CIDR_BLOCKS}} sits at 8-space indentation in
1677 # the manifest, so the first entry needs no leading indent (the manifest provides
1678 # it) and subsequent entries are indented to align.
1679 vpc_endpoint_cidrs = self.node.try_get_context("vpc_endpoint_cidrs") or ["10.0.0.0/16"]
1680 cidr_lines = []
1681 for i, cidr in enumerate(vpc_endpoint_cidrs):
1682 prefix = "" if i == 0 else " "
1683 cidr_lines.append(f'{prefix}- ipBlock:\n cidr: "{cidr}"')
1684 image_replacements["{{VPC_ENDPOINT_CIDR_BLOCKS}}"] = "\n".join(cidr_lines)
1686 # Resource governance for gco-jobs namespace: ResourceQuota caps aggregate
1687 # resource consumption across the namespace, LimitRange caps per-container
1688 # maxima. Values come from cdk.json `resource_quota` context with defaults
1689 # sized for a modest multi-tenant dev cluster.
1690 resource_quota = self.node.try_get_context("resource_quota") or {}
1691 image_replacements["{{QUOTA_MAX_CPU}}"] = str(resource_quota.get("max_cpu", "100"))
1692 image_replacements["{{QUOTA_MAX_MEMORY}}"] = str(resource_quota.get("max_memory", "512Gi"))
1693 image_replacements["{{QUOTA_MAX_GPU}}"] = str(resource_quota.get("max_gpu", "32"))
1694 image_replacements["{{QUOTA_MAX_PODS}}"] = str(resource_quota.get("max_pods", "50"))
1695 image_replacements["{{LIMIT_MAX_CPU}}"] = str(resource_quota.get("container_max_cpu", "10"))
1696 image_replacements["{{LIMIT_MAX_MEMORY}}"] = str(
1697 resource_quota.get("container_max_memory", "64Gi")
1698 )
1699 image_replacements["{{LIMIT_MAX_GPU}}"] = str(resource_quota.get("container_max_gpu", "4"))
1701 if self.queue_processor_enabled: 1701 ↛ 1775line 1701 didn't jump to line 1775 because the condition on line 1701 was always true
1702 image_replacements["{{QUEUE_PROCESSOR_IMAGE}}"] = self.queue_processor_image.image_uri
1703 image_replacements["{{QP_POLLING_INTERVAL}}"] = str(
1704 qp_config.get("polling_interval", 10)
1705 )
1706 image_replacements["{{QP_MAX_CONCURRENT_JOBS}}"] = str(
1707 qp_config.get("max_concurrent_jobs", 10)
1708 )
1709 image_replacements["{{QP_MESSAGES_PER_JOB}}"] = str(
1710 qp_config.get("messages_per_job", 1)
1711 )
1712 image_replacements["{{QP_SUCCESSFUL_JOBS_HISTORY}}"] = str(
1713 qp_config.get("successful_jobs_history", 20)
1714 )
1715 image_replacements["{{QP_FAILED_JOBS_HISTORY}}"] = str(
1716 qp_config.get("failed_jobs_history", 10)
1717 )
1718 image_replacements["{{QP_ALLOWED_NAMESPACES}}"] = ",".join(
1719 job_policy.get("allowed_namespaces", ["default", "gco-jobs"])
1720 )
1721 # Resource caps, image allowlist, and security policy are shared
1722 # with the REST manifest processor. Source them from the
1723 # job_validation_policy section so a single change in cdk.json
1724 # takes effect on both submission paths at the next deploy.
1725 image_replacements["{{QP_MAX_GPU_PER_MANIFEST}}"] = str(
1726 job_quotas.get("max_gpu_per_manifest", 4)
1727 )
1728 image_replacements["{{QP_MAX_CPU_PER_MANIFEST}}"] = str(
1729 job_quotas.get("max_cpu_per_manifest", "10")
1730 )
1731 image_replacements["{{QP_MAX_MEMORY_PER_MANIFEST}}"] = str(
1732 job_quotas.get("max_memory_per_manifest", "32Gi")
1733 )
1734 image_replacements["{{QP_TRUSTED_REGISTRIES}}"] = ",".join(
1735 job_policy.get("trusted_registries", [])
1736 )
1737 image_replacements["{{QP_TRUSTED_DOCKERHUB_ORGS}}"] = ",".join(
1738 job_policy.get("trusted_dockerhub_orgs", [])
1739 )
1741 # Security policy toggles — shared with the REST manifest_processor.
1742 # Both services read the same cdk.json section so a single policy
1743 # flip (e.g. block_run_as_root: true) takes effect on both paths.
1744 security_policy = job_policy.get("manifest_security_policy", {})
1746 def _policy_str(v: object) -> str:
1747 return "true" if v else "false"
1749 image_replacements["{{QP_BLOCK_PRIVILEGED}}"] = _policy_str(
1750 security_policy.get("block_privileged", True)
1751 )
1752 image_replacements["{{QP_BLOCK_PRIVILEGE_ESCALATION}}"] = _policy_str(
1753 security_policy.get("block_privilege_escalation", True)
1754 )
1755 image_replacements["{{QP_BLOCK_HOST_NETWORK}}"] = _policy_str(
1756 security_policy.get("block_host_network", True)
1757 )
1758 image_replacements["{{QP_BLOCK_HOST_PID}}"] = _policy_str(
1759 security_policy.get("block_host_pid", True)
1760 )
1761 image_replacements["{{QP_BLOCK_HOST_IPC}}"] = _policy_str(
1762 security_policy.get("block_host_ipc", True)
1763 )
1764 image_replacements["{{QP_BLOCK_HOST_PATH}}"] = _policy_str(
1765 security_policy.get("block_host_path", True)
1766 )
1767 image_replacements["{{QP_BLOCK_ADDED_CAPABILITIES}}"] = _policy_str(
1768 security_policy.get("block_added_capabilities", True)
1769 )
1770 image_replacements["{{QP_BLOCK_RUN_AS_ROOT}}"] = _policy_str(
1771 security_policy.get("block_run_as_root", False)
1772 )
1774 # Add Valkey endpoint if enabled
1775 if hasattr(self, "valkey_cache") and self.valkey_cache: 1775 ↛ 1776line 1775 didn't jump to line 1776 because the condition on line 1775 was never true
1776 image_replacements["{{VALKEY_ENDPOINT}}"] = self.valkey_cache.attr_endpoint_address
1777 image_replacements["{{VALKEY_PORT}}"] = self.valkey_cache.attr_endpoint_port
1779 # Add Aurora pgvector endpoint if enabled
1780 if hasattr(self, "aurora_cluster") and self.aurora_cluster:
1781 image_replacements["{{AURORA_PGVECTOR_ENDPOINT}}"] = (
1782 self.aurora_cluster.cluster_endpoint.hostname
1783 )
1784 image_replacements["{{AURORA_PGVECTOR_READER_ENDPOINT}}"] = (
1785 self.aurora_cluster.cluster_read_endpoint.hostname
1786 )
1787 image_replacements["{{AURORA_PGVECTOR_PORT}}"] = str(
1788 self.aurora_cluster.cluster_endpoint.port
1789 )
1790 if self.aurora_cluster.secret: 1790 ↛ 1796line 1790 didn't jump to line 1796 because the condition on line 1790 was always true
1791 image_replacements["{{AURORA_PGVECTOR_SECRET_ARN}}"] = (
1792 self.aurora_cluster.secret.secret_arn
1793 )
1795 # Add FSx replacements if enabled
1796 if self.fsx_file_system:
1797 image_replacements["{{FSX_FILE_SYSTEM_ID}}"] = self.fsx_file_system.ref
1798 image_replacements["{{FSX_DNS_NAME}}"] = self.fsx_file_system.attr_dns_name
1799 image_replacements["{{FSX_MOUNT_NAME}}"] = self.fsx_file_system.attr_lustre_mount_name
1800 image_replacements["{{PRIVATE_SUBNET_ID}}"] = self.vpc.private_subnets[0].subnet_id
1801 image_replacements["{{FSX_SECURITY_GROUP_ID}}"] = (
1802 self.fsx_security_group.security_group_id
1803 )
1805 kubectl_apply = CustomResource(
1806 self,
1807 "KubectlApplyManifests",
1808 service_token=self.kubectl_provider.service_token,
1809 properties={
1810 "ClusterName": self.cluster.cluster_name,
1811 "Region": self.deployment_region,
1812 "SkipDeletionOnStackDelete": "true", # Don't delete resources on stack deletion
1813 "ImageReplacements": image_replacements,
1814 # Include FSx file system ID directly to force update when FSx changes
1815 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none",
1816 # Force update on each deployment to trigger pod rollouts
1817 "DeploymentTimestamp": deployment_timestamp,
1818 },
1819 )
1821 # Ensure manifests are applied after cluster, EFS, and FSx are ready
1822 # Note: ALB is created by EKS Auto Mode when Ingress is applied
1823 kubectl_apply.node.add_dependency(self.cluster)
1824 kubectl_apply.node.add_dependency(self.efs_file_system)
1825 if self.fsx_file_system:
1826 kubectl_apply.node.add_dependency(self.fsx_file_system)
1828 # Wait for EKS to have patched the IRSA role ARN onto each managed
1829 # addon's service account before the kubectl Lambda rollout-restarts
1830 # the controllers at the end of this invocation. Otherwise the
1831 # restart sees the old (annotation-less) SA, the mutating webhook
1832 # can't inject AWS_ROLE_ARN, and the new pods are just as
1833 # credential-less as the ones they replaced. The symptom is
1834 # controller pods silently failing with "no EC2 IMDS role found" —
1835 # for EFS/FSx that manifests as PVCs stuck Pending forever, for
1836 # CloudWatch as missing Container Insights metrics. See the
1837 # UpdateEfsCsiAddonRole custom resource in _create_efs_csi_driver_addon
1838 # for the full rationale.
1839 for attr in (
1840 "_efs_csi_addon_role_update",
1841 "_fsx_csi_addon_role_update",
1842 "_cloudwatch_addon_role_update",
1843 ):
1844 update_cr = getattr(self, attr, None)
1845 if update_cr is not None:
1846 kubectl_apply.node.add_dependency(update_cr)
1848 # Ensure Pod Identity associations exist before workloads start,
1849 # so pods get IAM credentials on first launch
1850 for assoc in self._pod_identity_associations:
1851 kubectl_apply.node.add_dependency(assoc)
1853 # Install Helm charts (KEDA, etc.) after base manifests are applied
1854 # This ensures namespaces and RBAC are in place before Helm installations
1855 helm_install = CustomResource(
1856 self,
1857 "HelmInstallCharts",
1858 service_token=self.helm_installer_provider.service_token,
1859 properties={
1860 "ClusterName": self.cluster.cluster_name,
1861 "Region": self.deployment_region,
1862 # Enable core AI/ML infrastructure charts by default
1863 # NVIDIA Network Operator toggled via cdk.json nvidia_network_operator.enabled
1864 "EnabledCharts": self._get_enabled_helm_charts(),
1865 # Override chart values if needed
1866 "Charts": {},
1867 # Pass IAM role ARNs for service account annotations
1868 "KedaOperatorRoleArn": self.keda_operator_role.role_arn,
1869 # Force re-invocation on every deployment to pick up charts.yaml changes
1870 "DeploymentTimestamp": deployment_timestamp,
1871 },
1872 )
1874 # Helm charts depend on kubectl manifests being applied first
1875 helm_install.node.add_dependency(kubectl_apply)
1877 # Apply CRD-dependent manifests after Helm installs the CRDs.
1878 # KEDA ScaledJob/ScaledObject require the KEDA CRDs to exist first.
1879 # This second kubectl pass runs after Helm and applies only those resources.
1880 kubectl_apply_post_helm = CustomResource(
1881 self,
1882 "KubectlApplyPostHelmManifests",
1883 service_token=self.kubectl_provider.service_token,
1884 properties={
1885 "ClusterName": self.cluster.cluster_name,
1886 "Region": self.deployment_region,
1887 "SkipDeletionOnStackDelete": "true",
1888 "ImageReplacements": image_replacements,
1889 "FsxFileSystemId": self.fsx_file_system.ref if self.fsx_file_system else "none",
1890 "DeploymentTimestamp": deployment_timestamp,
1891 # PostHelm: "true" tells the handler to apply only post-helm-* manifests
1892 "PostHelm": "true",
1893 },
1894 )
1896 # Must run after Helm has installed the CRDs
1897 kubectl_apply_post_helm.node.add_dependency(helm_install)
1899 # Create GA registration custom resource AFTER manifests are applied
1900 # This waits for the Ingress to create the ALB and registers it with GA
1901 #
1902 # IMPORTANT: We include a deployment timestamp to force CloudFormation to
1903 # re-invoke the Lambda on every deployment. This ensures the ALB is always
1904 # registered with the Global Accelerator, even if other properties haven't changed.
1905 # Without this, CloudFormation may skip the custom resource if it thinks
1906 # nothing has changed, leaving the ALB unregistered after GA recreation.
1907 deployment_timestamp = str(int(time.time()))
1909 ga_registration = CustomResource(
1910 self,
1911 "GaRegistration",
1912 service_token=self.ga_registration_provider.service_token,
1913 properties={
1914 "ClusterName": self.cluster.cluster_name,
1915 "Region": self.deployment_region,
1916 "EndpointGroupArn": self.endpoint_group_arn,
1917 "IngressName": "gco-ingress",
1918 "Namespace": "gco-system",
1919 # Pass global region and project name for SSM storage
1920 "GlobalRegion": self.config.get_global_region(),
1921 "ProjectName": self.config.get_project_name(),
1922 # Force re-invocation on every deployment
1923 "DeploymentTimestamp": deployment_timestamp,
1924 },
1925 )
1927 # GA registration must happen after manifests are applied
1928 ga_registration.node.add_dependency(kubectl_apply)
1930 def _create_ga_registration_lambda(self) -> None:
1931 """Create Lambda function to register Ingress-created ALB with Global Accelerator.
1933 This Lambda:
1934 1. Waits for the Ingress to get an ALB address
1935 2. Gets the ALB ARN from the address
1936 3. Registers that ALB with Global Accelerator
1938 This is necessary because the ALB is created by the AWS Load Balancer Controller
1939 (not CDK), so we can't directly reference its ARN.
1940 """
1941 project_name = self.config.get_project_name()
1943 # Create Lambda function for GA registration using external handler
1944 ga_registration_lambda = lambda_.Function(
1945 self,
1946 "GaRegistrationFunction",
1947 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
1948 handler="handler.lambda_handler",
1949 code=lambda_.Code.from_asset("lambda/ga-registration"),
1950 timeout=Duration.minutes(15), # Max Lambda timeout; handler uses 14 min budget
1951 memory_size=256,
1952 vpc=self.vpc,
1953 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
1954 environment={
1955 "CLUSTER_NAME": self.cluster.cluster_name,
1956 "REGION": self.deployment_region,
1957 },
1958 tracing=lambda_.Tracing.ACTIVE,
1959 )
1961 # Grant permissions
1962 ga_registration_lambda.add_to_role_policy(
1963 iam.PolicyStatement(
1964 effect=iam.Effect.ALLOW,
1965 actions=["eks:DescribeCluster"],
1966 resources=[self.cluster.cluster_arn],
1967 )
1968 )
1969 ga_registration_lambda.add_to_role_policy(
1970 iam.PolicyStatement(
1971 effect=iam.Effect.ALLOW,
1972 actions=[
1973 "elasticloadbalancing:DescribeLoadBalancers",
1974 "elasticloadbalancing:DescribeTags", # Required for tag-based ALB detection
1975 ],
1976 resources=["*"],
1977 )
1978 )
1979 ga_registration_lambda.add_to_role_policy(
1980 iam.PolicyStatement(
1981 effect=iam.Effect.ALLOW,
1982 actions=[
1983 "globalaccelerator:AddEndpoints",
1984 "globalaccelerator:RemoveEndpoints",
1985 "globalaccelerator:UpdateEndpointGroup",
1986 "globalaccelerator:DescribeEndpointGroup",
1987 ],
1988 resources=["*"],
1989 )
1990 )
1991 ga_registration_lambda.add_to_role_policy(
1992 iam.PolicyStatement(
1993 effect=iam.Effect.ALLOW,
1994 actions=["ssm:GetParameter", "ssm:PutParameter", "ssm:DeleteParameter"],
1995 resources=[
1996 f"arn:aws:ssm:{self.config.get_global_region()}:{self.account}:parameter/{project_name}/*"
1997 ],
1998 )
1999 )
2001 # Add EKS access entry for the Lambda role
2002 if ga_registration_lambda.role is not None: 2002 ↛ 2016line 2002 didn't jump to line 2016 because the condition on line 2002 was always true
2003 eks.AccessEntry(
2004 self,
2005 "GaRegistrationLambdaAccessEntry",
2006 cluster=self.cluster, # type: ignore[arg-type]
2007 principal=ga_registration_lambda.role.role_arn,
2008 access_policies=[
2009 eks.AccessPolicy.from_access_policy_name(
2010 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER
2011 )
2012 ],
2013 )
2015 # Allow Lambda to access EKS API
2016 self.cluster.cluster_security_group.add_ingress_rule(
2017 peer=ec2.Peer.ipv4(self.vpc.vpc_cidr_block),
2018 connection=ec2.Port.tcp(443),
2019 description="Allow GA registration Lambda to access EKS API",
2020 )
2022 # Get endpoint group ARN from SSM (stored in global region).
2023 # Uses the shared AwsCustomResource execution role (pre-created in
2024 # _create_aws_custom_resource_role) — the SSM GetParameter
2025 # statement was attached there up-front so the Lambda never hits
2026 # an IAM propagation race on cold deploys.
2027 global_region = self.config.get_global_region()
2028 get_endpoint_group_arn = cr.AwsCustomResource(
2029 self,
2030 "GetEndpointGroupArn",
2031 on_create=cr.AwsSdkCall(
2032 service="SSM",
2033 action="getParameter",
2034 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"},
2035 region=global_region,
2036 physical_resource_id=cr.PhysicalResourceId.of(
2037 f"{project_name}-get-endpoint-group-arn-{self.deployment_region}"
2038 ),
2039 ),
2040 on_update=cr.AwsSdkCall(
2041 service="SSM",
2042 action="getParameter",
2043 parameters={"Name": f"/{project_name}/endpoint-group-{self.deployment_region}-arn"},
2044 region=global_region,
2045 ),
2046 role=self.aws_custom_resource_role,
2047 )
2048 get_endpoint_group_arn.node.add_dependency(self.aws_custom_resource_role)
2050 endpoint_group_arn = get_endpoint_group_arn.get_response_field("Parameter.Value")
2052 # Create log group for GA registration provider
2053 ga_provider_log_group = logs.LogGroup(
2054 self,
2055 "GaRegistrationProviderLogGroup",
2056 retention=logs.RetentionDays.ONE_WEEK,
2057 removal_policy=RemovalPolicy.DESTROY,
2058 )
2060 # Create provider and custom resource
2061 ga_provider = cr.Provider(
2062 self,
2063 "GaRegistrationProvider",
2064 on_event_handler=ga_registration_lambda,
2065 log_group=ga_provider_log_group,
2066 )
2068 # Store for use after kubectl apply
2069 self.ga_registration_provider = ga_provider
2070 self.endpoint_group_arn = endpoint_group_arn
2072 # cdk-nag suppression: the GA registration Lambda needs broad
2073 # Global Accelerator and ELB Describe access with Resource: *.
2074 from cdk_nag import NagSuppressions
2076 NagSuppressions.add_resource_suppressions(
2077 ga_registration_lambda,
2078 [
2079 {
2080 "id": "AwsSolutions-IAM5",
2081 "reason": (
2082 "The GA registration Lambda needs elasticloadbalancing:Describe* "
2083 "and globalaccelerator:* to discover the Ingress-created ALB and "
2084 "register it with Global Accelerator. These APIs do not support "
2085 "resource-level IAM scoping — Resource: * is the only valid form."
2086 ),
2087 "appliesTo": ["Resource::*"],
2088 },
2089 ],
2090 apply_to_children=True,
2091 )
2093 def _get_enabled_helm_charts(self) -> list[str]:
2094 """Return the list of Helm charts to install based on cdk.json helm config.
2096 Reads the 'helm' section from cdk.json context. Each key maps to one or
2097 more Helm chart names. Charts are returned in dependency order with Kueue
2098 last (its webhook intercepts all Job/Deployment mutations).
2099 """
2100 helm_config = self.node.try_get_context("helm") or {}
2102 # Mapping from cdk.json helm key → Helm chart name(s) in charts.yaml
2103 # Order matters: dependencies first, Kueue last
2104 chart_map: list[tuple[str, list[str]]] = [
2105 ("keda", ["keda"]),
2106 ("nvidia_gpu_operator", ["nvidia-gpu-operator"]),
2107 ("nvidia_dra_driver", ["nvidia-dra-driver"]),
2108 ("nvidia_network_operator", ["nvidia-network-operator"]),
2109 ("aws_efa_device_plugin", ["aws-efa-device-plugin"]),
2110 ("aws_neuron_device_plugin", ["aws-neuron-device-plugin"]),
2111 ("volcano", ["volcano"]),
2112 ("kuberay", ["kuberay-operator"]),
2113 ("cert_manager", ["cert-manager"]),
2114 ("slurm", ["slinky-slurm-operator", "slinky-slurm"]),
2115 ("yunikorn", ["yunikorn"]),
2116 ("kueue", ["kueue"]), # Must be last
2117 ]
2119 enabled_charts = []
2120 for config_key, chart_names in chart_map:
2121 chart_config = helm_config.get(config_key, {})
2122 if chart_config.get("enabled", True): 2122 ↛ 2120line 2122 didn't jump to line 2120 because the condition on line 2122 was always true
2123 enabled_charts.extend(chart_names)
2125 return enabled_charts
2127 def _create_helm_installer_lambda(self) -> None:
2128 """Create Lambda function to install Helm charts (KEDA, NVIDIA DRA, etc.).
2130 This Lambda uses Helm to install charts that require complex setup
2131 (TLS certificates, CRDs, etc.) that are difficult to manage via raw manifests.
2133 Charts installed:
2134 - KEDA: Kubernetes Event-Driven Autoscaling (enabled by default)
2135 - NVIDIA DRA Driver: Dynamic Resource Allocation for GPUs (disabled by default)
2136 """
2137 project_name = self.config.get_project_name()
2139 # Create IAM role for Helm installer Lambda
2140 helm_lambda_role = iam.Role(
2141 self,
2142 "HelmInstallerLambdaRole",
2143 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
2144 managed_policies=[
2145 iam.ManagedPolicy.from_aws_managed_policy_name(
2146 "service-role/AWSLambdaVPCAccessExecutionRole"
2147 ),
2148 iam.ManagedPolicy.from_aws_managed_policy_name(
2149 "service-role/AWSLambdaBasicExecutionRole"
2150 ),
2151 ],
2152 )
2154 # Add EKS permissions
2155 helm_lambda_role.add_to_policy(
2156 iam.PolicyStatement(
2157 actions=["eks:DescribeCluster", "eks:ListClusters"],
2158 resources=[self.cluster.cluster_arn],
2159 )
2160 )
2162 # Create security group for Helm installer Lambda
2163 helm_lambda_sg = ec2.SecurityGroup(
2164 self,
2165 "HelmInstallerLambdaSG",
2166 vpc=self.vpc,
2167 description="Security group for Helm installer Lambda to access EKS cluster",
2168 security_group_name=f"{project_name}-helm-lambda-sg-{self.deployment_region}",
2169 allow_all_outbound=True,
2170 )
2172 # Allow Lambda to access EKS cluster API
2173 self.cluster.cluster_security_group.add_ingress_rule(
2174 peer=helm_lambda_sg,
2175 connection=ec2.Port.tcp(443),
2176 description="Allow Helm installer Lambda to access EKS API",
2177 )
2179 # Build Docker image for Helm installer Lambda
2180 # Points at helm-installer-build/ which is rebuilt fresh every deploy
2181 # by _build_helm_installer_lambda() in cli/stacks.py
2182 ecr_assets.DockerImageAsset(
2183 self,
2184 "HelmInstallerImage",
2185 directory="lambda/helm-installer-build",
2186 platform=ecr_assets.Platform.LINUX_AMD64,
2187 )
2189 # Create Lambda function using Docker image
2190 # Store function name as string attribute for cross-stack references
2191 # This avoids CDK cross-environment resolution issues when account is unresolved
2192 self.helm_installer_lambda_function_name = f"{project_name}-helm-{self.deployment_region}"
2193 self.helm_installer_lambda = lambda_.DockerImageFunction(
2194 self,
2195 "HelmInstallerFunction",
2196 function_name=self.helm_installer_lambda_function_name,
2197 code=lambda_.DockerImageCode.from_image_asset(
2198 directory="lambda/helm-installer-build",
2199 platform=ecr_assets.Platform.LINUX_AMD64,
2200 ),
2201 timeout=Duration.minutes(15),
2202 memory_size=1024,
2203 architecture=lambda_.Architecture.X86_64,
2204 role=helm_lambda_role,
2205 vpc=self.vpc,
2206 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2207 security_groups=[helm_lambda_sg],
2208 environment={
2209 "CLUSTER_NAME": self.cluster.cluster_name,
2210 "REGION": self.deployment_region,
2211 },
2212 tracing=lambda_.Tracing.ACTIVE,
2213 )
2215 # Add EKS access entry for the Lambda role
2216 eks.AccessEntry(
2217 self,
2218 "HelmInstallerLambdaAccessEntry",
2219 cluster=self.cluster, # type: ignore[arg-type]
2220 principal=helm_lambda_role.role_arn,
2221 access_policies=[
2222 eks.AccessPolicy.from_access_policy_name(
2223 "AmazonEKSClusterAdminPolicy", access_scope_type=eks.AccessScopeType.CLUSTER
2224 )
2225 ],
2226 )
2228 # Create log group for Helm installer provider
2229 helm_provider_log_group = logs.LogGroup(
2230 self,
2231 "HelmInstallerProviderLogGroup",
2232 retention=logs.RetentionDays.ONE_WEEK,
2233 removal_policy=RemovalPolicy.DESTROY,
2234 )
2236 # Create custom resource provider
2237 self.helm_installer_provider = cr.Provider(
2238 self,
2239 "HelmInstallerProvider",
2240 on_event_handler=self.helm_installer_lambda,
2241 log_group=helm_provider_log_group,
2242 )
2244 # cdk-nag suppression: the Helm installer Lambda requires broad
2245 # EKS and Kubernetes API access to install Helm charts.
2246 from cdk_nag import NagSuppressions
2248 NagSuppressions.add_resource_suppressions(
2249 helm_lambda_role,
2250 [
2251 {
2252 "id": "AwsSolutions-IAM5",
2253 "reason": (
2254 "The Helm installer Lambda requires broad EKS and Kubernetes API "
2255 "access to install Helm charts (KEDA, NVIDIA DRA, etc.) that create "
2256 "CRDs, RBAC rules, and workloads across multiple namespaces. "
2257 "Resource: * is required because the set of Kubernetes resources "
2258 "is dynamic and not known at synth time."
2259 ),
2260 "appliesTo": ["Resource::*"],
2261 },
2262 ],
2263 apply_to_children=True,
2264 )
2266 def _create_efs(self) -> None:
2267 """Create EFS file system for shared storage across jobs.
2269 Creates an EFS file system with mount targets in each private subnet,
2270 allowing pods to share data and persist outputs. The EFS is configured
2271 with:
2272 - Encryption at rest
2273 - Automatic backups
2274 - General Purpose performance mode (suitable for most workloads)
2275 - Bursting throughput mode
2277 Kubernetes resources (StorageClass, PV, PVC) are created via manifests.
2278 """
2279 project_name = self.config.get_project_name()
2281 # Create security group for EFS
2282 self.efs_security_group = ec2.SecurityGroup(
2283 self,
2284 "EfsSecurityGroup",
2285 vpc=self.vpc,
2286 description=f"Security group for {project_name} EFS in {self.deployment_region}",
2287 security_group_name=f"{project_name}-efs-sg-{self.deployment_region}",
2288 allow_all_outbound=False, # EFS doesn't need outbound
2289 )
2291 # Allow NFS traffic from EKS cluster security group
2292 self.efs_security_group.add_ingress_rule(
2293 peer=self.cluster.cluster_security_group,
2294 connection=ec2.Port.tcp(2049),
2295 description="Allow NFS from EKS cluster",
2296 )
2298 # Create EFS file system
2299 self.efs_file_system = efs.FileSystem(
2300 self,
2301 "GCOEfs",
2302 vpc=self.vpc,
2303 file_system_name=f"{project_name}-efs-{self.deployment_region}",
2304 security_group=self.efs_security_group,
2305 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2306 encrypted=True,
2307 performance_mode=efs.PerformanceMode.GENERAL_PURPOSE,
2308 throughput_mode=efs.ThroughputMode.BURSTING,
2309 removal_policy=RemovalPolicy.DESTROY, # For dev/test; use RETAIN for production
2310 enable_automatic_backups=True,
2311 )
2313 # Add file system policy to allow mounting without IAM authorization
2314 # This allows any client that can reach the mount target to mount the file system
2315 self.efs_file_system.add_to_resource_policy(
2316 iam.PolicyStatement(
2317 effect=iam.Effect.ALLOW,
2318 principals=[iam.AnyPrincipal()],
2319 actions=[
2320 "elasticfilesystem:ClientMount",
2321 "elasticfilesystem:ClientWrite",
2322 "elasticfilesystem:ClientRootAccess",
2323 ],
2324 conditions={"Bool": {"elasticfilesystem:AccessedViaMountTarget": "true"}},
2325 )
2326 )
2328 # Create access point for the gco-jobs directory
2329 self.efs_access_point = self.efs_file_system.add_access_point(
2330 "JobsAccessPoint",
2331 path="/gco-jobs",
2332 create_acl=efs.Acl(owner_uid="1000", owner_gid="1000", permissions="755"),
2333 posix_user=efs.PosixUser(uid="1000", gid="1000"),
2334 )
2336 # Output EFS information
2337 CfnOutput(
2338 self,
2339 "EfsFileSystemId",
2340 value=self.efs_file_system.file_system_id,
2341 description="EFS File System ID for shared job storage",
2342 )
2344 CfnOutput(
2345 self,
2346 "EfsAccessPointId",
2347 value=self.efs_access_point.access_point_id,
2348 description="EFS Access Point ID for job outputs",
2349 )
2351 def _create_fsx_lustre(self) -> None:
2352 """Create FSx for Lustre file system for high-performance storage.
2354 FSx for Lustre provides high-performance parallel file system storage
2355 ideal for ML training workloads that require high throughput and low latency.
2357 This is optional and controlled by the fsx_lustre.enabled config setting.
2359 Supported deployment types:
2360 - SCRATCH_1: Temporary storage, no data replication
2361 - SCRATCH_2: Temporary storage with better burst performance
2362 - PERSISTENT_1: Persistent storage with data replication
2363 - PERSISTENT_2: Latest persistent storage with higher throughput
2364 """
2365 fsx_config = self.config.get_fsx_lustre_config(self.deployment_region)
2367 if not fsx_config.get("enabled", False):
2368 self.fsx_file_system = None
2369 return
2371 project_name = self.config.get_project_name()
2373 # Create security group for FSx
2374 self.fsx_security_group = ec2.SecurityGroup(
2375 self,
2376 "FsxSecurityGroup",
2377 vpc=self.vpc,
2378 description=f"Security group for {project_name} FSx Lustre in {self.deployment_region}",
2379 security_group_name=f"{project_name}-fsx-sg-{self.deployment_region}",
2380 allow_all_outbound=False,
2381 )
2383 # Allow Lustre traffic from EKS cluster security group
2384 # Lustre uses ports 988 (control) and 1021-1023 (data)
2385 self.fsx_security_group.add_ingress_rule(
2386 peer=self.cluster.cluster_security_group,
2387 connection=ec2.Port.tcp(988),
2388 description="Allow Lustre control traffic from EKS cluster",
2389 )
2390 self.fsx_security_group.add_ingress_rule(
2391 peer=self.cluster.cluster_security_group,
2392 connection=ec2.Port.tcp_range(1021, 1023),
2393 description="Allow Lustre data traffic from EKS cluster",
2394 )
2396 # Allow self-referencing traffic for FSx Lustre internal communication
2397 # FSx Lustre nodes need to communicate with each other on port 988
2398 self.fsx_security_group.add_ingress_rule(
2399 peer=self.fsx_security_group,
2400 connection=ec2.Port.tcp(988),
2401 description="Allow Lustre internal traffic on port 988",
2402 )
2403 self.fsx_security_group.add_ingress_rule(
2404 peer=self.fsx_security_group,
2405 connection=ec2.Port.tcp_range(1021, 1023),
2406 description="Allow Lustre internal traffic on ports 1021-1023",
2407 )
2409 # Get deployment type
2410 deployment_type = fsx_config.get("deployment_type", "SCRATCH_2")
2411 storage_capacity = fsx_config.get("storage_capacity_gib", 1200)
2413 # Build Lustre configuration based on deployment type
2414 lustre_config = {
2415 "deploymentType": deployment_type,
2416 "dataCompressionType": fsx_config.get("data_compression_type", "LZ4"),
2417 }
2419 # Add throughput for PERSISTENT types
2420 if deployment_type.startswith("PERSISTENT"):
2421 lustre_config["perUnitStorageThroughput"] = fsx_config.get(
2422 "per_unit_storage_throughput", 200
2423 )
2425 # Add S3 import/export if configured
2426 import_path = fsx_config.get("import_path")
2427 export_path = fsx_config.get("export_path")
2429 if import_path:
2430 lustre_config["importPath"] = import_path
2431 lustre_config["autoImportPolicy"] = fsx_config.get(
2432 "auto_import_policy", "NEW_CHANGED_DELETED"
2433 )
2435 if export_path:
2436 lustre_config["exportPath"] = export_path
2438 # Get file system type version (default to 2.15 for kernel 6.x compatibility)
2439 # IMPORTANT: Lustre 2.10 is NOT compatible with kernel 6.x (AL2023, Bottlerocket 1.19+)
2440 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/lustre-client-matrix.html
2441 file_system_type_version = fsx_config.get("file_system_type_version", "2.15")
2443 # Create FSx for Lustre file system
2444 self.fsx_file_system = fsx.CfnFileSystem(
2445 self,
2446 "GCOFsxLustre",
2447 file_system_type="LUSTRE",
2448 file_system_type_version=file_system_type_version,
2449 storage_capacity=storage_capacity,
2450 subnet_ids=[self.vpc.private_subnets[0].subnet_id],
2451 security_group_ids=[self.fsx_security_group.security_group_id],
2452 lustre_configuration=lustre_config,
2453 tags=[
2454 {"key": "Name", "value": f"{project_name}-fsx-{self.deployment_region}"},
2455 {"key": "Project", "value": project_name},
2456 ],
2457 )
2459 # Ensure FSx file system waits for security group ingress rules to be created
2460 # This prevents "security group does not permit Lustre LNET traffic" errors
2461 self.fsx_file_system.node.add_dependency(self.fsx_security_group)
2463 # Create FSx CSI Driver add-on for Kubernetes integration
2464 self._create_fsx_csi_driver_addon()
2466 # Output FSx information
2467 CfnOutput(
2468 self,
2469 "FsxFileSystemId",
2470 value=self.fsx_file_system.ref,
2471 description="FSx for Lustre File System ID",
2472 )
2474 CfnOutput(
2475 self,
2476 "FsxDnsName",
2477 value=self.fsx_file_system.attr_dns_name,
2478 description="FSx for Lustre DNS Name",
2479 )
2481 CfnOutput(
2482 self,
2483 "FsxMountName",
2484 value=self.fsx_file_system.attr_lustre_mount_name,
2485 description="FSx for Lustre Mount Name",
2486 )
2488 def _create_valkey_cache(self) -> None:
2489 """Create an ElastiCache Serverless Valkey cache for K/V caching.
2491 Provides a low-latency key-value store that inference endpoints and
2492 jobs can use for prompt caching, session state, feature stores, or
2493 any shared state across pods. Valkey Serverless auto-scales and
2494 requires no node management.
2496 The cache is placed in the VPC private subnets and accessible from
2497 any pod via the cluster security group.
2498 """
2499 valkey_config = self.config.get_valkey_config()
2500 if not valkey_config.get("enabled", False): 2500 ↛ 2503line 2500 didn't jump to line 2503 because the condition on line 2500 was always true
2501 return
2503 from aws_cdk import aws_elasticache as elasticache
2505 # Security group for Valkey (allow access from EKS cluster)
2506 valkey_sg = ec2.SecurityGroup(
2507 self,
2508 "ValkeySG",
2509 vpc=self.vpc,
2510 description="Security group for Valkey Serverless cache",
2511 allow_all_outbound=False,
2512 )
2513 valkey_sg.add_ingress_rule(
2514 ec2.Peer.ipv4(self.vpc.vpc_cidr_block),
2515 ec2.Port.tcp(6379),
2516 "Allow Valkey access from VPC",
2517 )
2519 private_subnet_ids = [s.subnet_id for s in self.vpc.private_subnets]
2521 self.valkey_cache = elasticache.CfnServerlessCache(
2522 self,
2523 "ValkeyCache",
2524 engine="valkey",
2525 serverless_cache_name=f"gco-{self.deployment_region}",
2526 description=f"GCO K/V cache for {self.deployment_region}",
2527 major_engine_version="8",
2528 security_group_ids=[valkey_sg.security_group_id],
2529 subnet_ids=private_subnet_ids,
2530 cache_usage_limits=elasticache.CfnServerlessCache.CacheUsageLimitsProperty(
2531 data_storage=elasticache.CfnServerlessCache.DataStorageProperty(
2532 maximum=valkey_config.get("max_data_storage_gb", 5),
2533 minimum=1,
2534 unit="GB",
2535 ),
2536 ecpu_per_second=elasticache.CfnServerlessCache.ECPUPerSecondProperty(
2537 maximum=valkey_config.get("max_ecpu_per_second", 5000),
2538 minimum=1000,
2539 ),
2540 ),
2541 snapshot_retention_limit=valkey_config.get("snapshot_retention_limit", 1),
2542 tags=[
2543 CfnTag(key="Project", value="gco"),
2544 CfnTag(key="Region", value=self.deployment_region),
2545 ],
2546 )
2548 CfnOutput(
2549 self,
2550 "ValkeyEndpoint",
2551 value=self.valkey_cache.attr_endpoint_address,
2552 description="Valkey Serverless cache endpoint",
2553 )
2554 CfnOutput(
2555 self,
2556 "ValkeyPort",
2557 value=self.valkey_cache.attr_endpoint_port,
2558 description="Valkey Serverless cache port",
2559 )
2561 # Store endpoint in SSM for discovery by pods
2562 ssm.StringParameter(
2563 self,
2564 "ValkeyEndpointParam",
2565 parameter_name=f"/{self.config.get_project_name()}/valkey-endpoint-{self.deployment_region}",
2566 string_value=self.valkey_cache.attr_endpoint_address,
2567 description=f"Valkey endpoint for {self.deployment_region}",
2568 )
2570 def _create_aurora_pgvector(self) -> None:
2571 """Create an Aurora Serverless v2 PostgreSQL cluster with pgvector.
2573 Provides a fully managed vector database that inference endpoints and
2574 jobs can use for RAG (retrieval-augmented generation), semantic search,
2575 embedding storage, and similarity queries. Aurora Serverless v2
2576 auto-scales capacity and requires no instance management.
2578 The cluster is placed in the VPC private subnets and accessible from
2579 any pod via the cluster security group. Credentials are stored in
2580 Secrets Manager and the endpoint is published to SSM + a K8s ConfigMap
2581 for automatic discovery.
2583 See: https://aws.amazon.com/blogs/database/accelerate-generative-ai-workloads-on-amazon-aurora-with-optimized-reads-and-pgvector/
2584 """
2585 aurora_config = self.config.get_aurora_pgvector_config()
2586 if not aurora_config.get("enabled", False):
2587 return
2589 from aws_cdk import aws_rds as rds
2591 project_name = self.config.get_project_name()
2593 # Security group for Aurora (allow PostgreSQL access from EKS cluster only)
2594 aurora_sg = ec2.SecurityGroup(
2595 self,
2596 "AuroraPgvectorSG",
2597 vpc=self.vpc,
2598 description="Security group for Aurora Serverless v2 pgvector",
2599 allow_all_outbound=False,
2600 )
2601 aurora_sg.add_ingress_rule(
2602 self.cluster.cluster_security_group,
2603 ec2.Port.tcp(5432),
2604 "Allow PostgreSQL access from EKS cluster",
2605 )
2607 # Subnet group for Aurora (private subnets only)
2608 subnet_group = rds.SubnetGroup(
2609 self,
2610 "AuroraPgvectorSubnetGroup",
2611 description=f"Subnet group for GCO Aurora pgvector in {self.deployment_region}",
2612 vpc=self.vpc,
2613 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2614 )
2616 # Aurora Serverless v2 cluster with PostgreSQL 16 + pgvector
2617 self.aurora_cluster = rds.DatabaseCluster(
2618 self,
2619 "AuroraPgvectorCluster",
2620 engine=rds.DatabaseClusterEngine.aurora_postgres(
2621 version=getattr(rds.AuroraPostgresEngineVersion, AURORA_POSTGRES_VERSION),
2622 ),
2623 serverless_v2_min_capacity=aurora_config.get("min_acu", 0),
2624 serverless_v2_max_capacity=aurora_config.get("max_acu", 16),
2625 writer=rds.ClusterInstance.serverless_v2(
2626 "Writer",
2627 auto_minor_version_upgrade=True,
2628 ),
2629 readers=[
2630 rds.ClusterInstance.serverless_v2(
2631 "Reader",
2632 auto_minor_version_upgrade=True,
2633 scale_with_writer=True,
2634 ),
2635 ],
2636 vpc=self.vpc,
2637 vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS),
2638 subnet_group=subnet_group,
2639 security_groups=[aurora_sg],
2640 default_database_name="gco_vectors",
2641 backup=rds.BackupProps(
2642 retention=Duration.days(aurora_config.get("backup_retention_days", 7)),
2643 ),
2644 deletion_protection=aurora_config.get("deletion_protection", False),
2645 removal_policy=RemovalPolicy.DESTROY,
2646 storage_encrypted=True,
2647 iam_authentication=True,
2648 cloudwatch_logs_exports=["postgresql"],
2649 monitoring_interval=Duration.seconds(60),
2650 cluster_identifier=f"{project_name}-pgvector-{self.deployment_region}",
2651 )
2653 # Construct-level cdk-nag suppressions for Aurora pgvector
2654 from cdk_nag import NagPackSuppression, NagSuppressions
2656 NagSuppressions.add_resource_suppressions(
2657 self.aurora_cluster,
2658 [
2659 NagPackSuppression(
2660 id="AwsSolutions-RDS10",
2661 reason=(
2662 "Deletion protection is intentionally disabled for dev/test deployments. "
2663 "Production deployments should set aurora_pgvector.deletion_protection=true "
2664 "in cdk.json."
2665 ),
2666 ),
2667 NagPackSuppression(
2668 id="AwsSolutions-SMG4",
2669 reason=(
2670 "Aurora manages credential rotation via the RDS integration with Secrets "
2671 "Manager. Manual Secrets Manager rotation is not required. "
2672 "See: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/rds-secrets-manager.html"
2673 ),
2674 ),
2675 NagPackSuppression(
2676 id="HIPAA.Security-RDSInstanceDeletionProtectionEnabled",
2677 reason=(
2678 "Deletion protection is intentionally disabled for dev/test deployments. "
2679 "Production deployments should set aurora_pgvector.deletion_protection=true "
2680 "in cdk.json."
2681 ),
2682 ),
2683 NagPackSuppression(
2684 id="NIST.800.53.R5-RDSInstanceDeletionProtectionEnabled",
2685 reason=(
2686 "Deletion protection is intentionally disabled for dev/test deployments. "
2687 "Production deployments should set aurora_pgvector.deletion_protection=true "
2688 "in cdk.json."
2689 ),
2690 ),
2691 NagPackSuppression(
2692 id="PCI.DSS.321-SecretsManagerUsingKMSKey",
2693 reason=(
2694 "Aurora Serverless v2 credentials in Secrets Manager are encrypted with "
2695 "AWS-managed keys by default. Customer-managed KMS can be enabled if "
2696 "required for PCI compliance."
2697 ),
2698 ),
2699 ],
2700 apply_to_children=True,
2701 )
2703 # Outputs
2704 CfnOutput(
2705 self,
2706 "AuroraPgvectorEndpoint",
2707 value=self.aurora_cluster.cluster_endpoint.hostname,
2708 description="Aurora pgvector cluster writer endpoint",
2709 )
2710 CfnOutput(
2711 self,
2712 "AuroraPgvectorReaderEndpoint",
2713 value=self.aurora_cluster.cluster_read_endpoint.hostname,
2714 description="Aurora pgvector cluster reader endpoint",
2715 )
2716 CfnOutput(
2717 self,
2718 "AuroraPgvectorPort",
2719 value=str(self.aurora_cluster.cluster_endpoint.port),
2720 description="Aurora pgvector cluster port",
2721 )
2722 CfnOutput(
2723 self,
2724 "AuroraPgvectorSecretArn",
2725 value=self.aurora_cluster.secret.secret_arn if self.aurora_cluster.secret else "",
2726 description="Aurora pgvector credentials secret ARN",
2727 )
2729 # Store endpoint in SSM for discovery by pods and external tools
2730 ssm.StringParameter(
2731 self,
2732 "AuroraPgvectorEndpointParam",
2733 parameter_name=f"/{project_name}/aurora-pgvector-endpoint-{self.deployment_region}",
2734 string_value=self.aurora_cluster.cluster_endpoint.hostname,
2735 description=f"Aurora pgvector endpoint for {self.deployment_region}",
2736 )
2738 # Grant the ServiceAccountRole read access to the Aurora secret
2739 # so pods can retrieve credentials via the ConfigMap + Secrets Manager.
2740 if self.aurora_cluster.secret: 2740 ↛ exitline 2740 didn't return from function '_create_aurora_pgvector' because the condition on line 2740 was always true
2741 self.aurora_cluster.secret.grant_read(self.service_account_role)
2743 def _create_fsx_csi_driver_addon(self) -> None:
2744 """Create FSx CSI Driver add-on for Kubernetes integration.
2746 The FSx CSI driver enables Kubernetes pods to mount FSx for Lustre
2747 file systems as persistent volumes.
2748 """
2749 # Create IAM role for FSx CSI Driver using IRSA + Pod Identity
2750 self.fsx_csi_role = GCORegionalStack._create_irsa_role(
2751 self,
2752 "FsxCsiDriverRole",
2753 oidc_provider_arn=self.oidc_provider.open_id_connect_provider_arn,
2754 oidc_issuer_url=self.cluster.cluster_open_id_connect_issuer_url,
2755 service_account_names=["fsx-csi-controller-sa"],
2756 namespaces=["kube-system"],
2757 )
2759 # Add FSx CSI driver permissions
2760 self.fsx_csi_role.add_to_policy(
2761 iam.PolicyStatement(
2762 effect=iam.Effect.ALLOW,
2763 actions=[
2764 "fsx:DescribeFileSystems",
2765 "fsx:DescribeVolumes",
2766 "fsx:CreateVolume",
2767 "fsx:DeleteVolume",
2768 "fsx:TagResource",
2769 ],
2770 resources=["*"],
2771 )
2772 )
2774 self.fsx_csi_role.add_to_policy(
2775 iam.PolicyStatement(
2776 effect=iam.Effect.ALLOW,
2777 actions=[
2778 "ec2:DescribeInstances",
2779 "ec2:DescribeVolumes",
2780 "ec2:DescribeVpcs",
2781 "ec2:DescribeSubnets",
2782 "ec2:DescribeSecurityGroups",
2783 ],
2784 resources=["*"],
2785 )
2786 )
2788 # cdk-nag suppression: the FSx CSI driver role grants
2789 # ec2:Describe* APIs that don't support resource-level scoping.
2790 from cdk_nag import NagSuppressions
2792 NagSuppressions.add_resource_suppressions(
2793 self.fsx_csi_role,
2794 [
2795 {
2796 "id": "AwsSolutions-IAM5",
2797 "reason": (
2798 "The FSx CSI driver role grants ec2:Describe* for volume "
2799 "and network discovery. These AWS APIs do not support "
2800 "resource-level IAM scoping — Resource: * is the only "
2801 "valid form."
2802 ),
2803 "appliesTo": ["Resource::*"],
2804 },
2805 ],
2806 apply_to_children=True,
2807 )
2809 # Create FSx CSI Driver add-on
2810 fsx_addon = eks.Addon(
2811 self,
2812 "FsxCsiDriverAddon",
2813 cluster=self.cluster, # type: ignore[arg-type]
2814 addon_name="aws-fsx-csi-driver",
2815 addon_version=EKS_ADDON_FSX_CSI_DRIVER,
2816 preserve_on_delete=False,
2817 configuration_values={
2818 "node": {
2819 "tolerations": self._ADDON_NODE_TOLERATIONS,
2820 },
2821 "controller": {
2822 "tolerations": self._ADDON_NODE_TOLERATIONS,
2823 },
2824 },
2825 )
2827 # Append the PassRole statement for the FSx CSI role to the shared
2828 # AwsCustomResource execution role. See
2829 # _create_aws_custom_resource_role for the full rationale.
2830 self.aws_custom_resource_role.add_to_policy(
2831 iam.PolicyStatement(
2832 effect=iam.Effect.ALLOW,
2833 actions=["iam:PassRole"],
2834 resources=[self.fsx_csi_role.role_arn],
2835 )
2836 )
2838 # Update the add-on to use the IRSA role
2839 update_fsx_addon = cr.AwsCustomResource(
2840 self,
2841 "UpdateFsxCsiAddonRole",
2842 on_create=cr.AwsSdkCall(
2843 service="EKS",
2844 action="updateAddon",
2845 parameters={
2846 "clusterName": self.cluster.cluster_name,
2847 "addonName": "aws-fsx-csi-driver",
2848 "serviceAccountRoleArn": self.fsx_csi_role.role_arn,
2849 },
2850 physical_resource_id=cr.PhysicalResourceId.of(
2851 f"{self.cluster.cluster_name}-fsx-csi-role-update"
2852 ),
2853 ),
2854 on_update=cr.AwsSdkCall(
2855 service="EKS",
2856 action="updateAddon",
2857 parameters={
2858 "clusterName": self.cluster.cluster_name,
2859 "addonName": "aws-fsx-csi-driver",
2860 "serviceAccountRoleArn": self.fsx_csi_role.role_arn,
2861 },
2862 ),
2863 role=self.aws_custom_resource_role,
2864 )
2866 update_fsx_addon.node.add_dependency(fsx_addon)
2867 update_fsx_addon.node.add_dependency(self.fsx_csi_role)
2868 update_fsx_addon.node.add_dependency(self.aws_custom_resource_role)
2870 # Expose the update-addon resource so _apply_kubernetes_manifests can
2871 # make the kubectl Lambda wait for the IRSA annotation patch to land
2872 # before it rollout-restarts the fsx-csi-controller. See the EFS CSI
2873 # equivalent for the full rationale — same race, same fix, same
2874 # symptom (PVCs stuck Pending with "no EC2 IMDS role found").
2875 self._fsx_csi_addon_role_update = update_fsx_addon
2877 # Create Pod Identity Association for FSx CSI driver
2878 eks_l1.CfnPodIdentityAssociation(
2879 self,
2880 "PodIdentity-fsx-csi",
2881 cluster_name=self.cluster.cluster_name,
2882 namespace="kube-system",
2883 service_account="fsx-csi-controller-sa",
2884 role_arn=self.fsx_csi_role.role_arn,
2885 )
2887 def _create_drift_detection(self) -> None:
2888 """Create CloudFormation drift detection on a daily schedule.
2890 Creates:
2891 - SNS topic (KMS-encrypted) for drift alerts
2892 - Lambda function that initiates drift detection on this stack, polls
2893 until detection completes, and publishes to SNS if drift is found
2894 - EventBridge rule on a daily schedule (configurable via cdk.json
2895 ``drift_detection.schedule_hours``) that invokes the Lambda
2897 Operators can disable drift detection entirely by setting
2898 ``drift_detection.enabled`` to ``false`` in cdk.json. When disabled,
2899 no resources are created.
2900 """
2901 drift_config = self.node.try_get_context("drift_detection") or {}
2902 if not drift_config.get("enabled", True):
2903 return
2905 schedule_hours = int(drift_config.get("schedule_hours", 24))
2907 # KMS key for SNS topic encryption. SNS with AWS-managed keys doesn't
2908 # allow CloudFormation/Lambda to publish, so we use a customer-managed
2909 # key we can grant publish access on.
2910 drift_topic_key = kms.Key(
2911 self,
2912 "DriftDetectionTopicKey",
2913 description="KMS key for GCO drift detection SNS topic",
2914 enable_key_rotation=True,
2915 removal_policy=RemovalPolicy.DESTROY,
2916 )
2918 self.drift_detection_topic = sns.Topic(
2919 self,
2920 "DriftDetectionTopic",
2921 display_name="GCO CloudFormation Drift Alerts",
2922 master_key=drift_topic_key,
2923 )
2925 # IAM role for the drift detection Lambda
2926 drift_lambda_role = iam.Role(
2927 self,
2928 "DriftDetectionLambdaRole",
2929 assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
2930 managed_policies=[
2931 iam.ManagedPolicy.from_aws_managed_policy_name(
2932 "service-role/AWSLambdaBasicExecutionRole"
2933 ),
2934 ],
2935 )
2937 # CloudFormation drift APIs operate at the stack level; the API does
2938 # not support resource-level ARN scoping for these actions, so we scope
2939 # to this stack's ARN where supported and accept "*" where not.
2940 drift_lambda_role.add_to_policy(
2941 iam.PolicyStatement(
2942 effect=iam.Effect.ALLOW,
2943 actions=[
2944 "cloudformation:DetectStackDrift",
2945 "cloudformation:DescribeStackDriftDetectionStatus",
2946 "cloudformation:DescribeStackResourceDrifts",
2947 "cloudformation:DescribeStackResource",
2948 "cloudformation:DescribeStackResources",
2949 ],
2950 resources=["*"],
2951 )
2952 )
2954 self.drift_detection_topic.grant_publish(drift_lambda_role)
2956 # Lambda function — one per stack; stack name is baked into env vars
2957 drift_lambda = lambda_.Function(
2958 self,
2959 "DriftDetectionFunction",
2960 runtime=getattr(lambda_.Runtime, LAMBDA_PYTHON_RUNTIME),
2961 handler="handler.lambda_handler",
2962 code=lambda_.Code.from_asset("lambda/drift-detection"),
2963 timeout=Duration.minutes(14), # Leave headroom under Lambda 15-min cap
2964 memory_size=256,
2965 role=drift_lambda_role,
2966 environment={
2967 "STACK_NAME": self.stack_name,
2968 "SNS_TOPIC_ARN": self.drift_detection_topic.topic_arn,
2969 "REGION": self.deployment_region,
2970 },
2971 tracing=lambda_.Tracing.ACTIVE,
2972 )
2974 # Dead-letter queue for EventBridge → Lambda target failures.
2975 # Captures events that fail to reach the Lambda (e.g. due to
2976 # throttling or permission issues) so operators can retry or
2977 # investigate. Required by Serverless-EventBusDLQ cdk-nag rule.
2978 drift_rule_dlq = sqs.Queue(
2979 self,
2980 "DriftDetectionRuleDlq",
2981 retention_period=Duration.days(14),
2982 enforce_ssl=True,
2983 encryption=sqs.QueueEncryption.SQS_MANAGED,
2984 removal_policy=RemovalPolicy.DESTROY,
2985 )
2987 # DLQs themselves are terminal — they don't need their own DLQ.
2988 # Suppress the circular AwsSolutions-SQS3 nag finding.
2989 from cdk_nag import NagSuppressions as _DlqNagSuppressions
2991 _DlqNagSuppressions.add_resource_suppressions(
2992 drift_rule_dlq,
2993 [
2994 {
2995 "id": "AwsSolutions-SQS3",
2996 "reason": (
2997 "This queue IS the dead-letter queue for the "
2998 "DriftDetectionSchedule EventBridge rule. A DLQ for a "
2999 "DLQ is circular; if events fail to reach this queue "
3000 "they are captured by EventBridge's own retry metrics "
3001 "(CloudWatch FailedInvocations)."
3002 ),
3003 },
3004 ],
3005 )
3007 # EventBridge rule — daily schedule by default
3008 events.Rule(
3009 self,
3010 "DriftDetectionSchedule",
3011 description=(f"Daily CloudFormation drift detection for {self.stack_name}"),
3012 schedule=events.Schedule.rate(Duration.hours(schedule_hours)),
3013 targets=[
3014 events_targets.LambdaFunction(
3015 drift_lambda,
3016 dead_letter_queue=drift_rule_dlq,
3017 retry_attempts=2,
3018 )
3019 ],
3020 )
3022 # Outputs for operators to subscribe to the topic
3023 CfnOutput(
3024 self,
3025 "DriftDetectionTopicArn",
3026 value=self.drift_detection_topic.topic_arn,
3027 description=(
3028 f"SNS topic ARN for CloudFormation drift alerts in "
3029 f"{self.deployment_region}. Subscribe an endpoint (email, "
3030 f"Slack, PagerDuty) to receive drift notifications."
3031 ),
3032 )
3034 # cdk-nag suppressions for this component
3035 from cdk_nag import NagSuppressions
3037 NagSuppressions.add_resource_suppressions(
3038 drift_lambda_role,
3039 [
3040 {
3041 "id": "AwsSolutions-IAM4",
3042 "reason": (
3043 "AWSLambdaBasicExecutionRole provides standard "
3044 "CloudWatch Logs permissions required for Lambda "
3045 "logging. This is the AWS-recommended managed policy."
3046 ),
3047 },
3048 {
3049 "id": "AwsSolutions-IAM5",
3050 "reason": (
3051 "CloudFormation drift detection APIs (DetectStackDrift, "
3052 "DescribeStackDriftDetectionStatus, "
3053 "DescribeStackResourceDrifts) cannot be scoped to a "
3054 "specific stack resource via IAM; the action-level "
3055 "scoping requires wildcard resources. The Lambda's "
3056 "environment pins it to a single stack name, so the "
3057 "effective blast radius is limited."
3058 ),
3059 },
3060 ],
3061 apply_to_children=True,
3062 )
3064 def _create_mcp_role(self) -> None:
3065 """Create dedicated IAM role for the MCP server.
3067 The MCP server exposes GCO CLI tools to LLM agents. Without a dedicated
3068 role, the server would inherit the full ambient credentials of the user
3069 who launches it (often an administrator). This method creates a
3070 least-privilege role that the MCP server can assume at startup via
3071 ``GCO_MCP_ROLE_ARN``.
3073 Permissions are scoped to the minimum needed by the tools exposed:
3075 - ``eks:DescribeCluster`` on this regional EKS cluster ARN only.
3076 - ``s3:GetObject`` on model weights buckets. The model bucket lives in
3077 the global stack, so we scope to the same name pattern used by the
3078 service account role (``{project_name}-*``). This is a deliberate
3079 compromise: a precise cross-stack ARN export would force a tight
3080 dependency on the global stack, and cdk-nag will flag it anyway
3081 because the bucket name is auto-generated.
3082 - ``cloudwatch:GetMetricData`` / ``cloudwatch:ListMetrics``. These APIs
3083 do not support resource-level IAM, so wildcard is required. Read-only.
3084 - ``sqs:SendMessage`` scoped to this region's job queue ARN only.
3086 The trust policy uses ``AccountRootPrincipal`` so any IAM user/role in
3087 the account can assume it (gated by an explicit sts:AssumeRole
3088 permission on the caller — standard AWS behavior). Operators who want
3089 to restrict assumption further should add an external-id or principal
3090 condition to the trust policy after deployment.
3092 Operators can disable this component entirely by setting
3093 ``mcp_server.enabled`` to ``false`` in cdk.json.
3094 """
3095 mcp_config = self.node.try_get_context("mcp_server") or {}
3096 if not mcp_config.get("enabled", True):
3097 return
3099 project_name = self.config.get_project_name()
3101 self.mcp_server_role = iam.Role(
3102 self,
3103 "McpServerRole",
3104 assumed_by=iam.AccountRootPrincipal(),
3105 description=(
3106 "Least-privilege role assumed by the GCO MCP server at startup. "
3107 "Grants only the permissions needed by MCP tools: eks:DescribeCluster, "
3108 "s3:GetObject on model buckets, cloudwatch read-only metrics, and "
3109 "sqs:SendMessage to the regional job queue."
3110 ),
3111 max_session_duration=Duration.hours(12),
3112 )
3114 # eks:DescribeCluster on this region's cluster only
3115 self.mcp_server_role.add_to_policy(
3116 iam.PolicyStatement(
3117 effect=iam.Effect.ALLOW,
3118 actions=["eks:DescribeCluster"],
3119 resources=[self.cluster.cluster_arn],
3120 )
3121 )
3123 # s3:GetObject on model weights buckets. Bucket name is auto-generated
3124 # in the global stack, so we match the same prefix pattern used by the
3125 # service account role.
3126 self.mcp_server_role.add_to_policy(
3127 iam.PolicyStatement(
3128 effect=iam.Effect.ALLOW,
3129 actions=["s3:GetObject", "s3:ListBucket"],
3130 resources=[
3131 f"arn:aws:s3:::{project_name}-*",
3132 f"arn:aws:s3:::{project_name}-*/*",
3133 ],
3134 )
3135 )
3137 # CloudWatch read-only metrics APIs. These APIs do not support
3138 # resource-level IAM so wildcard is required.
3139 self.mcp_server_role.add_to_policy(
3140 iam.PolicyStatement(
3141 effect=iam.Effect.ALLOW,
3142 actions=[
3143 "cloudwatch:GetMetricData",
3144 "cloudwatch:GetMetricStatistics",
3145 "cloudwatch:ListMetrics",
3146 ],
3147 resources=["*"],
3148 )
3149 )
3151 # sqs:SendMessage scoped to the regional job queue only
3152 self.mcp_server_role.add_to_policy(
3153 iam.PolicyStatement(
3154 effect=iam.Effect.ALLOW,
3155 actions=["sqs:SendMessage", "sqs:GetQueueUrl", "sqs:GetQueueAttributes"],
3156 resources=[self.job_queue.queue_arn],
3157 )
3158 )
3160 # Export the role ARN so operators can set GCO_MCP_ROLE_ARN in their
3161 # MCP server environment.
3162 CfnOutput(
3163 self,
3164 "McpServerRoleArn",
3165 value=self.mcp_server_role.role_arn,
3166 description=(
3167 "IAM role ARN for the GCO MCP server. Set GCO_MCP_ROLE_ARN to "
3168 "this value when launching the MCP server so it assumes a "
3169 "least-privilege role instead of ambient credentials."
3170 ),
3171 export_name=f"{project_name}-mcp-server-role-arn-{self.deployment_region}",
3172 )
3174 # cdk-nag suppressions: CloudWatch metrics APIs cannot be scoped.
3175 from cdk_nag import NagSuppressions
3177 NagSuppressions.add_resource_suppressions(
3178 self.mcp_server_role,
3179 [
3180 {
3181 "id": "AwsSolutions-IAM5",
3182 "reason": (
3183 "The CloudWatch metrics APIs (GetMetricData, "
3184 "GetMetricStatistics, ListMetrics) do not support "
3185 "resource-level IAM; wildcard resource is required. "
3186 "The S3 permissions use the {project_name}-* prefix "
3187 "pattern because the model weights bucket name is "
3188 "auto-generated by CDK in the global stack and a "
3189 "cross-stack ARN export would create tight stack "
3190 "coupling. All actions are read-only or scoped "
3191 "send-only (SQS)."
3192 ),
3193 },
3194 ],
3195 apply_to_children=True,
3196 )
3198 def _create_outputs(self) -> None:
3199 """Create CloudFormation outputs for cluster information"""
3200 project_name = self.config.get_project_name()
3202 # Export cluster information
3203 CfnOutput(
3204 self,
3205 "ClusterName",
3206 value=self.cluster.cluster_name,
3207 description=f"EKS cluster name for {self.deployment_region}",
3208 export_name=f"{project_name}-cluster-name-{self.deployment_region}",
3209 )
3211 CfnOutput(
3212 self,
3213 "ClusterArn",
3214 value=self.cluster.cluster_arn,
3215 description=f"EKS cluster ARN for {self.deployment_region}",
3216 export_name=f"{project_name}-cluster-arn-{self.deployment_region}",
3217 )
3219 CfnOutput(
3220 self,
3221 "ClusterEndpoint",
3222 value=self.cluster.cluster_endpoint,
3223 description=f"EKS cluster endpoint for {self.deployment_region}",
3224 export_name=f"{project_name}-cluster-endpoint-{self.deployment_region}",
3225 )
3227 CfnOutput(
3228 self,
3229 "ClusterSecurityGroupId",
3230 value=self.cluster.cluster_security_group_id,
3231 description=f"EKS cluster security group ID for {self.deployment_region}",
3232 export_name=f"{project_name}-cluster-sg-{self.deployment_region}",
3233 )
3235 CfnOutput(
3236 self,
3237 "VpcId",
3238 value=self.vpc.vpc_id,
3239 description=f"VPC ID for {self.deployment_region}",
3240 export_name=f"{project_name}-vpc-id-{self.deployment_region}",
3241 )
3243 # Export public subnet IDs for ALB
3244 public_subnet_ids = [subnet.subnet_id for subnet in self.vpc.public_subnets]
3245 CfnOutput(
3246 self,
3247 "PublicSubnetIds",
3248 value=Fn.join(",", public_subnet_ids),
3249 description=f"Public subnet IDs for ALB in {self.deployment_region}",
3250 export_name=f"{project_name}-public-subnets-{self.deployment_region}",
3251 )
3253 # Note: ALB is created by AWS Load Balancer Controller via Ingress
3254 # The ALB ARN is registered with Global Accelerator by the GA registration Lambda
3256 def get_cluster(self) -> eks.Cluster:
3257 """Get the EKS cluster"""
3258 return self.cluster
3260 def get_vpc(self) -> ec2.Vpc:
3261 """Get the VPC"""
3262 return self.vpc